ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

deep_q_network.py

(5304B)


      1 from __future__ import print_function, division
      2 import random
      3 import numpy as np
      4 import gym
      5 from collections import deque
      6 
      7 
      8 class DeepQNetwork():
      9     """Q-Learning with deep neural network to learn the control policy. 
     10     Uses a deep neural network model to predict the expected utility (Q-value) of executing an action in a given state. 
     11 
     12     Reference: https://arxiv.org/abs/1312.5602
     13     Parameters:
     14     -----------
     15     env_name: string
     16         The environment that the agent will explore. 
     17         Check: https://gym.openai.com/envs
     18     epsilon: float
     19         The epsilon-greedy value. The probability that the agent should select a random action instead of
     20         the action that will maximize the expected utility. 
     21     gamma: float
     22         Determines how much the agent should consider future rewards. 
     23     decay_rate: float
     24         The rate of decay for the epsilon value after each epoch.
     25     min_epsilon: float
     26         The value which epsilon will approach as the training progresses.
     27     """
     28     def __init__(self, env_name='CartPole-v1', epsilon=1, gamma=0.9, decay_rate=0.005, min_epsilon=0.1):
     29         self.epsilon = epsilon
     30         self.gamma = gamma
     31         self.decay_rate = decay_rate
     32         self.min_epsilon = min_epsilon
     33         self.memory_size = 300
     34         self.memory = []
     35 
     36         # Initialize the environment
     37         self.env = gym.make(env_name)
     38         self.n_states = self.env.observation_space.shape[0]
     39         self.n_actions = self.env.action_space.n
     40     
     41     def set_model(self, model):
     42         self.model = model(n_inputs=self.n_states, n_outputs=self.n_actions)
     43 
     44     def _select_action(self, state):
     45         if np.random.rand() < self.epsilon:
     46             # Choose action randomly
     47             action = np.random.randint(self.n_actions)
     48         else:
     49             # Take action with highest predicted utility given state
     50             action = np.argmax(self.model.predict(state), axis=1)[0]
     51 
     52         return action
     53 
     54     def _memorize(self, state, action, reward, new_state, done):
     55         self.memory.append((state, action, reward, new_state, done))
     56         # Make sure we restrict memory size to specified limit
     57         if len(self.memory) > self.memory_size:
     58             self.memory.pop(0)
     59 
     60     def _construct_training_set(self, replay):
     61         # Select states and new states from replay
     62         states = np.array([a[0] for a in replay])
     63         new_states = np.array([a[3] for a in replay])
     64 
     65         # Predict the expected utility of current state and new state
     66         Q = self.model.predict(states)
     67         Q_new = self.model.predict(new_states)
     68 
     69         replay_size = len(replay)
     70         X = np.empty((replay_size, self.n_states))
     71         y = np.empty((replay_size, self.n_actions))
     72         
     73         # Construct training set
     74         for i in range(replay_size):
     75             state_r, action_r, reward_r, new_state_r, done_r = replay[i]
     76 
     77             target = Q[i]
     78             target[action_r] = reward_r
     79             # If we're done the utility is simply the reward of executing action a in
     80             # state s, otherwise we add the expected maximum future reward as well
     81             if not done_r:
     82                 target[action_r] += self.gamma * np.amax(Q_new[i])
     83 
     84             X[i] = state_r
     85             y[i] = target
     86 
     87         return X, y
     88 
     89     def train(self, n_epochs=500, batch_size=32):
     90         max_reward = 0
     91 
     92         for epoch in range(n_epochs):
     93             state = self.env.reset()
     94             total_reward = 0
     95 
     96             epoch_loss = []
     97             while True:
     98 
     99                 action = self._select_action(state)
    100                 # Take a step
    101                 new_state, reward, done, _ = self.env.step(action)
    102 
    103                 self._memorize(state, action, reward, new_state, done)
    104 
    105                 # Sample replay batch from memory
    106                 _batch_size = min(len(self.memory), batch_size)
    107                 replay = random.sample(self.memory, _batch_size)
    108 
    109                 # Construct training set from replay
    110                 X, y = self._construct_training_set(replay)
    111 
    112                 # Learn control policy
    113                 loss = self.model.train_on_batch(X, y)
    114                 epoch_loss.append(loss)
    115 
    116                 total_reward += reward
    117                 state = new_state
    118 
    119                 if done: break
    120             
    121             epoch_loss = np.mean(epoch_loss)
    122 
    123             # Reduce the epsilon parameter
    124             self.epsilon = self.min_epsilon + (1.0 - self.min_epsilon) * np.exp(-self.decay_rate * epoch)
    125             
    126             max_reward = max(max_reward, total_reward)
    127 
    128             print ("%d [Loss: %.4f, Reward: %s, Epsilon: %.4f, Max Reward: %s]" % (epoch, epoch_loss, total_reward, self.epsilon, max_reward))
    129 
    130         print ("Training Finished")
    131 
    132     def play(self, n_epochs):
    133         # self.env = gym.wrappers.Monitor(self.env, '/tmp/cartpole-experiment-1', force=True)
    134         for epoch in range(n_epochs):
    135             state = self.env.reset()
    136             total_reward = 0
    137             while True:
    138                 self.env.render()
    139                 action = np.argmax(self.model.predict(state), axis=1)[0]
    140                 state, reward, done, _ = self.env.step(action)
    141                 total_reward += reward
    142                 if done: break
    143             print ("%d Reward: %s" % (epoch, total_reward))
    144         self.env.close()