ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
deep_q_network.py
(5304B)
1 from __future__ import print_function, division
2 import random
3 import numpy as np
4 import gym
5 from collections import deque
6
7
8 class DeepQNetwork():
9 """Q-Learning with deep neural network to learn the control policy.
10 Uses a deep neural network model to predict the expected utility (Q-value) of executing an action in a given state.
11
12 Reference: https://arxiv.org/abs/1312.5602
13 Parameters:
14 -----------
15 env_name: string
16 The environment that the agent will explore.
17 Check: https://gym.openai.com/envs
18 epsilon: float
19 The epsilon-greedy value. The probability that the agent should select a random action instead of
20 the action that will maximize the expected utility.
21 gamma: float
22 Determines how much the agent should consider future rewards.
23 decay_rate: float
24 The rate of decay for the epsilon value after each epoch.
25 min_epsilon: float
26 The value which epsilon will approach as the training progresses.
27 """
28 def __init__(self, env_name='CartPole-v1', epsilon=1, gamma=0.9, decay_rate=0.005, min_epsilon=0.1):
29 self.epsilon = epsilon
30 self.gamma = gamma
31 self.decay_rate = decay_rate
32 self.min_epsilon = min_epsilon
33 self.memory_size = 300
34 self.memory = []
35
36 # Initialize the environment
37 self.env = gym.make(env_name)
38 self.n_states = self.env.observation_space.shape[0]
39 self.n_actions = self.env.action_space.n
40
41 def set_model(self, model):
42 self.model = model(n_inputs=self.n_states, n_outputs=self.n_actions)
43
44 def _select_action(self, state):
45 if np.random.rand() < self.epsilon:
46 # Choose action randomly
47 action = np.random.randint(self.n_actions)
48 else:
49 # Take action with highest predicted utility given state
50 action = np.argmax(self.model.predict(state), axis=1)[0]
51
52 return action
53
54 def _memorize(self, state, action, reward, new_state, done):
55 self.memory.append((state, action, reward, new_state, done))
56 # Make sure we restrict memory size to specified limit
57 if len(self.memory) > self.memory_size:
58 self.memory.pop(0)
59
60 def _construct_training_set(self, replay):
61 # Select states and new states from replay
62 states = np.array([a[0] for a in replay])
63 new_states = np.array([a[3] for a in replay])
64
65 # Predict the expected utility of current state and new state
66 Q = self.model.predict(states)
67 Q_new = self.model.predict(new_states)
68
69 replay_size = len(replay)
70 X = np.empty((replay_size, self.n_states))
71 y = np.empty((replay_size, self.n_actions))
72
73 # Construct training set
74 for i in range(replay_size):
75 state_r, action_r, reward_r, new_state_r, done_r = replay[i]
76
77 target = Q[i]
78 target[action_r] = reward_r
79 # If we're done the utility is simply the reward of executing action a in
80 # state s, otherwise we add the expected maximum future reward as well
81 if not done_r:
82 target[action_r] += self.gamma * np.amax(Q_new[i])
83
84 X[i] = state_r
85 y[i] = target
86
87 return X, y
88
89 def train(self, n_epochs=500, batch_size=32):
90 max_reward = 0
91
92 for epoch in range(n_epochs):
93 state = self.env.reset()
94 total_reward = 0
95
96 epoch_loss = []
97 while True:
98
99 action = self._select_action(state)
100 # Take a step
101 new_state, reward, done, _ = self.env.step(action)
102
103 self._memorize(state, action, reward, new_state, done)
104
105 # Sample replay batch from memory
106 _batch_size = min(len(self.memory), batch_size)
107 replay = random.sample(self.memory, _batch_size)
108
109 # Construct training set from replay
110 X, y = self._construct_training_set(replay)
111
112 # Learn control policy
113 loss = self.model.train_on_batch(X, y)
114 epoch_loss.append(loss)
115
116 total_reward += reward
117 state = new_state
118
119 if done: break
120
121 epoch_loss = np.mean(epoch_loss)
122
123 # Reduce the epsilon parameter
124 self.epsilon = self.min_epsilon + (1.0 - self.min_epsilon) * np.exp(-self.decay_rate * epoch)
125
126 max_reward = max(max_reward, total_reward)
127
128 print ("%d [Loss: %.4f, Reward: %s, Epsilon: %.4f, Max Reward: %s]" % (epoch, epoch_loss, total_reward, self.epsilon, max_reward))
129
130 print ("Training Finished")
131
132 def play(self, n_epochs):
133 # self.env = gym.wrappers.Monitor(self.env, '/tmp/cartpole-experiment-1', force=True)
134 for epoch in range(n_epochs):
135 state = self.env.reset()
136 total_reward = 0
137 while True:
138 self.env.render()
139 action = np.argmax(self.model.predict(state), axis=1)[0]
140 state, reward, done, _ = self.env.step(action)
141 total_reward += reward
142 if done: break
143 print ("%d Reward: %s" % (epoch, total_reward))
144 self.env.close()