-
Notifications
You must be signed in to change notification settings - Fork 0
/
learn.py
182 lines (165 loc) · 7.01 KB
/
learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import numpy as np
import tensorflow as tf
import perceptron
import matplotlib.pyplot as plt
import gym
import sys
import subprocess
# Prevent sleeping
if 'darwin' in sys.platform:
print('Running \'caffeinate\' on MacOSX to prevent the system from sleeping')
subprocess.Popen('caffeinate')
# Import CartPole environment
env = gym.make('CartPole-v0')
# Start with a clean slate
tf.reset_default_graph()
# Parameters
# Needs to be large enough to make progress, but small enough for stability
learning_rate = 0.005
num_episodes = 1501
# Encourage exploring initially
initial_eps = 0.5
# Needs to be large to make progress
discount_factor = 0.99
batch_size = 100
# Need big enough buffer for IID, but not so big that it triggers catastrophic forgetting
buffer_size = 500
log_freq = 10
visualisation_freq = 10
visualise = False
num_runs = 12
# Don't set this too high, or else outliers skew the success
max_episode_length = 200
target_performance = 195
# Network Parameters
# Needs high enough capacity, but small enough to permit efficient fitting
n_hidden_1 = 30 # 1st layer number of nodes
n_hidden_2 = 30 # 2nd layer number of nodes
n_input = 4 # input layer (state)
n_output = 2 # output layer (q values for actions)
# Create computation graph
model = perceptron.MultilayerPerceptron(n_input, n_hidden_1, n_hidden_2, n_output)
inputs = tf.placeholder(shape=[1, n_input], dtype=tf.float32)
current_output, reg = model.evaluate(inputs)
target_output = tf.placeholder(shape=[1, n_output],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(target_output - current_output)) + reg
trainer = tf.train.AdamOptimizer(learning_rate=learning_rate)
minimize = trainer.minimize(loss)
init = tf.initialize_all_variables()
rewardToPlot = []
for alpha in range(num_runs):
print("Run number {} of {}".format(str(alpha) + 1, str(num_runs)))
# Create lists to contain total rewards per episode
rList = []
rAggList = []
# Stores tuples of (S, A, R, S')
# For experience replay
replay = []
h = 0
eps = initial_eps
with tf.Session() as sess:
sess.run(init)
for i in range(num_episodes):
# Anneal epsilon
eps = max(eps - (initial_eps / 500 ), 0.05)
# Reset environment and get first new observation
s = env.reset()
rTotal = 0
lossTotal = 0
j = 0
# The q-network
while j < max_episode_length:
if i % visualisation_freq == 0 and visualise:
env.render()
j+=1
# Choose an action greedily from the Q-network
q = sess.run(current_output, feed_dict={inputs: [s]})[0]
a = np.argmax(q)
if np.random.rand(1) < eps:
a = env.action_space.sample()
# Get new state and reward from environment
sPrime, r, d, _ = env.step(a)
if d:
# Nowhere to go in terminal state
sPrime = None
# If buffer not filled, add to it
if len(replay) < buffer_size:
replay.append((s, a, r, sPrime))
# Buffer is full - overwrite values
else:
if h < (buffer_size - 1):
h += 1
else:
h = 0
replay[h] = (s, a, r, sPrime)
# Prepare for next iteration
if d:
break
s = sPrime
rTotal += r
# Update only once per episode for stability
if batch_size < len(replay):
# Randomly sample our experience replay memory
minibatch_indices = np.random.choice(range(len(replay)), batch_size)
for index in minibatch_indices:
s_mem, a_mem, r_mem, sPrime_mem = replay[index]
target_q = sess.run(current_output, feed_dict={inputs: [s_mem]})[0]
# Expected return in the terminal state is known to be 0
max_q_prime = 0
if sPrime_mem is not None:
# Obtain the Q' values by feeding the new state through our network
q_prime = sess.run(current_output, feed_dict={inputs: [sPrime_mem]})[0]
# Obtain max Q' and set our target value for chosen action
max_q_prime = np.max(q_prime)
# Q-learning
target_q[a_mem] = r_mem + discount_factor * max_q_prime
sess.run(minimize, feed_dict={inputs: [s_mem], target_output: [target_q]})
rList.append(rTotal)
if i % log_freq == 0:
rAggList.append(np.mean(rList[-log_freq:]))
print("Average reward after " + str(i) + " episodes = " + str(np.mean(rList[-log_freq:])))
if i > 100 and np.mean(rList[-100:]) > target_performance:
print("Mission complete after {} episodes! The average over the last 100 was {}.".\
format(i, np.mean(rList[-100:])))
s = env.reset()
j = 0
while j < max_episode_length:
if visualise:
env.render()
j += 1
if j == max_episode_length:
print("Pole did not fall in first {} steps".format(max_episode_length))
# Choose an action greedily from the Q-network
q = sess.run(current_output, feed_dict={inputs: [s]})[0]
a = np.argmax(q)
# Get new state and reward from environment
sPrime, r, d, _ = env.step(a)
if d:
print("Pole fell on step {}".format(j))
break
s = sPrime
break
if i == num_episodes - 1:
print("*** Mission failed! ***")
s = env.reset()
j = 0
while j < max_episode_length:
if visualise:
env.render()
j += 1
if j == max_episode_length:
print("Pole did not fall in first {} steps".format(max_episode_length))
# Choose an action greedily from the Q-network
q = sess.run(current_output, feed_dict={inputs: [s]})[0]
a = np.argmax(q)
# Get new state and reward from environment
sPrime, r, d, _ = env.step(a)
if d:
print("Pole fell on step {}".format(j))
break
s = sPrime
rewardToPlot.append(rAggList)
# Thanks http://stackoverflow.com/questions/18296755/python-max-function-using-key-and-lambda-expression
plt.plot([np.mean([x[i] for x in rewardToPlot if len(x) > i]) for i in range(len(max(rewardToPlot, key=len)))])
plt.savefig( 'reward.png' )
plt.clf()