-
Notifications
You must be signed in to change notification settings - Fork 7
/
D3Q.py
190 lines (165 loc) · 7.12 KB
/
D3Q.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from awsenv import *
import numpy as np
import tensorflow as tf
import cPickle as pkl
import tensorflow.contrib.slim as slim
def scale_obs(obs):
obs_2 = obs[2]
if obs_2 > 0:
obs_2 = np.log(obs[2])
obs_3 = obs[3]
if obs_3 > 0:
obs_3 = np.log(obs[3])
return (obs[0]/20.,obs[1],obs_2,obs_3,obs[4])
class Dueling_DQN():
def __init__(self, D, l, learning_rate):
self.inputs = tf.placeholder(tf.float32, [None,D, l, 1] , name="input_x")
self.conv1 = slim.conv2d(\
inputs=self.inputs, num_outputs=16, kernel_size=[1, 8], stride=[1, 1], padding='SAME',
biases_initializer=None)
self.maxpool1 = slim.max_pool2d(self.conv1, [1,2])
self.conv2 = slim.conv2d(\
inputs=self.maxpool1, num_outputs=32, kernel_size=[1, 5], stride=[1, 1], padding='SAME',
biases_initializer=None)
self.maxpool2 = slim.max_pool2d(self.conv2, [1,2])
# We take the output from the final convolutional layer and split it into separate advantage and value streams.
self.streamAC, self.streamVC = tf.split(self.maxpool2, 2, 3)
self.streamA = slim.flatten(self.streamAC)
self.streamV = slim.flatten(self.streamVC)
self.xavier_init = tf.contrib.layers.xavier_initializer()
self.AW = tf.Variable(self.xavier_init([128, O]))
self.VW = tf.Variable(self.xavier_init([128, 1]))
self.Advantage = tf.matmul(self.streamA, self.AW)
self.Value = tf.matmul(self.streamV, self.VW)
# Then combine them together to get our final Q-values.
self.Qout = self.Value + tf.subtract(self.Advantage, tf.reduce_mean(self.Advantage, axis=1, keep_dims=True))
self.predict = tf.argmax(self.Qout, 1)
# Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
self.nextQ = tf.placeholder(shape=[None], dtype=tf.float32)
self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
self.actions_onehot = tf.one_hot(self.actions, O, dtype=tf.float32)
self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
self.td_error = tf.square(self.nextQ - self.Q)
self.loss = tf.reduce_mean(self.td_error)
self.trainer = tf.train.AdamOptimizer(learning_rate=learning_rate)
self.updateModel = self.trainer.minimize(self.loss)
#Functions that update the parameters of the target network with those of the primary network.
def updateTargetGraph(tfVars, tau):
total_vars = len(tfVars)
op_holder = []
for idx, var in enumerate(tfVars[0:total_vars // 2]):
op_holder.append(tfVars[idx + total_vars // 2].assign(
(var.value() * tau) + ((1 - tau) * tfVars[idx + total_vars // 2].value())))
return op_holder
def updateTarget(op_holder, sess):
for op in op_holder:
sess.run(op)
as_group = 'AUTOSCALING_GROUP'
elb_name = 'ELB_NAME'
elb_url = 'ELB_URL'
env = AWSEnv(as_group=as_group, elb=elb_name, elb_url=elb_url)
#Env constant
n_num_instance_states = 5
n_cpu_util_states = 101
# Set learning parameters
y = .99
e = 0.1
reward_sum = 0
num_episodes = 1000
batch_size = 10
plot_reward = [] # for plot bookkeeping
plot_observation = [] # for plot bookkeeping
#create lists to contain total rewards and steps per episode
jList = []
rList = []
running_reward = None
checkpoint_dir = 'training_model_D3Q'
checkpoint_name = os.path.join(checkpoint_dir, 'D3Q')
reload = False
start_i = 0
D = 5
l = 16
O = 5
learning_rate = 1e-4
tau = 0.01 # Rate to update target network toward primary network
# Set up the variable for recording across the episodes.
tf.reset_default_graph()
PQN = Dueling_DQN(D,l, learning_rate) # The primary Q Net
TQN = Dueling_DQN(D,l, learning_rate) # The target Q Net
init = tf.global_variables_initializer()
saver = tf.train.Saver()
trainables = tf.trainable_variables()
targetOps = updateTargetGraph(trainables,tau)
with tf.Session() as sess:
if reload:
saver.restore(sess, checkpoint_name)
else:
sess.run(init)
slist= []
for i in xrange(l):
s, r, d, info = env.step(0)
s = scale_obs(s)
slist.append(list(s))
env.step(np.random.randint(0,O)-O//2)
x = np.asarray(slist).reshape(1, D, l, 1)
reward_sum = 0
for i in range(start_i, num_episodes):
#Reset environment and get first new observation
rAll = 0
reward_sum = 0
d = False
j = 0
#The Q-Network
while j < batch_size:
j+=1
#Choose an action by greedily (with e chance of random action) from the Q-network
a = sess.run(PQN.predict,feed_dict={PQN.inputs:x})
if np.random.rand(1) < e:
a[0] = np.random.randint(O)
#Get new state and reward from environment
s1,r,d,_ = env.step(a[0]-O//2)
s1 = scale_obs(s1)
plot_reward.append(r) # record reward for plot
plot_observation.append(s1)
reward_sum += r
jList.append(r)
spop = slist.pop(0)
slist.append(s1)
x1 = np.asarray(slist).transpose().reshape(1, D, l, 1)
#Obtain the Q' values by feeding the new state through our network
Q1 = sess.run(PQN.predict,feed_dict={PQN.inputs:x1})
Q2 = sess.run(TQN.Qout, feed_dict={TQN.inputs: x1})
doubleQ = Q2[0][Q1[0]]
#Obtain maxQ' and set our target value for chosen action.
targetQ = r + y*doubleQ
#Train our network using target and predicted Q values
sess.run([PQN.updateModel],feed_dict={PQN.inputs:x,PQN.nextQ:[targetQ,], PQN.actions:a})
updateTarget(targetOps, sess)
rAll += r
x = x1
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
print 'Episode %f Average reward for episode %f. Total average reward %f.' % (i,
reward_sum / batch_size, running_reward / batch_size)
saver.save(sess, checkpoint_name) # Save the model
with open('training_model_conv5_D3Q/obs_Q.pkl', 'wb') as fp:
pkl.dump(plot_observation, fp)
with open('training_model_conv5_D3Q/reward_Q.pkl', 'wb') as fp:
pkl.dump(plot_reward, fp)
with open('training_model_conv5_D3Q/e_Q.pkl', 'wb') as fp:
pkl.dump((e,i), fp)
#Reduce chance of random action as we train the model.
e = 1./((i/50) + 10)
rList.append(rAll)
print "Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%"
##############################################
# import matplotlib.pyplot as plt
# import pandas as pd
# df = pd.DataFrame(jList)
# plt.figure();plt.subplot(121);plt.scatter(xrange(len(df.rolling(12).mean())),df.rolling(12).mean())
# plt.subplot(122);plt.plot(df.rolling(12).mean())
##############################################
# import matplotlib.pyplot as plt
# import pandas as pd
# df = pd.DataFrame(jList)
# plt.figure();plt.subplot(121);plt.scatter(xrange(len(df.rolling(12).mean())),df.rolling(12).mean())
# plt.subplot(122);plt.plot(df.rolling(12).mean())