-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrl.py
95 lines (70 loc) · 2.79 KB
/
rl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gym
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
if 'jupyter' in globals():
get_ipython().run_line_magic('matplotlib', 'inline')
# ### Load the environment
env = gym.make('FrozenLake-v0')
# ## The Q-Network Approach
# ### Implementing the network itself
tf.reset_default_graph()
#These lines establish the feed-forward part of the network used to choose actions
inputs1 = tf.placeholder(shape=[1,16],dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16,4],0,0.01))
Qout = tf.matmul(inputs1,W)
predict = tf.argmax(Qout,1)
#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape=[1,4],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)
# ### Training the network
init = tf.initialize_all_variables()
# Set learning parameters
y = .99
e = 0.1
num_episodes = 4000
#create lists to contain total rewards and steps per episode
jList = []
rList = []
with tf.Session() as sess:
sess.run(init)
for i in range(num_episodes):
#Reset environment and get first new observation
s = env.reset()
rAll = 0
d = False
j = 0
#The Q-Network
while j < 99:
j+=1
#Choose an action by greedily (with e chance of random action) from the Q-network
a,allQ = sess.run([predict,Qout],feed_dict={inputs1:np.identity(16)[s:s+1]})
if np.random.rand(1) < e:
a[0] = env.action_space.sample()
#Get new state and reward from environment
s1,r,d,_ = env.step(a[0])
#Obtain the Q' values by feeding the new state through our network
Q1 = sess.run(Qout,feed_dict={inputs1:np.identity(16)[s1:s1+1]})
#Obtain maxQ' and set our target value for chosen action.
maxQ1 = np.max(Q1)
targetQ = allQ
targetQ[0,a[0]] = r + y*maxQ1
#Train our network using target and predicted Q values
_,W1 = sess.run([updateModel,W],feed_dict={inputs1:np.identity(16)[s:s+1],nextQ:targetQ})
rAll += r
s = s1
if d == True:
#Reduce chance of random action as we train the model.
e = 1./((i/50) + 10)
break
jList.append(j)
rList.append(rAll)
print ("Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%")
# ### Some statistics on network performance
# We can see that the network beings to consistly reach the goal around the 750 episode mark.
plt.plot(rList)
# It also begins to progress through the environment for longer than chance aroudn the 750 mark as well.
plt.plot(jList)