This repository has been archived by the owner on Dec 22, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdqn.py
130 lines (118 loc) · 4.51 KB
/
dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import tensorflow as tf
import numpy as np
import math
import random
import gym
import matplotlib.pyplot as plt
BATCH_SIZE=10
MAX_EPSILON=0.7
MIN_EPSILON=0.1
LAMBDA=0.1
GAMMA=0.1
### functions on replay buffer ###
def add_experience(lst,max_memory,new_exp): ## add experience to replay buffer
lst.append(new_exp)
if len(lst)> max_memory:
lst.pop(0)
return lst
def get_prev_experience(lst,number_of_samples): ## retrieve sample from replay buffer
if number_of_samples> len(lst):
return random.sample(lst,len(lst))
else:
return random.sample(lst,number_of_samples)
### functions on replay buffer ###
def train(batch_size,game_name="MountainCar-v0",render=True): ## method to train
max_memory=5000
num_episodes=300 ### Number of epochs
lst=[] ### List containing <state,action,reward,next_state>
max_x_store=[]
reward_store=[]
####Start: Environment related details
env_name=game_name
env=gym.make(env_name)
num_states=env.env.observation_space.shape[0]
num_actions=env.env.action_space.n
####End: Environment related details
with tf.Session() as sess:
### building graph
states=tf.placeholder(shape=[None,num_states],dtype=tf.float32)
Qsa=tf.placeholder(shape=[None,num_actions],dtype=tf.float32)
fc1=tf.layers.dense(states,30,activation=tf.nn.relu)
fc2=tf.layers.dense(fc1,30,activation=tf.nn.relu)
logits=tf.layers.dense(fc2,num_actions)
loss=tf.losses.mean_squared_error(Qsa,logits)
opt=tf.train.AdamOptimizer().minimize(loss)
### building graph
sess.run(tf.global_variables_initializer()) ### initialize the neural network variables
cnt=0
while cnt<num_episodes:
if cnt %10==0:
print('Episode {} / {}'.format(cnt+1, num_episodes))
#----###Start: Adding experience to replay buffer
replay_state=env.reset()
total_reward=0
max_x=0
eps=MAX_EPSILON
steps=0
while True:
if render:
env.render()
####Start: select action based on random threshold
comp_val=random.random()
if comp_val<eps:
replay_action=random.randint(0,num_actions-1)
if comp_val>=eps:
replay_action=np.argmax(sess.run(logits,feed_dict={states:replay_state.reshape(1,num_states)}))
####End: select action based on random threshold
replay_next_state,replay_reward,replay_done,info=env.step(replay_action)
####Start: Reward allocation system
if replay_next_state[0]>=0.1:
replay_reward=replay_reward+10
if replay_next_state[0]>=0.25:
replay_reward=replay_reward+20
if replay_next_state[0]>=0.5:
replay_reward=replay_reward+100
####End: Reward allocation system
if replay_next_state[0]>max_x:
max_x=replay_next_state[0]
if replay_done:
replay_next_state=None
lst=add_experience(lst,max_memory,(replay_state,replay_action,replay_reward,replay_next_state))
#----###End: Adding experience to replay buffer
###----#Start: Training using replay buffer
train_batch=get_prev_experience(lst,BATCH_SIZE)
train_states=np.array([val[0] for val in train_batch])
train_next_states=np.array([(np.zeros(num_states) if val[3] is None else val[3]) for val in train_batch])
train_qsa=sess.run(logits, feed_dict={states: train_states})
train_qsad=sess.run(logits,feed_dict={states:train_next_states})
train_x=np.zeros((len(train_batch),num_states))
train_y=np.zeros((len(train_batch),num_actions))
for idx,b in enumerate(train_batch):
tmp_state,tmp_action,tmp_reward,tmp_next_state=b[0],b[1],b[2],b[3]
current_q=train_qsa[idx]
if tmp_next_state is None:
current_q[tmp_action]=tmp_reward
else:
current_q[tmp_action]=tmp_reward+GAMMA*np.amax(train_qsad[idx]) # Q(s,a) = r + gamma * max(Q(s',a'))
train_x[idx]=tmp_state
train_y[idx]=current_q
sess.run(opt,feed_dict={states:train_x,Qsa:train_y}) ### TRAINING DONE HERE !!!
###----#End; Training using replay buffer
steps=steps+1
eps=MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * steps)
replay_state=replay_next_state
total_reward=total_reward+replay_reward
if replay_done:
reward_store.append(total_reward)
max_x_store.append(max_x)
break #### BREAK WHILE LOOP HERE
print("Step {}, Total reward: {}, Eps: {}".format(steps, total_reward, eps))
cnt=cnt+1
####Start: plot visual graph
plt.plot(reward_store)
plt.show()
plt.close("all")
plt.plot(max_x_store)
plt.show()
#####End: plot visual graph
train(BATCH_SIZE)