-
Notifications
You must be signed in to change notification settings - Fork 2
/
train.py
146 lines (102 loc) · 4.43 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from maddpg.buffer import Buffer, update_target
from maddpg.model import get_actor, get_critic
from maddpg.noise import OUActionNoise
from env.env import NUM_AGENTS, DIM_AGENT_STATE, ENVIRONMENT, reward
from config import NUM_EPISODES, NUM_BUFFER, NUM_STEPS, STD_DEV, MODEL_PATH, BATCH_SIZE, TAU, CHECKPOINTS
save_path = MODEL_PATH
# Dimension of State Space for single agent
dim_agent_state = DIM_AGENT_STATE
# Number of Agents
num_agents = NUM_AGENTS
# Dimension of State Space
dim_state = dim_agent_state*num_agents
# Number of Episodes
num_episodes = NUM_EPISODES
# Number of Steps in each episodes
num_steps = NUM_STEPS
# For adding noise for exploration
std_dev = STD_DEV
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
# Neural Net Models for agents will be saved in these lists
ac_models = []
cr_models = []
target_ac = []
target_cr = []
# Appending Neural Network models in lists
for i in range(num_agents):
ac_models.append(get_actor())
cr_models.append(get_critic(dim_state))
target_ac.append(get_actor())
target_cr.append(get_critic(dim_state))
# Making the weights equal initially
target_ac[i].set_weights(ac_models[i].get_weights())
target_cr[i].set_weights(cr_models[i].get_weights())
# Creating class for replay buffer
buffer = Buffer(NUM_BUFFER, BATCH_SIZE)
# Executing Policy using actor models
def policy(state, noise_object, model):
sampled_actions = tf.squeeze(model(state))
noise = noise_object()
# Adding noise to action
sampled_actions = sampled_actions.numpy() + noise
# We make sure action is within bounds
legal_action = np.clip(sampled_actions, -1.0, 1.0)
return [np.squeeze(legal_action)]
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []
print("Training has started")
# Takes about long time to train, about a day on PC with intel core i3 processor
for ep in range(num_episodes):
# Initializing environment
env = ENVIRONMENT()
prev_state = env.initial_obs()
episodic_reward = 0
for i in range(num_steps):
# Expanding dimension of state from 1-d array to 2-d array
tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
# Action Value for each agents will be stored in this list
actions = []
# Get actions for each agents from respective models and store them in list
for j, model in enumerate(ac_models):
action = policy(tf_prev_state[:,dim_agent_state*j:dim_agent_state*(j+1)],
ou_noise, model)
actions.append(float(action[0]))
# Recieve new state and reward from environment.
new_state = env.step(actions)
# Rewards recieved is in form of list
# i.e for all agents we will get rewards
# for all agents in this list
rewards = reward(new_state)
# Record the experience of all the agents
# in the replay buffer
buffer.record((prev_state, actions, rewards, new_state))
# Sum of rewards of all agents
episodic_reward += sum(rewards)
# Updating parameters of actor and critic
# of all agents using maddpg algorithm
buffer.learn(ac_models, cr_models, target_ac, target_cr)
# Updating target networks for each agent
update_target(TAU, ac_models, cr_models, target_ac, target_cr)
# Updating old state with new state
prev_state = new_state
# Saving models after every 10 episodes
if ep%CHECKPOINTS == 0 and ep!=0:
for k in range(num_agents):
ac_models[k].save(save_path + 'actor'+str(k)+'.h5')
cr_models[k].save(save_path + 'critic'+str(k)+'.h5')
target_ac[k].save(save_path + 'target_actor' + str(k)+'.h5')
target_cr[k].save(save_path + 'target_critic' + str(k)+'.h5')
ep_reward_list.append(episodic_reward)
# Mean of last 40 episodes
avg_reward = np.mean(ep_reward_list[-40:])
print("Episode * {} * Avg Reward is ==> {}".format(ep+1, avg_reward))
avg_reward_list.append(avg_reward)
# Plotting Reward vs Episode plot
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()