-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
151 lines (115 loc) · 4.59 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import sys
import platform
import argparse
import numpy as np
from numpy_ringbuffer import RingBuffer
import copy
from scipy import signal
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch import nn
from setup import unityagents
from unityagents import UnityEnvironment
from code.UniformReplayBuffer import UniformReplayBuffer
from code.OUNoise import OUNoise
from code.Actor import Actor
from code.Critic import Critic
from code.MADDPG import MADDPG
parser = argparse.ArgumentParser(description='Train or execute a pair of agents in MADDPG in the Unity Tennis environment.' +
'Models are stored and loaded in the file final.pth.')
parser.add_argument('-t', '--train', dest='train_mode', action='store_true',
help='train a new model and store it as final.pth')
train_mode = parser.parse_args().train_mode
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# setup the environment
env = None
system = platform.system()
if system == 'Linux':
env = UnityEnvironment(file_name="setup/Tennis_Linux/Tennis.x86_64")
elif system == 'Darwin':
env = UnityEnvironment(file_name="setup/Tennis.app")
elif system == 'Windows':
env = UnityEnvironment(file_name="setup/Tennis_Windows_x86_64/Tennis.exe")
else:
print('Cannot find environment for this system.')
exit(0)
# use the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
state_size = env_info.vector_observations.shape[1]
agent = MADDPG(state_size, action_size, num_agents,
UniformReplayBuffer(1_000_000))
def execute_episode(agent, env, train_mode):
# prepare the environment
scores = np.zeros(num_agents)
env_info = env.reset(train_mode)[brain_name]
agent.new_episode()
# get the initial state
states = env_info.vector_observations
while True:
# evaluate the current state
actions = agent.act(states)
# execute the chosen action and get the outcome
env_info = env.step(actions)[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
# store the experience (also automatically learn if required)
if train_mode:
agent.store(states, actions, rewards, next_states, dones)
# prepare for the next iteration
states = next_states
scores += rewards
if np.any(dones):
break
# return the total rewards obtained
return np.max(scores)
def train(agent, env, episodes=2000, consecutive_episodes=100, show_output=True, save_as=None):
results = [None] * episodes
best_avg_score = 0
current_avg_score = 0
# reset the agent to start learning from scratch
agent.reset()
for i in range(episodes):
# execute all the episodes and store the results
score = execute_episode(agent, env, train_mode=True)
results[i] = score
# store the trained model if it is requested
if i+1 >= 100 and save_as is not None:
# but only if the model actually improved
current_avg_score = np.mean(np.array(results[i-99:i+1]))
if current_avg_score > best_avg_score:
best_avg_score = current_avg_score
torch.save(agent.actor.state_dict(), '{}.pth'.format(save_as))
if show_output:
print("\rEpisode: {}, Score: {:.2f}, Avg: {:.2f}".format(
i+1, score, current_avg_score), end="")
sys.stdout.flush()
if show_output:
print()
# use convolutions to calculate the mean, summarizing the training step
results = np.array(results)
mean = signal.convolve(results, np.ones(
[consecutive_episodes]) / consecutive_episodes, mode='valid')
return mean, results
if train_mode:
# train
mean, full_report = train(agent, env, save_as='final')
if np.any(mean > 0.5):
episode_solved = np.argmax(mean > 0.5) + 100
print('Solved after {} episodes'.format(episode_solved))
max_mean, max_mean_i = np.max(mean), np.argmax(mean)
print('Best avg. score over 100 consecutive episodes: {} achieved during episodes {} ... {}'.format(
max_mean, max_mean_i - 99, max_mean_i))
else:
# play
agent.actor.load_state_dict(torch.load('final.pth', map_location='cpu'))
agent.learning = False
score = execute_episode(agent, env, train_mode=False)
print('Score: {}'.format(score))
env.close()