-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathimitation.py
141 lines (119 loc) · 5.21 KB
/
imitation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import sys
import argparse
import numpy as np
import keras
import random
import gym
from keras.optimizers import Adam
def one_hot_vc(acs_probs,ac_space):
a=np.zeros(ac_space)
idx=np.argmax(acs_probs[0])
a[idx]=1
return a
class Imitation():
def __init__(self, model_config_path, expert_weights_path):
# Load the expert model.
with open(model_config_path, 'r') as f:
self.expert = keras.models.model_from_json(f.read())
self.expert.load_weights(expert_weights_path)
# Initialize the cloned model (to be trained).
with open(model_config_path, 'r') as f:
self.model = keras.models.model_from_json(f.read())
self.learning_rate=0.001
# self.model.summary()
self.model.compile(loss='categorical_crossentropy',
optimizer=Adam(lr=self.learning_rate),metrics=['acc'])
def run_expert(self, env, render=False):
# Generates an episode by running the expert policy on the given env.
return Imitation.generate_episode(self.expert, env, render)
def run_model(self, env, render=False):
# Generates an episode by running the cloned policy on the given env.
return Imitation.generate_episode(self.model, env, render)
@staticmethod
def generate_episode(model, env, render=False):
# Generates an episode by running the given model on the given env.
# Returns:
# - a list of states, indexed by time step
# - a list of actions, indexed by time step
# - a list of rewards, indexed by time step
obs_space=env.observation_space.shape[0]
ac_space=env.action_space.n
actions = []
rewards = []
states=[]
is_t=False
if(render):
env.render()
curr_state=np.expand_dims(env.reset(),axis=0)
# states=np.copy(curr_state)
states.append(curr_state)
while(not is_t):
ac=model.predict(curr_state)
n_st,rew,is_t,_=env.step(np.argmax(ac[0]))
n_st=np.expand_dims(n_st,axis=0)
states.append(n_st)
# states=np.append(states,n_st,axis=0)
ac=one_hot_vc(ac,ac_space)
actions.append(ac)
rewards.append(rew)
curr_state=np.copy(n_st)
actions=np.reshape(np.array(actions),[-1,ac_space])
states=np.reshape(np.array(states),[-1,obs_space])
# print(actions.shape)
return states[:-1], actions, np.array(rewards)
def train(self, env, num_episodes=100, num_epochs=50, render=False):
# Trains the model on training data generated by the expert policy.
# Args:
# - env: The environment to run the expert policy on.
# - num_episodes: # episodes to be generated by the expert.
# - num_epochs: # epochs to train on the data generated by the expert.
# - render: Whether to render the environment.
# Returns the final loss and accuracy.
loss = 0
acc = 0
for _ in range(num_epochs):
states,actions,rewards=[],[],[]
for __ in range(num_episodes):
st_epi,ac_epi,rew_epi=self.run_expert(env,render)
states.append(st_epi)
actions.append(ac_epi)
rewards.append(rew_epi)
states=np.array(states)
actions=np.array(actions)
rewards=np.array(rewards)
for i in range(num_episodes):
epoch_st=states[i]
epoch_ac=actions[i]
history=self.model.fit(epoch_st,epoch_ac,batch_size=len(epoch_st),epochs=1,verbose=2)
return loss, acc
def parse_arguments():
# Command-line flags are defined here.
parser = argparse.ArgumentParser()
parser.add_argument('--model-config-path', dest='model_config_path',
type=str, default='LunarLander-v2-config.json',
help="Path to the model config file.")
parser.add_argument('--expert-weights-path', dest='expert_weights_path',
type=str, default='LunarLander-v2-weights.h5',
help="Path to the expert weights file.")
# https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
parser_group = parser.add_mutually_exclusive_group(required=False)
parser_group.add_argument('--render', dest='render',
action='store_true',
help="Whether to render the environment.")
parser_group.add_argument('--no-render', dest='render',
action='store_false',
help="Whether to render the environment.")
parser.set_defaults(render=False)
return parser.parse_args()
def main(args):
# Parse command-line arguments.
args = parse_arguments()
model_config_path = args.model_config_path
expert_weights_path = args.expert_weights_path
render = args.render
# Create the environment.
env = gym.make('LunarLander-v2')
agent=Imitation(model_config_path, expert_weights_path)
agent.train(env=env,num_episodes=10,num_epochs=50,render=render)
if __name__ == '__main__':
main(sys.argv)