imitation.py

import sys
import argparse
import numpy as np
import keras
import random
import gym
from keras.optimizers import Adam

def one_hot_vc(acs_probs,ac_space):
    a=np.zeros(ac_space)
    idx=np.argmax(acs_probs[0])
    a[idx]=1
    return a

class Imitation():
    def __init__(self, model_config_path, expert_weights_path):
        # Load the expert model.
        with open(model_config_path, 'r') as f:
            self.expert = keras.models.model_from_json(f.read())
        self.expert.load_weights(expert_weights_path)
        
        # Initialize the cloned model (to be trained).
        with open(model_config_path, 'r') as f:
            self.model = keras.models.model_from_json(f.read())

        self.learning_rate=0.001
        # self.model.summary()
        self.model.compile(loss='categorical_crossentropy',
            optimizer=Adam(lr=self.learning_rate),metrics=['acc'])
    
    def run_expert(self, env, render=False):
        # Generates an episode by running the expert policy on the given env.
        return Imitation.generate_episode(self.expert, env, render)

    def run_model(self, env, render=False):
        # Generates an episode by running the cloned policy on the given env.
        return Imitation.generate_episode(self.model, env, render)

    @staticmethod
    def generate_episode(model, env, render=False):
        # Generates an episode by running the given model on the given env.
        # Returns:
        # - a list of states, indexed by time step
        # - a list of actions, indexed by time step
        # - a list of rewards, indexed by time step
        obs_space=env.observation_space.shape[0]
        ac_space=env.action_space.n
        
        actions = []
        rewards = []
        states=[]
        is_t=False
        if(render):
            env.render()
        curr_state=np.expand_dims(env.reset(),axis=0)
        # states=np.copy(curr_state)
        states.append(curr_state)
        
        while(not is_t):
            ac=model.predict(curr_state)
            n_st,rew,is_t,_=env.step(np.argmax(ac[0]))
            n_st=np.expand_dims(n_st,axis=0) 
            states.append(n_st)
            # states=np.append(states,n_st,axis=0)
            ac=one_hot_vc(ac,ac_space)
            actions.append(ac)
            rewards.append(rew)
            curr_state=np.copy(n_st)
        
        actions=np.reshape(np.array(actions),[-1,ac_space])
        states=np.reshape(np.array(states),[-1,obs_space])
        # print(actions.shape)
        return states[:-1], actions, np.array(rewards)
    
    def train(self, env, num_episodes=100, num_epochs=50, render=False):
        # Trains the model on training data generated by the expert policy.
        # Args:
        # - env: The environment to run the expert policy on. 
        # - num_episodes: # episodes to be generated by the expert.
        # - num_epochs: # epochs to train on the data generated by the expert.
        # - render: Whether to render the environment.
        # Returns the final loss and accuracy.
        loss = 0
        acc = 0
        for _ in range(num_epochs): 
            states,actions,rewards=[],[],[]
            for __ in range(num_episodes):
                st_epi,ac_epi,rew_epi=self.run_expert(env,render)
                states.append(st_epi)
                actions.append(ac_epi)
                rewards.append(rew_epi)
            
            states=np.array(states)
            actions=np.array(actions)
            rewards=np.array(rewards)

            for i in range(num_episodes):         
                epoch_st=states[i]
                epoch_ac=actions[i]
                history=self.model.fit(epoch_st,epoch_ac,batch_size=len(epoch_st),epochs=1,verbose=2)
        
        return loss, acc

def parse_arguments():
    # Command-line flags are defined here.
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-config-path', dest='model_config_path',
                        type=str, default='LunarLander-v2-config.json',
                        help="Path to the model config file.")
    parser.add_argument('--expert-weights-path', dest='expert_weights_path',
                        type=str, default='LunarLander-v2-weights.h5',
                        help="Path to the expert weights file.")

    # https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
    parser_group = parser.add_mutually_exclusive_group(required=False)
    parser_group.add_argument('--render', dest='render',
                              action='store_true',
                              help="Whether to render the environment.")
    parser_group.add_argument('--no-render', dest='render',
                              action='store_false',
                              help="Whether to render the environment.")
    parser.set_defaults(render=False)

    return parser.parse_args()


def main(args):
    # Parse command-line arguments.
    args = parse_arguments()
    model_config_path = args.model_config_path
    expert_weights_path = args.expert_weights_path
    render = args.render

    # Create the environment.
    env = gym.make('LunarLander-v2')
    
    agent=Imitation(model_config_path, expert_weights_path)
    agent.train(env=env,num_episodes=10,num_epochs=50,render=render)

if __name__ == '__main__':
  main(sys.argv)