Example 15: DQN algorithm on Gridworld with experience replay

In this example, we will train an agent so that it learns to navigate itself in a grid. Specifically, we will be using the Gridworld environmant from the book Deep Reinforcement Learning in Action. We have implemented this environment in rlenvs_from_cpp; check the class Gridworld.

We will use the DQN algorithm, see Deep Reinforcement Learning in Action and references therein, in order to train our agent and we will TensorBoard to monitor the training. We will use a static environment configuration in this example something that makes this problem a lot easier to work on.

We will code the same model as is done in the Deep Reinforcement Learning in Action book so you may also want to follow the code therein.

Driver code

/**
 * Use DQN on Gridworld
 *
 * */
#include "cubeai/base/cubeai_config.h"

#if defined(USE_PYTORCH) && defined(USE_RLENVS_CPP)

#include "cubeai/base/cubeai_types.h"
#include "cubeai/io/csv_file_writer.h"
#include "cubeai/rl/trainers/rl_serial_agent_trainer.h"
#include "cubeai/maths/optimization/optimizer_type.h"
#include "cubeai/maths/optimization/pytorch_optimizer_factory.h"
#include "cubeai/rl/policies/epsilon_greedy_policy.h"
#include "cubeai/utils/torch_adaptor.h"
#include "cubeai/maths/vector_math.h"
#include "cubeai/data_structs/experience_buffer.h"
#include "rlenvs/envs/grid_world/grid_world_env.h"

#include <boost/log/trivial.hpp>
#include <torch/torch.h>

#include <unordered_map>
#include <iostream>
#include <string>
#include <any>
#include <filesystem>
#include <map>
#include <vector>
#include <tuple>

namespace rl_example_12{

	namespace F = torch::nn::functional;
	
using cubeai::real_t;
using cubeai::uint_t;
using cubeai::float_t;
using cubeai::torch_tensor_t;

using cubeai::rl::RLSerialAgentTrainer;
using cubeai::rl::RLSerialTrainerConfig;
using cubeai::rl::policies::EpsilonGreedyPolicy;
using cubeai::containers::ExperienceBuffer;
using rlenvs_cpp::envs::grid_world::Gridworld;

	
const std::string EXPERIMENT_ID = "1";

const uint_t l1 = 64;
const uint_t l2 = 150;
const uint_t l3 = 100;
const uint_t l4 = 4;
const uint_t SEED = 42;
const uint_t BATCH_SIZE = 200;
const uint_t EXPERIENCE_BUFFER_SIZE = 1000;
const uint_t TOTAL_EPOCHS = 1000;
const uint_t TOTAL_ITRS_PER_EPOCH = 50;
const real_t GAMMA = 0.9;
const real_t EPSILON = 0.3;
const real_t LEARNING_RATE = 1.0e-3;


// The class that models the Policy network to train
class QNetImpl: public torch::nn::Module
{
public:


    QNetImpl();

    torch_tensor_t forward(torch_tensor_t);

    
private:

   torch::nn::Linear fc1_;
   torch::nn::Linear fc2_;
   torch::nn::Linear fc3_;

};


QNetImpl::QNetImpl()
    :
      fc1_(torch::nn::Linear(l1, l2)),
      fc2_(torch::nn::Linear(l2, l3)),
      fc3_(torch::nn::Linear(l3, l4))
{
    register_module("fc1", fc1_);
    register_module("fc2", fc2_);
    register_module("fc3", fc3_);
}


torch_tensor_t
QNetImpl::forward(torch_tensor_t x){

    x = F::relu(fc1_->forward(x));
    x = fc2_->forward(x);
    x = F::relu(fc3_->forward(x));
    return x; 
}

// create the model
TORCH_MODULE(QNet);

// 4x4 grid
typedef Gridworld<4> env_type;
typedef env_type::time_step_type time_step_type;

typedef std::tuple<std::vector<float_t>, uint_t, real_t, std::vector<float_t>, bool> experience_tuple_type;

typedef ExperienceBuffer<experience_tuple_type> experience_buffer_type;
typedef env_type::state_type state_type;


std::vector<float_t>
flattened_observation(const state_type& state){
	
	std::vector<float_t> data;
	data.reserve(64);
	
	for(uint_t i=0; i<state.size(); ++i){
		for(uint_t j=0; j<state[i].size(); ++j){
			for(uint_t k=0; k<state[i][j].size(); ++k){
				data.push_back(state[i][j][k]);
			}
		}
	}
	
	return data;
}

template<typename T, uint_t index>
std::vector<T> 
get(const std::vector<experience_tuple_type>& experience){
	
	std::vector<T> result;
	result.reserve(experience.size());
	
	auto b = experience.begin();
	auto e = experience.end();
	
	for(; b != e; ++b){
		auto item = *b;
		result.push_back(std::get<index>(item));
	}
	
	return result;
	
}

}


int main(){

    using namespace rl_example_12;
	using namespace cubeai;

    try{

        BOOST_LOG_TRIVIAL(info)<<"Starting agent training...";
        BOOST_LOG_TRIVIAL(info)<<"Numebr of episodes to trina: "<<TOTAL_EPOCHS;

        // let's create a directory where we want to
        //store all the results from running a simulation
        std::filesystem::create_directories("experiments/" + EXPERIMENT_ID);

        // set the seed for PyTorch
        torch::manual_seed(SEED);

        BOOST_LOG_TRIVIAL(info)<<"Creating the environment...";

        // create a 4x4 grid
        auto env = env_type();

        
		// initialize the environment using random mode
		std::unordered_map<std::string, std::any> options;
        options["mode"] = std::any(rlenvs_cpp::envs::grid_world::to_string(rlenvs_cpp::envs::grid_world::GridWorldInitType::RANDOM));

        env.make("v0", options);

        BOOST_LOG_TRIVIAL(info)<<"Done...";
        BOOST_LOG_TRIVIAL(info)<<"Environment name: "<<env.name;
        BOOST_LOG_TRIVIAL(info)<<"Number of actions available: "<<env.n_actions();
        BOOST_LOG_TRIVIAL(info)<<"Number of states available: "<<env.n_states();


        // the network to train for the q values
        QNet qnet;

        auto optimizer_ptr = std::make_unique<torch::optim::Adam>(qnet->parameters(),
                                                                  torch::optim::AdamOptions(LEARNING_RATE));

        // we will use an epsilon-greedy policy
        EpsilonGreedyPolicy policy(	EPSILON, 
									SEED, 
									rl::policies::EpsilonDecayOption::NONE);

        // the loss function to use
        auto loss_fn = torch::nn::MSELoss();

		// track the average loss per epoch
        std::vector<real_t> losses;
        losses.reserve(TOTAL_EPOCHS);
		
		experience_buffer_type experience_buffer(EXPERIENCE_BUFFER_SIZE);


		// hold random values
		std::vector<float_t> rand_vec(64, 0.0);
		float_t a = 0.0;
		float_t b = 1.0;
        
		// loop over the epochs
        for(uint_t epoch=0; epoch < TOTAL_EPOCHS; ++epoch){

            BOOST_LOG_TRIVIAL(info)<<"Starting epoch: "<<epoch<<std::endl;

            // for every new epoch we reset the environment
            auto time_step = env.reset();
            
			uint_t step_counter = 0;
			
			
			// the loss associated with the epoch
			std::vector<real_t> epoch_loss;
			epoch_loss.reserve(TOTAL_ITRS_PER_EPOCH);
            
			auto done = false;
			while(!done){

				auto obs1 = flattened_observation(time_step.observation());
				
				rand_vec = cubeai::maths::randomize(rand_vec, a, b, 64);
				rand_vec = cubeai::maths::divide(rand_vec, 100.0);
				
				// randomize the flattened observation by adding
				// some noise
				obs1 = maths::add(obs1, rand_vec);
				auto torch_state_1 = torch_utils::TorchAdaptor::to_torch(obs1, 
																		 DeviceType::CPU);
                
                // get the qvals
                auto qvals = qnet(torch_state_1);
                auto action_idx = policy(qvals, 
				                         cubeai::torch_tensor_value_type<float_t>());
				
				BOOST_LOG_TRIVIAL(info)<<"Action selected: "<<action_idx<<std::endl;

				// step in the environment
                time_step = env.step(action_idx);
				auto reward = time_step.reward();
				auto step_finished = time_step.done();
				
				auto obs2 = flattened_observation(time_step.observation());
				rand_vec = cubeai::maths::randomize(rand_vec, a, b, 64);
				rand_vec = cubeai::maths::divide(rand_vec, 100.0);
				obs2 = maths::add(obs2, rand_vec);
				
				auto torch_state_2 = torch_utils::TorchAdaptor::to_torch(obs2, 
																		 DeviceType::CPU);
																		 
																		 
				
				experience_tuple_type exp = {obs1, action_idx, reward, obs2, step_finished}; 
				
				// put the observation into the buffer
				experience_buffer.append(exp);
				
				// if we reach the batch size we will do
				// a backward propagation
				if(experience_buffer.size() >= BATCH_SIZE){
					
					std::vector<experience_tuple_type> batch_sample;
					batch_sample.reserve(BATCH_SIZE);
					
					// sample from the experience 
					experience_buffer.sample(BATCH_SIZE, batch_sample, SEED);
					
					// stack the experiences
					auto state_1_batch = get<std::vector<float_t>, 0>(batch_sample);
					auto action_batch  = get<int_t, 1>(batch_sample);
					auto state_2_batch = get<std::vector<float_t>, 3>(batch_sample);
					auto reward_batch  = get<real_t, 2>(batch_sample);
					auto done_batch    = get<bool, 4>(batch_sample);
					
					auto state_1_batch_t = torch_utils::TorchAdaptor::stack(state_1_batch, 
																			cubeai::DeviceType::CPU);
					auto q1 = qnet(state_1_batch_t);
					
					// tell the model that we don't use grad here
					qnet->eval();
					
					auto state_2_batch_t = torch_utils::TorchAdaptor::stack(state_2_batch, 
																			cubeai::DeviceType::CPU);
					auto q2 = qnet(state_2_batch_t);
				
					// we are training again
					qnet->train();
					
					auto reward_batch_t = torch_utils::TorchAdaptor::to_torch(reward_batch, 
																			  cubeai::DeviceType::CPU);
					auto done_batch_t = torch_utils::TorchAdaptor::to_torch(done_batch, 
																			  cubeai::DeviceType::CPU);
					auto action_batch_t = torch_utils::TorchAdaptor::to_torch(action_batch, 
																			  cubeai::DeviceType::CPU);
															
					auto state_max = torch::max(q2, 1);
					auto state_max_val = std::get<0>(state_max);
					auto Y = reward_batch_t + GAMMA * ((1-done_batch_t) * state_max_val);
					auto X = q1.gather(1, action_batch_t.unsqueeze(1)).squeeze();
					auto loss = loss_fn(X, Y.detach());
					
					optimizer_ptr -> zero_grad();
					loss.backward();
					optimizer_ptr -> step();
					
					BOOST_LOG_TRIVIAL(info)<<"Loss at epoch: "<<loss.item<real_t>();
					epoch_loss.push_back(loss.item<real_t>());
					
				}
				
				step_counter += 1;
				
				// update done
				done = time_step.done();
				
				if(done || step_counter > TOTAL_ITRS_PER_EPOCH){ //#Q
					//done = true;
					BOOST_LOG_TRIVIAL(info)<<"Reward: "<<reward;
					BOOST_LOG_TRIVIAL(info)<<"Finishing epoch at step: "<<step_counter;
					step_counter = 0;
				}
            }

            BOOST_LOG_TRIVIAL(info)<<"Epoch finished...";
			
			// get the epsilon
			/*auto eps = policy.eps_value();
			if(eps > 0.1){
				eps -= 1/static_cast<real_t>(TOTAL_EPOCHS);
				policy.set_eps_value(eps);
			}*/
			
			losses.push_back(cubeai::maths::mean(epoch_loss.begin(),
													epoch_loss.end()));
        }

        // save the rewards per episode for visualization
        // purposes
        auto filename = std::string("experiments/") + EXPERIMENT_ID;
        filename += "/dqn_grid_world_policy_rewards.csv";
        cubeai::io::CSVWriter csv_writer(filename, cubeai::io::CSVWriter::default_delimiter());
        csv_writer.open();
		csv_writer.write_column_vector(losses);
        
        // save the policy also so that we can load it and check
        // use it
        auto policy_model_filename = std::string("experiments/") + EXPERIMENT_ID;
        policy_model_filename += std::string("/dqn_grid_world_policy.pth");
        torch::serialize::OutputArchive archive;
        qnet -> save(archive);
        archive.save_to(policy_model_filename);

    }
    catch(std::exception& e){
        std::cout<<e.what()<<std::endl;
    }
    catch(...){
        std::cout<<"Unknown exception occured"<<std::endl;
    }
    return 0;
}
#else
#include <iostream>
int main(){

	std::cout<<"This example requires PyTorch and rlenvscpplib."<<std::endl;
	std::cout<<"Reconfigure cuberl with USE_PYTORCH and USE_RLENVS_CPP flags turned ON."<<std::endl;
    return 0;
}
#endif

Running the code above produces the following output:

The average per epoch loss is shown in the figure below

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

rl_example_15.md

rl_example_15.md

Example 15: DQN algorithm on Gridworld with experience replay

Driver code

Files

rl_example_15.md

Latest commit

History

rl_example_15.md

File metadata and controls

Example 15: DQN algorithm on Gridworld with experience replay

Driver code