polish(pu): polish comments and render in cartpole, pendulum, atari, …

…lunarlander
opendilab · Nov 29, 2023 · 9e97184 · 9e97184
1 parent e2531ce
commit 9e97184
Showing 9 changed files with 437 additions and 206 deletions.
diff --git a/zoo/atari/entry/atari_eval.py b/zoo/atari/entry/atari_eval.py
@@ -1,30 +1,41 @@
-# According to the model you want to evaluate, import the corresponding config.
 from lzero.entry import eval_muzero
 import numpy as np
 
 if __name__ == "__main__":
-    """ 
-    model_path (:obj:`Optional[str]`): The pretrained model path, which should
-    point to the ckpt file of the pretrained model, and an absolute path is recommended.
-    In LightZero, the path is usually something like ``exp_name/ckpt/ckpt_best.pth.tar``.
     """
-    # Take the config of sampled efficientzero as an example
-    from zoo.atari.config.atari_sampled_efficientzero_config import main_config, create_config
+    Overview:
+        Main script to evaluate the MuZero model on Atari games. The script will loop over multiple seeds,
+        evaluating a certain number of episodes per seed. Results are aggregated and printed.
 
-    model_path = "/path/ckpt/ckpt_best.pth.tar"
+    Variables:
+        - model_path (:obj:`Optional[str]`): The pretrained model path, pointing to the ckpt file of the pretrained model. 
+          The path is usually something like ``exp_name/ckpt/ckpt_best.pth.tar``.
+        - seeds (:obj:`List[int]`): List of seeds to use for the evaluations.
+        - num_episodes_each_seed (:obj:`int`): Number of episodes to evaluate for each seed.
+        - total_test_episodes (:obj:`int`): Total number of test episodes, calculated as num_episodes_each_seed * len(seeds).
+        - returns_mean_seeds (:obj:`np.array`): Array of mean return values for each seed.
+        - returns_seeds (:obj:`np.array`): Array of all return values for each seed.
+    """
+    # Take the config of MuZero as an example
+    from zoo.atari.config.atari_muzero_config import main_config, create_config
+
+    # model_path = "/path/ckpt/ckpt_best.pth.tar"
+    model_path = None
 
-    returns_mean_seeds = []
-    returns_seeds = []
     seeds = [0]
     num_episodes_each_seed = 1
     total_test_episodes = num_episodes_each_seed * len(seeds)
     create_config.env_manager.type = 'base'  # Visualization requires the 'type' to be set as base
     main_config.env.evaluator_env_num = 1  # Visualization requires the 'env_num' to be set as 1
     main_config.env.n_evaluator_episode = total_test_episodes
-    main_config.env.render_mode_human = True  # Whether to enable real-time rendering
-    main_config.env.save_video = True  # Whether to save the video, if save the video render_mode_human must to be True
-    main_config.env.save_path = '../config/'
-    main_config.env.eval_max_episode_steps = int(1e3)  # Adjust according to different environments
+    main_config.env.render_mode_human = False  # Whether to enable real-time rendering
+
+    main_config.env.save_replay = True  # Whether to save the video
+    main_config.env.save_path = './video'
+    main_config.env.eval_max_episode_steps = int(20)  # Adjust according to different environments
+
+    returns_mean_seeds = []
+    returns_seeds = []
 
     for seed in seeds:
         returns_mean, returns = eval_muzero(
@@ -45,4 +56,4 @@
     print(f'We eval total {len(seeds)} seeds. In each seed, we eval {num_episodes_each_seed} episodes.')
     print(f'In seeds {seeds}, returns_mean_seeds is {returns_mean_seeds}, returns is {returns_seeds}')
     print('In all seeds, reward_mean:', returns_mean_seeds.mean())
-    print("=" * 20)
+    print("=" * 20)
diff --git a/zoo/atari/envs/atari_lightzero_env.py b/zoo/atari/envs/atari_lightzero_env.py
@@ -1,6 +1,6 @@
 import copy
 import sys
-from typing import List
+from typing import List, Any
 
 import gym
 import numpy as np
@@ -14,56 +14,108 @@
 
 @ENV_REGISTRY.register('atari_lightzero')
 class AtariLightZeroEnv(BaseEnv):
+    """
+    Overview:
+        AtariLightZeroEnv is a derived class from BaseEnv and represents the environment for the Atari LightZero game.
+        This class provides the necessary interfaces to interact with the environment, including reset, step, seed,
+        close, etc. and manages the environment's properties such as observation_space, action_space, and reward_space.
+    Properties:
+        cfg, _init_flag, channel_last, clip_rewards, episode_life, _env, _observation_space, _action_space,
+        _reward_space, obs, _eval_episode_return, has_reset, _seed, _dynamic_seed
+    """
     config = dict(
+        # (int) The number of environment instances used for data collection.
         collector_env_num=8,
+        # (int) The number of environment instances used for evaluator.
         evaluator_env_num=3,
+        # (int) The number of episodes to evaluate during each evaluation period.
         n_evaluator_episode=3,
+        # (str) The name of the Atari game environment.
         env_name='PongNoFrameskip-v4',
+        # (str) The type of the environment, here it's Atari.
         env_type='Atari',
+        # (tuple) The shape of the observation space, which is a stacked frame of 4 images each of 96x96 pixels.
         obs_shape=(4, 96, 96),
+        # (int) The maximum number of steps in each episode during data collection.
         collect_max_episode_steps=int(1.08e5),
+        # (int) The maximum number of steps in each episode during evaluation.
         eval_max_episode_steps=int(1.08e5),
+        # (bool) If True, the game is rendered in real-time.
+        render_mode_human=False,
+        # (bool) If True, a video of the game play is saved.
+        save_replay=False,
+        # (str) The path to save the video.
+        replay_path='./video',
+        # (bool) If set to True, the game screen is converted to grayscale, reducing the complexity of the observation space.
         gray_scale=True,
+        # (int) The number of frames to skip between each action. Higher values result in faster simulation.
         frame_skip=4,
+        # (bool) If True, the game ends when the agent loses a life, otherwise, the game only ends when all lives are lost.
         episode_life=True,
+        # (bool) If True, the rewards are clipped to a certain range, usually between -1 and 1, to reduce variance.
         clip_rewards=True,
+        # (bool) If True, the channels of the observation images are placed last (e.g., height, width, channels).
         channel_last=True,
-        render_mode_human=False,
+        # (bool) If True, the pixel values of the game frames are scaled down to the range [0, 1].
         scale=True,
+        # (bool) If True, the game frames are preprocessed by cropping irrelevant parts and resizing to a smaller resolution.
         warp_frame=True,
-        save_video=False,
+        # (bool) If True, the game state is transformed into a string before being returned by the environment.
         transform2string=False,
+        # (bool) If True, additional wrappers for the game environment are used.
         game_wrapper=True,
+        # (dict) The configuration for the environment manager. If shared_memory is set to False, each environment instance
+        # runs in the same process as the trainer, otherwise, they run in separate processes.
         manager=dict(shared_memory=False, ),
+        # (int) The value of the cumulative reward at which the training stops.
         stop_value=int(1e6),
     )
 
     @classmethod
     def default_config(cls: type) -> EasyDict:
+        """
+        Overview:
+            Return the default configuration for the Atari LightZero environment.
+        Arguments:
+            - cls (:obj:`type`): The class AtariLightZeroEnv.
+        Returns:
+            - cfg (:obj:`EasyDict`): The default configuration dictionary.
+        """
         cfg = EasyDict(copy.deepcopy(cls.config))
         cfg.cfg_type = cls.__name__ + 'Dict'
         return cfg
 
-    def __init__(self, cfg=None):
+    def __init__(self, cfg: EasyDict) -> None:
+        """
+        Overview:
+            Initialize the Atari LightZero environment with the given configuration.
+        Arguments:
+            - cfg (:obj:`EasyDict`): The configuration dictionary.
+        """
         self.cfg = cfg
         self._init_flag = False
         self.channel_last = cfg.channel_last
         self.clip_rewards = cfg.clip_rewards
         self.episode_life = cfg.episode_life
 
-    def _make_env(self):
-        return wrap_lightzero(self.cfg, episode_life=self.cfg.episode_life, clip_rewards=self.cfg.clip_rewards)
-
-    def reset(self):
+    def reset(self) -> dict:
+        """
+        Overview:
+            Reset the environment and return the initial observation.
+        Returns:
+            - obs (:obj:`dict`): The initial observation after reset.
+        """
         if not self._init_flag:
-            self._env = self._make_env()
+            # Create and return the wrapped environment for Atari LightZero.
+            self._env = wrap_lightzero(self.cfg, episode_life=self.cfg.episode_life, clip_rewards=self.cfg.clip_rewards)
             self._observation_space = self._env.env.observation_space
             self._action_space = self._env.env.action_space
             self._reward_space = gym.spaces.Box(
-                low=self._env.env.reward_range[0], high=self._env.env.reward_range[1], shape=(1, ), dtype=np.float32
+                low=self._env.env.reward_range[0], high=self._env.env.reward_range[1], shape=(1,), dtype=np.float32
             )
 
             self._init_flag = True
+
         if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed:
             np_seed = 100 * np.random.randint(1, 1000)
             self._env.env.seed(self._seed + np_seed)
@@ -73,29 +125,19 @@ def reset(self):
         obs = self._env.reset()
         self.obs = to_ndarray(obs)
         self._eval_episode_return = 0.
-        self.has_reset = True
         obs = self.observe()
-        # obs.shape: 96,96,1
         return obs
 
-    def observe(self):
+    def step(self, action: int) -> BaseEnvTimestep:
         """
         Overview:
-            add action_mask to obs to adapt with MCTS alg..
+            Execute the given action and return the resulting environment timestep.
+        Arguments:
+            - action (:obj:`int`): The action to be executed.
+        Returns:
+            - timestep (:obj:`BaseEnvTimestep`): The environment timestep after executing the action.
         """
-        observation = self.obs
-
-        if not self.channel_last:
-            # move the channel dim to the fist axis
-            # (96, 96, 3) -> (3, 96, 96)
-            observation = np.transpose(observation, (2, 0, 1))
-
-        action_mask = np.ones(self._action_space.n, 'int8')
-        return {'observation': observation, 'action_mask': action_mask, 'to_play': -1}
-
-    def step(self, action):
         obs, reward, done, info = self._env.step(action)
-        # self._env.render()
         self.obs = to_ndarray(obs)
         self.reward = np.array(reward).astype(np.float32)
         self._eval_episode_return += self.reward
@@ -105,6 +147,23 @@ def step(self, action):
 
         return BaseEnvTimestep(observation, self.reward, done, info)
 
+    def observe(self) -> dict:
+        """
+        Overview:
+            Return the current observation along with the action mask and to_play flag.
+        Returns:
+            - observation (:obj:`dict`): The dictionary containing current observation, action mask, and to_play flag.
+        """
+        observation = self.obs
+
+        if not self.channel_last:
+            # move the channel dim to the fist axis
+            # (96, 96, 3) -> (3, 96, 96)
+            observation = np.transpose(observation, (2, 0, 1))
+
+        action_mask = np.ones(self._action_space.n, 'int8')
+        return {'observation': observation, 'action_mask': action_mask, 'to_play': -1}
+
     @property
     def legal_actions(self):
         return np.arange(self._action_space.n)
@@ -113,52 +172,41 @@ def random_action(self):
         action_list = self.legal_actions
         return np.random.choice(action_list)
 
-    def render(self, mode='human'):
-        self._env.render()
-
-    def human_to_action(self):
-        """
-        Overview:
-            For multiplayer games, ask the user for a legal action
-            and return the corresponding action number.
-        Returns:
-            An integer from the action space.
-        """
-        while True:
-            try:
-                print(f"Current available actions for the player are:{self.legal_actions}")
-                choice = int(input(f"Enter the index of next action: "))
-                if choice in self.legal_actions:
-                    break
-                else:
-                    print("Wrong input, try again")
-            except KeyboardInterrupt:
-                print("exit")
-                sys.exit(0)
-            except Exception as e:
-                print("Wrong input, try again")
-        return choice
-
     def close(self) -> None:
+        """
+        Close the environment, and set the initialization flag to False.
+        """
         if self._init_flag:
             self._env.close()
         self._init_flag = False
 
     def seed(self, seed: int, dynamic_seed: bool = True) -> None:
+        """
+        Set the seed for the environment's random number generator. Can handle both static and dynamic seeding.
+        """
         self._seed = seed
         self._dynamic_seed = dynamic_seed
         np.random.seed(self._seed)
 
     @property
     def observation_space(self) -> gym.spaces.Space:
+        """
+        Property to access the observation space of the environment.
+        """
         return self._observation_space
 
     @property
     def action_space(self) -> gym.spaces.Space:
+        """
+        Property to access the action space of the environment.
+        """
         return self._action_space
 
     @property
     def reward_space(self) -> gym.spaces.Space:
+        """
+        Property to access the reward space of the environment.
+        """
         return self._reward_space
 
     def __repr__(self) -> str: