From 52cec56f93cd1c1a4a1be5aa7c07d6fb09cfcb63 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Sat, 11 Jul 2020 01:24:38 +0900
Subject: [PATCH 01/34] Adds basic code for bitflip DQN and adds basic code for

 examples/her/ | 215 +++++++++++++++++++++++++++++
 pfrl/replay_buffers/   | 181 ++++++++++++++++++++++++
 2 files changed, 396 insertions(+)
 create mode 100644 examples/her/
 create mode 100644 pfrl/replay_buffers/

diff --git a/examples/her/ b/examples/her/
new file mode 100644
index 000000000..2a8b702ff
--- /dev/null
+++ b/examples/her/
@@ -0,0 +1,215 @@
+import argparse
+import gym
+import gym.spaces as spaces
+import torch.nn as nn
+import numpy as np
+import pfrl
+from pfrl.q_functions import DiscreteActionValueHead
+from pfrl import agents
+from pfrl import experiments
+from pfrl import explorers
+from pfrl import utils
+from pfrl import replay_buffers
+from pfrl.initializers import init_chainer_default
+class BitFlip(gym.GoalEnv):
+    """BitFlip environment from
+    Args:
+        n: State space is {0,1}^n
+    """
+    def __init__(self, n):
+        self.n = n
+        self.steps = 0
+        self.action_space = spaces.Discrete(n)
+        self.observation_space = spaces.Dict(dict(
+            desired_goal=spaces.MultiBinary(n),
+            achieved_goal=spaces.MultiBinary(n),
+            observation=spaces.MultiBinary(n),
+        ))
+    def compute_reward(self, achieved_goal, desired_goal, info):
+        return -1.0 if (achieved_goal != desired_goal).any() else 0.0
+    def step(self, action):
+        self.observation["observation"][action] = \
+            int(not self.observation["observation"][action])
+        reward = self.compute_reward(self.observation["achieved_goal"],
+                                     self.observation["desired_goal"], {})
+        done = (self.observation["desired_goal"] == \
+            self.observation["achieved_goal"]).all()
+        self.steps += 1
+        if self.steps == self.n:
+            done = True
+        return self.observation, reward, done, {}
+    def reset(self):
+        state = self.observation_space['desired_goal'].sample()
+        goal = self.observation_space['desired_goal'].sample()
+        while (state == goal).all():
+            goal = self.observation_space['desired_goal'].sample()
+        self.observation = dict()
+        self.observation["desired_goal"] = goal
+        self.observation["achieved_goal"] = state
+        self.observation["observation"] = state
+        self.steps = 0
+        return self.observation
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        default="results",
+        help=(
+            "Directory path to save output files."
+            " If it does not exist, it will be created."
+        ),
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)")
+    parser.add_argument(
+        "--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU."
+    )
+    parser.add_argument("--demo", action="store_true", default=False)
+    parser.add_argument("--load", type=str, default=None)
+    parser.add_argument(
+        "--log-level",
+        type=int,
+        default=20,
+        help="Logging level. 10:DEBUG, 20:INFO etc.",
+    )
+    parser.add_argument(
+        "--steps",
+        type=int,
+        default=5 * 10 ** 7,
+        help="Total number of timesteps to train the agent.",
+    )
+    parser.add_argument(
+        "--replay-start-size",
+        type=int,
+        default=5 * 10 ** 4,
+        help="Minimum replay buffer size before " + "performing gradient updates.",
+    )
+    parser.add_argument(
+        "--num-bits",
+        type=int,
+        default=10,
+        help="Number of bits for BitFlipping environment",
+    )
+    parser.add_argument("--eval-n-steps", type=int, default=125000)
+    parser.add_argument("--eval-interval", type=int, default=250000)
+    parser.add_argument("--n-best-episodes", type=int, default=30)
+    args = parser.parse_args()
+    import logging
+    logging.basicConfig(level=args.log_level)
+    # Set a random seed used in PFRL.
+    utils.set_random_seed(args.seed)
+    # Set different random seeds for train and test envs.
+    train_seed = args.seed
+    test_seed = 2 ** 31 - 1 - args.seed
+    args.outdir = experiments.prepare_output_dir(args, args.outdir)
+    print("Output files are saved in {}".format(args.outdir))
+    def make_env(test):
+        # Use different random seeds for train and test envs
+        env_seed = test_seed if test else train_seed
+        env = BitFlip(args.num_bits)
+        env.seed(int(env_seed))
+        return env
+    env = make_env(test=False)
+    eval_env = make_env(test=True)
+    n_actions = env.action_space.n
+    q_func = nn.Sequential(
+        init_chainer_default(nn.Linear(args.num_bits * 2, 256)),
+        nn.ReLU(),
+        init_chainer_default(nn.Linear(256, n_actions)),
+        DiscreteActionValueHead(),
+    )
+    # Use the same hyperparameters as the Nature paper
+    opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
+        q_func.parameters(),
+        lr=2.5e-4,
+        alpha=0.95,
+        momentum=0.0,
+        eps=1e-2,
+        centered=True,
+    )
+    rbuf = replay_buffers.ReplayBuffer(10 ** 6)
+    explorer = explorers.LinearDecayEpsilonGreedy(
+        start_epsilon=1.0,
+        end_epsilon=0.1,
+        decay_steps=10 ** 6,
+        random_action_func=lambda: np.random.randint(n_actions),
+    )
+    def phi(observation):
+        # Feature extractor
+        obs = np.asarray(observation["observation"], dtype=np.float32) / 255
+        dg = np.asarray(observation["desired_goal"], dtype=np.float32) / 255
+        return np.concatenate((obs, dg))
+    Agent = agents.DQN
+    agent = Agent(
+        q_func,
+        opt,
+        rbuf,
+        gpu=args.gpu,
+        gamma=0.99,
+        explorer=explorer,
+        replay_start_size=args.replay_start_size,
+        target_update_interval=10 ** 4,
+        clip_delta=True,
+        update_interval=4,
+        batch_accumulator="sum",
+        phi=phi,
+    )
+    if args.load:
+        agent.load(args.load)
+    if args.demo:
+        eval_stats = experiments.eval_performance(
+            env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None
+        )
+        print(
+            "n_episodes: {} mean: {} median: {} stdev {}".format(
+                eval_stats["episodes"],
+                eval_stats["mean"],
+                eval_stats["median"],
+                eval_stats["stdev"],
+            )
+        )
+    else:
+        experiments.train_agent_with_evaluation(
+            agent=agent,
+            env=env,
+            steps=args.steps,
+            eval_n_steps=args.eval_n_steps,
+            eval_n_episodes=None,
+            eval_interval=args.eval_interval,
+            outdir=args.outdir,
+            save_best_so_far_agent=True,
+            eval_env=eval_env,
+        )
+if __name__ == "__main__":
+    main()
diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
new file mode 100644
index 000000000..820bc8801
--- /dev/null
+++ b/pfrl/replay_buffers/
@@ -0,0 +1,181 @@
+import copy
+import numpy as np
+from pfrl.replay_buffer import EpisodicReplayBuffer
+from pfrl.replay_buffer import random_subseq
+def relabel_transition_goal(self, transition, goal_transition,
+                            reward_fn, swap_keys_list):
+    # Relabel/replace the desired goal for the transition with new_goal
+    for desired_obs_key, achieved_obs_key in swap_keys_list:
+        replacement = goal_transition["next_state"][achieved_obs_key]
+        transition["state"][desired_obs_key] = replacement
+        transition["next_state"][desired_obs_key] = replacement
+    new_goal = goal_transition["next_state"]["achieved_goal"]
+    achieved_goal = transition["next_state"]["achieved_goal"]
+    transition["reward"] = reward_fn(new_goal, achieved_goal)
+    return transition
+class HindsightReplayStrategy():
+    """ReplayStrategy for Hindsight experience replay
+    """
+    def __init__(self, reward_fn):
+        self.reward_fn = reward_fn
+    def apply(self, episodes):
+        return episodes
+class ReplayFinalGoal(HindsightReplayStrategy):
+    """Replay final goal.
+    """
+    def __init__(self, ignore_null_goals=True, is_null_goal=None):
+        self.ignore_null_goals = ignore_null_goals
+        self.is_null_goal = is_null_goal  
+    def apply(self, episodes, reward_fn):
+        batch_size = len(episodes)
+        episode_lens = np.array([len(episode) for episode in episodes])
+        # Randomly select time-steps from each episode
+        ts = [np.random.randint(ep_len) for ep_len in episode_lens]
+        ts = np.array(ts)
+        # Select subset for hindsight goal replacement.
+        apply_hers = np.random.uniform(size=batch_size) < 0.5
+        batch = []
+        for episode, apply_her, t in zip(episodes, apply_hers, ts):
+            transition = episode[t]
+            if apply_her:
+                final_transition = episode[-1]
+                final_goal = final_transition["next_state"]["achieved_goal"]
+                if not (self.ignore_null_goals and
+                        self.is_null_goal(final_goal)):
+                    transition = copy.deepcopy(transition)
+                    transition = relabel_transition_goal(
+                        transition, final_transition, reward_fn, swap_keys_list)
+            batch.append([transition])
+        return batch
+class ReplayFutureGoal(HindsightReplayStrategy):
+    """Replay random future goal.
+        Args:
+            ignore_null_goals (bool): no replace with goal when nothing achieved
+            future_k (int): number of future goals to sample per true sample
+            swap_list (list): a list of tuples of keys to swap in the
+                observation. E.g. [(("desired_x", "achieved_x"))] This is used
+                to replace a transition's "desired_x" with a goal transition's
+                "achieved_x"
+    """
+    def __init__(self, ignore_null_goals=True, is_null_goal=None):
+        self.ignore_null_goals = ignore_null_goals
+        self.is_null_goal = is_null_goal
+    def apply(self, episodes, reward_fn):
+        """Sample with the future strategy
+        """
+        batch_size = len(episodes)
+        episode_lens = np.array([len(episode) for episode in episodes])
+        # Randomly select time-steps from each episode
+        ts = [np.random.randint(ep_len) for ep_len in episode_lens]
+        ts = np.array(ts)
+        # Select subset for hindsight goal replacement. future_k controls ratio
+        apply_hers = np.random.uniform(size=batch_size) < self.future_prob
+        # Randomly select offsets for future goals
+        future_offset = np.random.uniform(
+            size=batch_size) * (episode_lens - ts)
+        future_offset = future_offset.astype(int)
+        future_ts = ts + future_offset
+        batch = []
+        for episode, apply_her, t, future_t in zip(episodes,
+                                                   apply_hers,
+                                                   ts, future_ts):
+            transition = episode[t]
+            if apply_her:
+                future_transition = episode[future_t]
+                future_goal = future_transition["next_state"]["achieved_goal"]
+                if not (self.ignore_null_goals and
+                        self.is_null_goal(future_goal)):
+                    transition = copy.deepcopy(transition)
+                    transition = relabel_transition_goal(
+                        transition, future_transition, reward_fn, swap_keys_list)
+            batch.append([transition])
+        return batch
+class HindsightReplayBuffer(EpisodicReplayBuffer):
+    """Hindsight Replay Buffer
+     We currently do not support N-step transitions for the
+     Hindsight Buffer.
+     Args:
+        reward_fn(fn): Calculate reward from achieved & observed goals
+        replay_strategy: instance of HindsightReplayStrategy()
+        capacity (int): Capacity of the replay buffer
+        future_k (int): number of future goals to sample per true sample
+        swap_list (list): a list of tuples of keys to swap in the
+            observation. E.g. [(("desired_x", "achieved_x"))] This is used
+            to replace a transition's "desired_x" with a goal transition's
+            "achieved_x"
+    """
+    def __init__(self,
+                 reward_fn,
+                 replay_strategy,
+                 capacity=None,
+                 is_null_goal=None,
+                 future_k=0,
+                 swap_list=[('desired_goal', 'achieved_goal')]):
+        assert replay_strategy in ["future", "final", "none"]
+        if ignore_null_goals:
+            assert is_null_goal is not None, "is_null_goal to detect when no\
+                goal was reached is required when ignore_null_goals=True"
+        self.reward_fn = reward_fn
+        self.replay_strategy = replay_strategy
+        self.is_null_goal = is_null_goal
+        self.swap_keys_list = swap_list
+        assert ('desired_goal', 'achieved_goal') in self.swap_keys_list
+        super(HindsightReplayBuffer, self).__init__(capacity)
+        # probability of sampling a future goal instead of a true goal
+        self.future_prob = 1.0 - 1.0 / (float(future_k) + 1)
+    def sample(self, n):
+        # Sample n transitions from the hindsight replay buffer
+        assert len(self.memory) >= n
+        # Select n episodes
+        episodes = self.sample_episodes(n)
+        batch = self.replay_strategy.apply(episodes,
+                                           self.reward_fn,
+                                           self.swap_keys_list)
+        if self.replay_strategy == "future":
+            batch = self._replay_future(episodes)
+        elif self.replay_strategy == "final":
+            batch = self._replay_final(episodes)
+        else:
+            raise NotImplementedError()
+        return batch
+    def sample_episodes(self, n_episodes, max_len=None):
+        episodes = self.sample_with_replacement(n_episodes)
+        if max_len is not None:
+            return [random_subseq(ep, max_len) for ep in episodes]
+        else:
+            return episodes
+    def sample_with_replacement(self, k):
+        return [self.episodic_memory[i] for i in
+                np.random.randint(0, len(self.episodic_memory), k)]
\ No newline at end of file

From 6859259c35d86c0ee9234aad9056a7dc3abcf797 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Sat, 11 Jul 2020 22:13:51 +0900
Subject: [PATCH 02/34] Adds hindsight to bit flip

 examples/her/ | 13 ++++++++++++-
 pfrl/replay_buffers/   | 24 ++++++------------------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 2a8b702ff..a3bd888f4 100644
--- a/examples/her/
+++ b/examples/her/
@@ -103,6 +103,7 @@ def main():
         help="Number of bits for BitFlipping environment",
+    parser.add_argument("--use-hindsight", type=bool, default=True)
     parser.add_argument("--eval-n-steps", type=int, default=125000)
     parser.add_argument("--eval-interval", type=int, default=250000)
     parser.add_argument("--n-best-episodes", type=int, default=30)
@@ -150,7 +151,17 @@ def make_env(test):
-    rbuf = replay_buffers.ReplayBuffer(10 ** 6)
+    def reward_fn(dg, ag):
+        return -1.0 if (ag != dg).any() else 0.0
+    if args.use_hindsight:
+        rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
+            reward_fn=reward_fn,
+            replay_strategy=replay_buffers.hindsight.ReplayFutureGoal(),
+            capacity=10 ** 6
+            )
+    else:
+        rbuf = replay_buffers.ReplayBuffer(10 ** 6)
     explorer = explorers.LinearDecayEpsilonGreedy(
diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
index 820bc8801..4ff693e75 100644
--- a/pfrl/replay_buffers/
+++ b/pfrl/replay_buffers/
@@ -26,7 +26,7 @@ class HindsightReplayStrategy():
     def __init__(self, reward_fn):
         self.reward_fn = reward_fn
-    def apply(self, episodes):
+    def apply(self, episodes,  reward_fn, swap_keys_list):
         return episodes
 class ReplayFinalGoal(HindsightReplayStrategy):
@@ -37,7 +37,7 @@ def __init__(self, ignore_null_goals=True, is_null_goal=None):
         self.ignore_null_goals = ignore_null_goals
         self.is_null_goal = is_null_goal  
-    def apply(self, episodes, reward_fn):
+    def apply(self, episodes, reward_fn, swap_keys_list):
         batch_size = len(episodes)
         episode_lens = np.array([len(episode) for episode in episodes])
@@ -68,17 +68,14 @@ class ReplayFutureGoal(HindsightReplayStrategy):
             ignore_null_goals (bool): no replace with goal when nothing achieved
             future_k (int): number of future goals to sample per true sample
-            swap_list (list): a list of tuples of keys to swap in the
-                observation. E.g. [(("desired_x", "achieved_x"))] This is used
-                to replace a transition's "desired_x" with a goal transition's
-                "achieved_x"
-    def __init__(self, ignore_null_goals=True, is_null_goal=None):
+    def __init__(self, ignore_null_goals=True, is_null_goal=None, future_k=4):
         self.ignore_null_goals = ignore_null_goals
         self.is_null_goal = is_null_goal
+        self.future_prob = 1.0 - 1.0 / (float(future_k) + 1)
-    def apply(self, episodes, reward_fn):
+    def apply(self, episodes, reward_fn, swap_keys_list):
         """Sample with the future strategy
         batch_size = len(episodes)
@@ -122,7 +119,6 @@ class HindsightReplayBuffer(EpisodicReplayBuffer):
         reward_fn(fn): Calculate reward from achieved & observed goals
         replay_strategy: instance of HindsightReplayStrategy()
         capacity (int): Capacity of the replay buffer
-        future_k (int): number of future goals to sample per true sample
         swap_list (list): a list of tuples of keys to swap in the
             observation. E.g. [(("desired_x", "achieved_x"))] This is used
             to replace a transition's "desired_x" with a goal transition's
@@ -137,7 +133,7 @@ def __init__(self,
                  swap_list=[('desired_goal', 'achieved_goal')]):
-        assert replay_strategy in ["future", "final", "none"]
+        assert replay_strategy is not None
         if ignore_null_goals:
             assert is_null_goal is not None, "is_null_goal to detect when no\
                 goal was reached is required when ignore_null_goals=True"
@@ -149,7 +145,6 @@ def __init__(self,
         super(HindsightReplayBuffer, self).__init__(capacity)
         # probability of sampling a future goal instead of a true goal
-        self.future_prob = 1.0 - 1.0 / (float(future_k) + 1)
     def sample(self, n):
@@ -160,13 +155,6 @@ def sample(self, n):
         batch = self.replay_strategy.apply(episodes,
-        if self.replay_strategy == "future":
-            batch = self._replay_future(episodes)
-        elif self.replay_strategy == "final":
-            batch = self._replay_final(episodes)
-        else:
-            raise NotImplementedError()
         return batch
     def sample_episodes(self, n_episodes, max_len=None):

From 120cfa58eb25c1538de9ad268147cb5c03bb9d18 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Sat, 11 Jul 2020 22:30:56 +0900
Subject: [PATCH 03/34] removes null_goals

 pfrl/replay_buffers/  |  4 ++++
 pfrl/replay_buffers/ | 31 +++++++++----------------------
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
index 1c5b0ef2b..3319041f8 100644
--- a/pfrl/replay_buffers/
+++ b/pfrl/replay_buffers/
@@ -1,4 +1,6 @@
 from pfrl.replay_buffers.episodic import EpisodicReplayBuffer  # NOQA
+from pfrl.replay_buffers.hindsight import HindsightReplayStrategy  # NOQA
+from pfrl.replay_buffers.hindsight import HindsightReplayBuffer  # NOQA
 from pfrl.replay_buffers.persistent import PersistentEpisodicReplayBuffer  # NOQA
 from pfrl.replay_buffers.persistent import PersistentReplayBuffer  # NOQA
 from pfrl.replay_buffers.prioritized import PrioritizedReplayBuffer  # NOQA
@@ -7,3 +9,5 @@
 from pfrl.replay_buffers.replay_buffer import ReplayBuffer  # NOQA
+from pfrl.replay_buffers.hindsight import ReplayFinalGoal  # NOQA
+from pfrl.replay_buffers.hindsight import ReplayFutureGoal  # NOQA
diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
index 4ff693e75..83cdad084 100644
--- a/pfrl/replay_buffers/
+++ b/pfrl/replay_buffers/
@@ -2,11 +2,11 @@
 import numpy as np
-from pfrl.replay_buffer import EpisodicReplayBuffer
+from pfrl.replay_buffers.episodic import EpisodicReplayBuffer
 from pfrl.replay_buffer import random_subseq
-def relabel_transition_goal(self, transition, goal_transition,
+def relabel_transition_goal(transition, goal_transition,
                             reward_fn, swap_keys_list):
     # Relabel/replace the desired goal for the transition with new_goal
     for desired_obs_key, achieved_obs_key in swap_keys_list:
@@ -33,10 +33,6 @@ class ReplayFinalGoal(HindsightReplayStrategy):
     """Replay final goal.
-    def __init__(self, ignore_null_goals=True, is_null_goal=None):
-        self.ignore_null_goals = ignore_null_goals
-        self.is_null_goal = is_null_goal  
     def apply(self, episodes, reward_fn, swap_keys_list):
         batch_size = len(episodes)
         episode_lens = np.array([len(episode) for episode in episodes])
@@ -54,11 +50,9 @@ def apply(self, episodes, reward_fn, swap_keys_list):
             if apply_her:
                 final_transition = episode[-1]
                 final_goal = final_transition["next_state"]["achieved_goal"]
-                if not (self.ignore_null_goals and
-                        self.is_null_goal(final_goal)):
-                    transition = copy.deepcopy(transition)
-                    transition = relabel_transition_goal(
-                        transition, final_transition, reward_fn, swap_keys_list)
+                transition = copy.deepcopy(transition)
+                transition = relabel_transition_goal(
+                    transition, final_transition, reward_fn, swap_keys_list)
         return batch
@@ -70,9 +64,7 @@ class ReplayFutureGoal(HindsightReplayStrategy):
             future_k (int): number of future goals to sample per true sample
-    def __init__(self, ignore_null_goals=True, is_null_goal=None, future_k=4):
-        self.ignore_null_goals = ignore_null_goals
-        self.is_null_goal = is_null_goal
+    def __init__(self, future_k=4):
         self.future_prob = 1.0 - 1.0 / (float(future_k) + 1)
     def apply(self, episodes, reward_fn, swap_keys_list):
@@ -101,11 +93,9 @@ def apply(self, episodes, reward_fn, swap_keys_list):
             if apply_her:
                 future_transition = episode[future_t]
                 future_goal = future_transition["next_state"]["achieved_goal"]
-                if not (self.ignore_null_goals and
-                        self.is_null_goal(future_goal)):
-                    transition = copy.deepcopy(transition)
-                    transition = relabel_transition_goal(
-                        transition, future_transition, reward_fn, swap_keys_list)
+                transition = copy.deepcopy(transition)
+                transition = relabel_transition_goal(
+                    transition, future_transition, reward_fn, swap_keys_list)
         return batch
@@ -134,9 +124,6 @@ def __init__(self,
                  swap_list=[('desired_goal', 'achieved_goal')]):
         assert replay_strategy is not None
-        if ignore_null_goals:
-            assert is_null_goal is not None, "is_null_goal to detect when no\
-                goal was reached is required when ignore_null_goals=True"
         self.reward_fn = reward_fn
         self.replay_strategy = replay_strategy
         self.is_null_goal = is_null_goal

From 3360fafd7312db1ea80b17c746c4743f4fdeab0a Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Sun, 12 Jul 2020 00:15:25 +0900
Subject: [PATCH 04/34] modifies total steps

 examples/her/ | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/her/ b/examples/her/
index a3bd888f4..288af1e1b 100644
--- a/examples/her/
+++ b/examples/her/
@@ -88,7 +88,7 @@ def main():
-        default=5 * 10 ** 7,
+        default=10 ** 7,
         help="Total number of timesteps to train the agent.",

From 1e572055a889033aa6bd2241c7316ae5568005d9 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Sun, 12 Jul 2020 00:44:35 +0900
Subject: [PATCH 05/34] Updates space sampling

 examples/her/ | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 288af1e1b..beb702d63 100644
--- a/examples/her/
+++ b/examples/her/
@@ -49,10 +49,11 @@ def step(self, action):
         return self.observation, reward, done, {}
     def reset(self):
-        state = self.observation_space['desired_goal'].sample()
-        goal = self.observation_space['desired_goal'].sample()
+        sample_obs = self.observation_space.sample()
+        state, goal = sample_obs['observation'], sample_obs['desired_goal']
         while (state == goal).all():
-            goal = self.observation_space['desired_goal'].sample()
+            sample_obs = self.observation_space.sample()
+            state, goal = sample_obs['observation'], sample_obs['desired_goal']
         self.observation = dict()
         self.observation["desired_goal"] = goal
         self.observation["achieved_goal"] = state
@@ -61,7 +62,6 @@ def reset(self):
         return self.observation
 def main():
     parser = argparse.ArgumentParser()

From a60e1f5e982b523a85723d5add72b36610e3eb01 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Fri, 17 Jul 2020 17:43:34 +0900
Subject: [PATCH 06/34] Cleans hindsight buffer code

 pfrl/replay_buffers/ | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
index 83cdad084..72fbde3aa 100644
--- a/pfrl/replay_buffers/
+++ b/pfrl/replay_buffers/
@@ -119,14 +119,11 @@ def __init__(self,
-                 is_null_goal=None,
-                 future_k=0,
                  swap_list=[('desired_goal', 'achieved_goal')]):
         assert replay_strategy is not None
         self.reward_fn = reward_fn
         self.replay_strategy = replay_strategy
-        self.is_null_goal = is_null_goal
         self.swap_keys_list = swap_list
         assert ('desired_goal', 'achieved_goal') in self.swap_keys_list

From eece248a6be7458ce64d4ffafc6a010dbf669bb9 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Sun, 19 Jul 2020 23:31:06 +0900
Subject: [PATCH 07/34] Modifies experiment params

 examples/her/ | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/her/ b/examples/her/
index beb702d63..304ce98ab 100644
--- a/examples/her/
+++ b/examples/her/
@@ -104,9 +104,9 @@ def main():
         help="Number of bits for BitFlipping environment",
     parser.add_argument("--use-hindsight", type=bool, default=True)
-    parser.add_argument("--eval-n-steps", type=int, default=125000)
+    parser.add_argument("--eval-n-episodes", type=int, default=100)
     parser.add_argument("--eval-interval", type=int, default=250000)
-    parser.add_argument("--n-best-episodes", type=int, default=30)
+    parser.add_argument("--n-best-episodes", type=int, default=100)
     args = parser.parse_args()
     import logging
@@ -213,8 +213,8 @@ def phi(observation):
-            eval_n_steps=args.eval_n_steps,
-            eval_n_episodes=None,
+            eval_n_steps=None,
+            eval_n_episodes=args.eval_n_episodes,

From e38e0d019fc649879a5f4bf4c39590716d589d3b Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 29 Oct 2020 01:54:40 +0900
Subject: [PATCH 08/34] Applies black to pfrl

 pfrl/replay_buffers/ | 87 ++++++++++++++++----------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
index 72fbde3aa..f7750ce25 100644
--- a/pfrl/replay_buffers/
+++ b/pfrl/replay_buffers/
@@ -6,8 +6,7 @@
 from pfrl.replay_buffer import random_subseq
-def relabel_transition_goal(transition, goal_transition,
-                            reward_fn, swap_keys_list):
+def relabel_transition_goal(transition, goal_transition, reward_fn, swap_keys_list):
     # Relabel/replace the desired goal for the transition with new_goal
     for desired_obs_key, achieved_obs_key in swap_keys_list:
         replacement = goal_transition["next_state"][achieved_obs_key]
@@ -19,19 +18,18 @@ def relabel_transition_goal(transition, goal_transition,
     return transition
-class HindsightReplayStrategy():
-    """ReplayStrategy for Hindsight experience replay
-    """
+class HindsightReplayStrategy:
+    """ReplayStrategy for Hindsight experience replay"""
     def __init__(self, reward_fn):
         self.reward_fn = reward_fn
-    def apply(self, episodes,  reward_fn, swap_keys_list):
+    def apply(self, episodes, reward_fn, swap_keys_list):
         return episodes
 class ReplayFinalGoal(HindsightReplayStrategy):
-    """Replay final goal.
-    """
+    """Replay final goal."""
     def apply(self, episodes, reward_fn, swap_keys_list):
         batch_size = len(episodes)
@@ -52,24 +50,25 @@ def apply(self, episodes, reward_fn, swap_keys_list):
                 final_goal = final_transition["next_state"]["achieved_goal"]
                 transition = copy.deepcopy(transition)
                 transition = relabel_transition_goal(
-                    transition, final_transition, reward_fn, swap_keys_list)
+                    transition, final_transition, reward_fn, swap_keys_list
+                )
         return batch
 class ReplayFutureGoal(HindsightReplayStrategy):
     """Replay random future goal.
-        Args:
-            ignore_null_goals (bool): no replace with goal when nothing achieved
-            future_k (int): number of future goals to sample per true sample
+    Args:
+        ignore_null_goals (bool): no replace with goal when nothing achieved
+        future_k (int): number of future goals to sample per true sample
     def __init__(self, future_k=4):
         self.future_prob = 1.0 - 1.0 / (float(future_k) + 1)
     def apply(self, episodes, reward_fn, swap_keys_list):
-        """Sample with the future strategy
-        """
+        """Sample with the future strategy"""
         batch_size = len(episodes)
         episode_lens = np.array([len(episode) for episode in episodes])
@@ -81,64 +80,64 @@ def apply(self, episodes, reward_fn, swap_keys_list):
         apply_hers = np.random.uniform(size=batch_size) < self.future_prob
         # Randomly select offsets for future goals
-        future_offset = np.random.uniform(
-            size=batch_size) * (episode_lens - ts)
+        future_offset = np.random.uniform(size=batch_size) * (episode_lens - ts)
         future_offset = future_offset.astype(int)
         future_ts = ts + future_offset
         batch = []
-        for episode, apply_her, t, future_t in zip(episodes,
-                                                   apply_hers,
-                                                   ts, future_ts):
+        for episode, apply_her, t, future_t in zip(episodes, apply_hers, ts, future_ts):
             transition = episode[t]
             if apply_her:
                 future_transition = episode[future_t]
                 future_goal = future_transition["next_state"]["achieved_goal"]
                 transition = copy.deepcopy(transition)
                 transition = relabel_transition_goal(
-                    transition, future_transition, reward_fn, swap_keys_list)
+                    transition, future_transition, reward_fn, swap_keys_list
+                )
         return batch
 class HindsightReplayBuffer(EpisodicReplayBuffer):
     """Hindsight Replay Buffer
-     We currently do not support N-step transitions for the
-     Hindsight Buffer.
-     Args:
-        reward_fn(fn): Calculate reward from achieved & observed goals
-        replay_strategy: instance of HindsightReplayStrategy()
-        capacity (int): Capacity of the replay buffer
-        swap_list (list): a list of tuples of keys to swap in the
-            observation. E.g. [(("desired_x", "achieved_x"))] This is used
-            to replace a transition's "desired_x" with a goal transition's
-            "achieved_x"
+    We currently do not support N-step transitions for the
+    Hindsight Buffer.
+    Args:
+       reward_fn(fn): Calculate reward from achieved & observed goals
+       replay_strategy: instance of HindsightReplayStrategy()
+       capacity (int): Capacity of the replay buffer
+       swap_list (list): a list of tuples of keys to swap in the
+           observation. E.g. [(("desired_x", "achieved_x"))] This is used
+           to replace a transition's "desired_x" with a goal transition's
+           "achieved_x"
-    def __init__(self,
-                 reward_fn,
-                 replay_strategy,
-                 capacity=None,
-                 swap_list=[('desired_goal', 'achieved_goal')]):
+    def __init__(
+        self,
+        reward_fn,
+        replay_strategy,
+        capacity=None,
+        swap_list=[("desired_goal", "achieved_goal")],
+    ):
         assert replay_strategy is not None
         self.reward_fn = reward_fn
         self.replay_strategy = replay_strategy
         self.swap_keys_list = swap_list
-        assert ('desired_goal', 'achieved_goal') in self.swap_keys_list
+        assert ("desired_goal", "achieved_goal") in self.swap_keys_list
         super(HindsightReplayBuffer, self).__init__(capacity)
         # probability of sampling a future goal instead of a true goal
     def sample(self, n):
         # Sample n transitions from the hindsight replay buffer
         assert len(self.memory) >= n
         # Select n episodes
         episodes = self.sample_episodes(n)
-        batch = self.replay_strategy.apply(episodes,
-                                           self.reward_fn,
-                                           self.swap_keys_list)
+        batch = self.replay_strategy.apply(
+            episodes, self.reward_fn, self.swap_keys_list
+        )
         return batch
     def sample_episodes(self, n_episodes, max_len=None):
@@ -149,5 +148,7 @@ def sample_episodes(self, n_episodes, max_len=None):
             return episodes
     def sample_with_replacement(self, k):
-        return [self.episodic_memory[i] for i in
-                np.random.randint(0, len(self.episodic_memory), k)]
\ No newline at end of file
+        return [
+            self.episodic_memory[i]
+            for i in np.random.randint(0, len(self.episodic_memory), k)
+        ]

From d89c788634aa64e7e412a78c9fbb01f64ca7481e Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 29 Oct 2020 02:26:14 +0900
Subject: [PATCH 09/34] Updates docstring

 pfrl/replay_buffers/ | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
index f7750ce25..dc70b861e 100644
--- a/pfrl/replay_buffers/
+++ b/pfrl/replay_buffers/
@@ -104,7 +104,7 @@ class HindsightReplayBuffer(EpisodicReplayBuffer):
     We currently do not support N-step transitions for the
     Hindsight Buffer.
-       reward_fn(fn): Calculate reward from achieved & observed goals
+       reward_fn(fn): reward fn with input: (achieved_goal, desired_goal)
        replay_strategy: instance of HindsightReplayStrategy()
        capacity (int): Capacity of the replay buffer
        swap_list (list): a list of tuples of keys to swap in the

From 080916ff970f7a012615c711a41f3afdce8d4fe2 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 29 Oct 2020 02:27:44 +0900
Subject: [PATCH 10/34] Implements step function and success rate calculation

 examples/her/ | 45 ++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 304ce98ab..9ef8f7be4 100644
--- a/examples/her/
+++ b/examples/her/
@@ -16,6 +16,10 @@
 from pfrl.initializers import init_chainer_default
+def reward_fn(dg, ag):
+    return -1.0 if (ag != dg).any() else 0.0
 class BitFlip(gym.GoalEnv):
     """BitFlip environment from
@@ -32,20 +36,33 @@ def __init__(self, n):
-    def compute_reward(self, achieved_goal, desired_goal, info):
-        return -1.0 if (achieved_goal != desired_goal).any() else 0.0
+        self.clear_statistics()
     def step(self, action):
-        self.observation["observation"][action] = \
-            int(not self.observation["observation"][action])
-        reward = self.compute_reward(self.observation["achieved_goal"],
-                                     self.observation["desired_goal"], {})
-        done = (self.observation["desired_goal"] == \
+        # Compute action outcome
+        bit_new = int(not self.observation["observation"][action])
+        new_obs = self.observation["observation"].copy()
+        new_obs[action] = bit_new
+        # Set new observation
+        dg = self.observation["desired_goal"]
+        self.observation["desired_goal"] = dg.copy()
+        self.observation["achieved_goal"] = new_obs
+        self.observation["observation"] = new_obs
+        reward = reward_fn(self.observation["desired_goal"],
+                           self.observation["achieved_goal"])
+        done_success = (self.observation["desired_goal"] == \
+        done = done_success
         self.steps += 1
         if self.steps == self.n:
             done = True
+        if done:
+            if done_success:
+                assert reward == 0
+                self.results.append(1)
+            else:
+                self.results.append(0)
         return self.observation, reward, done, {}
     def reset(self):
@@ -61,6 +78,15 @@ def reset(self):
         self.steps = 0
         return self.observation
+    def get_statistics(self):
+        failures =  self.results.count(0)
+        successes = self.results.count(1)
+        assert len(self.results) == failures + successes
+        success_rate = successes/float(self.results)
+        return [("success_rate", success_rate)]
+    def clear_statistics(self):
+        self.results = []
 def main():
     parser = argparse.ArgumentParser()
@@ -151,9 +177,6 @@ def make_env(test):
-    def reward_fn(dg, ag):
-        return -1.0 if (ag != dg).any() else 0.0
     if args.use_hindsight:
         rbuf = replay_buffers.hindsight.HindsightReplayBuffer(

From c363dc76f49f2b8ae8211153b66dc87499844efe Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 29 Oct 2020 02:30:12 +0900
Subject: [PATCH 11/34] Updates agent, explorer, replay start size, and phi

 examples/her/ | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 9ef8f7be4..b4a25ff04 100644
--- a/examples/her/
+++ b/examples/her/
@@ -120,7 +120,7 @@ def main():
-        default=5 * 10 ** 4,
+        default=5 * 10 ** 2,
         help="Minimum replay buffer size before " + "performing gradient updates.",
@@ -188,18 +188,18 @@ def make_env(test):
     explorer = explorers.LinearDecayEpsilonGreedy(
-        end_epsilon=0.1,
-        decay_steps=10 ** 6,
+        end_epsilon=0.0,
+        decay_steps=5 * 10 ** 3,
         random_action_func=lambda: np.random.randint(n_actions),
     def phi(observation):
         # Feature extractor
-        obs = np.asarray(observation["observation"], dtype=np.float32) / 255
-        dg = np.asarray(observation["desired_goal"], dtype=np.float32) / 255
+        obs = np.asarray(observation["observation"], dtype=np.float32)
+        dg = np.asarray(observation["desired_goal"], dtype=np.float32)
         return np.concatenate((obs, dg))
-    Agent = agents.DQN
+    Agent = agents.DoubleDQN
     agent = Agent(

From 4e15a76efeffe5d30dc11864978e3fd57cb15657 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 29 Oct 2020 02:34:22 +0900
Subject: [PATCH 12/34] Applies black

 examples/her/ | 36 +++++++++++++++++-------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/examples/her/ b/examples/her/
index b4a25ff04..99c2f2daf 100644
--- a/examples/her/
+++ b/examples/her/
@@ -31,11 +31,13 @@ def __init__(self, n):
         self.n = n
         self.steps = 0
         self.action_space = spaces.Discrete(n)
-        self.observation_space = spaces.Dict(dict(
-            desired_goal=spaces.MultiBinary(n),
-            achieved_goal=spaces.MultiBinary(n),
-            observation=spaces.MultiBinary(n),
-        ))
+        self.observation_space = spaces.Dict(
+            dict(
+                desired_goal=spaces.MultiBinary(n),
+                achieved_goal=spaces.MultiBinary(n),
+                observation=spaces.MultiBinary(n),
+            )
+        )
     def step(self, action):
@@ -49,10 +51,12 @@ def step(self, action):
         self.observation["achieved_goal"] = new_obs
         self.observation["observation"] = new_obs
-        reward = reward_fn(self.observation["desired_goal"],
-                           self.observation["achieved_goal"])
-        done_success = (self.observation["desired_goal"] == \
-            self.observation["achieved_goal"]).all()
+        reward = reward_fn(
+            self.observation["desired_goal"], self.observation["achieved_goal"]
+        )
+        done_success = (
+            self.observation["desired_goal"] == self.observation["achieved_goal"]
+        ).all()
         done = done_success
         self.steps += 1
         if self.steps == self.n:
@@ -67,10 +71,10 @@ def step(self, action):
     def reset(self):
         sample_obs = self.observation_space.sample()
-        state, goal = sample_obs['observation'], sample_obs['desired_goal']
+        state, goal = sample_obs["observation"], sample_obs["desired_goal"]
         while (state == goal).all():
             sample_obs = self.observation_space.sample()
-            state, goal = sample_obs['observation'], sample_obs['desired_goal']
+            state, goal = sample_obs["observation"], sample_obs["desired_goal"]
         self.observation = dict()
         self.observation["desired_goal"] = goal
         self.observation["achieved_goal"] = state
@@ -79,15 +83,16 @@ def reset(self):
         return self.observation
     def get_statistics(self):
-        failures =  self.results.count(0)
+        failures = self.results.count(0)
         successes = self.results.count(1)
         assert len(self.results) == failures + successes
-        success_rate = successes/float(self.results)
+        success_rate = successes / float(self.results)
         return [("success_rate", success_rate)]
     def clear_statistics(self):
         self.results = []
 def main():
     parser = argparse.ArgumentParser()
@@ -181,8 +186,8 @@ def make_env(test):
         rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
-            capacity=10 ** 6
-            )
+            capacity=10 ** 6,
+        )
         rbuf = replay_buffers.ReplayBuffer(10 ** 6)
@@ -218,7 +223,6 @@ def phi(observation):
     if args.load:
     if args.demo:
         eval_stats = experiments.eval_performance(
             env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None

From 71809a532b6d3a376c0bcbb7710551d40516698c Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 29 Oct 2020 02:36:28 +0900
Subject: [PATCH 13/34] Updates optimizer, and target update interval

 examples/her/ | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 99c2f2daf..001e7a05b 100644
--- a/examples/her/
+++ b/examples/her/
@@ -172,15 +172,7 @@ def make_env(test):
-    # Use the same hyperparameters as the Nature paper
-    opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
-        q_func.parameters(),
-        lr=2.5e-4,
-        alpha=0.95,
-        momentum=0.0,
-        eps=1e-2,
-        centered=True,
-    )
+    opt = torch.optim.Adam(q_func.parameters(), eps=1e-3)
     if args.use_hindsight:
         rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
@@ -192,7 +184,7 @@ def make_env(test):
         rbuf = replay_buffers.ReplayBuffer(10 ** 6)
     explorer = explorers.LinearDecayEpsilonGreedy(
-        start_epsilon=1.0,
+        start_epsilon=0.3,
         decay_steps=5 * 10 ** 3,
         random_action_func=lambda: np.random.randint(n_actions),
@@ -213,7 +205,7 @@ def phi(observation):
-        target_update_interval=10 ** 4,
+        target_update_interval=10 ** 3,

From 8c616e542ad7e3a72f46a1099f476459c5245081 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 29 Oct 2020 04:00:11 +0900
Subject: [PATCH 14/34] Fixes minor errors

 examples/her/ | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 001e7a05b..ea1278636 100644
--- a/examples/her/
+++ b/examples/her/
@@ -2,8 +2,9 @@
 import gym
 import gym.spaces as spaces
-import torch.nn as nn
 import numpy as np
+import torch
+import torch.nn as nn
 import pfrl
 from pfrl.q_functions import DiscreteActionValueHead
@@ -40,6 +41,9 @@ def __init__(self, n):
+    def compute_reward(self, achieved_goal, desired_goal, info):
+        return reward_fn(desired_goal, achieved_goal)
     def step(self, action):
         # Compute action outcome
         bit_new = int(not self.observation["observation"][action])
@@ -51,9 +55,9 @@ def step(self, action):
         self.observation["achieved_goal"] = new_obs
         self.observation["observation"] = new_obs
-        reward = reward_fn(
-            self.observation["desired_goal"], self.observation["achieved_goal"]
-        )
+        reward = self.compute_reward(self.observation["achieved_goal"],
+                                     self.observation["desired_goal"],
+                                     {})
         done_success = (
             self.observation["desired_goal"] == self.observation["achieved_goal"]
@@ -86,7 +90,9 @@ def get_statistics(self):
         failures = self.results.count(0)
         successes = self.results.count(1)
         assert len(self.results) == failures + successes
-        success_rate = successes / float(self.results)
+        if not self.results:
+            return [("success_rate", None)]
+        success_rate = successes / float(len(self.results))
         return [("success_rate", success_rate)]
     def clear_statistics(self):

From 9721d1afdae9fed11585cd0a58a8cb4a425dc0db Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 29 Oct 2020 14:10:39 +0900
Subject: [PATCH 15/34] Applies black

 examples/her/ | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/her/ b/examples/her/
index ea1278636..ea4bcc915 100644
--- a/examples/her/
+++ b/examples/her/
@@ -55,9 +55,9 @@ def step(self, action):
         self.observation["achieved_goal"] = new_obs
         self.observation["observation"] = new_obs
-        reward = self.compute_reward(self.observation["achieved_goal"],
-                                     self.observation["desired_goal"],
-                                     {})
+        reward = self.compute_reward(
+            self.observation["achieved_goal"], self.observation["desired_goal"], {}
+        )
         done_success = (
             self.observation["desired_goal"] == self.observation["achieved_goal"]

From 8643e0de6ae982dc55ab772d4e6a7cdde4f39eed Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 29 Oct 2020 14:32:30 +0900
Subject: [PATCH 16/34] Addresses flakes

 examples/her/ | 1 -
 pfrl/replay_buffers/   | 2 --
 2 files changed, 3 deletions(-)

diff --git a/examples/her/ b/examples/her/
index ea4bcc915..7e2baa04a 100644
--- a/examples/her/
+++ b/examples/her/
@@ -6,7 +6,6 @@
 import torch
 import torch.nn as nn
-import pfrl
 from pfrl.q_functions import DiscreteActionValueHead
 from pfrl import agents
 from pfrl import experiments
diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
index dc70b861e..fa24cdffd 100644
--- a/pfrl/replay_buffers/
+++ b/pfrl/replay_buffers/
@@ -47,7 +47,6 @@ def apply(self, episodes, reward_fn, swap_keys_list):
             transition = episode[t]
             if apply_her:
                 final_transition = episode[-1]
-                final_goal = final_transition["next_state"]["achieved_goal"]
                 transition = copy.deepcopy(transition)
                 transition = relabel_transition_goal(
                     transition, final_transition, reward_fn, swap_keys_list
@@ -88,7 +87,6 @@ def apply(self, episodes, reward_fn, swap_keys_list):
             transition = episode[t]
             if apply_her:
                 future_transition = episode[future_t]
-                future_goal = future_transition["next_state"]["achieved_goal"]
                 transition = copy.deepcopy(transition)
                 transition = relabel_transition_goal(
                     transition, future_transition, reward_fn, swap_keys_list

From 4d34f1e629a9872d6cda4dd854951bed313e7519 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 29 Oct 2020 23:51:58 +0900
Subject: [PATCH 17/34] Cleans up code

 examples/her/ | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 7e2baa04a..868a67278 100644
--- a/examples/her/
+++ b/examples/her/
@@ -29,7 +29,6 @@ class BitFlip(gym.GoalEnv):
     def __init__(self, n):
         self.n = n
-        self.steps = 0
         self.action_space = spaces.Discrete(n)
         self.observation_space = spaces.Dict(
@@ -43,6 +42,12 @@ def __init__(self, n):
     def compute_reward(self, achieved_goal, desired_goal, info):
         return reward_fn(desired_goal, achieved_goal)
+    def _check_done(self):
+        success = (
+            self.observation["desired_goal"] == self.observation["achieved_goal"]
+        ).all()
+        return (self.steps >= self.n) or success, success
     def step(self, action):
         # Compute action outcome
         bit_new = int(not self.observation["observation"][action])
@@ -57,19 +62,12 @@ def step(self, action):
         reward = self.compute_reward(
             self.observation["achieved_goal"], self.observation["desired_goal"], {}
-        done_success = (
-            self.observation["desired_goal"] == self.observation["achieved_goal"]
-        ).all()
-        done = done_success
         self.steps += 1
-        if self.steps == self.n:
-            done = True
+        done, success = self._check_done()
+        assert success == (reward == 0)
         if done:
-            if done_success:
-                assert reward == 0
-                self.results.append(1)
-            else:
-                self.results.append(0)
+            result = 1 if success else 0
+            self.results.append(result)
         return self.observation, reward, done, {}
     def reset(self):

From f5a1bfa2f0116a017e3c001302f434d912c39c24 Mon Sep 17 00:00:00 2001
From: Prabhat Nagarajan <>
Date: Fri, 30 Oct 2020 04:15:43 +0900
Subject: [PATCH 18/34] Update examples/her/

Co-authored-by: Justin DuJardin <>
 examples/her/ | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 868a67278..76624cf7e 100644
--- a/examples/her/
+++ b/examples/her/
@@ -55,9 +55,11 @@ def step(self, action):
         new_obs[action] = bit_new
         # Set new observation
         dg = self.observation["desired_goal"]
-        self.observation["desired_goal"] = dg.copy()
-        self.observation["achieved_goal"] = new_obs
-        self.observation["observation"] = new_obs
+        self.observation = {
+            "desired_goal": dg.copy(),
+            "achieved_goal": new_obs,
+            "observation": new_obs,
+        }
         reward = self.compute_reward(
             self.observation["achieved_goal"], self.observation["desired_goal"], {}

From 3812384f06960e41ce7990d8f3f8de48467c27a0 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Fri, 30 Oct 2020 12:59:33 +0900
Subject: [PATCH 19/34] experiment and hyperparameter update

 examples/her/ | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 76624cf7e..5c056840e 100644
--- a/examples/her/
+++ b/examples/her/
@@ -124,7 +124,7 @@ def main():
-        default=10 ** 7,
+        default=5 * 10 ** 6,
         help="Total number of timesteps to train the agent.",
@@ -177,7 +177,7 @@ def make_env(test):
-    opt = torch.optim.Adam(q_func.parameters(), eps=1e-3)
+    opt = torch.optim.Adam(q_func.parameters(), eps=1e-4)
     if args.use_hindsight:
         rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
@@ -188,10 +188,12 @@ def make_env(test):
         rbuf = replay_buffers.ReplayBuffer(10 ** 6)
+    decay_steps = ((args.num_bits + 5) * 10 ** 3)
+    end_epsilon = min(0.1, 0.5/args.num_bits)
     explorer = explorers.LinearDecayEpsilonGreedy(
-        start_epsilon=0.3,
-        end_epsilon=0.0,
-        decay_steps=5 * 10 ** 3,
+        start_epsilon=0.5,
+        end_epsilon=end_epsilon,
+        decay_steps=decay_steps,
         random_action_func=lambda: np.random.randint(n_actions),

From 5cd21e08e74dc1e0f9b38caba06e34737a1076b4 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Fri, 30 Oct 2020 20:49:11 +0900
Subject: [PATCH 20/34] Switches parse args

 examples/her/ | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 5c056840e..d2d66ccfb 100644
--- a/examples/her/
+++ b/examples/her/
@@ -139,7 +139,7 @@ def main():
         help="Number of bits for BitFlipping environment",
-    parser.add_argument("--use-hindsight", type=bool, default=True)
+    parser.add_argument("--no-hindsight", action='store_true', default=False)
     parser.add_argument("--eval-n-episodes", type=int, default=100)
     parser.add_argument("--eval-interval", type=int, default=250000)
     parser.add_argument("--n-best-episodes", type=int, default=100)
@@ -179,14 +179,14 @@ def make_env(test):
     opt = torch.optim.Adam(q_func.parameters(), eps=1e-4)
-    if args.use_hindsight:
+    if args.no_hindsight:
+        rbuf = replay_buffers.ReplayBuffer(10 ** 6)
+    else:
         rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
             capacity=10 ** 6,
-    else:
-        rbuf = replay_buffers.ReplayBuffer(10 ** 6)
     decay_steps = ((args.num_bits + 5) * 10 ** 3)
     end_epsilon = min(0.1, 0.5/args.num_bits)

From 035ad635b9539629f2672e9192a1421e7255611c Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Sat, 31 Oct 2020 02:19:08 +0900
Subject: [PATCH 21/34] Applies black

 examples/her/ | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/her/ b/examples/her/
index d2d66ccfb..0354bb344 100644
--- a/examples/her/
+++ b/examples/her/
@@ -139,7 +139,7 @@ def main():
         help="Number of bits for BitFlipping environment",
-    parser.add_argument("--no-hindsight", action='store_true', default=False)
+    parser.add_argument("--no-hindsight", action="store_true", default=False)
     parser.add_argument("--eval-n-episodes", type=int, default=100)
     parser.add_argument("--eval-interval", type=int, default=250000)
     parser.add_argument("--n-best-episodes", type=int, default=100)
@@ -188,8 +188,8 @@ def make_env(test):
             capacity=10 ** 6,
-    decay_steps = ((args.num_bits + 5) * 10 ** 3)
-    end_epsilon = min(0.1, 0.5/args.num_bits)
+    decay_steps = (args.num_bits + 5) * 10 ** 3
+    end_epsilon = min(0.1, 0.5 / args.num_bits)
     explorer = explorers.LinearDecayEpsilonGreedy(

From e481b857d5aae7c83a248727ec9b7673921ea204 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 5 Nov 2020 20:49:57 +0900
Subject: [PATCH 22/34] Adds HER to the Repo readme

--- | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ b/
index 08f75a8fe..daa070db4 100644
--- a/
+++ b/
@@ -83,6 +83,8 @@ Following useful techniques have been also implemented in PFRL:
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/
 - [Prioritized Experience Replay](
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/
+- [Hindsight Experience Replay](
+  - examples: [[Bit-flip DQN]](examples/her/
 - [Dueling Network](
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/
 - [Normalized Advantage Function](

From ed4ae2e01262668ce4ef698aa0796968696a74c8 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Fri, 6 Nov 2020 01:11:28 +0900
Subject: [PATCH 23/34] Applies isort

 examples/her/ | 9 ++-------
 pfrl/replay_buffers/    | 6 +++---
 pfrl/replay_buffers/   | 2 +-
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 0354bb344..a430a1bef 100644
--- a/examples/her/
+++ b/examples/her/
@@ -6,14 +6,9 @@
 import torch
 import torch.nn as nn
-from pfrl.q_functions import DiscreteActionValueHead
-from pfrl import agents
-from pfrl import experiments
-from pfrl import explorers
-from pfrl import utils
-from pfrl import replay_buffers
+from pfrl import agents, experiments, explorers, replay_buffers, utils
 from pfrl.initializers import init_chainer_default
+from pfrl.q_functions import DiscreteActionValueHead
 def reward_fn(dg, ag):
diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
index 3319041f8..d33f1bfba 100644
--- a/pfrl/replay_buffers/
+++ b/pfrl/replay_buffers/
@@ -1,6 +1,8 @@
 from pfrl.replay_buffers.episodic import EpisodicReplayBuffer  # NOQA
-from pfrl.replay_buffers.hindsight import HindsightReplayStrategy  # NOQA
 from pfrl.replay_buffers.hindsight import HindsightReplayBuffer  # NOQA
+from pfrl.replay_buffers.hindsight import HindsightReplayStrategy  # NOQA
+from pfrl.replay_buffers.hindsight import ReplayFinalGoal  # NOQA
+from pfrl.replay_buffers.hindsight import ReplayFutureGoal  # NOQA
 from pfrl.replay_buffers.persistent import PersistentEpisodicReplayBuffer  # NOQA
 from pfrl.replay_buffers.persistent import PersistentReplayBuffer  # NOQA
 from pfrl.replay_buffers.prioritized import PrioritizedReplayBuffer  # NOQA
@@ -9,5 +11,3 @@
 from pfrl.replay_buffers.replay_buffer import ReplayBuffer  # NOQA
-from pfrl.replay_buffers.hindsight import ReplayFinalGoal  # NOQA
-from pfrl.replay_buffers.hindsight import ReplayFutureGoal  # NOQA
diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
index fa24cdffd..92b8f1eb2 100644
--- a/pfrl/replay_buffers/
+++ b/pfrl/replay_buffers/
@@ -2,8 +2,8 @@
 import numpy as np
-from pfrl.replay_buffers.episodic import EpisodicReplayBuffer
 from pfrl.replay_buffer import random_subseq
+from pfrl.replay_buffers.episodic import EpisodicReplayBuffer
 def relabel_transition_goal(transition, goal_transition, reward_fn, swap_keys_list):

From 9841438cbeb6d92b6c6226747d59430690bdb8ea Mon Sep 17 00:00:00 2001
From: muupan <>
Date: Sat, 7 Nov 2020 00:34:31 +0900
Subject: [PATCH 24/34] Make DDPG HER work for FetchReach-v1

 examples/her/ | 305 +++++++++++++++++++++++++++
 pfrl/agents/                  |  25 +++
 2 files changed, 330 insertions(+)
 create mode 100644 examples/her/

diff --git a/examples/her/ b/examples/her/
new file mode 100644
index 000000000..b1078a96b
--- /dev/null
+++ b/examples/her/
@@ -0,0 +1,305 @@
+import argparse
+import gym
+import gym.spaces
+import numpy as np
+import torch
+import torch.nn as nn
+import pfrl
+from pfrl import experiments, replay_buffers, utils
+from pfrl.nn import BoundByTanh, ConcatObsAndAction
+from pfrl.policies import DeterministicHead
+class ComputeSuccessRate(gym.Wrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        self.success_record = []
+    def reset(self):
+        self.success_record.append(None)
+        return self.env.reset()
+    def step(self, action):
+        obs, r, done, info = self.env.step(action)
+        assert "is_success" in info
+        self.success_record[-1] = info["is_success"]
+        return obs, r, done, info
+    def get_statistics(self):
+        # Ignore episodes with zero step
+        valid_record = [x for x in self.success_record if x is not None]
+        success_rate = (
+            valid_record.count(True) / len(valid_record) if valid_record else np.nan
+        )
+        return [("success_rate", success_rate)]
+    def clear_statistics(self):
+        self.success_record = []
+class ClipObservation(gym.ObservationWrapper):
+    """Clip observations to a given range.
+    Args:
+        env: Env to wrap.
+        low: Lower limit.
+        high: Upper limit.
+    Attributes:
+        original_observation: Observation before casting.
+    """
+    def __init__(self, env, low, high):
+        super().__init__(env)
+        self.low = low
+        self.high = high
+    def observation(self, observation):
+        self.original_observation = observation
+        return np.clip(observation, self.low, self.high)
+class EpsilonGreedyWithGaussianNoise(pfrl.explorer.Explorer):
+    """Epsilon-Greedy with Gaussian noise.
+    This type of explorer was used in 
+    """
+    def __init__(self, epsilon, random_action_func, noise_scale, low=None, high=None):
+        self.epsilon = epsilon
+        self.random_action_func = random_action_func
+        self.noise_scale = noise_scale
+        self.low = low
+        self.high = high
+    def select_action(self, t, greedy_action_func, action_value=None):
+        if np.random.rand() < self.epsilon:
+            a = self.random_action_func()
+        else:
+            a = greedy_action_func()
+            noise = np.random.normal(scale=self.noise_scale, size=a.shape).astype(
+                np.float32
+            )
+            a = a + noise
+        if self.low is not None or self.high is not None:
+            return np.clip(a, self.low, self.high)
+        else:
+            return a
+    def __repr__(self):
+        return "EpsilonGreedyWithGaussianNoise(epsilon={}, noise_scale={}, low={}, high={})".format(
+            self.epsilon, self.noise_scale, self.low, self.high
+        )
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        default="results",
+        help=(
+            "Directory path to save output files."
+            " If it does not exist, it will be created."
+        ),
+    )
+    parser.add_argument(
+        "--env",
+        type=str,
+        default="FetchReach-v1",
+        help="OpenAI Gym MuJoCo env to perform algorithm on.",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)")
+    parser.add_argument(
+        "--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU."
+    )
+    parser.add_argument("--demo", action="store_true", default=False)
+    parser.add_argument("--load", type=str, default=None)
+    parser.add_argument(
+        "--log-level",
+        type=int,
+        default=20,
+        help="Logging level. 10:DEBUG, 20:INFO etc.",
+    )
+    parser.add_argument(
+        "--steps",
+        type=int,
+        default=5 * 10 ** 3,
+        help="Total number of timesteps to train the agent.",
+    )
+    parser.add_argument(
+        "--replay-start-size",
+        type=int,
+        default=5 * 10 ** 2,
+        help="Minimum replay buffer size before " + "performing gradient updates.",
+    )
+    parser.add_argument(
+        "--num-bits",
+        type=int,
+        default=10,
+        help="Number of bits for BitFlipping environment",
+    )
+    parser.add_argument("--no-hindsight", action="store_true", default=False)
+    parser.add_argument("--eval-n-episodes", type=int, default=10)
+    parser.add_argument("--eval-interval", type=int, default=500)
+    parser.add_argument(
+        "--render", action="store_true", help="Render env states in a GUI window."
+    )
+    args = parser.parse_args()
+    import logging
+    logging.basicConfig(level=args.log_level)
+    # Set a random seed used in PFRL.
+    utils.set_random_seed(args.seed)
+    args.outdir = experiments.prepare_output_dir(args, args.outdir)
+    print("Output files are saved in {}".format(args.outdir))
+    def make_env(test):
+        env = gym.make(args.env)
+        # Unwrap TimeLimit wrapper
+        assert isinstance(env, gym.wrappers.TimeLimit)
+        env = env.env
+        # Use different random seeds for train and test envs
+        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
+        env.seed(env_seed)
+        # Cast observations to float32 because our model uses float32
+        if args.render and not test:
+            env = pfrl.wrappers.Render(env)
+        env = ComputeSuccessRate(env)
+        return env
+    env = make_env(test=False)
+    timestep_limit = env.spec.max_episode_steps
+    obs_space = env.observation_space
+    action_space = env.action_space
+    print("Observation space:", obs_space)
+    print("Action space:", action_space)
+    assert isinstance(obs_space, gym.spaces.Dict)
+    obs_size = obs_space["observation"].low.size + obs_space["desired_goal"].low.size
+    action_size = action_space.low.size
+    def reward_fn(dg, ag):
+        return env.compute_reward(ag, dg, None)
+    q_func = nn.Sequential(
+        ConcatObsAndAction(),
+        nn.Linear(obs_size + action_size, 256),
+        nn.ReLU(),
+        nn.Linear(256, 256),
+        nn.ReLU(),
+        nn.Linear(256, 256),
+        nn.ReLU(),
+        nn.Linear(256, 1),
+    )
+    policy = nn.Sequential(
+        nn.Linear(obs_size, 256),
+        nn.ReLU(),
+        nn.Linear(256, 256),
+        nn.ReLU(),
+        nn.Linear(256, 256),
+        nn.ReLU(),
+        nn.Linear(256, action_size),
+        BoundByTanh(low=action_space.low, high=action_space.high),
+        DeterministicHead(),
+    )
+    def init_xavier_uniform(layer):
+        if isinstance(layer, nn.Linear):
+            nn.init.xavier_uniform_(layer.weight)
+            nn.init.zeros_(layer.bias)
+    with torch.no_grad():
+        q_func.apply(init_xavier_uniform)
+        policy.apply(init_xavier_uniform)
+    opt_a = torch.optim.Adam(policy.parameters())
+    opt_c = torch.optim.Adam(q_func.parameters())
+    rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
+        reward_fn=reward_fn,
+        replay_strategy=replay_buffers.hindsight.ReplayFutureGoal(),
+        capacity=10 ** 6,
+    )
+    explorer = EpsilonGreedyWithGaussianNoise(
+        epsilon=0.3,
+        random_action_func=lambda: env.action_space.sample(),
+        noise_scale=0.2,
+    )
+    # Normalize observations based on their empirical mean and variance
+    obs_normalizer = pfrl.nn.EmpiricalNormalization(obs_size, clip_threshold=5)
+    def phi(observation):
+        # Feature extractor
+        obs = np.asarray(observation["observation"], dtype=np.float32)
+        dg = np.asarray(observation["desired_goal"], dtype=np.float32)
+        return np.concatenate((obs, dg)).clip(-200, 200)
+    # 1 eopch = 10 episodes = 500 steps
+    gamma = 1.0 - 1.0 / timestep_limit
+    agent = pfrl.agents.DDPG(
+        policy,
+        q_func,
+        opt_a,
+        opt_c,
+        rbuf,
+        phi=phi,
+        gamma=gamma,
+        explorer=explorer,
+        replay_start_size=500,
+        target_update_method="soft",
+        target_update_interval=50,
+        update_interval=50,
+        soft_update_tau=5e-2,
+        n_times_update=40,
+        gpu=args.gpu,
+        minibatch_size=256,
+        clip_return_range=(-1.0 / (1.0 - gamma), 0.0),
+        action_l2_penalty_coef=1.0,
+        obs_normalizer=obs_normalizer,
+    )
+    if args.load:
+        agent.load(args.load)
+    eval_env = make_env(test=True)
+    if args.demo:
+        eval_stats = experiments.eval_performance(
+            env=eval_env,
+            agent=agent,
+            n_steps=args.eval_n_steps,
+            n_episodes=None,
+            max_episode_len=timestep_limit,
+        )
+        print(
+            "n_episodes: {} mean: {} median: {} stdev {}".format(
+                eval_stats["episodes"],
+                eval_stats["mean"],
+                eval_stats["median"],
+                eval_stats["stdev"],
+            )
+        )
+    else:
+        experiments.train_agent_with_evaluation(
+            agent=agent,
+            env=env,
+            steps=args.steps,
+            eval_n_steps=None,
+            eval_n_episodes=args.eval_n_episodes,
+            eval_interval=args.eval_interval,
+            outdir=args.outdir,
+            save_best_so_far_agent=True,
+            eval_env=eval_env,
+            train_max_episode_len=timestep_limit,
+        )
+if __name__ == "__main__":
+    main()
diff --git a/pfrl/agents/ b/pfrl/agents/
index 9d2d15589..e007f8489 100644
--- a/pfrl/agents/
+++ b/pfrl/agents/
@@ -80,13 +80,19 @@ def __init__(
+        clip_return_range=None,
+        action_l2_penalty_coef=None,
+        obs_normalizer=None,
         self.model = nn.ModuleList([policy, q_func])
+        self.obs_normalizer = obs_normalizer
         if gpu is not None and gpu >= 0:
             assert torch.cuda.is_available()
             self.device = torch.device("cuda:{}".format(gpu))
+            if self.obs_normalizer is not None:
             self.device = torch.device("cpu")
@@ -119,6 +125,8 @@ def __init__(
         self.batch_states = batch_states
         self.burnin_action_func = burnin_action_func
+        self.clip_return_range = clip_return_range
+        self.action_l2_penalty_coef = action_l2_penalty_coef
         self.t = 0
         self.last_state = None
@@ -163,6 +171,8 @@ def compute_critic_loss(self, batch):
             target_q = batch_rewards + self.gamma * (
                 1.0 - batch_terminal
             ) * next_q.reshape((batchsize,))
+            if self.clip_return_range is not None:
+                target_q = target_q.clamp(*self.clip_return_range)
         predict_q = self.q_function((batch_state, batch_actions)).reshape((batchsize,))
@@ -181,6 +191,9 @@ def compute_actor_loss(self, batch):
         q = self.q_function((batch_state, onpolicy_actions))
         loss = -q.mean()
+        if self.action_l2_penalty_coef is not None:
+            loss += self.action_l2_penalty_coef * (onpolicy_actions ** 2).mean()
         # Update stats
@@ -192,6 +205,10 @@ def update(self, experiences, errors_out=None):
         batch = batch_experiences(experiences, self.device, self.phi, self.gamma)
+        if self.obs_normalizer:
+            batch["state"] = self.obs_normalizer(batch["state"], update=False)
+            batch["next_state"] = self.obs_normalizer(batch["next_state"], update=False)
@@ -258,6 +275,8 @@ def batch_observe(self, batch_obs, batch_reward, batch_done, batch_reset):
     def _batch_select_greedy_actions(self, batch_obs):
         with torch.no_grad(), evaluating(self.policy):
             batch_xs = self.batch_states(batch_obs, self.device, self.phi)
+            if self.obs_normalizer:
+                batch_xs = self.obs_normalizer(batch_xs, update=False)
             batch_action = self.policy(batch_xs).sample()
             return batch_action.cpu().numpy()
@@ -300,6 +319,12 @@ def _batch_observe_train(self, batch_obs, batch_reward, batch_done, batch_reset)
+                if self.obs_normalizer is not None:
+                    self.obs_normalizer.experience(
+                        self.batch_states(
+                            [self.batch_last_obs[i]], self.device, self.phi
+                        )
+                    )
                 if batch_reset[i] or batch_done[i]:
                     self.batch_last_obs[i] = None
                     self.batch_last_action[i] = None

From d61d1dc90f584531b74f7b2c3ca9b302f943c309 Mon Sep 17 00:00:00 2001
From: muupan <>
Date: Tue, 10 Nov 2020 00:55:58 +0900
Subject: [PATCH 25/34] Start updates earlier to match performance of baselines

 examples/her/ | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/her/ b/examples/her/
index b1078a96b..0f9599ae2 100644
--- a/examples/her/
+++ b/examples/her/
@@ -253,7 +253,7 @@ def phi(observation):
-        replay_start_size=500,
+        replay_start_size=256,

From 18177a4a3c373ed309f361cb738c00aba7b04cca Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Wed, 11 Nov 2020 00:28:11 +0900
Subject: [PATCH 26/34] Adds Fetch DDPG to readme

--- | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ b/
index daa070db4..448f7d645 100644
--- a/
+++ b/
@@ -84,7 +84,7 @@ Following useful techniques have been also implemented in PFRL:
 - [Prioritized Experience Replay](
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/
 - [Hindsight Experience Replay](
-  - examples: [[Bit-flip DQN]](examples/her/
+  - examples: [[Bit-flip DQN]](examples/her/ [[DDPG on Fetch Envs]](examples/her/
 - [Dueling Network](
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/
 - [Normalized Advantage Function](

From 383585f9a1dfa257785e29cf29f9e985c286d6be Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Wed, 11 Nov 2020 00:31:39 +0900
Subject: [PATCH 27/34] Updates descriptions for args in bit flip

 examples/her/ | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/her/ b/examples/her/
index a430a1bef..9775d5e42 100644
--- a/examples/her/
+++ b/examples/her/
@@ -126,15 +126,16 @@ def main():
         default=5 * 10 ** 2,
-        help="Minimum replay buffer size before " + "performing gradient updates.",
+        help="Minimum replay buffer size before performing gradient updates.",
-        help="Number of bits for BitFlipping environment",
+        help="Number of bits for BitFlipping environment.",
-    parser.add_argument("--no-hindsight", action="store_true", default=False)
+    parser.add_argument("--no-hindsight", action="store_true", default=False,
+                        help="Do not use Hindsight Replay.")
     parser.add_argument("--eval-n-episodes", type=int, default=100)
     parser.add_argument("--eval-interval", type=int, default=250000)
     parser.add_argument("--n-best-episodes", type=int, default=100)

From 88380f0ea3ed42de68849d2fcd16561483729610 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Wed, 11 Nov 2020 00:33:15 +0900
Subject: [PATCH 28/34] Updates docs in DDPG Fetch example

 examples/her/ | 32 +++++++++++++++++++---------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/examples/her/ b/examples/her/
index 0f9599ae2..8cc85924a 100644
--- a/examples/her/
+++ b/examples/her/
@@ -13,6 +13,14 @@
 class ComputeSuccessRate(gym.Wrapper):
+    """Environment wrapper that computes success rate.
+    Args:
+        env: Env to wrap
+    Attributes:
+        success_record: list of successes
+    """
     def __init__(self, env):
         self.success_record = []
@@ -65,6 +73,7 @@ class EpsilonGreedyWithGaussianNoise(pfrl.explorer.Explorer):
     """Epsilon-Greedy with Gaussian noise.
     This type of explorer was used in 
     def __init__(self, epsilon, random_action_func, noise_scale, low=None, high=None):
@@ -133,15 +142,14 @@ def main():
         default=5 * 10 ** 2,
-        help="Minimum replay buffer size before " + "performing gradient updates.",
+        help="Minimum replay buffer size before performing gradient updates.",
-    parser.add_argument(
-        "--num-bits",
-        type=int,
-        default=10,
-        help="Number of bits for BitFlipping environment",
-    )
-    parser.add_argument("--no-hindsight", action="store_true", default=False)
+    parser.add_argument("--replay-strategy",
+                        default="future",
+                        choices=["future", "final"],
+                        help="The replay strategy to use",)
+    parser.add_argument("--no-hindsight", action="store_true", default=False,
+                        help="Do not use Hindsight Replay")
     parser.add_argument("--eval-n-episodes", type=int, default=10)
     parser.add_argument("--eval-interval", type=int, default=500)
@@ -221,9 +229,13 @@ def init_xavier_uniform(layer):
     opt_a = torch.optim.Adam(policy.parameters())
     opt_c = torch.optim.Adam(q_func.parameters())
+    if args.replay_strategy == "future":
+        replay_strategy = replay_buffers.hindsight.ReplayFutureGoal()
+    else:
+        replay_strategy = replay_buffers.hindsight.ReplayFinalGoal()
     rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
-        replay_strategy=replay_buffers.hindsight.ReplayFutureGoal(),
+        replay_strategy=replay_strategy,
         capacity=10 ** 6,
@@ -242,7 +254,7 @@ def phi(observation):
         dg = np.asarray(observation["desired_goal"], dtype=np.float32)
         return np.concatenate((obs, dg)).clip(-200, 200)
-    # 1 eopch = 10 episodes = 500 steps
+    # 1 epoch = 10 episodes = 500 steps
     gamma = 1.0 - 1.0 / timestep_limit
     agent = pfrl.agents.DDPG(

From 453b04bdbd36589a781bdf9b76ae22ceceb4638c Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Wed, 11 Nov 2020 00:38:20 +0900
Subject: [PATCH 29/34] Minor cleanup of hindsight replay strategies

 pfrl/replay_buffers/ | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/pfrl/replay_buffers/ b/pfrl/replay_buffers/
index 92b8f1eb2..284e96985 100644
--- a/pfrl/replay_buffers/
+++ b/pfrl/replay_buffers/
@@ -19,10 +19,7 @@ def relabel_transition_goal(transition, goal_transition, reward_fn, swap_keys_li
 class HindsightReplayStrategy:
-    """ReplayStrategy for Hindsight experience replay"""
-    def __init__(self, reward_fn):
-        self.reward_fn = reward_fn
+    """ReplayStrategy for Hindsight experience replay."""
     def apply(self, episodes, reward_fn, swap_keys_list):
         return episodes
@@ -59,7 +56,6 @@ class ReplayFutureGoal(HindsightReplayStrategy):
     """Replay random future goal.
-        ignore_null_goals (bool): no replace with goal when nothing achieved
         future_k (int): number of future goals to sample per true sample

From 0a2efc612e6fe83c860ecc0732b55a58a63f0f06 Mon Sep 17 00:00:00 2001
From: Prabhat <>
Date: Thu, 12 Nov 2020 20:41:26 +0900
Subject: [PATCH 30/34] Adds bit flip to examples tests

 examples/her/      |  2 +-
 examples_tests/her/ | 12 ++++++++++++
+import collections
+import copy
+import os
+import tempfile
+import unittest
+import numpy as np
+import pytest
+import torch
+from pfrl import replay_buffer, replay_buffers
+@pytest.mark.parametrize("capacity", [100, None])
+@pytest.mark.parametrize("num_steps", [1, 3])
+class TestHindsightReplayBuffer:
+    @pytest.fixture(autouse=True)
+    def setUp(self, capacity, num_steps):
+        self.capacity = capacity
+        self.num_steps = num_steps
+    def test_append_and_sample(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+        assert len(rbuf) == 0
+        # Add one and sample one
+        correct_item = collections.deque([], maxlen=num_steps)
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        assert len(rbuf) == 1
+        s1 = rbuf.sample(1)
+        assert len(s1) == 1
+        assert s1[0] == list(correct_item)
+        # Add two and sample two, which must be unique
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=False,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+        assert len(rbuf) == 2
+        s2 = rbuf.sample(2)
+        assert len(s2) == 2
+        if s2[0][num_steps - 1]["state"] == 0:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        else:
+            assert s2[1] == list(correct_item)
+            assert s2[0] == list(correct_item2)
+    def test_append_and_terminate(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+        assert len(rbuf) == 0
+        # Add one and sample one
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(**trans1)
+        assert len(rbuf) == 1
+        s1 = rbuf.sample(1)
+        assert len(s1) == 1
+        # Add two and sample two, which must be unique
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        rbuf.append(**trans2)
+        assert len(rbuf) == self.num_steps + 1
+        s2 = rbuf.sample(self.num_steps + 1)
+        assert len(s2) == self.num_steps + 1
+        if self.num_steps == 1:
+            if s2[0][0]["state"] == 0:
+                assert s2[1][0]["state"] == 1
+            else:
+                assert s2[1][0]["state"] == 0
+        else:
+            for item in s2:
+                # e.g. if states are 0,0,0,1 then buffer looks like:
+                # [[0,0,0], [0, 0, 1], [0, 1], [1]]
+                if len(item) < self.num_steps:
+                    assert item[len(item) - 1]["state"] == 1
+                    for i in range(len(item) - 1):
+                        assert item[i]["state"] == 0
+                else:
+                    for i in range(len(item) - 1):
+                        assert item[i]["state"] == 0
+    def test_stop_current_episode(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+        assert len(rbuf) == 0
+        # Add one and sample one
+        for _ in range(num_steps - 1):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(**trans1)
+        # we haven't experienced n transitions yet
+        assert len(rbuf) == 0
+        # episode ends
+        rbuf.stop_current_episode()
+        # episode ends, so we should add n-1 transitions
+        assert len(rbuf) == self.num_steps - 1
+    def test_save_and_load(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        tempdir = tempfile.mkdtemp()
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+        correct_item = collections.deque([], maxlen=num_steps)
+        # Add two transitions
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=False,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+        # Now it has two transitions
+        assert len(rbuf) == 2
+        # Save
+        filename = os.path.join(tempdir, "rbuf.pkl")
+        # Initialize rbuf
+        rbuf = replay_buffers.ReplayBuffer(capacity)
+        # Of course it has no transition yet
+        assert len(rbuf) == 0
+        # Load the previously saved buffer
+        rbuf.load(filename)
+        # Now it has two transitions again
+        assert len(rbuf) == 2
+        # And sampled transitions are exactly what I added!
+        s2 = rbuf.sample(2)
+        if s2[0][num_steps - 1]["state"] == 0:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        else:
+            assert s2[0] == list(correct_item2)
+            assert s2[1] == list(correct_item)
+@pytest.mark.parametrize("capacity", [100, None])
+class TestEpisodicReplayBuffer:
+    @pytest.fixture(autouse=True)
+    def setUp(self, capacity):
+        self.capacity = capacity
+    def test_append_and_sample(self):
+        capacity = self.capacity
+        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)
+        for n in [10, 15, 5] * 3:
+            transs = [
+                dict(
+                    state=i,
+                    action=100 + i,
+                    reward=200 + i,
+                    next_state=i + 1,
+                    next_action=101 + i,
+                    is_state_terminal=(i == n - 1),
+                )
+                for i in range(n)
+            ]
+            for trans in transs:
+                rbuf.append(**trans)
+        assert len(rbuf) == 90
+        assert rbuf.n_episodes == 9
+        for k in [10, 30, 90]:
+            s = rbuf.sample(k)
+            assert len(s) == k
+        for k in [1, 3, 9]:
+            s = rbuf.sample_episodes(k)
+            assert len(s) == k
+            s = rbuf.sample_episodes(k, max_len=10)
+            for ep in s:
+                assert len(ep) <= 10
+                for t0, t1 in zip(ep, ep[1:]):
+                    assert t0["next_state"] == t1["state"]
+                    assert t0["next_action"] == t1["action"]
+    def test_save_and_load(self):
+        capacity = self.capacity
+        tempdir = tempfile.mkdtemp()
+        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)
+        transs = [
+            dict(
+                state=n,
+                action=n + 10,
+                reward=n + 20,
+                next_state=n + 1,
+                next_action=n + 11,
+                is_state_terminal=False,
+            )
+            for n in range(5)
+        ]
+        # Add two episodes
+        rbuf.append(**transs[0])
+        rbuf.append(**transs[1])
+        rbuf.stop_current_episode()
+        rbuf.append(**transs[2])
+        rbuf.append(**transs[3])
+        rbuf.append(**transs[4])
+        rbuf.stop_current_episode()
+        assert len(rbuf) == 5
+        assert rbuf.n_episodes == 2
+        # Save
+        filename = os.path.join(tempdir, "rbuf.pkl")
+        # Initialize rbuf
+        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)
+        # Of course it has no transition yet
+        assert len(rbuf) == 0
+        # Load the previously saved buffer
+        rbuf.load(filename)
+        # Sampled transitions are exactly what I added!
+        s5 = rbuf.sample(5)
+        assert len(s5) == 5
+        for t in s5:
+            assert len(t) == 1
+            n = t[0]["state"]
+            assert n in range(5)
+            assert t[0] == transs[n]
+        # And sampled episodes are exactly what I added!
+        s2e = rbuf.sample_episodes(2)
+        assert len(s2e) == 2
+        if s2e[0][0]["state"] == 0:
+            assert s2e[0] == [transs[0], transs[1]]
+            assert s2e[1] == [transs[2], transs[3], transs[4]]
+        else:
+            assert s2e[0] == [transs[2], transs[3], transs[4]]
+            assert s2e[1] == [transs[0], transs[1]]
+        # Sizes are correct!
+        assert len(rbuf) == 5
+        assert rbuf.n_episodes == 2
+@pytest.mark.parametrize("capacity", [100, None])
+@pytest.mark.parametrize("normalize_by_max", ["batch", "memory"])
+class TestPrioritizedReplayBuffer:
+    @pytest.fixture(autouse=True)
+    def setUp(self, capacity, normalize_by_max):
+        self.capacity = capacity
+        self.normalize_by_max = normalize_by_max
+        self.num_steps = 1
+    def test_append_and_sample(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.PrioritizedReplayBuffer(
+            capacity,
+            normalize_by_max=self.normalize_by_max,
+            error_max=5,
+            num_steps=num_steps,
+        )
+        assert len(rbuf) == 0
+        # Add one and sample one
+        correct_item = collections.deque([], maxlen=num_steps)
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        assert len(rbuf) == 1
+        s1 = rbuf.sample(1)
+        rbuf.update_errors([3.14])
+        assert len(s1) == 1
+        np.testing.assert_allclose(s1[0][0]["weight"], 1.0)
+        del s1[0][0]["weight"]
+        assert s1[0] == list(correct_item)
+        # Add two and sample two, which must be unique
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+        assert len(rbuf) == 2
+        s2 = rbuf.sample(2)
+        rbuf.update_errors([3.14, 2.71])
+        assert len(s2) == 2
+        del s2[0][0]["weight"]
+        del s2[1][0]["weight"]
+        if s2[0][num_steps - 1]["state"] == 1:
+            assert s2[0] == list(correct_item2)
+            assert s2[1] == list(correct_item)
+        else:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        # Weights should be different for different TD-errors
+        s3 = rbuf.sample(2)
+        assert not np.allclose(s3[0][0]["weight"], s3[1][0]["weight"])
+        # Weights should be equal for different but clipped TD-errors
+        rbuf.update_errors([5, 10])
+        s3 = rbuf.sample(2)
+        np.testing.assert_allclose(s3[0][0]["weight"], s3[1][0]["weight"])
+        # Weights should be equal for the same TD-errors
+        rbuf.update_errors([3.14, 3.14])
+        s4 = rbuf.sample(2)
+        np.testing.assert_allclose(s4[0][0]["weight"], s4[1][0]["weight"])
+    def test_normalize_by_max(self):
+        rbuf = replay_buffers.PrioritizedReplayBuffer(
+            self.capacity,
+            normalize_by_max=self.normalize_by_max,
+            error_max=1000,
+            num_steps=self.num_steps,
+        )
+        # Add 100 transitions
+        for i in range(100):
+            trans = dict(
+                state=i,
+                action=1,
+                reward=2,
+                next_state=i + 1,
+                next_action=1,
+                is_state_terminal=False,
+            )
+            rbuf.append(**trans)
+        assert len(rbuf) == 100
+        def set_errors_based_on_state(rbuf, samples):
+            # Use the value of 'state' as an error, so that state 0 will have
+            # the smallest error, thus the largest weight
+            errors = [s[0]["state"] for s in samples]
+            rbuf.update_errors(errors)
+        # Assign different errors to all the transitions first
+        samples = rbuf.sample(100)
+        set_errors_based_on_state(rbuf, samples)
+        # Repeatedly check how weights are normalized
+        for i in range(100):
+            samples = rbuf.sample(i + 1)
+            # All the weights must be unique
+            assert len(set(s[0]["weight"] for s in samples)) == len(samples)
+            # Now check the maximum weight in a minibatch
+            max_w = max([s[0]["weight"] for s in samples])
+            if self.normalize_by_max == "batch":
+                # Maximum weight in a minibatch must be 1
+                np.testing.assert_allclose(max_w, 1)
+            elif self.normalize_by_max == "memory":
+                # Maximum weight in a minibatch must be less than 1 unless
+                # the minibatch contains the transition of least error.
+                if any(s[0]["state"] == 0 for s in samples):
+                    np.testing.assert_allclose(max_w, 1)
+                else:
+                    assert max_w < 1
+            set_errors_based_on_state(rbuf, samples)
+    def test_capacity(self):
+        capacity = self.capacity
+        if capacity is None:
+            return
+        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity)
+        # Fill the buffer
+        for _ in range(capacity):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=True,
+            )
+            rbuf.append(**trans1)
+        assert len(rbuf) == capacity
+        # Add a new transition
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        rbuf.append(**trans2)
+        # The size should not change
+        assert len(rbuf) == capacity
+    def test_save_and_load(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        tempdir = tempfile.mkdtemp()
+        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity, num_steps=num_steps)
+        # Add two transitions
+        correct_item = collections.deque([], maxlen=num_steps)
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+        # Now it has two transitions
+        assert len(rbuf) == 2
+        # Save
+        filename = os.path.join(tempdir, "rbuf.pkl")
+        # Initialize rbuf
+        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity, num_steps=num_steps)
+        # Of course it has no transition yet
+        assert len(rbuf) == 0
+        # Load the previously saved buffer
+        rbuf.load(filename)
+        # Now it has two transitions again
+        assert len(rbuf) == 2
+        # And sampled transitions are exactly what I added!
+        s2 = rbuf.sample(2)
+        del s2[0][0]["weight"]
+        del s2[1][0]["weight"]
+        if s2[0][num_steps - 1]["state"] == 0:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        else:
+            assert s2[0] == list(correct_item2)
+            assert s2[1] == list(correct_item)
+def exp_return_of_episode(episode):
+    return sum(np.exp(x["reward"]) for x in episode)
+@pytest.mark.parametrize("normalize_by_max", ["batch", "memory"])
+    "wait_priority_after_sampling,default_priority_func",
+    [(True, None), (True, exp_return_of_episode), (False, exp_return_of_episode)],
+@pytest.mark.parametrize("uniform_ratio", [0, 0.1, 1.0])
+@pytest.mark.parametrize("return_sample_weights", [True, False])
+class TestPrioritizedEpisodicReplayBuffer:
+    @pytest.fixture(autouse=True)
+    def setUp(
+        self,
+        normalize_by_max,
+        wait_priority_after_sampling,
+        default_priority_func,
+        uniform_ratio,
+        return_sample_weights,
+    ):
+        self.capacity = 100
+        self.normalize_by_max = normalize_by_max
+        self.wait_priority_after_sampling = wait_priority_after_sampling
+        self.default_priority_func = default_priority_func
+        self.uniform_ratio = uniform_ratio
+        self.return_sample_weights = return_sample_weights
+    def test_append_and_sample(self):
+        rbuf = replay_buffers.PrioritizedEpisodicReplayBuffer(
+            capacity=self.capacity,
+            normalize_by_max=self.normalize_by_max,
+            default_priority_func=self.default_priority_func,
+            uniform_ratio=self.uniform_ratio,
+            wait_priority_after_sampling=self.wait_priority_after_sampling,
+            return_sample_weights=self.return_sample_weights,
+        )
+        for n in [10, 15, 5] * 3:
+            transs = [
+                dict(
+                    state=i,
+                    action=100 + i,
+                    reward=200 + i,
+                    next_state=i + 1,
+                    next_action=101 + i,
+                    is_state_terminal=(i == n - 1),
+                )
+                for i in range(n)
+            ]
+            for trans in transs:
+                rbuf.append(**trans)
+        assert len(rbuf) == 90
+        assert rbuf.n_episodes == 9
+        for k in [10, 30, 90]:
+            s = rbuf.sample(k)
+            assert len(s) == k
+        for k in [1, 3, 9]:
+            ret = rbuf.sample_episodes(k)
+            if self.return_sample_weights:
+                s, wt = ret
+                assert len(s) == k
+                assert len(wt) == k
+            else:
+                s = ret
+                assert len(s) == k
+            if self.wait_priority_after_sampling:
+                rbuf.update_errors([1.0] * k)
+            ret = rbuf.sample_episodes(k, max_len=10)
+            if self.return_sample_weights:
+                s, wt = ret
+                assert len(s) == k
+                assert len(wt) == k
+            else:
+                s = ret
+            if self.wait_priority_after_sampling:
+                rbuf.update_errors([1.0] * k)
+            for ep in s:
+                assert len(ep) <= 10
+                for t0, t1 in zip(ep, ep[1:]):
+                    assert t0["next_state"] == t1["state"]
+                    assert t0["next_action"] == t1["action"]
+    "replay_buffer_type", ["ReplayBuffer", "PrioritizedReplayBuffer"]
+class TestReplayBufferWithEnvID:
+    @pytest.fixture(autouse=True)
+    def setUp(self, replay_buffer_type):
+        self.replay_buffer_type = replay_buffer_type
+    def test(self):
+        n = 5
+        if self.replay_buffer_type == "ReplayBuffer":
+            rbuf = replay_buffers.ReplayBuffer(capacity=None, num_steps=n)
+        elif self.replay_buffer_type == "PrioritizedReplayBuffer":
+            rbuf = replay_buffers.PrioritizedReplayBuffer(capacity=None, num_steps=n)
+        else:
+            assert False
+        # 2 transitions for env_id=0
+        for _ in range(2):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(env_id=0, **trans1)
+        # 4 transitions for env_id=1 with a terminal state
+        for i in range(4):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=(i == 3),
+            )
+            rbuf.append(env_id=1, **trans1)
+        # 9 transitions for env_id=2
+        for _ in range(9):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(env_id=2, **trans1)
+        # It should have:
+        #   - 4 transitions from env_id=1
+        #   - 5 transitions from env_id=2
+        assert len(rbuf) == 9
+        # env_id=0 episode ends
+        rbuf.stop_current_episode(env_id=0)
+        # Now it should have 9 + 2 = 11 transitions
+        assert len(rbuf) == 11
+        # env_id=2 episode ends
+        rbuf.stop_current_episode(env_id=2)
+        # Finally it should have 9 + 2 + 4 = 15 transitions
+        assert len(rbuf) == 15
+    "replay_buffer_type", ["EpisodicReplayBuffer", "PrioritizedEpisodicReplayBuffer"]
+class TestEpisodicReplayBufferWithEnvID:
+    @pytest.fixture(autouse=True)
+    def setUp(self, replay_buffer_type):
+        self.replay_buffer_type = replay_buffer_type
+    def test(self):
+        if self.replay_buffer_type == "EpisodicReplayBuffer":
+            rbuf = replay_buffers.EpisodicReplayBuffer(capacity=None)
+        elif self.replay_buffer_type == "PrioritizedEpisodicReplayBuffer":
+            rbuf = replay_buffers.PrioritizedEpisodicReplayBuffer(capacity=None)
+        else:
+            assert False
+        # 2 transitions for env_id=0
+        for _ in range(2):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(env_id=0, **trans1)
+        # 4 transitions for env_id=1 with a terminal state
+        for i in range(4):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=(i == 3),
+            )
+            rbuf.append(env_id=1, **trans1)
+        # 9 transitions for env_id=2
+        for _ in range(9):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(env_id=2, **trans1)
+        # It should have 4 transitions from env_id=1
+        assert len(rbuf) == 4
+        # env_id=0 episode ends
+        rbuf.stop_current_episode(env_id=0)
+        # Now it should have 4 + 2 = 6 transitions
+        assert len(rbuf) == 6
+        # env_id=2 episode ends
+        rbuf.stop_current_episode(env_id=2)
+        # Finally it should have 4 + 2 + 9 = 15 transitions
+        assert len(rbuf) == 15
+class TestReplayBufferFail(unittest.TestCase):
+    def setUp(self):
+        self.rbuf = replay_buffers.PrioritizedReplayBuffer(100)
+        self.trans1 = dict(
+            state=0,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        self.rbuf.append(**self.trans1)
+    def _sample1(self):
+        self.rbuf.sample(1)
+    def _set1(self):
+        self.rbuf.update_errors([1.0])
+    def test_fail_noupdate(self):
+        self._sample1()
+        self.assertRaises(AssertionError, self._sample1)
+    def test_fail_update_first(self):
+        self.assertRaises(AssertionError, self._set1)
+    def test_fail_doubleupdate(self):
+        self._sample1()
+        self._set1()
+        self.assertRaises(AssertionError, self._set1)
+class TestBatchExperiences(unittest.TestCase):
+    def test_batch_experiences(self):
+        experiences = []
+        experiences.append(
+            [
+                dict(
+                    state=1,
+                    action=1,
+                    reward=1,
+                    next_state=i,
+                    next_action=1,
+                    is_state_terminal=False,
+                )
+                for i in range(3)
+            ]
+        )
+        experiences.append(
+            [
+                dict(
+                    state=1,
+                    action=1,
+                    reward=1,
+                    next_state=1,
+                    next_action=1,
+                    is_state_terminal=False,
+                )
+            ]
+        )
+        four_step_transition = [
+            dict(
+                state=1,
+                action=1,
+                reward=1,
+                next_state=1,
+                next_action=1,
+                is_state_terminal=False,
+            )
+        ] * 3
+        four_step_transition.append(
+            dict(
+                state=1,
+                action=1,
+                reward=1,
+                next_state=5,
+                next_action=1,
+                is_state_terminal=True,
+            )
+        )
+        experiences.append(four_step_transition)
+        batch = replay_buffer.batch_experiences(
+            experiences, torch.device("cpu"), lambda x: x, 0.99
+        )
+        self.assertEqual(batch["state"][0], 1)
+        self.assertSequenceEqual(
+            list(batch["is_state_terminal"]),
+            list(np.asarray([0.0, 0.0, 1.0], dtype=np.float32)),
+        )
+        self.assertSequenceEqual(
+            list(batch["discount"]),
+            list(np.asarray([0.99 ** 3, 0.99 ** 1, 0.99 ** 4], dtype=np.float32)),
+        )
+        self.assertSequenceEqual(list(batch["next_state"]), list(np.asarray([2, 1, 5])))

+@pytest.mark.parametrize("capacity", [100, None])
+@pytest.mark.parametrize("num_steps", [1, 3])
+class TestHindsightReplayBuffer:
+    @pytest.fixture(autouse=True)
+    def setUp(self, capacity, num_steps):
+        self.capacity = capacity
+        self.num_steps = num_steps
+    def test_append_and_sample(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+        assert len(rbuf) == 0
+        # Add one and sample one
+        correct_item = collections.deque([], maxlen=num_steps)
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        assert len(rbuf) == 1
+        s1 = rbuf.sample(1)
+        assert len(s1) == 1
+        assert s1[0] == list(correct_item)
+        # Add two and sample two, which must be unique
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=False,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+        assert len(rbuf) == 2
+        s2 = rbuf.sample(2)
+        assert len(s2) == 2
+        if s2[0][num_steps - 1]["state"] == 0:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        else:
+            assert s2[1] == list(correct_item)
+            assert s2[0] == list(correct_item2)
+    def test_append_and_terminate(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+        assert len(rbuf) == 0
+        # Add one and sample one
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(**trans1)
+        assert len(rbuf) == 1
+        s1 = rbuf.sample(1)
+        assert len(s1) == 1
+        # Add two and sample two, which must be unique
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        rbuf.append(**trans2)
+        assert len(rbuf) == self.num_steps + 1
+        s2 = rbuf.sample(self.num_steps + 1)
+        assert len(s2) == self.num_steps + 1
+        if self.num_steps == 1:
+            if s2[0][0]["state"] == 0:
+                assert s2[1][0]["state"] == 1
+            else:
+                assert s2[1][0]["state"] == 0
+        else:
+            for item in s2:
+                # e.g. if states are 0,0,0,1 then buffer looks like:
+                # [[0,0,0], [0, 0, 1], [0, 1], [1]]
+                if len(item) < self.num_steps:
+                    assert item[len(item) - 1]["state"] == 1
+                    for i in range(len(item) - 1):
+                        assert item[i]["state"] == 0
+                else:
+                    for i in range(len(item) - 1):
+                        assert item[i]["state"] == 0
+    def test_stop_current_episode(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+        assert len(rbuf) == 0
+        # Add one and sample one
+        for _ in range(num_steps - 1):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(**trans1)
+        # we haven't experienced n transitions yet
+        assert len(rbuf) == 0
+        # episode ends
+        rbuf.stop_current_episode()
+        # episode ends, so we should add n-1 transitions
+        assert len(rbuf) == self.num_steps - 1
+    def test_save_and_load(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        tempdir = tempfile.mkdtemp()
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+        correct_item = collections.deque([], maxlen=num_steps)
+        # Add two transitions
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=False,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+        # Now it has two transitions
+        assert len(rbuf) == 2
+        # Save
+        filename = os.path.join(tempdir, "rbuf.pkl")
+        # Initialize rbuf
+        rbuf = replay_buffers.ReplayBuffer(capacity)
+        # Of course it has no transition yet
+        assert len(rbuf) == 0
+        # Load the previously saved buffer
+        rbuf.load(filename)
+        # Now it has two transitions again
+        assert len(rbuf) == 2
+        # And sampled transitions are exactly what I added!
+        s2 = rbuf.sample(2)
+        if s2[0][num_steps - 1]["state"] == 0:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        else:
+            assert s2[0] == list(correct_item2)
+            assert s2[1] == list(correct_item)
 @pytest.mark.parametrize("capacity", [100, None])
 @pytest.mark.parametrize("normalize_by_max", ["batch", "memory"])
 class TestPrioritizedReplayBuffer: