From 52cec56f93cd1c1a4a1be5aa7c07d6fb09cfcb63 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Sat, 11 Jul 2020 01:24:38 +0900
Subject: [PATCH 01/34] Adds basic code for bitflip DQN and adds basic code for
 HER

---
 examples/her/train_dqn_bit_flip.py | 215 +++++++++++++++++++++++++++++
 pfrl/replay_buffers/hindsight.py   | 181 ++++++++++++++++++++++++
 2 files changed, 396 insertions(+)
 create mode 100644 examples/her/train_dqn_bit_flip.py
 create mode 100644 pfrl/replay_buffers/hindsight.py

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
new file mode 100644
index 000000000..2a8b702ff
--- /dev/null
+++ b/examples/her/train_dqn_bit_flip.py
@@ -0,0 +1,215 @@
+import argparse
+
+import gym
+import gym.spaces as spaces
+import torch.nn as nn
+import numpy as np
+
+import pfrl
+from pfrl.q_functions import DiscreteActionValueHead
+from pfrl import agents
+from pfrl import experiments
+from pfrl import explorers
+from pfrl import utils
+from pfrl import replay_buffers
+
+from pfrl.initializers import init_chainer_default
+
+
+class BitFlip(gym.GoalEnv):
+    """BitFlip environment from https://arxiv.org/pdf/1707.01495.pdf
+
+    Args:
+        n: State space is {0,1}^n
+    """
+
+    def __init__(self, n):
+        self.n = n
+        self.steps = 0
+        self.action_space = spaces.Discrete(n)
+        self.observation_space = spaces.Dict(dict(
+            desired_goal=spaces.MultiBinary(n),
+            achieved_goal=spaces.MultiBinary(n),
+            observation=spaces.MultiBinary(n),
+        ))
+
+    def compute_reward(self, achieved_goal, desired_goal, info):
+        return -1.0 if (achieved_goal != desired_goal).any() else 0.0
+
+    def step(self, action):
+        self.observation["observation"][action] = \
+            int(not self.observation["observation"][action])
+        reward = self.compute_reward(self.observation["achieved_goal"],
+                                     self.observation["desired_goal"], {})
+        done = (self.observation["desired_goal"] == \
+            self.observation["achieved_goal"]).all()
+        self.steps += 1
+        if self.steps == self.n:
+            done = True
+        return self.observation, reward, done, {}
+
+    def reset(self):
+        state = self.observation_space['desired_goal'].sample()
+        goal = self.observation_space['desired_goal'].sample()
+        while (state == goal).all():
+            goal = self.observation_space['desired_goal'].sample()
+        self.observation = dict()
+        self.observation["desired_goal"] = goal
+        self.observation["achieved_goal"] = state
+        self.observation["observation"] = state
+        self.steps = 0
+        return self.observation
+
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        default="results",
+        help=(
+            "Directory path to save output files."
+            " If it does not exist, it will be created."
+        ),
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)")
+    parser.add_argument(
+        "--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU."
+    )
+    parser.add_argument("--demo", action="store_true", default=False)
+    parser.add_argument("--load", type=str, default=None)
+    parser.add_argument(
+        "--log-level",
+        type=int,
+        default=20,
+        help="Logging level. 10:DEBUG, 20:INFO etc.",
+    )
+    parser.add_argument(
+        "--steps",
+        type=int,
+        default=5 * 10 ** 7,
+        help="Total number of timesteps to train the agent.",
+    )
+    parser.add_argument(
+        "--replay-start-size",
+        type=int,
+        default=5 * 10 ** 4,
+        help="Minimum replay buffer size before " + "performing gradient updates.",
+    )
+    parser.add_argument(
+        "--num-bits",
+        type=int,
+        default=10,
+        help="Number of bits for BitFlipping environment",
+    )
+    parser.add_argument("--eval-n-steps", type=int, default=125000)
+    parser.add_argument("--eval-interval", type=int, default=250000)
+    parser.add_argument("--n-best-episodes", type=int, default=30)
+    args = parser.parse_args()
+
+    import logging
+
+    logging.basicConfig(level=args.log_level)
+
+    # Set a random seed used in PFRL.
+    utils.set_random_seed(args.seed)
+
+    # Set different random seeds for train and test envs.
+    train_seed = args.seed
+    test_seed = 2 ** 31 - 1 - args.seed
+
+    args.outdir = experiments.prepare_output_dir(args, args.outdir)
+    print("Output files are saved in {}".format(args.outdir))
+
+    def make_env(test):
+        # Use different random seeds for train and test envs
+        env_seed = test_seed if test else train_seed
+        env = BitFlip(args.num_bits)
+        env.seed(int(env_seed))
+        return env
+
+    env = make_env(test=False)
+    eval_env = make_env(test=True)
+
+    n_actions = env.action_space.n
+    q_func = nn.Sequential(
+        init_chainer_default(nn.Linear(args.num_bits * 2, 256)),
+        nn.ReLU(),
+        init_chainer_default(nn.Linear(256, n_actions)),
+        DiscreteActionValueHead(),
+    )
+
+    # Use the same hyperparameters as the Nature paper
+    opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
+        q_func.parameters(),
+        lr=2.5e-4,
+        alpha=0.95,
+        momentum=0.0,
+        eps=1e-2,
+        centered=True,
+    )
+
+    rbuf = replay_buffers.ReplayBuffer(10 ** 6)
+
+    explorer = explorers.LinearDecayEpsilonGreedy(
+        start_epsilon=1.0,
+        end_epsilon=0.1,
+        decay_steps=10 ** 6,
+        random_action_func=lambda: np.random.randint(n_actions),
+    )
+
+    def phi(observation):
+        # Feature extractor
+        obs = np.asarray(observation["observation"], dtype=np.float32) / 255
+        dg = np.asarray(observation["desired_goal"], dtype=np.float32) / 255
+        return np.concatenate((obs, dg))
+
+    Agent = agents.DQN
+    agent = Agent(
+        q_func,
+        opt,
+        rbuf,
+        gpu=args.gpu,
+        gamma=0.99,
+        explorer=explorer,
+        replay_start_size=args.replay_start_size,
+        target_update_interval=10 ** 4,
+        clip_delta=True,
+        update_interval=4,
+        batch_accumulator="sum",
+        phi=phi,
+    )
+
+    if args.load:
+        agent.load(args.load)
+
+
+    if args.demo:
+        eval_stats = experiments.eval_performance(
+            env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None
+        )
+        print(
+            "n_episodes: {} mean: {} median: {} stdev {}".format(
+                eval_stats["episodes"],
+                eval_stats["mean"],
+                eval_stats["median"],
+                eval_stats["stdev"],
+            )
+        )
+    else:
+        experiments.train_agent_with_evaluation(
+            agent=agent,
+            env=env,
+            steps=args.steps,
+            eval_n_steps=args.eval_n_steps,
+            eval_n_episodes=None,
+            eval_interval=args.eval_interval,
+            outdir=args.outdir,
+            save_best_so_far_agent=True,
+            eval_env=eval_env,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pfrl/replay_buffers/hindsight.py b/pfrl/replay_buffers/hindsight.py
new file mode 100644
index 000000000..820bc8801
--- /dev/null
+++ b/pfrl/replay_buffers/hindsight.py
@@ -0,0 +1,181 @@
+import copy
+
+import numpy as np
+
+from pfrl.replay_buffer import EpisodicReplayBuffer
+from pfrl.replay_buffer import random_subseq
+
+
+def relabel_transition_goal(self, transition, goal_transition,
+                            reward_fn, swap_keys_list):
+    # Relabel/replace the desired goal for the transition with new_goal
+    for desired_obs_key, achieved_obs_key in swap_keys_list:
+        replacement = goal_transition["next_state"][achieved_obs_key]
+        transition["state"][desired_obs_key] = replacement
+        transition["next_state"][desired_obs_key] = replacement
+    new_goal = goal_transition["next_state"]["achieved_goal"]
+    achieved_goal = transition["next_state"]["achieved_goal"]
+    transition["reward"] = reward_fn(new_goal, achieved_goal)
+    return transition
+
+
+class HindsightReplayStrategy():
+    """ReplayStrategy for Hindsight experience replay
+    """
+
+    def __init__(self, reward_fn):
+        self.reward_fn = reward_fn
+
+    def apply(self, episodes):
+        return episodes
+
+class ReplayFinalGoal(HindsightReplayStrategy):
+    """Replay final goal.
+    """
+
+    def __init__(self, ignore_null_goals=True, is_null_goal=None):
+        self.ignore_null_goals = ignore_null_goals
+        self.is_null_goal = is_null_goal  
+
+    def apply(self, episodes, reward_fn):
+        batch_size = len(episodes)
+        episode_lens = np.array([len(episode) for episode in episodes])
+
+        # Randomly select time-steps from each episode
+        ts = [np.random.randint(ep_len) for ep_len in episode_lens]
+        ts = np.array(ts)
+
+        # Select subset for hindsight goal replacement.
+        apply_hers = np.random.uniform(size=batch_size) < 0.5
+
+        batch = []
+        for episode, apply_her, t in zip(episodes, apply_hers, ts):
+            transition = episode[t]
+            if apply_her:
+                final_transition = episode[-1]
+                final_goal = final_transition["next_state"]["achieved_goal"]
+                if not (self.ignore_null_goals and
+                        self.is_null_goal(final_goal)):
+                    transition = copy.deepcopy(transition)
+                    transition = relabel_transition_goal(
+                        transition, final_transition, reward_fn, swap_keys_list)
+            batch.append([transition])
+        return batch
+
+class ReplayFutureGoal(HindsightReplayStrategy):
+    """Replay random future goal.
+
+        Args:
+            ignore_null_goals (bool): no replace with goal when nothing achieved
+            future_k (int): number of future goals to sample per true sample
+            swap_list (list): a list of tuples of keys to swap in the
+                observation. E.g. [(("desired_x", "achieved_x"))] This is used
+                to replace a transition's "desired_x" with a goal transition's
+                "achieved_x"
+    """
+
+    def __init__(self, ignore_null_goals=True, is_null_goal=None):
+        self.ignore_null_goals = ignore_null_goals
+        self.is_null_goal = is_null_goal
+
+    def apply(self, episodes, reward_fn):
+        """Sample with the future strategy
+        """
+        batch_size = len(episodes)
+        episode_lens = np.array([len(episode) for episode in episodes])
+
+        # Randomly select time-steps from each episode
+        ts = [np.random.randint(ep_len) for ep_len in episode_lens]
+        ts = np.array(ts)
+
+        # Select subset for hindsight goal replacement. future_k controls ratio
+        apply_hers = np.random.uniform(size=batch_size) < self.future_prob
+
+        # Randomly select offsets for future goals
+        future_offset = np.random.uniform(
+            size=batch_size) * (episode_lens - ts)
+        future_offset = future_offset.astype(int)
+        future_ts = ts + future_offset
+        batch = []
+        for episode, apply_her, t, future_t in zip(episodes,
+                                                   apply_hers,
+                                                   ts, future_ts):
+            transition = episode[t]
+            if apply_her:
+                future_transition = episode[future_t]
+                future_goal = future_transition["next_state"]["achieved_goal"]
+                if not (self.ignore_null_goals and
+                        self.is_null_goal(future_goal)):
+                    transition = copy.deepcopy(transition)
+                    transition = relabel_transition_goal(
+                        transition, future_transition, reward_fn, swap_keys_list)
+            batch.append([transition])
+        return batch
+
+class HindsightReplayBuffer(EpisodicReplayBuffer):
+    """Hindsight Replay Buffer
+
+     https://arxiv.org/abs/1707.01495
+     We currently do not support N-step transitions for the
+     Hindsight Buffer.
+     Args:
+        reward_fn(fn): Calculate reward from achieved & observed goals
+        replay_strategy: instance of HindsightReplayStrategy()
+        capacity (int): Capacity of the replay buffer
+        future_k (int): number of future goals to sample per true sample
+        swap_list (list): a list of tuples of keys to swap in the
+            observation. E.g. [(("desired_x", "achieved_x"))] This is used
+            to replace a transition's "desired_x" with a goal transition's
+            "achieved_x"
+    """
+
+    def __init__(self,
+                 reward_fn,
+                 replay_strategy,
+                 capacity=None,
+                 is_null_goal=None,
+                 future_k=0,
+                 swap_list=[('desired_goal', 'achieved_goal')]):
+
+        assert replay_strategy in ["future", "final", "none"]
+        if ignore_null_goals:
+            assert is_null_goal is not None, "is_null_goal to detect when no\
+                goal was reached is required when ignore_null_goals=True"
+        self.reward_fn = reward_fn
+        self.replay_strategy = replay_strategy
+        self.is_null_goal = is_null_goal
+        self.swap_keys_list = swap_list
+        assert ('desired_goal', 'achieved_goal') in self.swap_keys_list
+
+        super(HindsightReplayBuffer, self).__init__(capacity)
+        # probability of sampling a future goal instead of a true goal
+        self.future_prob = 1.0 - 1.0 / (float(future_k) + 1)
+
+
+    def sample(self, n):
+        # Sample n transitions from the hindsight replay buffer
+        assert len(self.memory) >= n
+        # Select n episodes
+        episodes = self.sample_episodes(n)
+        batch = self.replay_strategy.apply(episodes,
+                                           self.reward_fn,
+                                           self.swap_keys_list)
+        if self.replay_strategy == "future":
+            batch = self._replay_future(episodes)
+        elif self.replay_strategy == "final":
+            batch = self._replay_final(episodes)
+        else:
+            raise NotImplementedError()
+
+        return batch
+
+    def sample_episodes(self, n_episodes, max_len=None):
+        episodes = self.sample_with_replacement(n_episodes)
+        if max_len is not None:
+            return [random_subseq(ep, max_len) for ep in episodes]
+        else:
+            return episodes
+
+    def sample_with_replacement(self, k):
+        return [self.episodic_memory[i] for i in
+                np.random.randint(0, len(self.episodic_memory), k)]
\ No newline at end of file

From 6859259c35d86c0ee9234aad9056a7dc3abcf797 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Sat, 11 Jul 2020 22:13:51 +0900
Subject: [PATCH 02/34] Adds hindsight to bit flip

---
 examples/her/train_dqn_bit_flip.py | 13 ++++++++++++-
 pfrl/replay_buffers/hindsight.py   | 24 ++++++------------------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 2a8b702ff..a3bd888f4 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -103,6 +103,7 @@ def main():
         default=10,
         help="Number of bits for BitFlipping environment",
     )
+    parser.add_argument("--use-hindsight", type=bool, default=True)
     parser.add_argument("--eval-n-steps", type=int, default=125000)
     parser.add_argument("--eval-interval", type=int, default=250000)
     parser.add_argument("--n-best-episodes", type=int, default=30)
@@ -150,7 +151,17 @@ def make_env(test):
         centered=True,
     )
 
-    rbuf = replay_buffers.ReplayBuffer(10 ** 6)
+    def reward_fn(dg, ag):
+        return -1.0 if (ag != dg).any() else 0.0
+
+    if args.use_hindsight:
+        rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
+            reward_fn=reward_fn,
+            replay_strategy=replay_buffers.hindsight.ReplayFutureGoal(),
+            capacity=10 ** 6
+            )
+    else:
+        rbuf = replay_buffers.ReplayBuffer(10 ** 6)
 
     explorer = explorers.LinearDecayEpsilonGreedy(
         start_epsilon=1.0,
diff --git a/pfrl/replay_buffers/hindsight.py b/pfrl/replay_buffers/hindsight.py
index 820bc8801..4ff693e75 100644
--- a/pfrl/replay_buffers/hindsight.py
+++ b/pfrl/replay_buffers/hindsight.py
@@ -26,7 +26,7 @@ class HindsightReplayStrategy():
     def __init__(self, reward_fn):
         self.reward_fn = reward_fn
 
-    def apply(self, episodes):
+    def apply(self, episodes,  reward_fn, swap_keys_list):
         return episodes
 
 class ReplayFinalGoal(HindsightReplayStrategy):
@@ -37,7 +37,7 @@ def __init__(self, ignore_null_goals=True, is_null_goal=None):
         self.ignore_null_goals = ignore_null_goals
         self.is_null_goal = is_null_goal  
 
-    def apply(self, episodes, reward_fn):
+    def apply(self, episodes, reward_fn, swap_keys_list):
         batch_size = len(episodes)
         episode_lens = np.array([len(episode) for episode in episodes])
 
@@ -68,17 +68,14 @@ class ReplayFutureGoal(HindsightReplayStrategy):
         Args:
             ignore_null_goals (bool): no replace with goal when nothing achieved
             future_k (int): number of future goals to sample per true sample
-            swap_list (list): a list of tuples of keys to swap in the
-                observation. E.g. [(("desired_x", "achieved_x"))] This is used
-                to replace a transition's "desired_x" with a goal transition's
-                "achieved_x"
     """
 
-    def __init__(self, ignore_null_goals=True, is_null_goal=None):
+    def __init__(self, ignore_null_goals=True, is_null_goal=None, future_k=4):
         self.ignore_null_goals = ignore_null_goals
         self.is_null_goal = is_null_goal
+        self.future_prob = 1.0 - 1.0 / (float(future_k) + 1)
 
-    def apply(self, episodes, reward_fn):
+    def apply(self, episodes, reward_fn, swap_keys_list):
         """Sample with the future strategy
         """
         batch_size = len(episodes)
@@ -122,7 +119,6 @@ class HindsightReplayBuffer(EpisodicReplayBuffer):
         reward_fn(fn): Calculate reward from achieved & observed goals
         replay_strategy: instance of HindsightReplayStrategy()
         capacity (int): Capacity of the replay buffer
-        future_k (int): number of future goals to sample per true sample
         swap_list (list): a list of tuples of keys to swap in the
             observation. E.g. [(("desired_x", "achieved_x"))] This is used
             to replace a transition's "desired_x" with a goal transition's
@@ -137,7 +133,7 @@ def __init__(self,
                  future_k=0,
                  swap_list=[('desired_goal', 'achieved_goal')]):
 
-        assert replay_strategy in ["future", "final", "none"]
+        assert replay_strategy is not None
         if ignore_null_goals:
             assert is_null_goal is not None, "is_null_goal to detect when no\
                 goal was reached is required when ignore_null_goals=True"
@@ -149,7 +145,6 @@ def __init__(self,
 
         super(HindsightReplayBuffer, self).__init__(capacity)
         # probability of sampling a future goal instead of a true goal
-        self.future_prob = 1.0 - 1.0 / (float(future_k) + 1)
 
 
     def sample(self, n):
@@ -160,13 +155,6 @@ def sample(self, n):
         batch = self.replay_strategy.apply(episodes,
                                            self.reward_fn,
                                            self.swap_keys_list)
-        if self.replay_strategy == "future":
-            batch = self._replay_future(episodes)
-        elif self.replay_strategy == "final":
-            batch = self._replay_final(episodes)
-        else:
-            raise NotImplementedError()
-
         return batch
 
     def sample_episodes(self, n_episodes, max_len=None):

From 120cfa58eb25c1538de9ad268147cb5c03bb9d18 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Sat, 11 Jul 2020 22:30:56 +0900
Subject: [PATCH 03/34] removes null_goals

---
 pfrl/replay_buffers/__init__.py  |  4 ++++
 pfrl/replay_buffers/hindsight.py | 31 +++++++++----------------------
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/pfrl/replay_buffers/__init__.py b/pfrl/replay_buffers/__init__.py
index 1c5b0ef2b..3319041f8 100644
--- a/pfrl/replay_buffers/__init__.py
+++ b/pfrl/replay_buffers/__init__.py
@@ -1,4 +1,6 @@
 from pfrl.replay_buffers.episodic import EpisodicReplayBuffer  # NOQA
+from pfrl.replay_buffers.hindsight import HindsightReplayStrategy  # NOQA
+from pfrl.replay_buffers.hindsight import HindsightReplayBuffer  # NOQA
 from pfrl.replay_buffers.persistent import PersistentEpisodicReplayBuffer  # NOQA
 from pfrl.replay_buffers.persistent import PersistentReplayBuffer  # NOQA
 from pfrl.replay_buffers.prioritized import PrioritizedReplayBuffer  # NOQA
@@ -7,3 +9,5 @@
     PrioritizedEpisodicReplayBuffer,
 )
 from pfrl.replay_buffers.replay_buffer import ReplayBuffer  # NOQA
+from pfrl.replay_buffers.hindsight import ReplayFinalGoal  # NOQA
+from pfrl.replay_buffers.hindsight import ReplayFutureGoal  # NOQA
diff --git a/pfrl/replay_buffers/hindsight.py b/pfrl/replay_buffers/hindsight.py
index 4ff693e75..83cdad084 100644
--- a/pfrl/replay_buffers/hindsight.py
+++ b/pfrl/replay_buffers/hindsight.py
@@ -2,11 +2,11 @@
 
 import numpy as np
 
-from pfrl.replay_buffer import EpisodicReplayBuffer
+from pfrl.replay_buffers.episodic import EpisodicReplayBuffer
 from pfrl.replay_buffer import random_subseq
 
 
-def relabel_transition_goal(self, transition, goal_transition,
+def relabel_transition_goal(transition, goal_transition,
                             reward_fn, swap_keys_list):
     # Relabel/replace the desired goal for the transition with new_goal
     for desired_obs_key, achieved_obs_key in swap_keys_list:
@@ -33,10 +33,6 @@ class ReplayFinalGoal(HindsightReplayStrategy):
     """Replay final goal.
     """
 
-    def __init__(self, ignore_null_goals=True, is_null_goal=None):
-        self.ignore_null_goals = ignore_null_goals
-        self.is_null_goal = is_null_goal  
-
     def apply(self, episodes, reward_fn, swap_keys_list):
         batch_size = len(episodes)
         episode_lens = np.array([len(episode) for episode in episodes])
@@ -54,11 +50,9 @@ def apply(self, episodes, reward_fn, swap_keys_list):
             if apply_her:
                 final_transition = episode[-1]
                 final_goal = final_transition["next_state"]["achieved_goal"]
-                if not (self.ignore_null_goals and
-                        self.is_null_goal(final_goal)):
-                    transition = copy.deepcopy(transition)
-                    transition = relabel_transition_goal(
-                        transition, final_transition, reward_fn, swap_keys_list)
+                transition = copy.deepcopy(transition)
+                transition = relabel_transition_goal(
+                    transition, final_transition, reward_fn, swap_keys_list)
             batch.append([transition])
         return batch
 
@@ -70,9 +64,7 @@ class ReplayFutureGoal(HindsightReplayStrategy):
             future_k (int): number of future goals to sample per true sample
     """
 
-    def __init__(self, ignore_null_goals=True, is_null_goal=None, future_k=4):
-        self.ignore_null_goals = ignore_null_goals
-        self.is_null_goal = is_null_goal
+    def __init__(self, future_k=4):
         self.future_prob = 1.0 - 1.0 / (float(future_k) + 1)
 
     def apply(self, episodes, reward_fn, swap_keys_list):
@@ -101,11 +93,9 @@ def apply(self, episodes, reward_fn, swap_keys_list):
             if apply_her:
                 future_transition = episode[future_t]
                 future_goal = future_transition["next_state"]["achieved_goal"]
-                if not (self.ignore_null_goals and
-                        self.is_null_goal(future_goal)):
-                    transition = copy.deepcopy(transition)
-                    transition = relabel_transition_goal(
-                        transition, future_transition, reward_fn, swap_keys_list)
+                transition = copy.deepcopy(transition)
+                transition = relabel_transition_goal(
+                    transition, future_transition, reward_fn, swap_keys_list)
             batch.append([transition])
         return batch
 
@@ -134,9 +124,6 @@ def __init__(self,
                  swap_list=[('desired_goal', 'achieved_goal')]):
 
         assert replay_strategy is not None
-        if ignore_null_goals:
-            assert is_null_goal is not None, "is_null_goal to detect when no\
-                goal was reached is required when ignore_null_goals=True"
         self.reward_fn = reward_fn
         self.replay_strategy = replay_strategy
         self.is_null_goal = is_null_goal

From 3360fafd7312db1ea80b17c746c4743f4fdeab0a Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Sun, 12 Jul 2020 00:15:25 +0900
Subject: [PATCH 04/34] modifies total steps

---
 examples/her/train_dqn_bit_flip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index a3bd888f4..288af1e1b 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -88,7 +88,7 @@ def main():
     parser.add_argument(
         "--steps",
         type=int,
-        default=5 * 10 ** 7,
+        default=10 ** 7,
         help="Total number of timesteps to train the agent.",
     )
     parser.add_argument(

From 1e572055a889033aa6bd2241c7316ae5568005d9 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Sun, 12 Jul 2020 00:44:35 +0900
Subject: [PATCH 05/34] Updates space sampling

---
 examples/her/train_dqn_bit_flip.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 288af1e1b..beb702d63 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -49,10 +49,11 @@ def step(self, action):
         return self.observation, reward, done, {}
 
     def reset(self):
-        state = self.observation_space['desired_goal'].sample()
-        goal = self.observation_space['desired_goal'].sample()
+        sample_obs = self.observation_space.sample()
+        state, goal = sample_obs['observation'], sample_obs['desired_goal']
         while (state == goal).all():
-            goal = self.observation_space['desired_goal'].sample()
+            sample_obs = self.observation_space.sample()
+            state, goal = sample_obs['observation'], sample_obs['desired_goal']
         self.observation = dict()
         self.observation["desired_goal"] = goal
         self.observation["achieved_goal"] = state
@@ -61,7 +62,6 @@ def reset(self):
         return self.observation
 
 
-
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(

From a60e1f5e982b523a85723d5add72b36610e3eb01 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Fri, 17 Jul 2020 17:43:34 +0900
Subject: [PATCH 06/34] Cleans hindsight buffer code

---
 pfrl/replay_buffers/hindsight.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pfrl/replay_buffers/hindsight.py b/pfrl/replay_buffers/hindsight.py
index 83cdad084..72fbde3aa 100644
--- a/pfrl/replay_buffers/hindsight.py
+++ b/pfrl/replay_buffers/hindsight.py
@@ -119,14 +119,11 @@ def __init__(self,
                  reward_fn,
                  replay_strategy,
                  capacity=None,
-                 is_null_goal=None,
-                 future_k=0,
                  swap_list=[('desired_goal', 'achieved_goal')]):
 
         assert replay_strategy is not None
         self.reward_fn = reward_fn
         self.replay_strategy = replay_strategy
-        self.is_null_goal = is_null_goal
         self.swap_keys_list = swap_list
         assert ('desired_goal', 'achieved_goal') in self.swap_keys_list
 

From eece248a6be7458ce64d4ffafc6a010dbf669bb9 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Sun, 19 Jul 2020 23:31:06 +0900
Subject: [PATCH 07/34] Modifies experiment params

---
 examples/her/train_dqn_bit_flip.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index beb702d63..304ce98ab 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -104,9 +104,9 @@ def main():
         help="Number of bits for BitFlipping environment",
     )
     parser.add_argument("--use-hindsight", type=bool, default=True)
-    parser.add_argument("--eval-n-steps", type=int, default=125000)
+    parser.add_argument("--eval-n-episodes", type=int, default=100)
     parser.add_argument("--eval-interval", type=int, default=250000)
-    parser.add_argument("--n-best-episodes", type=int, default=30)
+    parser.add_argument("--n-best-episodes", type=int, default=100)
     args = parser.parse_args()
 
     import logging
@@ -213,8 +213,8 @@ def phi(observation):
             agent=agent,
             env=env,
             steps=args.steps,
-            eval_n_steps=args.eval_n_steps,
-            eval_n_episodes=None,
+            eval_n_steps=None,
+            eval_n_episodes=args.eval_n_episodes,
             eval_interval=args.eval_interval,
             outdir=args.outdir,
             save_best_so_far_agent=True,

From e38e0d019fc649879a5f4bf4c39590716d589d3b Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 29 Oct 2020 01:54:40 +0900
Subject: [PATCH 08/34] Applies black to pfrl

---
 pfrl/replay_buffers/hindsight.py | 87 ++++++++++++++++----------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/pfrl/replay_buffers/hindsight.py b/pfrl/replay_buffers/hindsight.py
index 72fbde3aa..f7750ce25 100644
--- a/pfrl/replay_buffers/hindsight.py
+++ b/pfrl/replay_buffers/hindsight.py
@@ -6,8 +6,7 @@
 from pfrl.replay_buffer import random_subseq
 
 
-def relabel_transition_goal(transition, goal_transition,
-                            reward_fn, swap_keys_list):
+def relabel_transition_goal(transition, goal_transition, reward_fn, swap_keys_list):
     # Relabel/replace the desired goal for the transition with new_goal
     for desired_obs_key, achieved_obs_key in swap_keys_list:
         replacement = goal_transition["next_state"][achieved_obs_key]
@@ -19,19 +18,18 @@ def relabel_transition_goal(transition, goal_transition,
     return transition
 
 
-class HindsightReplayStrategy():
-    """ReplayStrategy for Hindsight experience replay
-    """
+class HindsightReplayStrategy:
+    """ReplayStrategy for Hindsight experience replay"""
 
     def __init__(self, reward_fn):
         self.reward_fn = reward_fn
 
-    def apply(self, episodes,  reward_fn, swap_keys_list):
+    def apply(self, episodes, reward_fn, swap_keys_list):
         return episodes
 
+
 class ReplayFinalGoal(HindsightReplayStrategy):
-    """Replay final goal.
-    """
+    """Replay final goal."""
 
     def apply(self, episodes, reward_fn, swap_keys_list):
         batch_size = len(episodes)
@@ -52,24 +50,25 @@ def apply(self, episodes, reward_fn, swap_keys_list):
                 final_goal = final_transition["next_state"]["achieved_goal"]
                 transition = copy.deepcopy(transition)
                 transition = relabel_transition_goal(
-                    transition, final_transition, reward_fn, swap_keys_list)
+                    transition, final_transition, reward_fn, swap_keys_list
+                )
             batch.append([transition])
         return batch
 
+
 class ReplayFutureGoal(HindsightReplayStrategy):
     """Replay random future goal.
 
-        Args:
-            ignore_null_goals (bool): no replace with goal when nothing achieved
-            future_k (int): number of future goals to sample per true sample
+    Args:
+        ignore_null_goals (bool): no replace with goal when nothing achieved
+        future_k (int): number of future goals to sample per true sample
     """
 
     def __init__(self, future_k=4):
         self.future_prob = 1.0 - 1.0 / (float(future_k) + 1)
 
     def apply(self, episodes, reward_fn, swap_keys_list):
-        """Sample with the future strategy
-        """
+        """Sample with the future strategy"""
         batch_size = len(episodes)
         episode_lens = np.array([len(episode) for episode in episodes])
 
@@ -81,64 +80,64 @@ def apply(self, episodes, reward_fn, swap_keys_list):
         apply_hers = np.random.uniform(size=batch_size) < self.future_prob
 
         # Randomly select offsets for future goals
-        future_offset = np.random.uniform(
-            size=batch_size) * (episode_lens - ts)
+        future_offset = np.random.uniform(size=batch_size) * (episode_lens - ts)
         future_offset = future_offset.astype(int)
         future_ts = ts + future_offset
         batch = []
-        for episode, apply_her, t, future_t in zip(episodes,
-                                                   apply_hers,
-                                                   ts, future_ts):
+        for episode, apply_her, t, future_t in zip(episodes, apply_hers, ts, future_ts):
             transition = episode[t]
             if apply_her:
                 future_transition = episode[future_t]
                 future_goal = future_transition["next_state"]["achieved_goal"]
                 transition = copy.deepcopy(transition)
                 transition = relabel_transition_goal(
-                    transition, future_transition, reward_fn, swap_keys_list)
+                    transition, future_transition, reward_fn, swap_keys_list
+                )
             batch.append([transition])
         return batch
 
+
 class HindsightReplayBuffer(EpisodicReplayBuffer):
     """Hindsight Replay Buffer
 
-     https://arxiv.org/abs/1707.01495
-     We currently do not support N-step transitions for the
-     Hindsight Buffer.
-     Args:
-        reward_fn(fn): Calculate reward from achieved & observed goals
-        replay_strategy: instance of HindsightReplayStrategy()
-        capacity (int): Capacity of the replay buffer
-        swap_list (list): a list of tuples of keys to swap in the
-            observation. E.g. [(("desired_x", "achieved_x"))] This is used
-            to replace a transition's "desired_x" with a goal transition's
-            "achieved_x"
+    https://arxiv.org/abs/1707.01495
+    We currently do not support N-step transitions for the
+    Hindsight Buffer.
+    Args:
+       reward_fn(fn): Calculate reward from achieved & observed goals
+       replay_strategy: instance of HindsightReplayStrategy()
+       capacity (int): Capacity of the replay buffer
+       swap_list (list): a list of tuples of keys to swap in the
+           observation. E.g. [(("desired_x", "achieved_x"))] This is used
+           to replace a transition's "desired_x" with a goal transition's
+           "achieved_x"
     """
 
-    def __init__(self,
-                 reward_fn,
-                 replay_strategy,
-                 capacity=None,
-                 swap_list=[('desired_goal', 'achieved_goal')]):
+    def __init__(
+        self,
+        reward_fn,
+        replay_strategy,
+        capacity=None,
+        swap_list=[("desired_goal", "achieved_goal")],
+    ):
 
         assert replay_strategy is not None
         self.reward_fn = reward_fn
         self.replay_strategy = replay_strategy
         self.swap_keys_list = swap_list
-        assert ('desired_goal', 'achieved_goal') in self.swap_keys_list
+        assert ("desired_goal", "achieved_goal") in self.swap_keys_list
 
         super(HindsightReplayBuffer, self).__init__(capacity)
         # probability of sampling a future goal instead of a true goal
 
-
     def sample(self, n):
         # Sample n transitions from the hindsight replay buffer
         assert len(self.memory) >= n
         # Select n episodes
         episodes = self.sample_episodes(n)
-        batch = self.replay_strategy.apply(episodes,
-                                           self.reward_fn,
-                                           self.swap_keys_list)
+        batch = self.replay_strategy.apply(
+            episodes, self.reward_fn, self.swap_keys_list
+        )
         return batch
 
     def sample_episodes(self, n_episodes, max_len=None):
@@ -149,5 +148,7 @@ def sample_episodes(self, n_episodes, max_len=None):
             return episodes
 
     def sample_with_replacement(self, k):
-        return [self.episodic_memory[i] for i in
-                np.random.randint(0, len(self.episodic_memory), k)]
\ No newline at end of file
+        return [
+            self.episodic_memory[i]
+            for i in np.random.randint(0, len(self.episodic_memory), k)
+        ]

From d89c788634aa64e7e412a78c9fbb01f64ca7481e Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 29 Oct 2020 02:26:14 +0900
Subject: [PATCH 09/34] Updates docstring

---
 pfrl/replay_buffers/hindsight.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pfrl/replay_buffers/hindsight.py b/pfrl/replay_buffers/hindsight.py
index f7750ce25..dc70b861e 100644
--- a/pfrl/replay_buffers/hindsight.py
+++ b/pfrl/replay_buffers/hindsight.py
@@ -104,7 +104,7 @@ class HindsightReplayBuffer(EpisodicReplayBuffer):
     We currently do not support N-step transitions for the
     Hindsight Buffer.
     Args:
-       reward_fn(fn): Calculate reward from achieved & observed goals
+       reward_fn(fn): reward fn with input: (achieved_goal, desired_goal)
        replay_strategy: instance of HindsightReplayStrategy()
        capacity (int): Capacity of the replay buffer
        swap_list (list): a list of tuples of keys to swap in the

From 080916ff970f7a012615c711a41f3afdce8d4fe2 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 29 Oct 2020 02:27:44 +0900
Subject: [PATCH 10/34] Implements step function and success rate calculation

---
 examples/her/train_dqn_bit_flip.py | 45 ++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 304ce98ab..9ef8f7be4 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -16,6 +16,10 @@
 from pfrl.initializers import init_chainer_default
 
 
+def reward_fn(dg, ag):
+    return -1.0 if (ag != dg).any() else 0.0
+
+
 class BitFlip(gym.GoalEnv):
     """BitFlip environment from https://arxiv.org/pdf/1707.01495.pdf
 
@@ -32,20 +36,33 @@ def __init__(self, n):
             achieved_goal=spaces.MultiBinary(n),
             observation=spaces.MultiBinary(n),
         ))
-
-    def compute_reward(self, achieved_goal, desired_goal, info):
-        return -1.0 if (achieved_goal != desired_goal).any() else 0.0
+        self.clear_statistics()
 
     def step(self, action):
-        self.observation["observation"][action] = \
-            int(not self.observation["observation"][action])
-        reward = self.compute_reward(self.observation["achieved_goal"],
-                                     self.observation["desired_goal"], {})
-        done = (self.observation["desired_goal"] == \
+        # Compute action outcome
+        bit_new = int(not self.observation["observation"][action])
+        new_obs = self.observation["observation"].copy()
+        new_obs[action] = bit_new
+        # Set new observation
+        dg = self.observation["desired_goal"]
+        self.observation["desired_goal"] = dg.copy()
+        self.observation["achieved_goal"] = new_obs
+        self.observation["observation"] = new_obs
+
+        reward = reward_fn(self.observation["desired_goal"],
+                           self.observation["achieved_goal"])
+        done_success = (self.observation["desired_goal"] == \
             self.observation["achieved_goal"]).all()
+        done = done_success
         self.steps += 1
         if self.steps == self.n:
             done = True
+        if done:
+            if done_success:
+                assert reward == 0
+                self.results.append(1)
+            else:
+                self.results.append(0)
         return self.observation, reward, done, {}
 
     def reset(self):
@@ -61,6 +78,15 @@ def reset(self):
         self.steps = 0
         return self.observation
 
+    def get_statistics(self):
+        failures =  self.results.count(0)
+        successes = self.results.count(1)
+        assert len(self.results) == failures + successes
+        success_rate = successes/float(self.results)
+        return [("success_rate", success_rate)]
+
+    def clear_statistics(self):
+        self.results = []
 
 def main():
     parser = argparse.ArgumentParser()
@@ -151,9 +177,6 @@ def make_env(test):
         centered=True,
     )
 
-    def reward_fn(dg, ag):
-        return -1.0 if (ag != dg).any() else 0.0
-
     if args.use_hindsight:
         rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
             reward_fn=reward_fn,

From c363dc76f49f2b8ae8211153b66dc87499844efe Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 29 Oct 2020 02:30:12 +0900
Subject: [PATCH 11/34] Updates agent, explorer, replay start size, and phi

---
 examples/her/train_dqn_bit_flip.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 9ef8f7be4..b4a25ff04 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -120,7 +120,7 @@ def main():
     parser.add_argument(
         "--replay-start-size",
         type=int,
-        default=5 * 10 ** 4,
+        default=5 * 10 ** 2,
         help="Minimum replay buffer size before " + "performing gradient updates.",
     )
     parser.add_argument(
@@ -188,18 +188,18 @@ def make_env(test):
 
     explorer = explorers.LinearDecayEpsilonGreedy(
         start_epsilon=1.0,
-        end_epsilon=0.1,
-        decay_steps=10 ** 6,
+        end_epsilon=0.0,
+        decay_steps=5 * 10 ** 3,
         random_action_func=lambda: np.random.randint(n_actions),
     )
 
     def phi(observation):
         # Feature extractor
-        obs = np.asarray(observation["observation"], dtype=np.float32) / 255
-        dg = np.asarray(observation["desired_goal"], dtype=np.float32) / 255
+        obs = np.asarray(observation["observation"], dtype=np.float32)
+        dg = np.asarray(observation["desired_goal"], dtype=np.float32)
         return np.concatenate((obs, dg))
 
-    Agent = agents.DQN
+    Agent = agents.DoubleDQN
     agent = Agent(
         q_func,
         opt,

From 4e15a76efeffe5d30dc11864978e3fd57cb15657 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 29 Oct 2020 02:34:22 +0900
Subject: [PATCH 12/34] Applies black

---
 examples/her/train_dqn_bit_flip.py | 36 +++++++++++++++++-------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index b4a25ff04..99c2f2daf 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -31,11 +31,13 @@ def __init__(self, n):
         self.n = n
         self.steps = 0
         self.action_space = spaces.Discrete(n)
-        self.observation_space = spaces.Dict(dict(
-            desired_goal=spaces.MultiBinary(n),
-            achieved_goal=spaces.MultiBinary(n),
-            observation=spaces.MultiBinary(n),
-        ))
+        self.observation_space = spaces.Dict(
+            dict(
+                desired_goal=spaces.MultiBinary(n),
+                achieved_goal=spaces.MultiBinary(n),
+                observation=spaces.MultiBinary(n),
+            )
+        )
         self.clear_statistics()
 
     def step(self, action):
@@ -49,10 +51,12 @@ def step(self, action):
         self.observation["achieved_goal"] = new_obs
         self.observation["observation"] = new_obs
 
-        reward = reward_fn(self.observation["desired_goal"],
-                           self.observation["achieved_goal"])
-        done_success = (self.observation["desired_goal"] == \
-            self.observation["achieved_goal"]).all()
+        reward = reward_fn(
+            self.observation["desired_goal"], self.observation["achieved_goal"]
+        )
+        done_success = (
+            self.observation["desired_goal"] == self.observation["achieved_goal"]
+        ).all()
         done = done_success
         self.steps += 1
         if self.steps == self.n:
@@ -67,10 +71,10 @@ def step(self, action):
 
     def reset(self):
         sample_obs = self.observation_space.sample()
-        state, goal = sample_obs['observation'], sample_obs['desired_goal']
+        state, goal = sample_obs["observation"], sample_obs["desired_goal"]
         while (state == goal).all():
             sample_obs = self.observation_space.sample()
-            state, goal = sample_obs['observation'], sample_obs['desired_goal']
+            state, goal = sample_obs["observation"], sample_obs["desired_goal"]
         self.observation = dict()
         self.observation["desired_goal"] = goal
         self.observation["achieved_goal"] = state
@@ -79,15 +83,16 @@ def reset(self):
         return self.observation
 
     def get_statistics(self):
-        failures =  self.results.count(0)
+        failures = self.results.count(0)
         successes = self.results.count(1)
         assert len(self.results) == failures + successes
-        success_rate = successes/float(self.results)
+        success_rate = successes / float(self.results)
         return [("success_rate", success_rate)]
 
     def clear_statistics(self):
         self.results = []
 
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -181,8 +186,8 @@ def make_env(test):
         rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
             reward_fn=reward_fn,
             replay_strategy=replay_buffers.hindsight.ReplayFutureGoal(),
-            capacity=10 ** 6
-            )
+            capacity=10 ** 6,
+        )
     else:
         rbuf = replay_buffers.ReplayBuffer(10 ** 6)
 
@@ -218,7 +223,6 @@ def phi(observation):
     if args.load:
         agent.load(args.load)
 
-
     if args.demo:
         eval_stats = experiments.eval_performance(
             env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None

From 71809a532b6d3a376c0bcbb7710551d40516698c Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 29 Oct 2020 02:36:28 +0900
Subject: [PATCH 13/34] Updates optimizer, and target update interval

---
 examples/her/train_dqn_bit_flip.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 99c2f2daf..001e7a05b 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -172,15 +172,7 @@ def make_env(test):
         DiscreteActionValueHead(),
     )
 
-    # Use the same hyperparameters as the Nature paper
-    opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
-        q_func.parameters(),
-        lr=2.5e-4,
-        alpha=0.95,
-        momentum=0.0,
-        eps=1e-2,
-        centered=True,
-    )
+    opt = torch.optim.Adam(q_func.parameters(), eps=1e-3)
 
     if args.use_hindsight:
         rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
@@ -192,7 +184,7 @@ def make_env(test):
         rbuf = replay_buffers.ReplayBuffer(10 ** 6)
 
     explorer = explorers.LinearDecayEpsilonGreedy(
-        start_epsilon=1.0,
+        start_epsilon=0.3,
         end_epsilon=0.0,
         decay_steps=5 * 10 ** 3,
         random_action_func=lambda: np.random.randint(n_actions),
@@ -213,7 +205,7 @@ def phi(observation):
         gamma=0.99,
         explorer=explorer,
         replay_start_size=args.replay_start_size,
-        target_update_interval=10 ** 4,
+        target_update_interval=10 ** 3,
         clip_delta=True,
         update_interval=4,
         batch_accumulator="sum",

From 8c616e542ad7e3a72f46a1099f476459c5245081 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 29 Oct 2020 04:00:11 +0900
Subject: [PATCH 14/34] Fixes minor errors

---
 examples/her/train_dqn_bit_flip.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 001e7a05b..ea1278636 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -2,8 +2,9 @@
 
 import gym
 import gym.spaces as spaces
-import torch.nn as nn
 import numpy as np
+import torch
+import torch.nn as nn
 
 import pfrl
 from pfrl.q_functions import DiscreteActionValueHead
@@ -40,6 +41,9 @@ def __init__(self, n):
         )
         self.clear_statistics()
 
+    def compute_reward(self, achieved_goal, desired_goal, info):
+        return reward_fn(desired_goal, achieved_goal)
+
     def step(self, action):
         # Compute action outcome
         bit_new = int(not self.observation["observation"][action])
@@ -51,9 +55,9 @@ def step(self, action):
         self.observation["achieved_goal"] = new_obs
         self.observation["observation"] = new_obs
 
-        reward = reward_fn(
-            self.observation["desired_goal"], self.observation["achieved_goal"]
-        )
+        reward = self.compute_reward(self.observation["achieved_goal"],
+                                     self.observation["desired_goal"],
+                                     {})
         done_success = (
             self.observation["desired_goal"] == self.observation["achieved_goal"]
         ).all()
@@ -86,7 +90,9 @@ def get_statistics(self):
         failures = self.results.count(0)
         successes = self.results.count(1)
         assert len(self.results) == failures + successes
-        success_rate = successes / float(self.results)
+        if not self.results:
+            return [("success_rate", None)]
+        success_rate = successes / float(len(self.results))
         return [("success_rate", success_rate)]
 
     def clear_statistics(self):

From 9721d1afdae9fed11585cd0a58a8cb4a425dc0db Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 29 Oct 2020 14:10:39 +0900
Subject: [PATCH 15/34] Applies black

---
 examples/her/train_dqn_bit_flip.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index ea1278636..ea4bcc915 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -55,9 +55,9 @@ def step(self, action):
         self.observation["achieved_goal"] = new_obs
         self.observation["observation"] = new_obs
 
-        reward = self.compute_reward(self.observation["achieved_goal"],
-                                     self.observation["desired_goal"],
-                                     {})
+        reward = self.compute_reward(
+            self.observation["achieved_goal"], self.observation["desired_goal"], {}
+        )
         done_success = (
             self.observation["desired_goal"] == self.observation["achieved_goal"]
         ).all()

From 8643e0de6ae982dc55ab772d4e6a7cdde4f39eed Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 29 Oct 2020 14:32:30 +0900
Subject: [PATCH 16/34] Addresses flakes

---
 examples/her/train_dqn_bit_flip.py | 1 -
 pfrl/replay_buffers/hindsight.py   | 2 --
 2 files changed, 3 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index ea4bcc915..7e2baa04a 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -6,7 +6,6 @@
 import torch
 import torch.nn as nn
 
-import pfrl
 from pfrl.q_functions import DiscreteActionValueHead
 from pfrl import agents
 from pfrl import experiments
diff --git a/pfrl/replay_buffers/hindsight.py b/pfrl/replay_buffers/hindsight.py
index dc70b861e..fa24cdffd 100644
--- a/pfrl/replay_buffers/hindsight.py
+++ b/pfrl/replay_buffers/hindsight.py
@@ -47,7 +47,6 @@ def apply(self, episodes, reward_fn, swap_keys_list):
             transition = episode[t]
             if apply_her:
                 final_transition = episode[-1]
-                final_goal = final_transition["next_state"]["achieved_goal"]
                 transition = copy.deepcopy(transition)
                 transition = relabel_transition_goal(
                     transition, final_transition, reward_fn, swap_keys_list
@@ -88,7 +87,6 @@ def apply(self, episodes, reward_fn, swap_keys_list):
             transition = episode[t]
             if apply_her:
                 future_transition = episode[future_t]
-                future_goal = future_transition["next_state"]["achieved_goal"]
                 transition = copy.deepcopy(transition)
                 transition = relabel_transition_goal(
                     transition, future_transition, reward_fn, swap_keys_list

From 4d34f1e629a9872d6cda4dd854951bed313e7519 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 29 Oct 2020 23:51:58 +0900
Subject: [PATCH 17/34] Cleans up code

---
 examples/her/train_dqn_bit_flip.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 7e2baa04a..868a67278 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -29,7 +29,6 @@ class BitFlip(gym.GoalEnv):
 
     def __init__(self, n):
         self.n = n
-        self.steps = 0
         self.action_space = spaces.Discrete(n)
         self.observation_space = spaces.Dict(
             dict(
@@ -43,6 +42,12 @@ def __init__(self, n):
     def compute_reward(self, achieved_goal, desired_goal, info):
         return reward_fn(desired_goal, achieved_goal)
 
+    def _check_done(self):
+        success = (
+            self.observation["desired_goal"] == self.observation["achieved_goal"]
+        ).all()
+        return (self.steps >= self.n) or success, success
+
     def step(self, action):
         # Compute action outcome
         bit_new = int(not self.observation["observation"][action])
@@ -57,19 +62,12 @@ def step(self, action):
         reward = self.compute_reward(
             self.observation["achieved_goal"], self.observation["desired_goal"], {}
         )
-        done_success = (
-            self.observation["desired_goal"] == self.observation["achieved_goal"]
-        ).all()
-        done = done_success
         self.steps += 1
-        if self.steps == self.n:
-            done = True
+        done, success = self._check_done()
+        assert success == (reward == 0)
         if done:
-            if done_success:
-                assert reward == 0
-                self.results.append(1)
-            else:
-                self.results.append(0)
+            result = 1 if success else 0
+            self.results.append(result)
         return self.observation, reward, done, {}
 
     def reset(self):

From f5a1bfa2f0116a017e3c001302f434d912c39c24 Mon Sep 17 00:00:00 2001
From: Prabhat Nagarajan <prabhat.nagarajan@gmail.com>
Date: Fri, 30 Oct 2020 04:15:43 +0900
Subject: [PATCH 18/34] Update examples/her/train_dqn_bit_flip.py

Co-authored-by: Justin DuJardin <justindujardin@users.noreply.github.com>
---
 examples/her/train_dqn_bit_flip.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 868a67278..76624cf7e 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -55,9 +55,11 @@ def step(self, action):
         new_obs[action] = bit_new
         # Set new observation
         dg = self.observation["desired_goal"]
-        self.observation["desired_goal"] = dg.copy()
-        self.observation["achieved_goal"] = new_obs
-        self.observation["observation"] = new_obs
+        self.observation = {
+            "desired_goal": dg.copy(),
+            "achieved_goal": new_obs,
+            "observation": new_obs,
+        }
 
         reward = self.compute_reward(
             self.observation["achieved_goal"], self.observation["desired_goal"], {}

From 3812384f06960e41ce7990d8f3f8de48467c27a0 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Fri, 30 Oct 2020 12:59:33 +0900
Subject: [PATCH 19/34] experiment and hyperparameter update

---
 examples/her/train_dqn_bit_flip.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 76624cf7e..5c056840e 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -124,7 +124,7 @@ def main():
     parser.add_argument(
         "--steps",
         type=int,
-        default=10 ** 7,
+        default=5 * 10 ** 6,
         help="Total number of timesteps to train the agent.",
     )
     parser.add_argument(
@@ -177,7 +177,7 @@ def make_env(test):
         DiscreteActionValueHead(),
     )
 
-    opt = torch.optim.Adam(q_func.parameters(), eps=1e-3)
+    opt = torch.optim.Adam(q_func.parameters(), eps=1e-4)
 
     if args.use_hindsight:
         rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
@@ -188,10 +188,12 @@ def make_env(test):
     else:
         rbuf = replay_buffers.ReplayBuffer(10 ** 6)
 
+    decay_steps = ((args.num_bits + 5) * 10 ** 3)
+    end_epsilon = min(0.1, 0.5/args.num_bits)
     explorer = explorers.LinearDecayEpsilonGreedy(
-        start_epsilon=0.3,
-        end_epsilon=0.0,
-        decay_steps=5 * 10 ** 3,
+        start_epsilon=0.5,
+        end_epsilon=end_epsilon,
+        decay_steps=decay_steps,
         random_action_func=lambda: np.random.randint(n_actions),
     )
 

From 5cd21e08e74dc1e0f9b38caba06e34737a1076b4 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Fri, 30 Oct 2020 20:49:11 +0900
Subject: [PATCH 20/34] Switches parse args

---
 examples/her/train_dqn_bit_flip.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 5c056840e..d2d66ccfb 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -139,7 +139,7 @@ def main():
         default=10,
         help="Number of bits for BitFlipping environment",
     )
-    parser.add_argument("--use-hindsight", type=bool, default=True)
+    parser.add_argument("--no-hindsight", action='store_true', default=False)
     parser.add_argument("--eval-n-episodes", type=int, default=100)
     parser.add_argument("--eval-interval", type=int, default=250000)
     parser.add_argument("--n-best-episodes", type=int, default=100)
@@ -179,14 +179,14 @@ def make_env(test):
 
     opt = torch.optim.Adam(q_func.parameters(), eps=1e-4)
 
-    if args.use_hindsight:
+    if args.no_hindsight:
+        rbuf = replay_buffers.ReplayBuffer(10 ** 6)
+    else:
         rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
             reward_fn=reward_fn,
             replay_strategy=replay_buffers.hindsight.ReplayFutureGoal(),
             capacity=10 ** 6,
         )
-    else:
-        rbuf = replay_buffers.ReplayBuffer(10 ** 6)
 
     decay_steps = ((args.num_bits + 5) * 10 ** 3)
     end_epsilon = min(0.1, 0.5/args.num_bits)

From 035ad635b9539629f2672e9192a1421e7255611c Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Sat, 31 Oct 2020 02:19:08 +0900
Subject: [PATCH 21/34] Applies black

---
 examples/her/train_dqn_bit_flip.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index d2d66ccfb..0354bb344 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -139,7 +139,7 @@ def main():
         default=10,
         help="Number of bits for BitFlipping environment",
     )
-    parser.add_argument("--no-hindsight", action='store_true', default=False)
+    parser.add_argument("--no-hindsight", action="store_true", default=False)
     parser.add_argument("--eval-n-episodes", type=int, default=100)
     parser.add_argument("--eval-interval", type=int, default=250000)
     parser.add_argument("--n-best-episodes", type=int, default=100)
@@ -188,8 +188,8 @@ def make_env(test):
             capacity=10 ** 6,
         )
 
-    decay_steps = ((args.num_bits + 5) * 10 ** 3)
-    end_epsilon = min(0.1, 0.5/args.num_bits)
+    decay_steps = (args.num_bits + 5) * 10 ** 3
+    end_epsilon = min(0.1, 0.5 / args.num_bits)
     explorer = explorers.LinearDecayEpsilonGreedy(
         start_epsilon=0.5,
         end_epsilon=end_epsilon,

From e481b857d5aae7c83a248727ec9b7673921ea204 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 5 Nov 2020 20:49:57 +0900
Subject: [PATCH 22/34] Adds HER to the Repo readme

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 08f75a8fe..daa070db4 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,8 @@ Following useful techniques have been also implemented in PFRL:
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/train_dqn_ale.py)
 - [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952)
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/train_dqn_ale.py)
+- [Hindsight Experience Replay](https://arxiv.org/abs/1707.01495)
+  - examples: [[Bit-flip DQN]](examples/her/train_dqn_bit_flip.py)
 - [Dueling Network](https://arxiv.org/abs/1511.06581)
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/train_dqn_ale.py)
 - [Normalized Advantage Function](https://arxiv.org/abs/1603.00748)

From ed4ae2e01262668ce4ef698aa0796968696a74c8 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Fri, 6 Nov 2020 01:11:28 +0900
Subject: [PATCH 23/34] Applies isort

---
 examples/her/train_dqn_bit_flip.py | 9 ++-------
 pfrl/replay_buffers/__init__.py    | 6 +++---
 pfrl/replay_buffers/hindsight.py   | 2 +-
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 0354bb344..a430a1bef 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -6,14 +6,9 @@
 import torch
 import torch.nn as nn
 
-from pfrl.q_functions import DiscreteActionValueHead
-from pfrl import agents
-from pfrl import experiments
-from pfrl import explorers
-from pfrl import utils
-from pfrl import replay_buffers
-
+from pfrl import agents, experiments, explorers, replay_buffers, utils
 from pfrl.initializers import init_chainer_default
+from pfrl.q_functions import DiscreteActionValueHead
 
 
 def reward_fn(dg, ag):
diff --git a/pfrl/replay_buffers/__init__.py b/pfrl/replay_buffers/__init__.py
index 3319041f8..d33f1bfba 100644
--- a/pfrl/replay_buffers/__init__.py
+++ b/pfrl/replay_buffers/__init__.py
@@ -1,6 +1,8 @@
 from pfrl.replay_buffers.episodic import EpisodicReplayBuffer  # NOQA
-from pfrl.replay_buffers.hindsight import HindsightReplayStrategy  # NOQA
 from pfrl.replay_buffers.hindsight import HindsightReplayBuffer  # NOQA
+from pfrl.replay_buffers.hindsight import HindsightReplayStrategy  # NOQA
+from pfrl.replay_buffers.hindsight import ReplayFinalGoal  # NOQA
+from pfrl.replay_buffers.hindsight import ReplayFutureGoal  # NOQA
 from pfrl.replay_buffers.persistent import PersistentEpisodicReplayBuffer  # NOQA
 from pfrl.replay_buffers.persistent import PersistentReplayBuffer  # NOQA
 from pfrl.replay_buffers.prioritized import PrioritizedReplayBuffer  # NOQA
@@ -9,5 +11,3 @@
     PrioritizedEpisodicReplayBuffer,
 )
 from pfrl.replay_buffers.replay_buffer import ReplayBuffer  # NOQA
-from pfrl.replay_buffers.hindsight import ReplayFinalGoal  # NOQA
-from pfrl.replay_buffers.hindsight import ReplayFutureGoal  # NOQA
diff --git a/pfrl/replay_buffers/hindsight.py b/pfrl/replay_buffers/hindsight.py
index fa24cdffd..92b8f1eb2 100644
--- a/pfrl/replay_buffers/hindsight.py
+++ b/pfrl/replay_buffers/hindsight.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 
-from pfrl.replay_buffers.episodic import EpisodicReplayBuffer
 from pfrl.replay_buffer import random_subseq
+from pfrl.replay_buffers.episodic import EpisodicReplayBuffer
 
 
 def relabel_transition_goal(transition, goal_transition, reward_fn, swap_keys_list):

From 9841438cbeb6d92b6c6226747d59430690bdb8ea Mon Sep 17 00:00:00 2001
From: muupan <muupan@gmail.com>
Date: Sat, 7 Nov 2020 00:34:31 +0900
Subject: [PATCH 24/34] Make DDPG HER work for FetchReach-v1

---
 examples/her/train_ddpg_her_fetch.py | 305 +++++++++++++++++++++++++++
 pfrl/agents/ddpg.py                  |  25 +++
 2 files changed, 330 insertions(+)
 create mode 100644 examples/her/train_ddpg_her_fetch.py

diff --git a/examples/her/train_ddpg_her_fetch.py b/examples/her/train_ddpg_her_fetch.py
new file mode 100644
index 000000000..b1078a96b
--- /dev/null
+++ b/examples/her/train_ddpg_her_fetch.py
@@ -0,0 +1,305 @@
+import argparse
+
+import gym
+import gym.spaces
+import numpy as np
+import torch
+import torch.nn as nn
+
+import pfrl
+from pfrl import experiments, replay_buffers, utils
+from pfrl.nn import BoundByTanh, ConcatObsAndAction
+from pfrl.policies import DeterministicHead
+
+
+class ComputeSuccessRate(gym.Wrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        self.success_record = []
+
+    def reset(self):
+        self.success_record.append(None)
+        return self.env.reset()
+
+    def step(self, action):
+        obs, r, done, info = self.env.step(action)
+        assert "is_success" in info
+        self.success_record[-1] = info["is_success"]
+        return obs, r, done, info
+
+    def get_statistics(self):
+        # Ignore episodes with zero step
+        valid_record = [x for x in self.success_record if x is not None]
+        success_rate = (
+            valid_record.count(True) / len(valid_record) if valid_record else np.nan
+        )
+        return [("success_rate", success_rate)]
+
+    def clear_statistics(self):
+        self.success_record = []
+
+
+class ClipObservation(gym.ObservationWrapper):
+    """Clip observations to a given range.
+
+    Args:
+        env: Env to wrap.
+        low: Lower limit.
+        high: Upper limit.
+
+    Attributes:
+        original_observation: Observation before casting.
+    """
+
+    def __init__(self, env, low, high):
+        super().__init__(env)
+        self.low = low
+        self.high = high
+
+    def observation(self, observation):
+        self.original_observation = observation
+        return np.clip(observation, self.low, self.high)
+
+
+class EpsilonGreedyWithGaussianNoise(pfrl.explorer.Explorer):
+    """Epsilon-Greedy with Gaussian noise.
+
+    This type of explorer was used in 
+    """
+
+    def __init__(self, epsilon, random_action_func, noise_scale, low=None, high=None):
+        self.epsilon = epsilon
+        self.random_action_func = random_action_func
+        self.noise_scale = noise_scale
+        self.low = low
+        self.high = high
+
+    def select_action(self, t, greedy_action_func, action_value=None):
+        if np.random.rand() < self.epsilon:
+            a = self.random_action_func()
+        else:
+            a = greedy_action_func()
+            noise = np.random.normal(scale=self.noise_scale, size=a.shape).astype(
+                np.float32
+            )
+            a = a + noise
+        if self.low is not None or self.high is not None:
+            return np.clip(a, self.low, self.high)
+        else:
+            return a
+
+    def __repr__(self):
+        return "EpsilonGreedyWithGaussianNoise(epsilon={}, noise_scale={}, low={}, high={})".format(
+            self.epsilon, self.noise_scale, self.low, self.high
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        default="results",
+        help=(
+            "Directory path to save output files."
+            " If it does not exist, it will be created."
+        ),
+    )
+    parser.add_argument(
+        "--env",
+        type=str,
+        default="FetchReach-v1",
+        help="OpenAI Gym MuJoCo env to perform algorithm on.",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)")
+    parser.add_argument(
+        "--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU."
+    )
+    parser.add_argument("--demo", action="store_true", default=False)
+    parser.add_argument("--load", type=str, default=None)
+    parser.add_argument(
+        "--log-level",
+        type=int,
+        default=20,
+        help="Logging level. 10:DEBUG, 20:INFO etc.",
+    )
+    parser.add_argument(
+        "--steps",
+        type=int,
+        default=5 * 10 ** 3,
+        help="Total number of timesteps to train the agent.",
+    )
+    parser.add_argument(
+        "--replay-start-size",
+        type=int,
+        default=5 * 10 ** 2,
+        help="Minimum replay buffer size before " + "performing gradient updates.",
+    )
+    parser.add_argument(
+        "--num-bits",
+        type=int,
+        default=10,
+        help="Number of bits for BitFlipping environment",
+    )
+    parser.add_argument("--no-hindsight", action="store_true", default=False)
+    parser.add_argument("--eval-n-episodes", type=int, default=10)
+    parser.add_argument("--eval-interval", type=int, default=500)
+    parser.add_argument(
+        "--render", action="store_true", help="Render env states in a GUI window."
+    )
+    args = parser.parse_args()
+
+    import logging
+
+    logging.basicConfig(level=args.log_level)
+
+    # Set a random seed used in PFRL.
+    utils.set_random_seed(args.seed)
+
+    args.outdir = experiments.prepare_output_dir(args, args.outdir)
+    print("Output files are saved in {}".format(args.outdir))
+
+    def make_env(test):
+        env = gym.make(args.env)
+        # Unwrap TimeLimit wrapper
+        assert isinstance(env, gym.wrappers.TimeLimit)
+        env = env.env
+        # Use different random seeds for train and test envs
+        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
+        env.seed(env_seed)
+        # Cast observations to float32 because our model uses float32
+        if args.render and not test:
+            env = pfrl.wrappers.Render(env)
+        env = ComputeSuccessRate(env)
+        return env
+
+    env = make_env(test=False)
+    timestep_limit = env.spec.max_episode_steps
+    obs_space = env.observation_space
+    action_space = env.action_space
+    print("Observation space:", obs_space)
+    print("Action space:", action_space)
+
+    assert isinstance(obs_space, gym.spaces.Dict)
+    obs_size = obs_space["observation"].low.size + obs_space["desired_goal"].low.size
+    action_size = action_space.low.size
+
+    def reward_fn(dg, ag):
+        return env.compute_reward(ag, dg, None)
+
+    q_func = nn.Sequential(
+        ConcatObsAndAction(),
+        nn.Linear(obs_size + action_size, 256),
+        nn.ReLU(),
+        nn.Linear(256, 256),
+        nn.ReLU(),
+        nn.Linear(256, 256),
+        nn.ReLU(),
+        nn.Linear(256, 1),
+    )
+    policy = nn.Sequential(
+        nn.Linear(obs_size, 256),
+        nn.ReLU(),
+        nn.Linear(256, 256),
+        nn.ReLU(),
+        nn.Linear(256, 256),
+        nn.ReLU(),
+        nn.Linear(256, action_size),
+        BoundByTanh(low=action_space.low, high=action_space.high),
+        DeterministicHead(),
+    )
+
+    def init_xavier_uniform(layer):
+        if isinstance(layer, nn.Linear):
+            nn.init.xavier_uniform_(layer.weight)
+            nn.init.zeros_(layer.bias)
+
+    with torch.no_grad():
+        q_func.apply(init_xavier_uniform)
+        policy.apply(init_xavier_uniform)
+
+    opt_a = torch.optim.Adam(policy.parameters())
+    opt_c = torch.optim.Adam(q_func.parameters())
+
+    rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
+        reward_fn=reward_fn,
+        replay_strategy=replay_buffers.hindsight.ReplayFutureGoal(),
+        capacity=10 ** 6,
+    )
+
+    explorer = EpsilonGreedyWithGaussianNoise(
+        epsilon=0.3,
+        random_action_func=lambda: env.action_space.sample(),
+        noise_scale=0.2,
+    )
+
+    # Normalize observations based on their empirical mean and variance
+    obs_normalizer = pfrl.nn.EmpiricalNormalization(obs_size, clip_threshold=5)
+
+    def phi(observation):
+        # Feature extractor
+        obs = np.asarray(observation["observation"], dtype=np.float32)
+        dg = np.asarray(observation["desired_goal"], dtype=np.float32)
+        return np.concatenate((obs, dg)).clip(-200, 200)
+
+    # 1 eopch = 10 episodes = 500 steps
+    gamma = 1.0 - 1.0 / timestep_limit
+    agent = pfrl.agents.DDPG(
+        policy,
+        q_func,
+        opt_a,
+        opt_c,
+        rbuf,
+        phi=phi,
+        gamma=gamma,
+        explorer=explorer,
+        replay_start_size=500,
+        target_update_method="soft",
+        target_update_interval=50,
+        update_interval=50,
+        soft_update_tau=5e-2,
+        n_times_update=40,
+        gpu=args.gpu,
+        minibatch_size=256,
+        clip_return_range=(-1.0 / (1.0 - gamma), 0.0),
+        action_l2_penalty_coef=1.0,
+        obs_normalizer=obs_normalizer,
+    )
+
+    if args.load:
+        agent.load(args.load)
+
+    eval_env = make_env(test=True)
+    if args.demo:
+        eval_stats = experiments.eval_performance(
+            env=eval_env,
+            agent=agent,
+            n_steps=args.eval_n_steps,
+            n_episodes=None,
+            max_episode_len=timestep_limit,
+        )
+        print(
+            "n_episodes: {} mean: {} median: {} stdev {}".format(
+                eval_stats["episodes"],
+                eval_stats["mean"],
+                eval_stats["median"],
+                eval_stats["stdev"],
+            )
+        )
+    else:
+        experiments.train_agent_with_evaluation(
+            agent=agent,
+            env=env,
+            steps=args.steps,
+            eval_n_steps=None,
+            eval_n_episodes=args.eval_n_episodes,
+            eval_interval=args.eval_interval,
+            outdir=args.outdir,
+            save_best_so_far_agent=True,
+            eval_env=eval_env,
+            train_max_episode_len=timestep_limit,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pfrl/agents/ddpg.py b/pfrl/agents/ddpg.py
index 9d2d15589..e007f8489 100644
--- a/pfrl/agents/ddpg.py
+++ b/pfrl/agents/ddpg.py
@@ -80,13 +80,19 @@ def __init__(
         logger=getLogger(__name__),
         batch_states=batch_states,
         burnin_action_func=None,
+        clip_return_range=None,
+        action_l2_penalty_coef=None,
+        obs_normalizer=None,
     ):
 
         self.model = nn.ModuleList([policy, q_func])
+        self.obs_normalizer = obs_normalizer
         if gpu is not None and gpu >= 0:
             assert torch.cuda.is_available()
             self.device = torch.device("cuda:{}".format(gpu))
             self.model.to(self.device)
+            if self.obs_normalizer is not None:
+                self.obs_normalizer.to(self.device)
         else:
             self.device = torch.device("cpu")
 
@@ -119,6 +125,8 @@ def __init__(
         )
         self.batch_states = batch_states
         self.burnin_action_func = burnin_action_func
+        self.clip_return_range = clip_return_range
+        self.action_l2_penalty_coef = action_l2_penalty_coef
 
         self.t = 0
         self.last_state = None
@@ -163,6 +171,8 @@ def compute_critic_loss(self, batch):
             target_q = batch_rewards + self.gamma * (
                 1.0 - batch_terminal
             ) * next_q.reshape((batchsize,))
+            if self.clip_return_range is not None:
+                target_q = target_q.clamp(*self.clip_return_range)
 
         predict_q = self.q_function((batch_state, batch_actions)).reshape((batchsize,))
 
@@ -181,6 +191,9 @@ def compute_actor_loss(self, batch):
         q = self.q_function((batch_state, onpolicy_actions))
         loss = -q.mean()
 
+        if self.action_l2_penalty_coef is not None:
+            loss += self.action_l2_penalty_coef * (onpolicy_actions ** 2).mean()
+
         # Update stats
         self.q_record.extend(q.detach().cpu().numpy())
         self.actor_loss_record.append(float(loss.detach().cpu().numpy()))
@@ -192,6 +205,10 @@ def update(self, experiences, errors_out=None):
 
         batch = batch_experiences(experiences, self.device, self.phi, self.gamma)
 
+        if self.obs_normalizer:
+            batch["state"] = self.obs_normalizer(batch["state"], update=False)
+            batch["next_state"] = self.obs_normalizer(batch["next_state"], update=False)
+
         self.critic_optimizer.zero_grad()
         self.compute_critic_loss(batch).backward()
         self.critic_optimizer.step()
@@ -258,6 +275,8 @@ def batch_observe(self, batch_obs, batch_reward, batch_done, batch_reset):
     def _batch_select_greedy_actions(self, batch_obs):
         with torch.no_grad(), evaluating(self.policy):
             batch_xs = self.batch_states(batch_obs, self.device, self.phi)
+            if self.obs_normalizer:
+                batch_xs = self.obs_normalizer(batch_xs, update=False)
             batch_action = self.policy(batch_xs).sample()
             return batch_action.cpu().numpy()
 
@@ -300,6 +319,12 @@ def _batch_observe_train(self, batch_obs, batch_reward, batch_done, batch_reset)
                     is_state_terminal=batch_done[i],
                     env_id=i,
                 )
+                if self.obs_normalizer is not None:
+                    self.obs_normalizer.experience(
+                        self.batch_states(
+                            [self.batch_last_obs[i]], self.device, self.phi
+                        )
+                    )
                 if batch_reset[i] or batch_done[i]:
                     self.batch_last_obs[i] = None
                     self.batch_last_action[i] = None

From d61d1dc90f584531b74f7b2c3ca9b302f943c309 Mon Sep 17 00:00:00 2001
From: muupan <muupan@gmail.com>
Date: Tue, 10 Nov 2020 00:55:58 +0900
Subject: [PATCH 25/34] Start updates earlier to match performance of baselines

---
 examples/her/train_ddpg_her_fetch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/her/train_ddpg_her_fetch.py b/examples/her/train_ddpg_her_fetch.py
index b1078a96b..0f9599ae2 100644
--- a/examples/her/train_ddpg_her_fetch.py
+++ b/examples/her/train_ddpg_her_fetch.py
@@ -253,7 +253,7 @@ def phi(observation):
         phi=phi,
         gamma=gamma,
         explorer=explorer,
-        replay_start_size=500,
+        replay_start_size=256,
         target_update_method="soft",
         target_update_interval=50,
         update_interval=50,

From 18177a4a3c373ed309f361cb738c00aba7b04cca Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Wed, 11 Nov 2020 00:28:11 +0900
Subject: [PATCH 26/34] Adds Fetch DDPG to readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index daa070db4..448f7d645 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ Following useful techniques have been also implemented in PFRL:
 - [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952)
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/train_dqn_ale.py)
 - [Hindsight Experience Replay](https://arxiv.org/abs/1707.01495)
-  - examples: [[Bit-flip DQN]](examples/her/train_dqn_bit_flip.py)
+  - examples: [[Bit-flip DQN]](examples/her/train_dqn_bit_flip.py) [[DDPG on Fetch Envs]](examples/her/train_ddpg_her_fetch.py)
 - [Dueling Network](https://arxiv.org/abs/1511.06581)
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/train_dqn_ale.py)
 - [Normalized Advantage Function](https://arxiv.org/abs/1603.00748)

From 383585f9a1dfa257785e29cf29f9e985c286d6be Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Wed, 11 Nov 2020 00:31:39 +0900
Subject: [PATCH 27/34] Updates descriptions for args in bit flip

---
 examples/her/train_dqn_bit_flip.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index a430a1bef..9775d5e42 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -126,15 +126,16 @@ def main():
         "--replay-start-size",
         type=int,
         default=5 * 10 ** 2,
-        help="Minimum replay buffer size before " + "performing gradient updates.",
+        help="Minimum replay buffer size before performing gradient updates.",
     )
     parser.add_argument(
         "--num-bits",
         type=int,
         default=10,
-        help="Number of bits for BitFlipping environment",
+        help="Number of bits for BitFlipping environment.",
     )
-    parser.add_argument("--no-hindsight", action="store_true", default=False)
+    parser.add_argument("--no-hindsight", action="store_true", default=False,
+                        help="Do not use Hindsight Replay.")
     parser.add_argument("--eval-n-episodes", type=int, default=100)
     parser.add_argument("--eval-interval", type=int, default=250000)
     parser.add_argument("--n-best-episodes", type=int, default=100)

From 88380f0ea3ed42de68849d2fcd16561483729610 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Wed, 11 Nov 2020 00:33:15 +0900
Subject: [PATCH 28/34] Updates docs in DDPG Fetch example

---
 examples/her/train_ddpg_her_fetch.py | 32 +++++++++++++++++++---------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/examples/her/train_ddpg_her_fetch.py b/examples/her/train_ddpg_her_fetch.py
index 0f9599ae2..8cc85924a 100644
--- a/examples/her/train_ddpg_her_fetch.py
+++ b/examples/her/train_ddpg_her_fetch.py
@@ -13,6 +13,14 @@
 
 
 class ComputeSuccessRate(gym.Wrapper):
+    """Environment wrapper that computes success rate.
+
+    Args:
+        env: Env to wrap
+
+    Attributes:
+        success_record: list of successes
+    """
     def __init__(self, env):
         super().__init__(env)
         self.success_record = []
@@ -65,6 +73,7 @@ class EpsilonGreedyWithGaussianNoise(pfrl.explorer.Explorer):
     """Epsilon-Greedy with Gaussian noise.
 
     This type of explorer was used in 
+    https://github.com/openai/baselines/tree/master/baselines/her
     """
 
     def __init__(self, epsilon, random_action_func, noise_scale, low=None, high=None):
@@ -133,15 +142,14 @@ def main():
         "--replay-start-size",
         type=int,
         default=5 * 10 ** 2,
-        help="Minimum replay buffer size before " + "performing gradient updates.",
+        help="Minimum replay buffer size before performing gradient updates.",
     )
-    parser.add_argument(
-        "--num-bits",
-        type=int,
-        default=10,
-        help="Number of bits for BitFlipping environment",
-    )
-    parser.add_argument("--no-hindsight", action="store_true", default=False)
+    parser.add_argument("--replay-strategy",
+                        default="future",
+                        choices=["future", "final"],
+                        help="The replay strategy to use",)
+    parser.add_argument("--no-hindsight", action="store_true", default=False,
+                        help="Do not use Hindsight Replay")
     parser.add_argument("--eval-n-episodes", type=int, default=10)
     parser.add_argument("--eval-interval", type=int, default=500)
     parser.add_argument(
@@ -221,9 +229,13 @@ def init_xavier_uniform(layer):
     opt_a = torch.optim.Adam(policy.parameters())
     opt_c = torch.optim.Adam(q_func.parameters())
 
+    if args.replay_strategy == "future":
+        replay_strategy = replay_buffers.hindsight.ReplayFutureGoal()
+    else:
+        replay_strategy = replay_buffers.hindsight.ReplayFinalGoal()
     rbuf = replay_buffers.hindsight.HindsightReplayBuffer(
         reward_fn=reward_fn,
-        replay_strategy=replay_buffers.hindsight.ReplayFutureGoal(),
+        replay_strategy=replay_strategy,
         capacity=10 ** 6,
     )
 
@@ -242,7 +254,7 @@ def phi(observation):
         dg = np.asarray(observation["desired_goal"], dtype=np.float32)
         return np.concatenate((obs, dg)).clip(-200, 200)
 
-    # 1 eopch = 10 episodes = 500 steps
+    # 1 epoch = 10 episodes = 500 steps
     gamma = 1.0 - 1.0 / timestep_limit
     agent = pfrl.agents.DDPG(
         policy,

From 453b04bdbd36589a781bdf9b76ae22ceceb4638c Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Wed, 11 Nov 2020 00:38:20 +0900
Subject: [PATCH 29/34] Minor cleanup of hindsight replay strategies

---
 pfrl/replay_buffers/hindsight.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/pfrl/replay_buffers/hindsight.py b/pfrl/replay_buffers/hindsight.py
index 92b8f1eb2..284e96985 100644
--- a/pfrl/replay_buffers/hindsight.py
+++ b/pfrl/replay_buffers/hindsight.py
@@ -19,10 +19,7 @@ def relabel_transition_goal(transition, goal_transition, reward_fn, swap_keys_li
 
 
 class HindsightReplayStrategy:
-    """ReplayStrategy for Hindsight experience replay"""
-
-    def __init__(self, reward_fn):
-        self.reward_fn = reward_fn
+    """ReplayStrategy for Hindsight experience replay."""
 
     def apply(self, episodes, reward_fn, swap_keys_list):
         return episodes
@@ -59,7 +56,6 @@ class ReplayFutureGoal(HindsightReplayStrategy):
     """Replay random future goal.
 
     Args:
-        ignore_null_goals (bool): no replace with goal when nothing achieved
         future_k (int): number of future goals to sample per true sample
     """
 

From 0a2efc612e6fe83c860ecc0732b55a58a63f0f06 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 12 Nov 2020 20:41:26 +0900
Subject: [PATCH 30/34] Adds bit flip to examples tests

---
 examples/her/train_dqn_bit_flip.py      |  2 +-
 examples_tests/her/test_dqn_bit_flip.sh | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100755 examples_tests/her/test_dqn_bit_flip.sh

diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index 9775d5e42..a077dfc62 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -220,7 +220,7 @@ def phi(observation):
 
     if args.demo:
         eval_stats = experiments.eval_performance(
-            env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None
+            env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_episodes
         )
         print(
             "n_episodes: {} mean: {} median: {} stdev {}".format(
diff --git a/examples_tests/her/test_dqn_bit_flip.sh b/examples_tests/her/test_dqn_bit_flip.sh
new file mode 100755
index 000000000..d89208e63
--- /dev/null
+++ b/examples_tests/her/test_dqn_bit_flip.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -Ceu
+
+outdir=$(mktemp -d)
+
+gpu="$1"
+
+# her/dqn_bit_flip
+python examples/her/train_dqn_bit_flip.py --gpu $gpu --steps 100 --outdir $outdir/her/bit_flip
+model=$(find $outdir/her/bit_flip -name "*_finish")
+python examples/her/train_dqn_bit_flip.py --demo --load $model --eval-n-episodes 1 --outdir $outdir/temp --gpu $gpu
\ No newline at end of file

From eaa01e442ae560b6926a91ce8da3274843c2c12d Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Thu, 12 Nov 2020 20:46:33 +0900
Subject: [PATCH 31/34] Applies black

---
 examples/her/train_ddpg_her_fetch.py | 7 ++++---
 examples/her/train_dqn_bit_flip.py   | 8 ++++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/her/train_ddpg_her_fetch.py b/examples/her/train_ddpg_her_fetch.py
index 0f9599ae2..6ca2868fa 100644
--- a/examples/her/train_ddpg_her_fetch.py
+++ b/examples/her/train_ddpg_her_fetch.py
@@ -64,7 +64,7 @@ def observation(self, observation):
 class EpsilonGreedyWithGaussianNoise(pfrl.explorer.Explorer):
     """Epsilon-Greedy with Gaussian noise.
 
-    This type of explorer was used in 
+    This type of explorer was used in
     """
 
     def __init__(self, epsilon, random_action_func, noise_scale, low=None, high=None):
@@ -89,8 +89,9 @@ def select_action(self, t, greedy_action_func, action_value=None):
             return a
 
     def __repr__(self):
-        return "EpsilonGreedyWithGaussianNoise(epsilon={}, noise_scale={}, low={}, high={})".format(
-            self.epsilon, self.noise_scale, self.low, self.high
+        return (
+            "EpsilonGreedyWithGaussianNoise(epsilon={}, noise_scale={}, low={},"
+            " high={})".format(self.epsilon, self.noise_scale, self.low, self.high)
         )
 
 
diff --git a/examples/her/train_dqn_bit_flip.py b/examples/her/train_dqn_bit_flip.py
index a077dfc62..e640986a0 100644
--- a/examples/her/train_dqn_bit_flip.py
+++ b/examples/her/train_dqn_bit_flip.py
@@ -134,8 +134,12 @@ def main():
         default=10,
         help="Number of bits for BitFlipping environment.",
     )
-    parser.add_argument("--no-hindsight", action="store_true", default=False,
-                        help="Do not use Hindsight Replay.")
+    parser.add_argument(
+        "--no-hindsight",
+        action="store_true",
+        default=False,
+        help="Do not use Hindsight Replay.",
+    )
     parser.add_argument("--eval-n-episodes", type=int, default=100)
     parser.add_argument("--eval-interval", type=int, default=250000)
     parser.add_argument("--n-best-episodes", type=int, default=100)

From ef230a477dbc1c1f3ba3087f72c90e19a9797f35 Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Mon, 22 Mar 2021 19:08:50 +0900
Subject: [PATCH 32/34] Adds HER readme and tests

---
 examples/her/README.md                |  23 +
 tests/replay_buffers_test/test_her.py | 865 ++++++++++++++++++++++++++
 2 files changed, 888 insertions(+)
 create mode 100644 examples/her/README.md
 create mode 100644 tests/replay_buffers_test/test_her.py

diff --git a/examples/her/README.md b/examples/her/README.md
new file mode 100644
index 000000000..28113c9ec
--- /dev/null
+++ b/examples/her/README.md
@@ -0,0 +1,23 @@
+# Hindsight Experience Replay
+These two examples train agents using [Hindsight Experience Replay (HER)](https://arxiv.org/abs/1707.01495). The first example, `train_dqn_bit_flip.py` trains a DoubleDQN in the BitFlip environment as described in the HER paper. The second example, `train_ddpg_her_fetch.py` trains agents in the robotic Fetch environments, also described in the HER paper.
+
+## To Run:
+
+To run the bitflip example:
+```
+python train_dqn_bit_flip.py --num-bits <number of bits>
+```
+
+To run DDPG with HER on fetch tasks, run:
+```
+python train_ddpg_her_fetch.py --env <Gym environment name>
+```
+
+Options
+- `--gpu`: Set to -1 if you have no GPU.
+
+## Results and Reproducibility
+The BitFlip environment was implemented as per the description in the paper. The DQN algorithm for the bitflip environment is not from the paper (to our knowledge there is no publicly released implementation).
+
+For the Fetch environments, we added an action penalty, return clipping, and observation normalization to DDPG as done by the [OpenAI baselines implementation](https://github.com/openai/baselines/tree/master/baselines/her).
+
diff --git a/tests/replay_buffers_test/test_her.py b/tests/replay_buffers_test/test_her.py
new file mode 100644
index 000000000..f8e9963e3
--- /dev/null
+++ b/tests/replay_buffers_test/test_her.py
@@ -0,0 +1,865 @@
+import collections
+import copy
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+import torch
+
+from pfrl import replay_buffer, replay_buffers
+
+
+@pytest.mark.parametrize("capacity", [100, None])
+@pytest.mark.parametrize("num_steps", [1, 3])
+class TestHindsightReplayBuffer:
+    @pytest.fixture(autouse=True)
+    def setUp(self, capacity, num_steps):
+        self.capacity = capacity
+        self.num_steps = num_steps
+
+    def test_append_and_sample(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+
+        assert len(rbuf) == 0
+
+        # Add one and sample one
+        correct_item = collections.deque([], maxlen=num_steps)
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        assert len(rbuf) == 1
+        s1 = rbuf.sample(1)
+        assert len(s1) == 1
+        assert s1[0] == list(correct_item)
+
+        # Add two and sample two, which must be unique
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=False,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+        assert len(rbuf) == 2
+        s2 = rbuf.sample(2)
+        assert len(s2) == 2
+        if s2[0][num_steps - 1]["state"] == 0:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        else:
+            assert s2[1] == list(correct_item)
+            assert s2[0] == list(correct_item2)
+
+    def test_append_and_terminate(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+
+        assert len(rbuf) == 0
+
+        # Add one and sample one
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(**trans1)
+        assert len(rbuf) == 1
+        s1 = rbuf.sample(1)
+        assert len(s1) == 1
+
+        # Add two and sample two, which must be unique
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        rbuf.append(**trans2)
+        assert len(rbuf) == self.num_steps + 1
+        s2 = rbuf.sample(self.num_steps + 1)
+        assert len(s2) == self.num_steps + 1
+        if self.num_steps == 1:
+            if s2[0][0]["state"] == 0:
+                assert s2[1][0]["state"] == 1
+            else:
+                assert s2[1][0]["state"] == 0
+        else:
+            for item in s2:
+                # e.g. if states are 0,0,0,1 then buffer looks like:
+                # [[0,0,0], [0, 0, 1], [0, 1], [1]]
+                if len(item) < self.num_steps:
+                    assert item[len(item) - 1]["state"] == 1
+                    for i in range(len(item) - 1):
+                        assert item[i]["state"] == 0
+                else:
+                    for i in range(len(item) - 1):
+                        assert item[i]["state"] == 0
+
+    def test_stop_current_episode(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+
+        assert len(rbuf) == 0
+
+        # Add one and sample one
+        for _ in range(num_steps - 1):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(**trans1)
+        # we haven't experienced n transitions yet
+        assert len(rbuf) == 0
+        # episode ends
+        rbuf.stop_current_episode()
+        # episode ends, so we should add n-1 transitions
+        assert len(rbuf) == self.num_steps - 1
+
+    def test_save_and_load(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+
+        tempdir = tempfile.mkdtemp()
+
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+
+        correct_item = collections.deque([], maxlen=num_steps)
+        # Add two transitions
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=False,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+
+        # Now it has two transitions
+        assert len(rbuf) == 2
+
+        # Save
+        filename = os.path.join(tempdir, "rbuf.pkl")
+        rbuf.save(filename)
+
+        # Initialize rbuf
+        rbuf = replay_buffers.ReplayBuffer(capacity)
+
+        # Of course it has no transition yet
+        assert len(rbuf) == 0
+
+        # Load the previously saved buffer
+        rbuf.load(filename)
+
+        # Now it has two transitions again
+        assert len(rbuf) == 2
+
+        # And sampled transitions are exactly what I added!
+        s2 = rbuf.sample(2)
+        if s2[0][num_steps - 1]["state"] == 0:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        else:
+            assert s2[0] == list(correct_item2)
+            assert s2[1] == list(correct_item)
+
+
+@pytest.mark.parametrize("capacity", [100, None])
+class TestEpisodicReplayBuffer:
+    @pytest.fixture(autouse=True)
+    def setUp(self, capacity):
+        self.capacity = capacity
+
+    def test_append_and_sample(self):
+        capacity = self.capacity
+        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)
+
+        for n in [10, 15, 5] * 3:
+            transs = [
+                dict(
+                    state=i,
+                    action=100 + i,
+                    reward=200 + i,
+                    next_state=i + 1,
+                    next_action=101 + i,
+                    is_state_terminal=(i == n - 1),
+                )
+                for i in range(n)
+            ]
+            for trans in transs:
+                rbuf.append(**trans)
+
+        assert len(rbuf) == 90
+        assert rbuf.n_episodes == 9
+
+        for k in [10, 30, 90]:
+            s = rbuf.sample(k)
+            assert len(s) == k
+
+        for k in [1, 3, 9]:
+            s = rbuf.sample_episodes(k)
+            assert len(s) == k
+
+            s = rbuf.sample_episodes(k, max_len=10)
+            for ep in s:
+                assert len(ep) <= 10
+                for t0, t1 in zip(ep, ep[1:]):
+                    assert t0["next_state"] == t1["state"]
+                    assert t0["next_action"] == t1["action"]
+
+    def test_save_and_load(self):
+        capacity = self.capacity
+
+        tempdir = tempfile.mkdtemp()
+
+        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)
+
+        transs = [
+            dict(
+                state=n,
+                action=n + 10,
+                reward=n + 20,
+                next_state=n + 1,
+                next_action=n + 11,
+                is_state_terminal=False,
+            )
+            for n in range(5)
+        ]
+
+        # Add two episodes
+        rbuf.append(**transs[0])
+        rbuf.append(**transs[1])
+        rbuf.stop_current_episode()
+
+        rbuf.append(**transs[2])
+        rbuf.append(**transs[3])
+        rbuf.append(**transs[4])
+        rbuf.stop_current_episode()
+
+        assert len(rbuf) == 5
+        assert rbuf.n_episodes == 2
+
+        # Save
+        filename = os.path.join(tempdir, "rbuf.pkl")
+        rbuf.save(filename)
+
+        # Initialize rbuf
+        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)
+
+        # Of course it has no transition yet
+        assert len(rbuf) == 0
+
+        # Load the previously saved buffer
+        rbuf.load(filename)
+
+        # Sampled transitions are exactly what I added!
+        s5 = rbuf.sample(5)
+        assert len(s5) == 5
+        for t in s5:
+            assert len(t) == 1
+            n = t[0]["state"]
+            assert n in range(5)
+            assert t[0] == transs[n]
+
+        # And sampled episodes are exactly what I added!
+        s2e = rbuf.sample_episodes(2)
+        assert len(s2e) == 2
+        if s2e[0][0]["state"] == 0:
+            assert s2e[0] == [transs[0], transs[1]]
+            assert s2e[1] == [transs[2], transs[3], transs[4]]
+        else:
+            assert s2e[0] == [transs[2], transs[3], transs[4]]
+            assert s2e[1] == [transs[0], transs[1]]
+
+        # Sizes are correct!
+        assert len(rbuf) == 5
+        assert rbuf.n_episodes == 2
+
+
+@pytest.mark.parametrize("capacity", [100, None])
+@pytest.mark.parametrize("normalize_by_max", ["batch", "memory"])
+class TestPrioritizedReplayBuffer:
+    @pytest.fixture(autouse=True)
+    def setUp(self, capacity, normalize_by_max):
+        self.capacity = capacity
+        self.normalize_by_max = normalize_by_max
+        self.num_steps = 1
+
+    def test_append_and_sample(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.PrioritizedReplayBuffer(
+            capacity,
+            normalize_by_max=self.normalize_by_max,
+            error_max=5,
+            num_steps=num_steps,
+        )
+
+        assert len(rbuf) == 0
+
+        # Add one and sample one
+        correct_item = collections.deque([], maxlen=num_steps)
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        assert len(rbuf) == 1
+        s1 = rbuf.sample(1)
+        rbuf.update_errors([3.14])
+        assert len(s1) == 1
+        np.testing.assert_allclose(s1[0][0]["weight"], 1.0)
+        del s1[0][0]["weight"]
+        assert s1[0] == list(correct_item)
+
+        # Add two and sample two, which must be unique
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+        assert len(rbuf) == 2
+        s2 = rbuf.sample(2)
+        rbuf.update_errors([3.14, 2.71])
+        assert len(s2) == 2
+        del s2[0][0]["weight"]
+        del s2[1][0]["weight"]
+        if s2[0][num_steps - 1]["state"] == 1:
+            assert s2[0] == list(correct_item2)
+            assert s2[1] == list(correct_item)
+        else:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+
+        # Weights should be different for different TD-errors
+        s3 = rbuf.sample(2)
+        assert not np.allclose(s3[0][0]["weight"], s3[1][0]["weight"])
+
+        # Weights should be equal for different but clipped TD-errors
+        rbuf.update_errors([5, 10])
+        s3 = rbuf.sample(2)
+        np.testing.assert_allclose(s3[0][0]["weight"], s3[1][0]["weight"])
+
+        # Weights should be equal for the same TD-errors
+        rbuf.update_errors([3.14, 3.14])
+        s4 = rbuf.sample(2)
+        np.testing.assert_allclose(s4[0][0]["weight"], s4[1][0]["weight"])
+
+    def test_normalize_by_max(self):
+
+        rbuf = replay_buffers.PrioritizedReplayBuffer(
+            self.capacity,
+            normalize_by_max=self.normalize_by_max,
+            error_max=1000,
+            num_steps=self.num_steps,
+        )
+
+        # Add 100 transitions
+        for i in range(100):
+            trans = dict(
+                state=i,
+                action=1,
+                reward=2,
+                next_state=i + 1,
+                next_action=1,
+                is_state_terminal=False,
+            )
+            rbuf.append(**trans)
+        assert len(rbuf) == 100
+
+        def set_errors_based_on_state(rbuf, samples):
+            # Use the value of 'state' as an error, so that state 0 will have
+            # the smallest error, thus the largest weight
+            errors = [s[0]["state"] for s in samples]
+            rbuf.update_errors(errors)
+
+        # Assign different errors to all the transitions first
+        samples = rbuf.sample(100)
+        set_errors_based_on_state(rbuf, samples)
+
+        # Repeatedly check how weights are normalized
+        for i in range(100):
+            samples = rbuf.sample(i + 1)
+            # All the weights must be unique
+            assert len(set(s[0]["weight"] for s in samples)) == len(samples)
+            # Now check the maximum weight in a minibatch
+            max_w = max([s[0]["weight"] for s in samples])
+            if self.normalize_by_max == "batch":
+                # Maximum weight in a minibatch must be 1
+                np.testing.assert_allclose(max_w, 1)
+            elif self.normalize_by_max == "memory":
+                # Maximum weight in a minibatch must be less than 1 unless
+                # the minibatch contains the transition of least error.
+                if any(s[0]["state"] == 0 for s in samples):
+                    np.testing.assert_allclose(max_w, 1)
+                else:
+                    assert max_w < 1
+            set_errors_based_on_state(rbuf, samples)
+
+    def test_capacity(self):
+        capacity = self.capacity
+        if capacity is None:
+            return
+
+        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity)
+        # Fill the buffer
+        for _ in range(capacity):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=True,
+            )
+            rbuf.append(**trans1)
+        assert len(rbuf) == capacity
+
+        # Add a new transition
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        rbuf.append(**trans2)
+        # The size should not change
+        assert len(rbuf) == capacity
+
+    def test_save_and_load(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+
+        tempdir = tempfile.mkdtemp()
+
+        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity, num_steps=num_steps)
+
+        # Add two transitions
+        correct_item = collections.deque([], maxlen=num_steps)
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+
+        # Now it has two transitions
+        assert len(rbuf) == 2
+
+        # Save
+        filename = os.path.join(tempdir, "rbuf.pkl")
+        rbuf.save(filename)
+
+        # Initialize rbuf
+        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity, num_steps=num_steps)
+
+        # Of course it has no transition yet
+        assert len(rbuf) == 0
+
+        # Load the previously saved buffer
+        rbuf.load(filename)
+
+        # Now it has two transitions again
+        assert len(rbuf) == 2
+
+        # And sampled transitions are exactly what I added!
+        s2 = rbuf.sample(2)
+        del s2[0][0]["weight"]
+        del s2[1][0]["weight"]
+        if s2[0][num_steps - 1]["state"] == 0:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        else:
+            assert s2[0] == list(correct_item2)
+            assert s2[1] == list(correct_item)
+
+
+def exp_return_of_episode(episode):
+    return sum(np.exp(x["reward"]) for x in episode)
+
+
+@pytest.mark.parametrize("normalize_by_max", ["batch", "memory"])
+@pytest.mark.parametrize(
+    "wait_priority_after_sampling,default_priority_func",
+    [(True, None), (True, exp_return_of_episode), (False, exp_return_of_episode)],
+)
+@pytest.mark.parametrize("uniform_ratio", [0, 0.1, 1.0])
+@pytest.mark.parametrize("return_sample_weights", [True, False])
+class TestPrioritizedEpisodicReplayBuffer:
+    @pytest.fixture(autouse=True)
+    def setUp(
+        self,
+        normalize_by_max,
+        wait_priority_after_sampling,
+        default_priority_func,
+        uniform_ratio,
+        return_sample_weights,
+    ):
+        self.capacity = 100
+        self.normalize_by_max = normalize_by_max
+        self.wait_priority_after_sampling = wait_priority_after_sampling
+        self.default_priority_func = default_priority_func
+        self.uniform_ratio = uniform_ratio
+        self.return_sample_weights = return_sample_weights
+
+    def test_append_and_sample(self):
+        rbuf = replay_buffers.PrioritizedEpisodicReplayBuffer(
+            capacity=self.capacity,
+            normalize_by_max=self.normalize_by_max,
+            default_priority_func=self.default_priority_func,
+            uniform_ratio=self.uniform_ratio,
+            wait_priority_after_sampling=self.wait_priority_after_sampling,
+            return_sample_weights=self.return_sample_weights,
+        )
+
+        for n in [10, 15, 5] * 3:
+            transs = [
+                dict(
+                    state=i,
+                    action=100 + i,
+                    reward=200 + i,
+                    next_state=i + 1,
+                    next_action=101 + i,
+                    is_state_terminal=(i == n - 1),
+                )
+                for i in range(n)
+            ]
+            for trans in transs:
+                rbuf.append(**trans)
+
+        assert len(rbuf) == 90
+        assert rbuf.n_episodes == 9
+
+        for k in [10, 30, 90]:
+            s = rbuf.sample(k)
+            assert len(s) == k
+
+        for k in [1, 3, 9]:
+            ret = rbuf.sample_episodes(k)
+            if self.return_sample_weights:
+                s, wt = ret
+                assert len(s) == k
+                assert len(wt) == k
+            else:
+                s = ret
+                assert len(s) == k
+            if self.wait_priority_after_sampling:
+                rbuf.update_errors([1.0] * k)
+
+            ret = rbuf.sample_episodes(k, max_len=10)
+            if self.return_sample_weights:
+                s, wt = ret
+                assert len(s) == k
+                assert len(wt) == k
+            else:
+                s = ret
+            if self.wait_priority_after_sampling:
+                rbuf.update_errors([1.0] * k)
+
+            for ep in s:
+                assert len(ep) <= 10
+                for t0, t1 in zip(ep, ep[1:]):
+                    assert t0["next_state"] == t1["state"]
+                    assert t0["next_action"] == t1["action"]
+
+
+@pytest.mark.parametrize(
+    "replay_buffer_type", ["ReplayBuffer", "PrioritizedReplayBuffer"]
+)
+class TestReplayBufferWithEnvID:
+    @pytest.fixture(autouse=True)
+    def setUp(self, replay_buffer_type):
+        self.replay_buffer_type = replay_buffer_type
+
+    def test(self):
+        n = 5
+        if self.replay_buffer_type == "ReplayBuffer":
+            rbuf = replay_buffers.ReplayBuffer(capacity=None, num_steps=n)
+        elif self.replay_buffer_type == "PrioritizedReplayBuffer":
+            rbuf = replay_buffers.PrioritizedReplayBuffer(capacity=None, num_steps=n)
+        else:
+            assert False
+
+        # 2 transitions for env_id=0
+        for _ in range(2):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(env_id=0, **trans1)
+        # 4 transitions for env_id=1 with a terminal state
+        for i in range(4):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=(i == 3),
+            )
+            rbuf.append(env_id=1, **trans1)
+        # 9 transitions for env_id=2
+        for _ in range(9):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(env_id=2, **trans1)
+
+        # It should have:
+        #   - 4 transitions from env_id=1
+        #   - 5 transitions from env_id=2
+        assert len(rbuf) == 9
+
+        # env_id=0 episode ends
+        rbuf.stop_current_episode(env_id=0)
+
+        # Now it should have 9 + 2 = 11 transitions
+        assert len(rbuf) == 11
+
+        # env_id=2 episode ends
+        rbuf.stop_current_episode(env_id=2)
+
+        # Finally it should have 9 + 2 + 4 = 15 transitions
+        assert len(rbuf) == 15
+
+
+@pytest.mark.parametrize(
+    "replay_buffer_type", ["EpisodicReplayBuffer", "PrioritizedEpisodicReplayBuffer"]
+)
+class TestEpisodicReplayBufferWithEnvID:
+    @pytest.fixture(autouse=True)
+    def setUp(self, replay_buffer_type):
+        self.replay_buffer_type = replay_buffer_type
+
+    def test(self):
+        if self.replay_buffer_type == "EpisodicReplayBuffer":
+            rbuf = replay_buffers.EpisodicReplayBuffer(capacity=None)
+        elif self.replay_buffer_type == "PrioritizedEpisodicReplayBuffer":
+            rbuf = replay_buffers.PrioritizedEpisodicReplayBuffer(capacity=None)
+        else:
+            assert False
+
+        # 2 transitions for env_id=0
+        for _ in range(2):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(env_id=0, **trans1)
+        # 4 transitions for env_id=1 with a terminal state
+        for i in range(4):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=(i == 3),
+            )
+            rbuf.append(env_id=1, **trans1)
+        # 9 transitions for env_id=2
+        for _ in range(9):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(env_id=2, **trans1)
+
+        # It should have 4 transitions from env_id=1
+        assert len(rbuf) == 4
+
+        # env_id=0 episode ends
+        rbuf.stop_current_episode(env_id=0)
+
+        # Now it should have 4 + 2 = 6 transitions
+        assert len(rbuf) == 6
+
+        # env_id=2 episode ends
+        rbuf.stop_current_episode(env_id=2)
+
+        # Finally it should have 4 + 2 + 9 = 15 transitions
+        assert len(rbuf) == 15
+
+
+class TestReplayBufferFail(unittest.TestCase):
+    def setUp(self):
+        self.rbuf = replay_buffers.PrioritizedReplayBuffer(100)
+        self.trans1 = dict(
+            state=0,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        self.rbuf.append(**self.trans1)
+
+    def _sample1(self):
+        self.rbuf.sample(1)
+
+    def _set1(self):
+        self.rbuf.update_errors([1.0])
+
+    def test_fail_noupdate(self):
+        self._sample1()
+        self.assertRaises(AssertionError, self._sample1)
+
+    def test_fail_update_first(self):
+        self.assertRaises(AssertionError, self._set1)
+
+    def test_fail_doubleupdate(self):
+        self._sample1()
+        self._set1()
+        self.assertRaises(AssertionError, self._set1)
+
+
+class TestBatchExperiences(unittest.TestCase):
+    def test_batch_experiences(self):
+        experiences = []
+        experiences.append(
+            [
+                dict(
+                    state=1,
+                    action=1,
+                    reward=1,
+                    next_state=i,
+                    next_action=1,
+                    is_state_terminal=False,
+                )
+                for i in range(3)
+            ]
+        )
+        experiences.append(
+            [
+                dict(
+                    state=1,
+                    action=1,
+                    reward=1,
+                    next_state=1,
+                    next_action=1,
+                    is_state_terminal=False,
+                )
+            ]
+        )
+        four_step_transition = [
+            dict(
+                state=1,
+                action=1,
+                reward=1,
+                next_state=1,
+                next_action=1,
+                is_state_terminal=False,
+            )
+        ] * 3
+        four_step_transition.append(
+            dict(
+                state=1,
+                action=1,
+                reward=1,
+                next_state=5,
+                next_action=1,
+                is_state_terminal=True,
+            )
+        )
+        experiences.append(four_step_transition)
+        batch = replay_buffer.batch_experiences(
+            experiences, torch.device("cpu"), lambda x: x, 0.99
+        )
+        self.assertEqual(batch["state"][0], 1)
+        self.assertSequenceEqual(
+            list(batch["is_state_terminal"]),
+            list(np.asarray([0.0, 0.0, 1.0], dtype=np.float32)),
+        )
+        self.assertSequenceEqual(
+            list(batch["discount"]),
+            list(np.asarray([0.99 ** 3, 0.99 ** 1, 0.99 ** 4], dtype=np.float32)),
+        )
+        self.assertSequenceEqual(list(batch["next_state"]), list(np.asarray([2, 1, 5])))

From ac952f372b79c582cb3e923350f06a02e24e351d Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Tue, 30 Mar 2021 15:44:46 +0900
Subject: [PATCH 33/34] Applies black

---
 examples/her/train_ddpg_her_fetch.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/examples/her/train_ddpg_her_fetch.py b/examples/her/train_ddpg_her_fetch.py
index 9c993950d..d0a825aa1 100644
--- a/examples/her/train_ddpg_her_fetch.py
+++ b/examples/her/train_ddpg_her_fetch.py
@@ -21,6 +21,7 @@ class ComputeSuccessRate(gym.Wrapper):
     Attributes:
         success_record: list of successes
     """
+
     def __init__(self, env):
         super().__init__(env)
         self.success_record = []
@@ -72,7 +73,7 @@ def observation(self, observation):
 class EpsilonGreedyWithGaussianNoise(pfrl.explorer.Explorer):
     """Epsilon-Greedy with Gaussian noise.
 
-    This type of explorer was used in 
+    This type of explorer was used in
     https://github.com/openai/baselines/tree/master/baselines/her
     """
 
@@ -145,12 +146,18 @@ def main():
         default=5 * 10 ** 2,
         help="Minimum replay buffer size before performing gradient updates.",
     )
-    parser.add_argument("--replay-strategy",
-                        default="future",
-                        choices=["future", "final"],
-                        help="The replay strategy to use",)
-    parser.add_argument("--no-hindsight", action="store_true", default=False,
-                        help="Do not use Hindsight Replay")
+    parser.add_argument(
+        "--replay-strategy",
+        default="future",
+        choices=["future", "final"],
+        help="The replay strategy to use",
+    )
+    parser.add_argument(
+        "--no-hindsight",
+        action="store_true",
+        default=False,
+        help="Do not use Hindsight Replay",
+    )
     parser.add_argument("--eval-n-episodes", type=int, default=10)
     parser.add_argument("--eval-interval", type=int, default=500)
     parser.add_argument(

From 35f085c5925cf2fad7fb9810212453ee2e7d635b Mon Sep 17 00:00:00 2001
From: Prabhat <prabhat.nagarajan@gmail.com>
Date: Tue, 30 Mar 2021 19:26:52 +0900
Subject: [PATCH 34/34] Adds HER to replay buffer tests

---
 tests/replay_buffers_test/test_her.py         | 865 ------------------
 .../replay_buffers_test/test_replay_buffer.py | 194 ++++
 2 files changed, 194 insertions(+), 865 deletions(-)
 delete mode 100644 tests/replay_buffers_test/test_her.py

diff --git a/tests/replay_buffers_test/test_her.py b/tests/replay_buffers_test/test_her.py
deleted file mode 100644
index f8e9963e3..000000000
--- a/tests/replay_buffers_test/test_her.py
+++ /dev/null
@@ -1,865 +0,0 @@
-import collections
-import copy
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import pytest
-import torch
-
-from pfrl import replay_buffer, replay_buffers
-
-
-@pytest.mark.parametrize("capacity", [100, None])
-@pytest.mark.parametrize("num_steps", [1, 3])
-class TestHindsightReplayBuffer:
-    @pytest.fixture(autouse=True)
-    def setUp(self, capacity, num_steps):
-        self.capacity = capacity
-        self.num_steps = num_steps
-
-    def test_append_and_sample(self):
-        capacity = self.capacity
-        num_steps = self.num_steps
-        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
-
-        assert len(rbuf) == 0
-
-        # Add one and sample one
-        correct_item = collections.deque([], maxlen=num_steps)
-        for _ in range(num_steps):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=False,
-            )
-            correct_item.append(trans1)
-            rbuf.append(**trans1)
-        assert len(rbuf) == 1
-        s1 = rbuf.sample(1)
-        assert len(s1) == 1
-        assert s1[0] == list(correct_item)
-
-        # Add two and sample two, which must be unique
-        correct_item2 = copy.deepcopy(correct_item)
-        trans2 = dict(
-            state=1,
-            action=1,
-            reward=2,
-            next_state=3,
-            next_action=4,
-            is_state_terminal=False,
-        )
-        correct_item2.append(trans2)
-        rbuf.append(**trans2)
-        assert len(rbuf) == 2
-        s2 = rbuf.sample(2)
-        assert len(s2) == 2
-        if s2[0][num_steps - 1]["state"] == 0:
-            assert s2[0] == list(correct_item)
-            assert s2[1] == list(correct_item2)
-        else:
-            assert s2[1] == list(correct_item)
-            assert s2[0] == list(correct_item2)
-
-    def test_append_and_terminate(self):
-        capacity = self.capacity
-        num_steps = self.num_steps
-        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
-
-        assert len(rbuf) == 0
-
-        # Add one and sample one
-        for _ in range(num_steps):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=False,
-            )
-            rbuf.append(**trans1)
-        assert len(rbuf) == 1
-        s1 = rbuf.sample(1)
-        assert len(s1) == 1
-
-        # Add two and sample two, which must be unique
-        trans2 = dict(
-            state=1,
-            action=1,
-            reward=2,
-            next_state=3,
-            next_action=4,
-            is_state_terminal=True,
-        )
-        rbuf.append(**trans2)
-        assert len(rbuf) == self.num_steps + 1
-        s2 = rbuf.sample(self.num_steps + 1)
-        assert len(s2) == self.num_steps + 1
-        if self.num_steps == 1:
-            if s2[0][0]["state"] == 0:
-                assert s2[1][0]["state"] == 1
-            else:
-                assert s2[1][0]["state"] == 0
-        else:
-            for item in s2:
-                # e.g. if states are 0,0,0,1 then buffer looks like:
-                # [[0,0,0], [0, 0, 1], [0, 1], [1]]
-                if len(item) < self.num_steps:
-                    assert item[len(item) - 1]["state"] == 1
-                    for i in range(len(item) - 1):
-                        assert item[i]["state"] == 0
-                else:
-                    for i in range(len(item) - 1):
-                        assert item[i]["state"] == 0
-
-    def test_stop_current_episode(self):
-        capacity = self.capacity
-        num_steps = self.num_steps
-        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
-
-        assert len(rbuf) == 0
-
-        # Add one and sample one
-        for _ in range(num_steps - 1):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=False,
-            )
-            rbuf.append(**trans1)
-        # we haven't experienced n transitions yet
-        assert len(rbuf) == 0
-        # episode ends
-        rbuf.stop_current_episode()
-        # episode ends, so we should add n-1 transitions
-        assert len(rbuf) == self.num_steps - 1
-
-    def test_save_and_load(self):
-        capacity = self.capacity
-        num_steps = self.num_steps
-
-        tempdir = tempfile.mkdtemp()
-
-        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
-
-        correct_item = collections.deque([], maxlen=num_steps)
-        # Add two transitions
-        for _ in range(num_steps):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=False,
-            )
-            correct_item.append(trans1)
-            rbuf.append(**trans1)
-        correct_item2 = copy.deepcopy(correct_item)
-        trans2 = dict(
-            state=1,
-            action=1,
-            reward=2,
-            next_state=3,
-            next_action=4,
-            is_state_terminal=False,
-        )
-        correct_item2.append(trans2)
-        rbuf.append(**trans2)
-
-        # Now it has two transitions
-        assert len(rbuf) == 2
-
-        # Save
-        filename = os.path.join(tempdir, "rbuf.pkl")
-        rbuf.save(filename)
-
-        # Initialize rbuf
-        rbuf = replay_buffers.ReplayBuffer(capacity)
-
-        # Of course it has no transition yet
-        assert len(rbuf) == 0
-
-        # Load the previously saved buffer
-        rbuf.load(filename)
-
-        # Now it has two transitions again
-        assert len(rbuf) == 2
-
-        # And sampled transitions are exactly what I added!
-        s2 = rbuf.sample(2)
-        if s2[0][num_steps - 1]["state"] == 0:
-            assert s2[0] == list(correct_item)
-            assert s2[1] == list(correct_item2)
-        else:
-            assert s2[0] == list(correct_item2)
-            assert s2[1] == list(correct_item)
-
-
-@pytest.mark.parametrize("capacity", [100, None])
-class TestEpisodicReplayBuffer:
-    @pytest.fixture(autouse=True)
-    def setUp(self, capacity):
-        self.capacity = capacity
-
-    def test_append_and_sample(self):
-        capacity = self.capacity
-        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)
-
-        for n in [10, 15, 5] * 3:
-            transs = [
-                dict(
-                    state=i,
-                    action=100 + i,
-                    reward=200 + i,
-                    next_state=i + 1,
-                    next_action=101 + i,
-                    is_state_terminal=(i == n - 1),
-                )
-                for i in range(n)
-            ]
-            for trans in transs:
-                rbuf.append(**trans)
-
-        assert len(rbuf) == 90
-        assert rbuf.n_episodes == 9
-
-        for k in [10, 30, 90]:
-            s = rbuf.sample(k)
-            assert len(s) == k
-
-        for k in [1, 3, 9]:
-            s = rbuf.sample_episodes(k)
-            assert len(s) == k
-
-            s = rbuf.sample_episodes(k, max_len=10)
-            for ep in s:
-                assert len(ep) <= 10
-                for t0, t1 in zip(ep, ep[1:]):
-                    assert t0["next_state"] == t1["state"]
-                    assert t0["next_action"] == t1["action"]
-
-    def test_save_and_load(self):
-        capacity = self.capacity
-
-        tempdir = tempfile.mkdtemp()
-
-        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)
-
-        transs = [
-            dict(
-                state=n,
-                action=n + 10,
-                reward=n + 20,
-                next_state=n + 1,
-                next_action=n + 11,
-                is_state_terminal=False,
-            )
-            for n in range(5)
-        ]
-
-        # Add two episodes
-        rbuf.append(**transs[0])
-        rbuf.append(**transs[1])
-        rbuf.stop_current_episode()
-
-        rbuf.append(**transs[2])
-        rbuf.append(**transs[3])
-        rbuf.append(**transs[4])
-        rbuf.stop_current_episode()
-
-        assert len(rbuf) == 5
-        assert rbuf.n_episodes == 2
-
-        # Save
-        filename = os.path.join(tempdir, "rbuf.pkl")
-        rbuf.save(filename)
-
-        # Initialize rbuf
-        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)
-
-        # Of course it has no transition yet
-        assert len(rbuf) == 0
-
-        # Load the previously saved buffer
-        rbuf.load(filename)
-
-        # Sampled transitions are exactly what I added!
-        s5 = rbuf.sample(5)
-        assert len(s5) == 5
-        for t in s5:
-            assert len(t) == 1
-            n = t[0]["state"]
-            assert n in range(5)
-            assert t[0] == transs[n]
-
-        # And sampled episodes are exactly what I added!
-        s2e = rbuf.sample_episodes(2)
-        assert len(s2e) == 2
-        if s2e[0][0]["state"] == 0:
-            assert s2e[0] == [transs[0], transs[1]]
-            assert s2e[1] == [transs[2], transs[3], transs[4]]
-        else:
-            assert s2e[0] == [transs[2], transs[3], transs[4]]
-            assert s2e[1] == [transs[0], transs[1]]
-
-        # Sizes are correct!
-        assert len(rbuf) == 5
-        assert rbuf.n_episodes == 2
-
-
-@pytest.mark.parametrize("capacity", [100, None])
-@pytest.mark.parametrize("normalize_by_max", ["batch", "memory"])
-class TestPrioritizedReplayBuffer:
-    @pytest.fixture(autouse=True)
-    def setUp(self, capacity, normalize_by_max):
-        self.capacity = capacity
-        self.normalize_by_max = normalize_by_max
-        self.num_steps = 1
-
-    def test_append_and_sample(self):
-        capacity = self.capacity
-        num_steps = self.num_steps
-        rbuf = replay_buffers.PrioritizedReplayBuffer(
-            capacity,
-            normalize_by_max=self.normalize_by_max,
-            error_max=5,
-            num_steps=num_steps,
-        )
-
-        assert len(rbuf) == 0
-
-        # Add one and sample one
-        correct_item = collections.deque([], maxlen=num_steps)
-        for _ in range(num_steps):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=False,
-            )
-            correct_item.append(trans1)
-            rbuf.append(**trans1)
-        assert len(rbuf) == 1
-        s1 = rbuf.sample(1)
-        rbuf.update_errors([3.14])
-        assert len(s1) == 1
-        np.testing.assert_allclose(s1[0][0]["weight"], 1.0)
-        del s1[0][0]["weight"]
-        assert s1[0] == list(correct_item)
-
-        # Add two and sample two, which must be unique
-        correct_item2 = copy.deepcopy(correct_item)
-        trans2 = dict(
-            state=1,
-            action=1,
-            reward=2,
-            next_state=3,
-            next_action=4,
-            is_state_terminal=True,
-        )
-        correct_item2.append(trans2)
-        rbuf.append(**trans2)
-        assert len(rbuf) == 2
-        s2 = rbuf.sample(2)
-        rbuf.update_errors([3.14, 2.71])
-        assert len(s2) == 2
-        del s2[0][0]["weight"]
-        del s2[1][0]["weight"]
-        if s2[0][num_steps - 1]["state"] == 1:
-            assert s2[0] == list(correct_item2)
-            assert s2[1] == list(correct_item)
-        else:
-            assert s2[0] == list(correct_item)
-            assert s2[1] == list(correct_item2)
-
-        # Weights should be different for different TD-errors
-        s3 = rbuf.sample(2)
-        assert not np.allclose(s3[0][0]["weight"], s3[1][0]["weight"])
-
-        # Weights should be equal for different but clipped TD-errors
-        rbuf.update_errors([5, 10])
-        s3 = rbuf.sample(2)
-        np.testing.assert_allclose(s3[0][0]["weight"], s3[1][0]["weight"])
-
-        # Weights should be equal for the same TD-errors
-        rbuf.update_errors([3.14, 3.14])
-        s4 = rbuf.sample(2)
-        np.testing.assert_allclose(s4[0][0]["weight"], s4[1][0]["weight"])
-
-    def test_normalize_by_max(self):
-
-        rbuf = replay_buffers.PrioritizedReplayBuffer(
-            self.capacity,
-            normalize_by_max=self.normalize_by_max,
-            error_max=1000,
-            num_steps=self.num_steps,
-        )
-
-        # Add 100 transitions
-        for i in range(100):
-            trans = dict(
-                state=i,
-                action=1,
-                reward=2,
-                next_state=i + 1,
-                next_action=1,
-                is_state_terminal=False,
-            )
-            rbuf.append(**trans)
-        assert len(rbuf) == 100
-
-        def set_errors_based_on_state(rbuf, samples):
-            # Use the value of 'state' as an error, so that state 0 will have
-            # the smallest error, thus the largest weight
-            errors = [s[0]["state"] for s in samples]
-            rbuf.update_errors(errors)
-
-        # Assign different errors to all the transitions first
-        samples = rbuf.sample(100)
-        set_errors_based_on_state(rbuf, samples)
-
-        # Repeatedly check how weights are normalized
-        for i in range(100):
-            samples = rbuf.sample(i + 1)
-            # All the weights must be unique
-            assert len(set(s[0]["weight"] for s in samples)) == len(samples)
-            # Now check the maximum weight in a minibatch
-            max_w = max([s[0]["weight"] for s in samples])
-            if self.normalize_by_max == "batch":
-                # Maximum weight in a minibatch must be 1
-                np.testing.assert_allclose(max_w, 1)
-            elif self.normalize_by_max == "memory":
-                # Maximum weight in a minibatch must be less than 1 unless
-                # the minibatch contains the transition of least error.
-                if any(s[0]["state"] == 0 for s in samples):
-                    np.testing.assert_allclose(max_w, 1)
-                else:
-                    assert max_w < 1
-            set_errors_based_on_state(rbuf, samples)
-
-    def test_capacity(self):
-        capacity = self.capacity
-        if capacity is None:
-            return
-
-        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity)
-        # Fill the buffer
-        for _ in range(capacity):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=True,
-            )
-            rbuf.append(**trans1)
-        assert len(rbuf) == capacity
-
-        # Add a new transition
-        trans2 = dict(
-            state=1,
-            action=1,
-            reward=2,
-            next_state=3,
-            next_action=4,
-            is_state_terminal=True,
-        )
-        rbuf.append(**trans2)
-        # The size should not change
-        assert len(rbuf) == capacity
-
-    def test_save_and_load(self):
-        capacity = self.capacity
-        num_steps = self.num_steps
-
-        tempdir = tempfile.mkdtemp()
-
-        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity, num_steps=num_steps)
-
-        # Add two transitions
-        correct_item = collections.deque([], maxlen=num_steps)
-        for _ in range(num_steps):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=False,
-            )
-            correct_item.append(trans1)
-            rbuf.append(**trans1)
-        correct_item2 = copy.deepcopy(correct_item)
-        trans2 = dict(
-            state=1,
-            action=1,
-            reward=2,
-            next_state=3,
-            next_action=4,
-            is_state_terminal=True,
-        )
-        correct_item2.append(trans2)
-        rbuf.append(**trans2)
-
-        # Now it has two transitions
-        assert len(rbuf) == 2
-
-        # Save
-        filename = os.path.join(tempdir, "rbuf.pkl")
-        rbuf.save(filename)
-
-        # Initialize rbuf
-        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity, num_steps=num_steps)
-
-        # Of course it has no transition yet
-        assert len(rbuf) == 0
-
-        # Load the previously saved buffer
-        rbuf.load(filename)
-
-        # Now it has two transitions again
-        assert len(rbuf) == 2
-
-        # And sampled transitions are exactly what I added!
-        s2 = rbuf.sample(2)
-        del s2[0][0]["weight"]
-        del s2[1][0]["weight"]
-        if s2[0][num_steps - 1]["state"] == 0:
-            assert s2[0] == list(correct_item)
-            assert s2[1] == list(correct_item2)
-        else:
-            assert s2[0] == list(correct_item2)
-            assert s2[1] == list(correct_item)
-
-
-def exp_return_of_episode(episode):
-    return sum(np.exp(x["reward"]) for x in episode)
-
-
-@pytest.mark.parametrize("normalize_by_max", ["batch", "memory"])
-@pytest.mark.parametrize(
-    "wait_priority_after_sampling,default_priority_func",
-    [(True, None), (True, exp_return_of_episode), (False, exp_return_of_episode)],
-)
-@pytest.mark.parametrize("uniform_ratio", [0, 0.1, 1.0])
-@pytest.mark.parametrize("return_sample_weights", [True, False])
-class TestPrioritizedEpisodicReplayBuffer:
-    @pytest.fixture(autouse=True)
-    def setUp(
-        self,
-        normalize_by_max,
-        wait_priority_after_sampling,
-        default_priority_func,
-        uniform_ratio,
-        return_sample_weights,
-    ):
-        self.capacity = 100
-        self.normalize_by_max = normalize_by_max
-        self.wait_priority_after_sampling = wait_priority_after_sampling
-        self.default_priority_func = default_priority_func
-        self.uniform_ratio = uniform_ratio
-        self.return_sample_weights = return_sample_weights
-
-    def test_append_and_sample(self):
-        rbuf = replay_buffers.PrioritizedEpisodicReplayBuffer(
-            capacity=self.capacity,
-            normalize_by_max=self.normalize_by_max,
-            default_priority_func=self.default_priority_func,
-            uniform_ratio=self.uniform_ratio,
-            wait_priority_after_sampling=self.wait_priority_after_sampling,
-            return_sample_weights=self.return_sample_weights,
-        )
-
-        for n in [10, 15, 5] * 3:
-            transs = [
-                dict(
-                    state=i,
-                    action=100 + i,
-                    reward=200 + i,
-                    next_state=i + 1,
-                    next_action=101 + i,
-                    is_state_terminal=(i == n - 1),
-                )
-                for i in range(n)
-            ]
-            for trans in transs:
-                rbuf.append(**trans)
-
-        assert len(rbuf) == 90
-        assert rbuf.n_episodes == 9
-
-        for k in [10, 30, 90]:
-            s = rbuf.sample(k)
-            assert len(s) == k
-
-        for k in [1, 3, 9]:
-            ret = rbuf.sample_episodes(k)
-            if self.return_sample_weights:
-                s, wt = ret
-                assert len(s) == k
-                assert len(wt) == k
-            else:
-                s = ret
-                assert len(s) == k
-            if self.wait_priority_after_sampling:
-                rbuf.update_errors([1.0] * k)
-
-            ret = rbuf.sample_episodes(k, max_len=10)
-            if self.return_sample_weights:
-                s, wt = ret
-                assert len(s) == k
-                assert len(wt) == k
-            else:
-                s = ret
-            if self.wait_priority_after_sampling:
-                rbuf.update_errors([1.0] * k)
-
-            for ep in s:
-                assert len(ep) <= 10
-                for t0, t1 in zip(ep, ep[1:]):
-                    assert t0["next_state"] == t1["state"]
-                    assert t0["next_action"] == t1["action"]
-
-
-@pytest.mark.parametrize(
-    "replay_buffer_type", ["ReplayBuffer", "PrioritizedReplayBuffer"]
-)
-class TestReplayBufferWithEnvID:
-    @pytest.fixture(autouse=True)
-    def setUp(self, replay_buffer_type):
-        self.replay_buffer_type = replay_buffer_type
-
-    def test(self):
-        n = 5
-        if self.replay_buffer_type == "ReplayBuffer":
-            rbuf = replay_buffers.ReplayBuffer(capacity=None, num_steps=n)
-        elif self.replay_buffer_type == "PrioritizedReplayBuffer":
-            rbuf = replay_buffers.PrioritizedReplayBuffer(capacity=None, num_steps=n)
-        else:
-            assert False
-
-        # 2 transitions for env_id=0
-        for _ in range(2):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=False,
-            )
-            rbuf.append(env_id=0, **trans1)
-        # 4 transitions for env_id=1 with a terminal state
-        for i in range(4):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=(i == 3),
-            )
-            rbuf.append(env_id=1, **trans1)
-        # 9 transitions for env_id=2
-        for _ in range(9):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=False,
-            )
-            rbuf.append(env_id=2, **trans1)
-
-        # It should have:
-        #   - 4 transitions from env_id=1
-        #   - 5 transitions from env_id=2
-        assert len(rbuf) == 9
-
-        # env_id=0 episode ends
-        rbuf.stop_current_episode(env_id=0)
-
-        # Now it should have 9 + 2 = 11 transitions
-        assert len(rbuf) == 11
-
-        # env_id=2 episode ends
-        rbuf.stop_current_episode(env_id=2)
-
-        # Finally it should have 9 + 2 + 4 = 15 transitions
-        assert len(rbuf) == 15
-
-
-@pytest.mark.parametrize(
-    "replay_buffer_type", ["EpisodicReplayBuffer", "PrioritizedEpisodicReplayBuffer"]
-)
-class TestEpisodicReplayBufferWithEnvID:
-    @pytest.fixture(autouse=True)
-    def setUp(self, replay_buffer_type):
-        self.replay_buffer_type = replay_buffer_type
-
-    def test(self):
-        if self.replay_buffer_type == "EpisodicReplayBuffer":
-            rbuf = replay_buffers.EpisodicReplayBuffer(capacity=None)
-        elif self.replay_buffer_type == "PrioritizedEpisodicReplayBuffer":
-            rbuf = replay_buffers.PrioritizedEpisodicReplayBuffer(capacity=None)
-        else:
-            assert False
-
-        # 2 transitions for env_id=0
-        for _ in range(2):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=False,
-            )
-            rbuf.append(env_id=0, **trans1)
-        # 4 transitions for env_id=1 with a terminal state
-        for i in range(4):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=(i == 3),
-            )
-            rbuf.append(env_id=1, **trans1)
-        # 9 transitions for env_id=2
-        for _ in range(9):
-            trans1 = dict(
-                state=0,
-                action=1,
-                reward=2,
-                next_state=3,
-                next_action=4,
-                is_state_terminal=False,
-            )
-            rbuf.append(env_id=2, **trans1)
-
-        # It should have 4 transitions from env_id=1
-        assert len(rbuf) == 4
-
-        # env_id=0 episode ends
-        rbuf.stop_current_episode(env_id=0)
-
-        # Now it should have 4 + 2 = 6 transitions
-        assert len(rbuf) == 6
-
-        # env_id=2 episode ends
-        rbuf.stop_current_episode(env_id=2)
-
-        # Finally it should have 4 + 2 + 9 = 15 transitions
-        assert len(rbuf) == 15
-
-
-class TestReplayBufferFail(unittest.TestCase):
-    def setUp(self):
-        self.rbuf = replay_buffers.PrioritizedReplayBuffer(100)
-        self.trans1 = dict(
-            state=0,
-            action=1,
-            reward=2,
-            next_state=3,
-            next_action=4,
-            is_state_terminal=True,
-        )
-        self.rbuf.append(**self.trans1)
-
-    def _sample1(self):
-        self.rbuf.sample(1)
-
-    def _set1(self):
-        self.rbuf.update_errors([1.0])
-
-    def test_fail_noupdate(self):
-        self._sample1()
-        self.assertRaises(AssertionError, self._sample1)
-
-    def test_fail_update_first(self):
-        self.assertRaises(AssertionError, self._set1)
-
-    def test_fail_doubleupdate(self):
-        self._sample1()
-        self._set1()
-        self.assertRaises(AssertionError, self._set1)
-
-
-class TestBatchExperiences(unittest.TestCase):
-    def test_batch_experiences(self):
-        experiences = []
-        experiences.append(
-            [
-                dict(
-                    state=1,
-                    action=1,
-                    reward=1,
-                    next_state=i,
-                    next_action=1,
-                    is_state_terminal=False,
-                )
-                for i in range(3)
-            ]
-        )
-        experiences.append(
-            [
-                dict(
-                    state=1,
-                    action=1,
-                    reward=1,
-                    next_state=1,
-                    next_action=1,
-                    is_state_terminal=False,
-                )
-            ]
-        )
-        four_step_transition = [
-            dict(
-                state=1,
-                action=1,
-                reward=1,
-                next_state=1,
-                next_action=1,
-                is_state_terminal=False,
-            )
-        ] * 3
-        four_step_transition.append(
-            dict(
-                state=1,
-                action=1,
-                reward=1,
-                next_state=5,
-                next_action=1,
-                is_state_terminal=True,
-            )
-        )
-        experiences.append(four_step_transition)
-        batch = replay_buffer.batch_experiences(
-            experiences, torch.device("cpu"), lambda x: x, 0.99
-        )
-        self.assertEqual(batch["state"][0], 1)
-        self.assertSequenceEqual(
-            list(batch["is_state_terminal"]),
-            list(np.asarray([0.0, 0.0, 1.0], dtype=np.float32)),
-        )
-        self.assertSequenceEqual(
-            list(batch["discount"]),
-            list(np.asarray([0.99 ** 3, 0.99 ** 1, 0.99 ** 4], dtype=np.float32)),
-        )
-        self.assertSequenceEqual(list(batch["next_state"]), list(np.asarray([2, 1, 5])))
diff --git a/tests/replay_buffers_test/test_replay_buffer.py b/tests/replay_buffers_test/test_replay_buffer.py
index bf2b2b037..0eceb11e7 100644
--- a/tests/replay_buffers_test/test_replay_buffer.py
+++ b/tests/replay_buffers_test/test_replay_buffer.py
@@ -317,6 +317,200 @@ def test_save_and_load(self):
         assert rbuf.n_episodes == 2
 
 
+@pytest.mark.parametrize("capacity", [100, None])
+@pytest.mark.parametrize("num_steps", [1, 3])
+class TestHindsightReplayBuffer:
+    @pytest.fixture(autouse=True)
+    def setUp(self, capacity, num_steps):
+        self.capacity = capacity
+        self.num_steps = num_steps
+
+    def test_append_and_sample(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+
+        assert len(rbuf) == 0
+
+        # Add one and sample one
+        correct_item = collections.deque([], maxlen=num_steps)
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        assert len(rbuf) == 1
+        s1 = rbuf.sample(1)
+        assert len(s1) == 1
+        assert s1[0] == list(correct_item)
+
+        # Add two and sample two, which must be unique
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=False,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+        assert len(rbuf) == 2
+        s2 = rbuf.sample(2)
+        assert len(s2) == 2
+        if s2[0][num_steps - 1]["state"] == 0:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        else:
+            assert s2[1] == list(correct_item)
+            assert s2[0] == list(correct_item2)
+
+    def test_append_and_terminate(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+
+        assert len(rbuf) == 0
+
+        # Add one and sample one
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(**trans1)
+        assert len(rbuf) == 1
+        s1 = rbuf.sample(1)
+        assert len(s1) == 1
+
+        # Add two and sample two, which must be unique
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=True,
+        )
+        rbuf.append(**trans2)
+        assert len(rbuf) == self.num_steps + 1
+        s2 = rbuf.sample(self.num_steps + 1)
+        assert len(s2) == self.num_steps + 1
+        if self.num_steps == 1:
+            if s2[0][0]["state"] == 0:
+                assert s2[1][0]["state"] == 1
+            else:
+                assert s2[1][0]["state"] == 0
+        else:
+            for item in s2:
+                # e.g. if states are 0,0,0,1 then buffer looks like:
+                # [[0,0,0], [0, 0, 1], [0, 1], [1]]
+                if len(item) < self.num_steps:
+                    assert item[len(item) - 1]["state"] == 1
+                    for i in range(len(item) - 1):
+                        assert item[i]["state"] == 0
+                else:
+                    for i in range(len(item) - 1):
+                        assert item[i]["state"] == 0
+
+    def test_stop_current_episode(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+
+        assert len(rbuf) == 0
+
+        # Add one and sample one
+        for _ in range(num_steps - 1):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            rbuf.append(**trans1)
+        # we haven't experienced n transitions yet
+        assert len(rbuf) == 0
+        # episode ends
+        rbuf.stop_current_episode()
+        # episode ends, so we should add n-1 transitions
+        assert len(rbuf) == self.num_steps - 1
+
+    def test_save_and_load(self):
+        capacity = self.capacity
+        num_steps = self.num_steps
+
+        tempdir = tempfile.mkdtemp()
+
+        rbuf = replay_buffers.ReplayBuffer(capacity, num_steps)
+
+        correct_item = collections.deque([], maxlen=num_steps)
+        # Add two transitions
+        for _ in range(num_steps):
+            trans1 = dict(
+                state=0,
+                action=1,
+                reward=2,
+                next_state=3,
+                next_action=4,
+                is_state_terminal=False,
+            )
+            correct_item.append(trans1)
+            rbuf.append(**trans1)
+        correct_item2 = copy.deepcopy(correct_item)
+        trans2 = dict(
+            state=1,
+            action=1,
+            reward=2,
+            next_state=3,
+            next_action=4,
+            is_state_terminal=False,
+        )
+        correct_item2.append(trans2)
+        rbuf.append(**trans2)
+
+        # Now it has two transitions
+        assert len(rbuf) == 2
+
+        # Save
+        filename = os.path.join(tempdir, "rbuf.pkl")
+        rbuf.save(filename)
+
+        # Initialize rbuf
+        rbuf = replay_buffers.ReplayBuffer(capacity)
+
+        # Of course it has no transition yet
+        assert len(rbuf) == 0
+
+        # Load the previously saved buffer
+        rbuf.load(filename)
+
+        # Now it has two transitions again
+        assert len(rbuf) == 2
+
+        # And sampled transitions are exactly what I added!
+        s2 = rbuf.sample(2)
+        if s2[0][num_steps - 1]["state"] == 0:
+            assert s2[0] == list(correct_item)
+            assert s2[1] == list(correct_item2)
+        else:
+            assert s2[0] == list(correct_item2)
+            assert s2[1] == list(correct_item)
+
+
 @pytest.mark.parametrize("capacity", [100, None])
 @pytest.mark.parametrize("normalize_by_max", ["batch", "memory"])
 class TestPrioritizedReplayBuffer: