From f89a19c6e765c8909fa8e19c049de33ff92b6e43 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 28 Mar 2023 20:42:54 -0400
Subject: [PATCH 001/206] rename plane -> cube

---
 config/env/{plane.yaml => cube.yaml} |  8 ++++----
 gflownet/envs/{plane.py => cube.py}  | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)
 rename config/env/{plane.yaml => cube.yaml} (63%)
 rename gflownet/envs/{plane.py => cube.py} (98%)

diff --git a/config/env/plane.yaml b/config/env/cube.yaml
similarity index 63%
rename from config/env/plane.yaml
rename to config/env/cube.yaml
index 16da93974..09af97393 100644
--- a/config/env/plane.yaml
+++ b/config/env/cube.yaml
@@ -1,11 +1,11 @@
 defaults:
   - base
 
-_target_: gflownet.envs.plane.Plane
+_target_: gflownet.envs.cube.Cube
 
-id: plane
+id: cube
 func: corners
-# Dimensions of hyperplane
+# Dimensions of hypercube
 n_dim: 2
 # Maximum length of trajecotry
 max_traj_length: 10
@@ -16,4 +16,4 @@ buffer:
   test:
     type: grid
     n: 1000
-    output_csv: plane_test.csv
+    output_csv: cube_test.csv
diff --git a/gflownet/envs/plane.py b/gflownet/envs/cube.py
similarity index 98%
rename from gflownet/envs/plane.py
rename to gflownet/envs/cube.py
index f0c065db8..db4170558 100644
--- a/gflownet/envs/plane.py
+++ b/gflownet/envs/cube.py
@@ -1,5 +1,5 @@
 """
-Classes to represent hyperplane environments
+Classes to represent hypercube environments
 """
 import itertools
 from typing import List, Tuple
@@ -14,9 +14,9 @@
 from gflownet.envs.base import GFlowNetEnv
 
 
-class Plane(GFlowNetEnv):
+class Cube(GFlowNetEnv):
     """
-    Hyperplane environment (continuous version of a hypergrid) in which the action
+    Hypercube environment (continuous version of a hypergrid) in which the action
     space consists of the increment of dimension d, modelled by a beta distribution.
 
     The states space is the value of each dimension. If the value of a dimension gets
@@ -25,7 +25,7 @@ class Plane(GFlowNetEnv):
     Attributes
     ----------
     n_dim : int
-        Dimensionality of the hyperplane
+        Dimensionality of the hypercube
 
     length_traj : int
        Fixed length of the trajectory.
@@ -49,7 +49,7 @@ def __init__(
         oracle=None,
         **kwargs,
     ):
-        super(Plane, self).__init__(
+        super(Cube, self).__init__(
             env_id,
             reward_beta,
             reward_norm,

From abd3fee2414054e96d6c866d837986326dba7cd2 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 28 Mar 2023 20:48:47 -0400
Subject: [PATCH 002/206] rename to hcube

---
 config/env/{cube.yaml => hcube.yaml} | 4 ++--
 gflownet/envs/cube.py                | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename config/env/{cube.yaml => hcube.yaml} (82%)

diff --git a/config/env/cube.yaml b/config/env/hcube.yaml
similarity index 82%
rename from config/env/cube.yaml
rename to config/env/hcube.yaml
index 09af97393..01471535f 100644
--- a/config/env/cube.yaml
+++ b/config/env/hcube.yaml
@@ -1,9 +1,9 @@
 defaults:
   - base
 
-_target_: gflownet.envs.cube.Cube
+_target_: gflownet.envs.cube.HybridCube
 
-id: cube
+id: hcube
 func: corners
 # Dimensions of hypercube
 n_dim: 2
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index db4170558..02444eb5c 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -14,7 +14,7 @@
 from gflownet.envs.base import GFlowNetEnv
 
 
-class Cube(GFlowNetEnv):
+class HybridCube(GFlowNetEnv):
     """
     Hypercube environment (continuous version of a hypergrid) in which the action
     space consists of the increment of dimension d, modelled by a beta distribution.

From 0d834f8403245adde6addb4bbf402a930e2bbe9f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 28 Mar 2023 21:05:20 -0400
Subject: [PATCH 003/206] wip: updating cube env

---
 gflownet/envs/cube.py | 135 +++++++++++++++---------------------------
 1 file changed, 47 insertions(+), 88 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 02444eb5c..675123a8c 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1,5 +1,5 @@
 """
-Classes to represent hypercube environments
+Classes to represent hyper-cube environments
 """
 import itertools
 from typing import List, Tuple
@@ -16,115 +16,74 @@
 
 class HybridCube(GFlowNetEnv):
     """
-    Hypercube environment (continuous version of a hypergrid) in which the action
-    space consists of the increment of dimension d, modelled by a beta distribution.
+    Continuous (hybrid: discrete and continuous) hyper-cube environment (continuous
+    version of a hyper-grid) in which the action space consists of the increment of
+    dimension d, modelled by a beta distribution.
 
     The states space is the value of each dimension. If the value of a dimension gets
-    larger than max_val, then the trajectory is ended and the reward is 0.
+    larger than max_val, then the trajectory is ended.
 
     Attributes
     ----------
     n_dim : int
-        Dimensionality of the hypercube
-
-    length_traj : int
-       Fixed length of the trajectory.
+        Dimensionality of the hyper-cube
     """
 
     def __init__(
         self,
-        n_dim=2,
-        max_val=1.0,
-        max_traj_length=1.0,
-        distr_alpha=2.0,
-        distr_beta=5.0,
-        env_id=None,
-        reward_beta=1,
-        reward_norm=1.0,
-        reward_norm_std_mult=0,
-        reward_func="boltzmann",
-        denorm_proxy=False,
-        energies_stats=None,
-        proxy=None,
-        oracle=None,
+        n_dim: int = 2,
+        max_val: float = 1.0,
+        n_comp: int = 1,
+        do_nonzero_source_prob: bool = True,
+        fixed_distribution: dict = {
+            "beta_alpha": 2.0,
+            "beta_alpha": 5.0,
+        },
+        random_distribution: dict = {
+            "beta_alpha": 1.0,
+            "beta_beta": 1.0,
+        },
         **kwargs,
     ):
-        super(Cube, self).__init__(
-            env_id,
-            reward_beta,
-            reward_norm,
-            reward_norm_std_mult,
-            reward_func,
-            energies_stats,
-            denorm_proxy,
-            proxy,
-            oracle,
-            **kwargs,
-        )
+        assert n_dim > 0
+        assert max_val > 1.0
+        assert n_comp > 0
         # Main properties
         self.continuous = True
         self.n_dim = n_dim
         self.eos = self.n_dim
         self.max_val = max_val
-        self.max_traj_length = max_traj_length
         # Parameters of fixed policy distribution
-        self.distr_alpha = distr_alpha
-        self.distr_beta = distr_beta
-        # Initialize angles and state attributes
-        self.source = [0.0 for _ in range(self.n_dim)]
-        self.reset()
-        self.action_space = self.get_action_space()
-        self.fixed_policy_output = self.get_fixed_policy_output()
-        self.policy_output_dim = len(self.fixed_policy_output)
-        self.policy_input_dim = len(self.state2policy())
-        # Set up proxy
-        self.setup_proxy()
-        # Oracle
-        self.state2oracle = self.state2proxy
-        self.statebatch2oracle = self.statebatch2proxy
-
-    def reward(self, state=None, done=None):
-        """
-        Sets the reward to min_reward if any value of the state is larger than max_val.
-        """
-        if done is None:
-            done = self.done
-        if done:
-            return np.array(0.0)
-        if state is None:
-            state = self.state.copy()
-        if any([s > self.max_val for s in self.state]):
-            return np.array(self.min_reward)
+        self.n_comp = n_comp
+        if do_nonzero_source_prob:
+            self.n_params_per_dim = 4
         else:
-            return super().reward(state)
-
-    def reward_batch(self, states, done):
-        """
-        Sets the reward to min_reward if any value of the state is larger than max_val.
-        """
-        states_super = []
-        done_super = []
-        within_plane = []
-        for state, d in zip(states, done):
-            if d and any([s > self.max_val for s in state]):
-                within_plane.append(False)
-            else:
-                within_plane.append(True)
-                states_super.append(state)
-                done_super.append(d)
-        reward = self.min_reward * np.ones(len(within_plane))
-        reward[within_plane] = super().reward_batch(states_super, done_super)
-        return reward
+            self.n_params_per_dim = 3
+        # Source state: position 0 at all dimensions
+        self.source = [0.0 for _ in range(self.n_dim)]
+        # End-of-sequence action: (n_dim, 0)
+        self.eos = (self.n_dim, 0)
+        # Base class init
+        super().__init__(
+            fixed_distribution=fixed_distribution,
+            random_distribution=random_distribution,
+            **kwargs,
+        )
 
     def get_action_space(self):
         """
-        Constructs list with all possible actions. The actions are tuples with two
-        values: (dimension, increment) where dimension indicates the index of the
-        dimension on which the action is to be performed and increment indicates the
-        increment of the dimension value.
+        Since this is a hybrid (continuous/discrete) environment, this method
+        constructs a list with the discrete actions.
+
+        The actions are tuples with two values: (dimension, increment) where dimension
+        indicates the index of the dimension on which the action is to be performed and
+        increment indicates the increment of the dimension.
+
+        The (discrete) action space is then one tuple per dimension (with 0 increment),
+        plus the EOS action.
         """
-        actions = [(d, None) for d in range(self.n_dim)]
-        actions += [(self.eos, None)]
+        actions = [(d, 0) for d in range(self.n_dim)]
+        actions.append(self.eos)
         return actions
 
     def get_fixed_policy_output(self):
@@ -133,7 +92,7 @@ def get_fixed_policy_output(self):
         action is to be determined or sampled, by returning a vector with a fixed
         random policy.
 
-        For each dimension of the hyper-plane, the output of the policy should return
+        For each dimension of the hyper-cube, the output of the policy should return
         1) a logit, for the categorical distribution over dimensions and 2) the alpha
         and 3) beta parameters of a beta distribution to sample the increment.
         Therefore, the output of the policy model has dimensionality D x 3 + 1, where D

From ceefeeef6fcf63dcd7aba949cedf90d3e38e5358 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 28 Mar 2023 21:05:41 -0400
Subject: [PATCH 004/206] minor: comments

---
 gflownet/envs/htorus.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gflownet/envs/htorus.py b/gflownet/envs/htorus.py
index 3430d7f38..4438c4dfd 100644
--- a/gflownet/envs/htorus.py
+++ b/gflownet/envs/htorus.py
@@ -22,15 +22,15 @@
 class HybridTorus(GFlowNetEnv):
     """
     Continuous (hybrid: discrete and continuous) hyper-torus environment in which the
-    action space consists of the selection of which dimension d to increment increment
-    and of the angle of dimension d. The trajectory is of fixed length length_traj.
+    action space consists of the selection of which dimension d to increment and of the
+    angle of dimension d. The trajectory is of fixed length length_traj.
 
     The states space is the concatenation of the angle (in radians and within [0, 2 *
     pi]) at each dimension and the number of actions.
 
     Attributes
     ----------
-    ndim : int
+    n_dim : int
         Dimensionality of the torus
 
     length_traj : int
@@ -72,7 +72,7 @@ def __init__(
         # Source state: position 0 at all dimensions and number of actions 0
         self.source_angles = [0.0 for _ in range(self.n_dim)]
         self.source = self.source_angles + [0]
-        # End-of-sequence action: (n_dim, None)
+        # End-of-sequence action: (n_dim, 0)
         self.eos = (self.n_dim, 0)
         # TODO: assess if really needed
         self.state2oracle = self.state2proxy

From 34a6006e279899d9918fd8e838fa06ca8cdcb95c Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 6 Apr 2023 15:54:18 -0400
Subject: [PATCH 005/206] wip

---
 gflownet/envs/ctorus.py |  2 +-
 gflownet/envs/cube.py   | 30 ++++++++++++++++++++----------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index a504a5f22..c7998c474 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -57,7 +57,7 @@ def get_policy_output(self, params: dict):
         mixture, 2) the location of the von Mises distribution and 3) the concentration
         of the von Mises distribution to sample the increment of the angle.
 
-        Therefore, the output of the policy model has dimensionality D x C x 1, where D
+        Therefore, the output of the policy model has dimensionality D x C x 3, where D
         is the number of dimensions (self.n_dim) and C is the number of components
         (self.n_comp). In sum, the entries of the entries of the policy output are:
 
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 675123a8c..127be162a 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -92,17 +92,27 @@ def get_fixed_policy_output(self):
         action is to be determined or sampled, by returning a vector with a fixed
         random policy.
 
-        For each dimension of the hyper-cube, the output of the policy should return
-        1) a logit, for the categorical distribution over dimensions and 2) the alpha
-        and 3) beta parameters of a beta distribution to sample the increment.
-        Therefore, the output of the policy model has dimensionality D x 3 + 1, where D
-        is the number of dimensions, and the elements of the output vector are:
-        - d * 3: logit of dimension d
-        - d * 3 + 1: log(alpha) of beta distribution for dimension d
-        - d * 3 + 2: log(beta) of a beta distribution for dimension d
-        with d in [0, ..., D]
+        For each dimension d of the hyper-cube and component c of the mixture, the
+        output of the policy should return 
+          1) the weight of the component in the mixture
+          2) a logit, for the categorical distribution over dimensions
+          3) the alpha parameter of the Beta distribution to sample the increment
+          4) the beta parameter of the Beta distribution to sample the increment
+
+        Therefore, the output of the policy model has dimensionality D x C x 4 + 1,
+        where D is the number of dimensions (self.n_dim) and C is the number of
+        components (self.n_comp). The additional dimension (+ 1) is to include the
+        logit of the EOS action. In sum, the entries of the entries of the policy
+        output are:
+
+        - d * c * 4 + 0: logit of dimension d, component c.
+        - d * c * 4 + 1: weight of component c in the mixture for dimension d
+        - d * c * 4 + 2: log(alpha) of the Beta distribution for dim. d, comp. c
+        - d * c * 4 + 3: log(beta) of the Beta distribution for dim. d, comp. c
+        TODO
+        - D * C * 4 + 3 + 1: log(beta) of the Beta distribution for dim. d, comp. c
         """
-        policy_output_fixed = np.ones(self.n_dim * 3 + 1)
+        policy_output_fixed = np.ones(self.n_dim * self.n_comp * 3 + 1)
         policy_output_fixed[1::3] = self.distr_alpha
         policy_output_fixed[2::3] = self.distr_beta
         return policy_output_fixed

From c7b47a8ffd66c28a4f7a832055aa72ddf5eecea5 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 6 Apr 2023 17:54:38 -0400
Subject: [PATCH 006/206] finish get_policy_output

---
 gflownet/envs/ctorus.py | 15 +++++++--------
 gflownet/envs/cube.py   | 36 ++++++++++++++++--------------------
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index c7998c474..190c09787 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -53,17 +53,16 @@ def get_policy_output(self, params: dict):
         random policy.
 
         For each dimension d of the hyper-torus and component c of the mixture, the
-        output of the policy should return 1) the weight of the component in the
-        mixture, 2) the location of the von Mises distribution and 3) the concentration
-        of the von Mises distribution to sample the increment of the angle.
+        output of the policy should return 
+          1) the weight of the component in the mixture
+          2) the location of the von Mises distribution to sample the angle increment
+          3) the log concentration of the von Mises distribution to sample the angle
+          increment
 
         Therefore, the output of the policy model has dimensionality D x C x 3, where D
         is the number of dimensions (self.n_dim) and C is the number of components
-        (self.n_comp). In sum, the entries of the entries of the policy output are:
-
-        - d * c * 3 + 0: weight of component c in the mixture for dim. d
-        - d * c * 3 + 1: location of Von Mises distribution for dim. d, comp. c
-        - d * c * 3 + 2: log concentration of Von Mises distribution for dim. d, comp. c
+        (self.n_comp). The first 3 x C entries in the policy output correspond to the
+        first dimension, and so on.
         """
         policy_output = np.ones(self.n_dim * self.n_comp * 3)
         policy_output[1::3] = params["vonmises_mean"]
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 127be162a..beb5b9c30 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -37,7 +37,7 @@ def __init__(
         do_nonzero_source_prob: bool = True,
         fixed_distribution: dict = {
             "beta_alpha": 2.0,
-            "beta_alpha": 5.0,
+            "beta_beta": 5.0,
         },
         random_distribution: dict = {
             "beta_alpha": 1.0,
@@ -86,35 +86,31 @@ def get_action_space(self):
         actions.append(self.eos)
         return actions
 
-    def get_fixed_policy_output(self):
+    def get_fixed_policy_output(self, params: dict):
         """
         Defines the structure of the output of the policy model, from which an
         action is to be determined or sampled, by returning a vector with a fixed
         random policy.
 
         For each dimension d of the hyper-cube and component c of the mixture, the
-        output of the policy should return 
+        output of the policy should return
           1) the weight of the component in the mixture
-          2) a logit, for the categorical distribution over dimensions
-          3) the alpha parameter of the Beta distribution to sample the increment
-          4) the beta parameter of the Beta distribution to sample the increment
+          2) the log(alpha) parameter of the Beta distribution to sample the increment
+          3) the log(beta) parameter of the Beta distribution to sample the increment
 
-        Therefore, the output of the policy model has dimensionality D x C x 4 + 1,
+        Additionally, the policy output contains one logit per dimension plus one logit
+        for the EOS action, for the categorical distribution over dimensions.
+
+        Therefore, the output of the policy model has dimensionality D x C x 3 + D + 1,
         where D is the number of dimensions (self.n_dim) and C is the number of
-        components (self.n_comp). The additional dimension (+ 1) is to include the
-        logit of the EOS action. In sum, the entries of the entries of the policy
-        output are:
-
-        - d * c * 4 + 0: logit of dimension d, component c.
-        - d * c * 4 + 1: weight of component c in the mixture for dimension d
-        - d * c * 4 + 2: log(alpha) of the Beta distribution for dim. d, comp. c
-        - d * c * 4 + 3: log(beta) of the Beta distribution for dim. d, comp. c
-        TODO
-        - D * C * 4 + 3 + 1: log(beta) of the Beta distribution for dim. d, comp. c
+        components (self.n_comp). The first D + 1 entries in the policy output
+        correspond to the categorical logits. Then, the next 3 x C entries in the
+        policy output correspond to the first dimension, and so on.
         """
-        policy_output_fixed = np.ones(self.n_dim * self.n_comp * 3 + 1)
-        policy_output_fixed[1::3] = self.distr_alpha
-        policy_output_fixed[2::3] = self.distr_beta
+        self.n_logits = self.n_dim + 1
+        policy_output_fixed = np.ones(self.n_dim * self.n_comp * 3 + self.n_logits)
+        policy_output_fixed[self.n_logits + 1 :: 3] = params["beta_alpha"]
+        policy_output_fixed[self.n_logits + 2 :: 3] = params["beta_beta"]
         return policy_output_fixed
 
     def get_mask_invalid_actions_forward(self, state=None, done=None):

From 11081e3600452cb5671d244e4c34b8ec30d50f77 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 6 Apr 2023 17:56:25 -0400
Subject: [PATCH 007/206] fix name

---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index beb5b9c30..f2150a24e 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -86,7 +86,7 @@ def get_action_space(self):
         actions.append(self.eos)
         return actions
 
-    def get_fixed_policy_output(self, params: dict):
+    def get_policy_output(self, params: dict):
         """
         Defines the structure of the output of the policy model, from which an
         action is to be determined or sampled, by returning a vector with a fixed

From 7a8c53f5a88ec7029c305abae8732ca8b973482d Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 7 Apr 2023 17:42:11 -0400
Subject: [PATCH 008/206] progress with cube

---
 gflownet/envs/cube.py            | 97 +++++++++++++++++++-------------
 tests/gflownet/envs/test_cube.py | 97 ++++++++++++++++++++++++++++++++
 2 files changed, 155 insertions(+), 39 deletions(-)
 create mode 100644 tests/gflownet/envs/test_cube.py

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index f2150a24e..759f110c3 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -2,7 +2,7 @@
 Classes to represent hyper-cube environments
 """
 import itertools
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import numpy as np
 import numpy.typing as npt
@@ -26,7 +26,11 @@ class HybridCube(GFlowNetEnv):
     Attributes
     ----------
     n_dim : int
-        Dimensionality of the hyper-cube
+        Dimensionality of the hyper-cube.
+
+    max_val : float
+        Max length of the hyper-cube.
+
     """
 
     def __init__(
@@ -35,18 +39,18 @@ def __init__(
         max_val: float = 1.0,
         n_comp: int = 1,
         do_nonzero_source_prob: bool = True,
-        fixed_distribution: dict = {
+        fixed_distr_params: dict = {
             "beta_alpha": 2.0,
             "beta_beta": 5.0,
         },
-        random_distribution: dict = {
+        random_distr_params: dict = {
             "beta_alpha": 1.0,
             "beta_beta": 1.0,
         },
         **kwargs,
     ):
         assert n_dim > 0
-        assert max_val > 1.0
+        assert max_val > 0.0
         assert n_comp > 0
         # Main properties
         self.continuous = True
@@ -63,10 +67,18 @@ def __init__(
         self.source = [0.0 for _ in range(self.n_dim)]
         # End-of-sequence action: (n_dim, 0)
         self.eos = (self.n_dim, 0)
+        # Conversions: only conversions to policy are implemented and the rest are the
+        # same
+        self.state2proxy = self.state2policy
+        self.statebatch2proxy = self.statebatch2policy
+        self.statetorch2proxy = self.statetorch2policy
+        self.state2oracle = self.state2proxy
+        self.statebatch2oracle = self.statebatch2proxy
+        self.statetorch2oracle = self.statetorch2proxy
         # Base class init
         super().__init__(
-            fixed_distribution=fixed_distribution,
-            random_distribution=random_distribution,
+            fixed_distr_params=fixed_distr_params,
+            random_distr_params=random_distr_params,
             **kwargs,
         )
 
@@ -86,7 +98,7 @@ def get_action_space(self):
         actions.append(self.eos)
         return actions
 
-    def get_policy_output(self, params: dict):
+    def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         """
         Defines the structure of the output of the policy model, from which an
         action is to be determined or sampled, by returning a vector with a fixed
@@ -107,13 +119,20 @@ def get_policy_output(self, params: dict):
         correspond to the categorical logits. Then, the next 3 x C entries in the
         policy output correspond to the first dimension, and so on.
         """
-        self.n_logits = self.n_dim + 1
-        policy_output_fixed = np.ones(self.n_dim * self.n_comp * 3 + self.n_logits)
-        policy_output_fixed[self.n_logits + 1 :: 3] = params["beta_alpha"]
-        policy_output_fixed[self.n_logits + 2 :: 3] = params["beta_beta"]
+        policy_output_fixed = torch.ones(
+            self.n_dim * self.n_comp * 3 + self.n_dim + 1,
+            device=self.device,
+            dtype=self.float,
+        )
+        policy_output_fixed[self.n_dim + 2 :: 3] = params["beta_alpha"]
+        policy_output_fixed[self.n_dim + 3 :: 3] = params["beta_beta"]
         return policy_output_fixed
 
-    def get_mask_invalid_actions_forward(self, state=None, done=None):
+    def get_mask_invalid_actions_forward(
+        self,
+        state: Optional[List] = None,
+        done: Optional[bool] = None,
+    ) -> List:
         """
         Returns a vector with the length of the discrete part of the action space + 1:
         True if action is invalid going forward given the current state, False
@@ -128,10 +147,9 @@ def get_mask_invalid_actions_forward(self, state=None, done=None):
             done = self.done
         if done:
             return [True for _ in range(self.action_space_dim)]
-        if (
-            any([s > self.max_val for s in self.state])
-            or self.n_actions >= self.max_traj_length
-        ):
+        # If the value of any dimension is greater than max_val, then next action can
+        # only be EOS.
+        if any([s > self.max_val for s in self.state]):
             mask = [True for _ in range(self.action_space_dim)]
             mask[-1] = False
         else:
@@ -151,35 +169,46 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         if done:
             mask = [True for _ in range(self.action_space_dim)]
             mask[-1] = False
+        # If the value of any dimension is smaller than 0.0, then next action can
+        # only be EOS (return to source).
+        if any([s < 0.0 for s in self.state]):
+            mask = [True for _ in range(self.action_space_dim)]
+            mask[-1] = False
         else:
             mask = [False for _ in range(self.action_space_dim)]
-        # TODO: review: anything to do with max_value?
         return mask
 
-    def statebatch2proxy(self, states: List[List] = None) -> npt.NDArray[np.float32]:
+    def statetorch2policy(
+        self, states: TensorType["batch", "state_dim"] = None
+    ) -> TensorType["batch", "policy_input_dim"]:
         """
-        Scales the states into [0, max_val]
+        Clips the states into [0, max_val]
 
         Args
         ----
         state : list
             State
         """
-        return -1.0 + np.array(states) * 2 / self.max_val
+        return torch.clip(states, min=0.0, max=self.max_val)
 
-    def state2policy(self, state: List = None) -> List:
+    def statebatch2policy(self, states: List[List] = None) -> npt.NDArray[np.float32]:
         """
-        Returns the state as is.
+        Clips the states into [0, max_val]
+
+        Args
+        ----
+        state : list
+            State
         """
-        if state is None:
-            state = self.state.copy()
-        return state
+        return np.clip(np.array(states), a_min=0.0, a_max=self.max_val)
 
-    def policy2state(self, state_policy: List) -> List:
+    def state2policy(self, state: List = None) -> List:
         """
-        Returns the input as is.
+        Clips the state into [0, max_val]
         """
-        return state_policy
+        if state is None:
+            state = self.state.copy()
+        return [min(max(0.0, s), self.max_val) for s in state]
 
     def state2readable(self, state: List) -> str:
         """
@@ -195,16 +224,6 @@ def readable2state(self, readable: str) -> List:
         """
         return [el for el in readable.strip("[]").split(" ")]
 
-    def reset(self, env_id=None):
-        """
-        Resets the environment.
-        """
-        self.state = self.source.copy()
-        self.n_actions = 0
-        self.done = False
-        self.id = env_id
-        return self
-
     def get_parents(
         self, state: List = None, done: bool = None, action: Tuple[int, float] = None
     ) -> Tuple[List[List], List[Tuple[int, float]]]:
diff --git a/tests/gflownet/envs/test_cube.py b/tests/gflownet/envs/test_cube.py
new file mode 100644
index 000000000..df7812cd8
--- /dev/null
+++ b/tests/gflownet/envs/test_cube.py
@@ -0,0 +1,97 @@
+import common
+import numpy as np
+import pytest
+import torch
+
+from gflownet.envs.cube import HybridCube
+
+
+@pytest.fixture
+def env():
+    return HybridCube(n_dim=2, n_comp=3)
+
+
+@pytest.mark.parametrize(
+    "action_space",
+    [
+        [
+            (0, 0.0),
+            (1, 0.0),
+            (2, 0.0),
+        ],
+    ],
+)
+def test__get_action_space__returns_expected(env, action_space):
+    assert set(action_space) == set(env.action_space)
+
+
+def test__get_policy_output__returns_expected(env):
+    assert env.policy_output_dim == env.n_dim * env.n_comp * 3 + env.n_dim + 1
+    fixed_policy_output = env.fixed_policy_output
+    random_policy_output = env.random_policy_output
+    assert torch.all(fixed_policy_output[: env.n_dim + 1] == 1)
+    assert torch.all(random_policy_output[: env.n_dim + 1] == 1)
+    assert torch.all(fixed_policy_output[env.n_dim + 1 :: 3] == 1)
+    assert torch.all(
+        fixed_policy_output[env.n_dim + 2 :: 3] == env.fixed_distr_params["beta_alpha"]
+    )
+    assert torch.all(
+        fixed_policy_output[env.n_dim + 3 :: 3] == env.fixed_distr_params["beta_beta"]
+    )
+    assert torch.all(random_policy_output[env.n_dim + 1 :: 3] == 1)
+    assert torch.all(
+        random_policy_output[env.n_dim + 2 :: 3]
+        == env.random_distr_params["beta_alpha"]
+    )
+    assert torch.all(
+        random_policy_output[env.n_dim + 3 :: 3] == env.random_distr_params["beta_beta"]
+    )
+
+
+@pytest.mark.parametrize(
+    "state, expected",
+    [
+        (
+            [0.0, 0.0],
+            [0.0, 0.0],
+        ),
+        (
+            [1.0, 1.0],
+            [1.0, 1.0],
+        ),
+        (
+            [1.1, 1.00001],
+            [1.0, 1.0],
+        ),
+        (
+            [-0.1, 1.00001],
+            [0.0, 1.0],
+        ),
+        (
+            [0.1, 0.21],
+            [0.1, 0.21],
+        ),
+    ],
+)
+def test__state2policy_returns_expected(env, state, expected):
+    assert env.state2policy(state) == expected
+
+
+@pytest.mark.parametrize(
+    "states, expected",
+    [
+        (
+            [[0.0, 0.0], [1.0, 1.0], [1.1, 1.00001], [-0.1, 1.00001], [0.1, 0.21]],
+            [[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 1.0], [0.1, 0.21]],
+        ),
+    ],
+)
+def test__statebatch_torch2policy_returns_expected(env, states, expected):
+    assert np.equal(env.statebatch2policy(states), np.array(expected)).all()
+    assert torch.equal(
+        env.statetorch2policy(torch.tensor(states)), torch.tensor(expected)
+    )
+
+
+# def test__continuous_env_common(env):
+#     return common.test__continuous_env_common(env)

From 55d19b553eb81d099e76b7419fa5e97769ed643f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 7 Apr 2023 17:43:22 -0400
Subject: [PATCH 009/206] mypy and minor changes

---
 gflownet/envs/base.py   | 10 ++++++----
 gflownet/envs/ctorus.py |  8 ++++++--
 gflownet/envs/htorus.py | 22 +++++++++++++---------
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/gflownet/envs/base.py b/gflownet/envs/base.py
index be536e76b..fb1140a30 100644
--- a/gflownet/envs/base.py
+++ b/gflownet/envs/base.py
@@ -34,8 +34,8 @@ def __init__(
         proxy=None,
         oracle=None,
         proxy_state_format: str = "oracle",
-        fixed_distribution: Optional[dict] = None,
-        random_distribution: Optional[dict] = None,
+        fixed_distr_params: Optional[dict] = None,
+        random_distr_params: Optional[dict] = None,
         **kwargs,
     ):
         # Call reset() to set initial state, done, n_actions
@@ -78,8 +78,10 @@ def __init__(
         # Max trajectory length
         self.max_traj_length = self.get_max_traj_length()
         # Policy outputs
-        self.fixed_policy_output = self.get_policy_output(fixed_distribution)
-        self.random_policy_output = self.get_policy_output(random_distribution)
+        self.fixed_distr_params = fixed_distr_params
+        self.random_distr_params = random_distr_params
+        self.fixed_policy_output = self.get_policy_output(self.fixed_distr_params)
+        self.random_policy_output = self.get_policy_output(self.random_distr_params)
         self.policy_output_dim = len(self.fixed_policy_output)
         self.policy_input_dim = len(self.state2policy())
 
diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index 190c09787..663788f9a 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -53,7 +53,7 @@ def get_policy_output(self, params: dict):
         random policy.
 
         For each dimension d of the hyper-torus and component c of the mixture, the
-        output of the policy should return 
+        output of the policy should return
           1) the weight of the component in the mixture
           2) the location of the von Mises distribution to sample the angle increment
           3) the log concentration of the von Mises distribution to sample the angle
@@ -69,7 +69,11 @@ def get_policy_output(self, params: dict):
         policy_output[2::3] = params["vonmises_concentration"]
         return policy_output
 
-    def get_mask_invalid_actions_forward(self, state=None, done=None):
+    def get_mask_invalid_actions_forward(
+        self,
+        state: Optional[List] = None,
+        done: Optional[bool] = None,
+    ) -> List:
         """
         Returns [True] if the only possible action is eos, [False] otherwise.
         """
diff --git a/gflownet/envs/htorus.py b/gflownet/envs/htorus.py
index 6ef8b0d3a..7b4741ae5 100644
--- a/gflownet/envs/htorus.py
+++ b/gflownet/envs/htorus.py
@@ -4,7 +4,7 @@
 import itertools
 import re
 from copy import deepcopy
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -45,11 +45,11 @@ def __init__(
         policy_encoding_dim_per_angle: int = None,
         do_nonzero_source_prob: bool = True,
         vonmises_min_concentration: float = 1e-3,
-        fixed_distribution: dict = {
+        fixed_distr_params: dict = {
             "vonmises_mean": 0.0,
             "vonmises_concentration": 0.5,
         },
-        random_distribution: dict = {
+        random_distr_params: dict = {
             "vonmises_mean": 0.0,
             "vonmises_concentration": 0.001,
         },
@@ -79,8 +79,8 @@ def __init__(
         self.statebatch2oracle = self.statebatch2proxy
         # Base class init
         super().__init__(
-            fixed_distribution=fixed_distribution,
-            random_distribution=random_distribution,
+            fixed_distr_params=fixed_distr_params,
+            random_distr_params=random_distr_params,
             **kwargs,
         )
 
@@ -131,7 +131,11 @@ def get_policy_output(self, params: dict):
         policy_output[2 :: self.n_params_per_dim] = params["vonmises_concentration"]
         return policy_output
 
-    def get_mask_invalid_actions_forward(self, state=None, done=None):
+    def get_mask_invalid_actions_forward(
+        self,
+        state: Optional[List] = None,
+        done: Optional[bool] = None,
+    ) -> List:
         """
         Returns a vector with the length of the discrete part of the action space:
         True if action is invalid going forward given the current state, False
@@ -168,10 +172,10 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
             mask = [False for _ in range(self.action_space_dim)]
             mask[-1] = True
         # Catch cases where it would not be possible to reach the initial state
-        noninit_states = [s for s, ss in zip(state[:-1], self.source_angles) if s != ss]
-        if len(noninit_states) > state[-1]:
+        noninit_dims = [s for s, ss in zip(state[:-1], self.source_angles) if s != ss]
+        if len(noninit_dims) > state[-1]:
             raise ValueError("This point in the code should never be reached!")
-        elif len(noninit_states) == state[-1] and len(noninit_states) >= state[-1] - 1:
+        elif len(noninit_dims) == state[-1] and len(noninit_dims) >= state[-1] - 1:
             mask = [
                 True if s == ss else m
                 for m, s, ss in zip(mask, state[:-1], self.source_angles)

From 9f9cd2d19be3e56ffbd673074fc7dabbc435f168 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 7 Apr 2023 19:30:44 -0400
Subject: [PATCH 010/206] wip cube classes

---
 gflownet/envs/cube.py | 627 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 571 insertions(+), 56 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 759f110c3..9b27bc77a 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1,6 +1,7 @@
 """
 Classes to represent hyper-cube environments
 """
+from abc import ABC, abstractmethod
 import itertools
 from typing import List, Optional, Tuple
 
@@ -14,7 +15,7 @@
 from gflownet.envs.base import GFlowNetEnv
 
 
-class HybridCube(GFlowNetEnv):
+class Cube(GFlowNetEnv, ABC):
     """
     Continuous (hybrid: discrete and continuous) hyper-cube environment (continuous
     version of a hyper-grid) in which the action space consists of the increment of
@@ -31,12 +32,235 @@ class HybridCube(GFlowNetEnv):
     max_val : float
         Max length of the hyper-cube.
 
+    min_incr : float
+        Minimum increment in the actions, expressed as the fraction of max_val. This is
+        necessary to ensure coverage of the state space.
     """
 
     def __init__(
         self,
         n_dim: int = 2,
         max_val: float = 1.0,
+        min_incr: float = 0.1,
+        n_comp: int = 1,
+        fixed_distr_params: dict = {
+            "beta_alpha": 2.0,
+            "beta_beta": 5.0,
+        },
+        random_distr_params: dict = {
+            "beta_alpha": 1.0,
+            "beta_beta": 1.0,
+        },
+        **kwargs,
+    ):
+        assert n_dim > 0
+        assert max_val > 0.0
+        assert n_comp > 0
+        # Main properties
+        self.continuous = True
+        self.n_dim = n_dim
+        self.eos = self.n_dim
+        self.max_val = max_val
+        self.min_incr = min_incr * self.max_val
+        # Parameters of fixed policy distribution
+        self.n_comp = n_comp
+        # Source state: position 0 at all dimensions
+        self.source = [0.0 for _ in range(self.n_dim)]
+        # Action from source: (n_dim, 0)
+        self.action_source = (self.n_dim, 0)
+        # End-of-sequence action: (n_dim + 1, 0)
+        self.eos = (self.n_dim + 1, 0)
+        # Conversions: only conversions to policy are implemented and the rest are the
+        # same
+        self.state2proxy = self.state2policy
+        self.statebatch2proxy = self.statebatch2policy
+        self.statetorch2proxy = self.statetorch2policy
+        self.state2oracle = self.state2proxy
+        self.statebatch2oracle = self.statebatch2proxy
+        self.statetorch2oracle = self.statetorch2proxy
+        # Base class init
+        super().__init__(
+            fixed_distr_params=fixed_distr_params,
+            random_distr_params=random_distr_params,
+            **kwargs,
+        )
+
+    @abstractmethod
+    def get_action_space(self):
+        pass
+
+    @abstractmethod
+    def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
+        pass
+
+    @abstractmethod
+    def get_mask_invalid_actions_forward(
+        self,
+        state: Optional[List] = None,
+        done: Optional[bool] = None,
+    ) -> List:
+        pass
+
+    @abstractmethod
+    def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
+        pass
+
+    def statetorch2policy(
+        self, states: TensorType["batch", "state_dim"] = None
+    ) -> TensorType["batch", "policy_input_dim"]:
+        """
+        Clips the states into [0, max_val]
+
+        Args
+        ----
+        state : list
+            State
+        """
+        return torch.clip(states, min=0.0, max=self.max_val)
+
+    def statebatch2policy(self, states: List[List] = None) -> npt.NDArray[np.float32]:
+        """
+        Clips the states into [0, max_val]
+
+        Args
+        ----
+        state : list
+            State
+        """
+        return np.clip(np.array(states), a_min=0.0, a_max=self.max_val)
+
+    def state2policy(self, state: List = None) -> List:
+        """
+        Clips the state into [0, max_val]
+        """
+        if state is None:
+            state = self.state.copy()
+        return [min(max(0.0, s), self.max_val) for s in state]
+
+    def state2readable(self, state: List) -> str:
+        """
+        Converts a state (a list of positions) into a human-readable string
+        representing a state.
+        """
+        return str(state).replace("(", "[").replace(")", "]").replace(",", "")
+
+    def readable2state(self, readable: str) -> List:
+        """
+        Converts a human-readable string representing a state into a state as a list of
+        positions.
+        """
+        return [el for el in readable.strip("[]").split(" ")]
+
+    @abstractmethod
+    def get_parents(
+        self, state: List = None, done: bool = None, action: Tuple[int, float] = None
+    ) -> Tuple[List[List], List[Tuple[int, float]]]:
+        """
+        Determines all parents and actions that lead to state.
+
+        Args
+        ----
+        state : list
+            Representation of a state
+
+        done : bool
+            Whether the trajectory is done. If None, done is taken from instance.
+
+        action : int
+            Last action performed
+
+        Returns
+        -------
+        parents : list
+            List of parents in state format
+
+        actions : list
+            List of actions that lead to state for each parent in parents
+        """
+        pass
+
+    @abstractmethod
+    def sample_actions(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+        sampling_method: str = "policy",
+        mask_invalid_actions: TensorType["n_states", "1"] = None,
+        temperature_logits: float = 1.0,
+        loginf: float = 1000,
+    ) -> Tuple[List[Tuple], TensorType["n_states"]]:
+        """
+        Samples a batch of actions from a batch of policy outputs.
+        """
+        pass
+
+    def get_logprobs(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+        is_forward: bool,
+        actions: TensorType["n_states", 2],
+        states_target: TensorType["n_states", "policy_input_dim"],
+        mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
+        loginf: float = 1000,
+    ) -> TensorType["batch_size"]:
+        """
+        Computes log probabilities of actions given policy outputs and actions.
+        """
+        pass
+
+    def step(
+        self, action: Tuple[int, float]
+    ) -> Tuple[List[float], Tuple[int, float], bool]:
+        """
+        Executes step given an action.
+
+        Args
+        ----
+        action : tuple
+            Action to be executed. An action is a tuple with two values:
+            (dimension, increment).
+
+        Returns
+        -------
+        self.state : list
+            The sequence after executing the action
+
+        action : int
+            Action executed
+
+        valid : bool
+            False, if the action is not allowed for the current state, e.g. stop at the
+            root state
+        """
+        pass
+
+
+class HybridCube(Cube):
+    """
+    Continuous (hybrid: discrete and continuous) hyper-cube environment (continuous
+    version of a hyper-grid) in which the action space consists of the increment of
+    dimension d, modelled by a beta distribution.
+
+    The states space is the value of each dimension. If the value of a dimension gets
+    larger than max_val, then the trajectory is ended.
+
+    Attributes
+    ----------
+    n_dim : int
+        Dimensionality of the hyper-cube.
+
+    max_val : float
+        Max length of the hyper-cube.
+
+    min_incr : float
+        Minimum increment in the actions, expressed as the fraction of max_val. This is
+        necessary to ensure coverage of the state space.
+    """
+
+    def __init__(
+        self,
+        n_dim: int = 2,
+        max_val: float = 1.0,
+        min_incr: float = 0.1,
         n_comp: int = 1,
         do_nonzero_source_prob: bool = True,
         fixed_distr_params: dict = {
@@ -57,6 +281,7 @@ def __init__(
         self.n_dim = n_dim
         self.eos = self.n_dim
         self.max_val = max_val
+        self.min_incr = min_incr * self.max_val
         # Parameters of fixed policy distribution
         self.n_comp = n_comp
         if do_nonzero_source_prob:
@@ -65,8 +290,10 @@ def __init__(
             self.n_params_per_dim = 3
         # Source state: position 0 at all dimensions
         self.source = [0.0 for _ in range(self.n_dim)]
-        # End-of-sequence action: (n_dim, 0)
-        self.eos = (self.n_dim, 0)
+        # Action from source: (n_dim, 0)
+        self.action_source = (self.n_dim, 0)
+        # End-of-sequence action: (n_dim + 1, 0)
+        self.eos = (self.n_dim + 1, 0)
         # Conversions: only conversions to policy are implemented and the rest are the
         # same
         self.state2proxy = self.state2policy
@@ -91,10 +318,15 @@ def get_action_space(self):
         indicates the index of the dimension on which the action is to be performed and
         increment indicates the increment of the dimension.
 
+        Additionally, there are two special discrete actions:
+            - Sample an increment for all dimensions. Only valid from the source state.
+            - EOS action
+
         The (discrete) action space is then one tuple per dimension (with 0 increment),
         plus the EOS action.
         """
         actions = [(d, 0) for d in range(self.n_dim)]
+        actions.append(self.action_source)
         actions.append(self.eos)
         return actions
 
@@ -147,9 +379,13 @@ def get_mask_invalid_actions_forward(
             done = self.done
         if done:
             return [True for _ in range(self.action_space_dim)]
+        # If state is source, then next action can only be the action from source.
+        if all([s == ss for s in zip(self.state, self.source)]):
+            mask = [True for _ in range(self.action_space_dim)]
+            mask[-2] = False
         # If the value of any dimension is greater than max_val, then next action can
         # only be EOS.
-        if any([s > self.max_val for s in self.state]):
+        elif any([s > self.max_val for s in self.state]):
             mask = [True for _ in range(self.action_space_dim)]
             mask[-1] = False
         else:
@@ -170,59 +406,329 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
             mask = [True for _ in range(self.action_space_dim)]
             mask[-1] = False
         # If the value of any dimension is smaller than 0.0, then next action can
-        # only be EOS (return to source).
+        # return to source.
         if any([s < 0.0 for s in self.state]):
             mask = [True for _ in range(self.action_space_dim)]
-            mask[-1] = False
+            mask[-2] = False
         else:
             mask = [False for _ in range(self.action_space_dim)]
         return mask
 
-    def statetorch2policy(
-        self, states: TensorType["batch", "state_dim"] = None
-    ) -> TensorType["batch", "policy_input_dim"]:
+    def get_parents(
+        self, state: List = None, done: bool = None, action: Tuple[int, float] = None
+    ) -> Tuple[List[List], List[Tuple[int, float]]]:
         """
-        Clips the states into [0, max_val]
+        Determines all parents and actions that lead to state.
 
         Args
         ----
         state : list
-            State
+            Representation of a state
+
+        done : bool
+            Whether the trajectory is done. If None, done is taken from instance.
+
+        action : int
+            Last action performed
+
+        Returns
+        -------
+        parents : list
+            List of parents in state format
+
+        actions : list
+            List of actions that lead to state for each parent in parents
         """
-        return torch.clip(states, min=0.0, max=self.max_val)
+        if state is None:
+            state = self.state.copy()
+        if done is None:
+            done = self.done
+        if done:
+            return [state], [self.eos]
+        # If source state
+        elif state[-1] == 0:
+            return [], []
+        else:
+            dim, incr = action
+            state[dim] -= incr
+            parents = [state]
+            return parents, [action]
 
-    def statebatch2policy(self, states: List[List] = None) -> npt.NDArray[np.float32]:
+    def sample_actions(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+        sampling_method: str = "policy",
+        mask_invalid_actions: TensorType["n_states", "1"] = None,
+        temperature_logits: float = 1.0,
+        loginf: float = 1000,
+    ) -> Tuple[List[Tuple], TensorType["n_states"]]:
         """
-        Clips the states into [0, max_val]
+        Samples a batch of actions from a batch of policy outputs.
+        """
+        device = policy_outputs.device
+        n_states = policy_outputs.shape[0]
+        ns_range = torch.arange(n_states).to(device)
+        # Sample dimensions
+        if sampling_method == "uniform":
+            logits_dims = torch.ones(n_states, self.policy_output_dim).to(device)
+        elif sampling_method == "policy":
+            logits_dims = policy_outputs[:, 0 : self.n_dim + 1]
+            logits_dims /= temperature_logits
+        if mask_invalid_actions is not None:
+            logits_dims[mask_invalid_actions] = -loginf
+        dimensions = Categorical(logits=logits_dims).sample()
+        logprobs_dim = self.logsoftmax(logits_dims)[ns_range, dimensions]
+        # Sample increments
+        ns_range_noeos = ns_range[dimensions != self.eos[0]]
+        dimensions_noeos = dimensions[dimensions != self.eos[0]]
+        increments = torch.zeros(n_states).to(device)
+        logprobs_increments = torch.zeros(n_states).to(device)
+        if len(dimensions_noeos) > 0:
+            if sampling_method == "uniform":
+                distr_increments = Uniform(
+                    torch.zeros(len(ns_range_noeos)),
+                    self.max_val * torch.ones(len(ns_range_noeos)),
+                )
+            elif sampling_method == "policy":
+                alphas = policy_outputs[:, self.n_dim + 2 :: 3][
+                    ns_range_noeos, dimensions_noeos
+                ]
+                betas = policy_outputs[:, self.n_dim + 3 :: 3][
+                    ns_range_noeos, dimensions_noeos
+                ]
+                distr_increments = Beta(torch.exp(alphas), torch.exp(betas))
+            increments[ns_range_noeos] = distr_increments.sample()
+            logprobs_increments[ns_range_noeos] = distr_increments.log_prob(
+                increments[ns_range_noeos]
+            )
+            # Apply minimum increment
+            increments[ns_range_noeos] = torch.min(
+                increments[ns_range_noeos],
+                self.min_incr * torch.ones(ns_range_noeos.shape[0]),
+            )
+        # Combined probabilities
+        logprobs = logprobs_dim + logprobs_increments
+        # Build actions
+        actions = [
+            (dimension, incr)
+            for dimension, incr in zip(dimensions.tolist(), increments.tolist())
+        ]
+        return actions, logprobs
+
+    def get_logprobs(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+        is_forward: bool,
+        actions: TensorType["n_states", 2],
+        states_target: TensorType["n_states", "policy_input_dim"],
+        mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
+        loginf: float = 1000,
+    ) -> TensorType["batch_size"]:
+        """
+        Computes log probabilities of actions given policy outputs and actions.
+        """
+        device = policy_outputs.device
+        dimensions, steps = zip(*actions)
+        dimensions = torch.LongTensor([d.long() for d in dimensions]).to(device)
+        steps = torch.FloatTensor(steps).to(device)
+        n_states = policy_outputs.shape[0]
+        ns_range = torch.arange(n_states).to(device)
+        # Dimensions
+        logits_dims = policy_outputs[:, 0::3]
+        if mask_invalid_actions is not None:
+            logits_dims[mask_invalid_actions] = -loginf
+        logprobs_dim = self.logsoftmax(logits_dims)[ns_range, dimensions]
+        # Steps
+        ns_range_noeos = ns_range[dimensions != self.eos]
+        dimensions_noeos = dimensions[dimensions != self.eos]
+        logprobs_steps = torch.zeros(n_states).to(device)
+        if len(dimensions_noeos) > 0:
+            alphas = policy_outputs[:, 1::3][ns_range_noeos, dimensions_noeos]
+            betas = policy_outputs[:, 2::3][ns_range_noeos, dimensions_noeos]
+            distr_steps = Beta(torch.exp(alphas), torch.exp(betas))
+            logprobs_steps[ns_range_noeos] = distr_steps.log_prob(steps[ns_range_noeos])
+        # Combined probabilities
+        logprobs = logprobs_dim + logprobs_steps
+        return logprobs
+
+    def step(
+        self, action: Tuple[int, float]
+    ) -> Tuple[List[float], Tuple[int, float], bool]:
+        """
+        Executes step given an action.
 
         Args
         ----
-        state : list
-            State
+        action : tuple
+            Action to be executed. An action is a tuple with two values:
+            (dimension, increment).
+
+        Returns
+        -------
+        self.state : list
+            The sequence after executing the action
+
+        action : int
+            Action executed
+
+        valid : bool
+            False, if the action is not allowed for the current state, e.g. stop at the
+            root state
         """
-        return np.clip(np.array(states), a_min=0.0, a_max=self.max_val)
+        if self.done:
+            return self.state, action, False
+        # If action is eos or any dimension is beyond max_val or n_actions has reached
+        # max_traj_length, then force eos
+        elif (
+            action[0] == self.eos
+            or any([s > self.max_val for s in self.state])
+            or self.n_actions >= self.max_traj_length
+        ):
+            self.done = True
+            self.n_actions += 1
+            return self.state, (self.eos, 0.0), True
+        # If action is not eos, then perform action
+        elif action[0] != self.eos:
+            self.n_actions += 1
+            self.state[action[0]] += action[1]
+            return self.state, action, True
+        # Otherwise (unreachable?) it is invalid
+        else:
+            return self.state, action, False
 
-    def state2policy(self, state: List = None) -> List:
+    def get_grid_terminating_states(self, n_states: int) -> List[List]:
+        n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
+        linspaces = [np.linspace(0, self.max_val, n_per_dim) for _ in range(self.n_dim)]
+        states = list(itertools.product(*linspaces))
+        # TODO: check if necessary
+        states = [list(el) for el in states]
+        return states
+
+
+class ContinuousCube(Cube):
+    """
+    Continuous hyper-cube environment (continuous
+    version of a hyper-grid) in which the action space consists of the increment of
+    each dimension d, modelled by a mixture of Beta distributions.
+
+    The states space is the value of each dimension. If the value of any dimension gets
+    larger than max_val, then the trajectory is ended.
+
+    Attributes
+    ----------
+    n_dim : int
+        Dimensionality of the hyper-cube.
+
+    max_val : float
+        Max length of the hyper-cube.
+
+    min_incr : float
+        Minimum increment in the actions, expressed as the fraction of max_val. This is
+        necessary to ensure coverage of the state space.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def get_action_space(self):
         """
-        Clips the state into [0, max_val]
+        The actions are tuples of length n_dim, where the value at position d indicates
+        the (positive) increment of dimension d.
+
+        Additionally, there are two special discrete actions:
+            - Action from the source state, with no minimum increment. Only valid from
+              the source state. Indicated by -1 for all dimensions.
+            - EOS action. Indicated by np.inf for all dimensions.
         """
-        if state is None:
-            state = self.state.copy()
-        return [min(max(0.0, s), self.max_val) for s in state]
+        generic_action = tuple([0.0 for _ in range(self.n_dim)])
+        self.action_source = tuple([-1.0 for _ in range(self.n_dim)])
+        self.eos = tuple([np.inf for _ in range(self.n_dim)])
+        actions = [generic_action, self.action_source, self.eos]
+        return actions
 
-    def state2readable(self, state: List) -> str:
+    def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         """
-        Converts a state (a list of positions) into a human-readable string
-        representing a state.
+        Defines the structure of the output of the policy model, from which an
+        action is to be determined or sampled, by returning a vector with a fixed
+        random policy.
+
+        For each dimension d of the hyper-cube and component c of the mixture, the
+        output of the policy should return
+          1) the weight of the component in the mixture
+          2) the log(alpha) parameter of the Beta distribution to sample the increment
+          3) the log(beta) parameter of the Beta distribution to sample the increment
+
+        Additionally, the policy output contains one logit per dimension plus one logit
+        for the EOS action, for the categorical distribution over dimensions.
+
+        Therefore, the output of the policy model has dimensionality D x C x 3 + D + 1,
+        where D is the number of dimensions (self.n_dim) and C is the number of
+        components (self.n_comp). The first D + 1 entries in the policy output
+        correspond to the categorical logits. Then, the next 3 x C entries in the
+        policy output correspond to the first dimension, and so on.
         """
-        return str(state).replace("(", "[").replace(")", "]").replace(",", "")
+        policy_output_fixed = torch.ones(
+            self.n_dim * self.n_comp * 3 + self.n_dim + 1,
+            device=self.device,
+            dtype=self.float,
+        )
+        policy_output_fixed[self.n_dim + 2 :: 3] = params["beta_alpha"]
+        policy_output_fixed[self.n_dim + 3 :: 3] = params["beta_beta"]
+        return policy_output_fixed
 
-    def readable2state(self, readable: str) -> List:
+    def get_mask_invalid_actions_forward(
+        self,
+        state: Optional[List] = None,
+        done: Optional[bool] = None,
+    ) -> List:
         """
-        Converts a human-readable string representing a state into a state as a list of
-        positions.
+        Returns a vector with the length of the discrete part of the action space + 1:
+        True if action is invalid going forward given the current state, False
+        otherwise.
+
+        All discrete actions are valid, including eos, except if the value of any
+        dimension has excedded max_val, in which case the only valid action is eos.
         """
-        return [el for el in readable.strip("[]").split(" ")]
+        if state is None:
+            state = self.state.copy()
+        if done is None:
+            done = self.done
+        if done:
+            return [True for _ in range(self.action_space_dim)]
+        # If state is source, then next action can only be the action from source.
+        if all([s == ss for s in zip(self.state, self.source)]):
+            mask = [True for _ in range(self.action_space_dim)]
+            mask[-2] = False
+        # If the value of any dimension is greater than max_val, then next action can
+        # only be EOS.
+        elif any([s > self.max_val for s in self.state]):
+            mask = [True for _ in range(self.action_space_dim)]
+            mask[-1] = False
+        else:
+            mask = [False for _ in range(self.action_space_dim)]
+        return mask
+
+    def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
+        """
+        Returns a vector with the length of the discrete part of the action space + 1:
+        True if action is invalid going backward given the current state, False
+        otherwise.
+        """
+        if state is None:
+            state = self.state.copy()
+        if done is None:
+            done = self.done
+        if done:
+            mask = [True for _ in range(self.action_space_dim)]
+            mask[-1] = False
+        # If the value of any dimension is smaller than 0.0, then next action can
+        # return to source.
+        if any([s < 0.0 for s in self.state]):
+            mask = [True for _ in range(self.action_space_dim)]
+            mask[-2] = False
+        else:
+            mask = [False for _ in range(self.action_space_dim)]
+        return mask
 
     def get_parents(
         self, state: List = None, done: bool = None, action: Tuple[int, float] = None
@@ -254,9 +760,13 @@ def get_parents(
         if done is None:
             done = self.done
         if done:
-            return [state], [(self.eos, 0.0)]
+            return [state], [self.eos]
+        # If source state
+        elif state[-1] == 0:
+            return [], []
         else:
-            state[action[0]] -= action[1]
+            dim, incr = action
+            state[dim] -= incr
             parents = [state]
             return parents, [action]
 
@@ -264,9 +774,8 @@ def sample_actions(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         sampling_method: str = "policy",
-        mask_invalid_actions: TensorType["n_states", "policy_output_dim"] = None,
+        mask_invalid_actions: TensorType["n_states", "1"] = None,
         temperature_logits: float = 1.0,
-        random_action_prob=0.0,
         loginf: float = 1000,
     ) -> Tuple[List[Tuple], TensorType["n_states"]]:
         """
@@ -275,51 +784,57 @@ def sample_actions(
         device = policy_outputs.device
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
-        # Random actions
-        n_random = int(n_states * random_action_prob)
-        idx_random = torch.randint(high=n_states, size=(n_random,))
-        policy_outputs[idx_random, :] = torch.tensor(self.fixed_policy_output).to(
-            policy_outputs
-        )
         # Sample dimensions
         if sampling_method == "uniform":
-            logits_dims = torch.zeros(n_states, self.n_dim).to(device)
+            logits_dims = torch.ones(n_states, self.policy_output_dim).to(device)
         elif sampling_method == "policy":
-            logits_dims = policy_outputs[:, 0::3]
+            logits_dims = policy_outputs[:, 0 : self.n_dim + 1]
             logits_dims /= temperature_logits
         if mask_invalid_actions is not None:
             logits_dims[mask_invalid_actions] = -loginf
         dimensions = Categorical(logits=logits_dims).sample()
         logprobs_dim = self.logsoftmax(logits_dims)[ns_range, dimensions]
-        # Sample steps
-        ns_range_noeos = ns_range[dimensions != self.eos]
-        dimensions_noeos = dimensions[dimensions != self.eos]
-        steps = torch.zeros(n_states).to(device)
-        logprobs_steps = torch.zeros(n_states).to(device)
+        # Sample increments
+        ns_range_noeos = ns_range[dimensions != self.eos[0]]
+        dimensions_noeos = dimensions[dimensions != self.eos[0]]
+        increments = torch.zeros(n_states).to(device)
+        logprobs_increments = torch.zeros(n_states).to(device)
         if len(dimensions_noeos) > 0:
             if sampling_method == "uniform":
-                distr_steps = Uniform(
+                distr_increments = Uniform(
                     torch.zeros(len(ns_range_noeos)),
                     self.max_val * torch.ones(len(ns_range_noeos)),
                 )
             elif sampling_method == "policy":
-                alphas = policy_outputs[:, 1::3][ns_range_noeos, dimensions_noeos]
-                betas = policy_outputs[:, 2::3][ns_range_noeos, dimensions_noeos]
-                distr_steps = Beta(torch.exp(alphas), torch.exp(betas))
-            steps[ns_range_noeos] = distr_steps.sample()
-            logprobs_steps[ns_range_noeos] = distr_steps.log_prob(steps[ns_range_noeos])
+                alphas = policy_outputs[:, self.n_dim + 2 :: 3][
+                    ns_range_noeos, dimensions_noeos
+                ]
+                betas = policy_outputs[:, self.n_dim + 3 :: 3][
+                    ns_range_noeos, dimensions_noeos
+                ]
+                distr_increments = Beta(torch.exp(alphas), torch.exp(betas))
+            increments[ns_range_noeos] = distr_increments.sample()
+            logprobs_increments[ns_range_noeos] = distr_increments.log_prob(
+                increments[ns_range_noeos]
+            )
+            # Apply minimum increment
+            increments[ns_range_noeos] = torch.min(
+                increments[ns_range_noeos],
+                self.min_incr * torch.ones(ns_range_noeos.shape[0]),
+            )
         # Combined probabilities
-        logprobs = logprobs_dim + logprobs_steps
+        logprobs = logprobs_dim + logprobs_increments
         # Build actions
         actions = [
-            (dimension, step)
-            for dimension, step in zip(dimensions.tolist(), steps.tolist())
+            (dimension, incr)
+            for dimension, incr in zip(dimensions.tolist(), increments.tolist())
         ]
         return actions, logprobs
 
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
+        is_forward: bool,
         actions: TensorType["n_states", 2],
         states_target: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,

From c46fb38f281eaef3b640af6cafda74e41a07e3d1 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sat, 8 Apr 2023 12:45:50 -0400
Subject: [PATCH 011/206] progress in ccube

---
 gflownet/envs/ctorus.py |   2 +-
 gflownet/envs/cube.py   | 180 +++++++++++++++++++---------------------
 2 files changed, 88 insertions(+), 94 deletions(-)

diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index 663788f9a..458b1425f 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -46,7 +46,7 @@ def get_action_space(self):
         actions = [generic_action, self.eos]
         return actions
 
-    def get_policy_output(self, params: dict):
+    def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         """
         Defines the structure of the output of the policy model, from which an
         action is to be determined or sampled, by returning a vector with a fixed
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 9b27bc77a..1c84a324a 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -351,14 +351,14 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         correspond to the categorical logits. Then, the next 3 x C entries in the
         policy output correspond to the first dimension, and so on.
         """
-        policy_output_fixed = torch.ones(
+        policy_output = torch.ones(
             self.n_dim * self.n_comp * 3 + self.n_dim + 1,
             device=self.device,
             dtype=self.float,
         )
-        policy_output_fixed[self.n_dim + 2 :: 3] = params["beta_alpha"]
-        policy_output_fixed[self.n_dim + 3 :: 3] = params["beta_beta"]
-        return policy_output_fixed
+        policy_output[self.n_dim + 2 :: 3] = params["beta_alpha"]
+        policy_output[self.n_dim + 3 :: 3] = params["beta_beta"]
+        return policy_output
 
     def get_mask_invalid_actions_forward(
         self,
@@ -658,23 +658,22 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
           2) the log(alpha) parameter of the Beta distribution to sample the increment
           3) the log(beta) parameter of the Beta distribution to sample the increment
 
-        Additionally, the policy output contains one logit per dimension plus one logit
-        for the EOS action, for the categorical distribution over dimensions.
+        Additionally, the policy output contains one logit of a Bernoulli distribution
+        to model the (discrete) forward probability of selecting the EOS action and the
+        (discrete) backward probability of returning to the source node.
 
-        Therefore, the output of the policy model has dimensionality D x C x 3 + D + 1,
+        Therefore, the output of the policy model has dimensionality D x C x 3 + 1,
         where D is the number of dimensions (self.n_dim) and C is the number of
-        components (self.n_comp). The first D + 1 entries in the policy output
-        correspond to the categorical logits. Then, the next 3 x C entries in the
-        policy output correspond to the first dimension, and so on.
+        components (self.n_comp).
         """
-        policy_output_fixed = torch.ones(
-            self.n_dim * self.n_comp * 3 + self.n_dim + 1,
+        policy_output = torch.ones(
+            self.n_dim * self.n_comp * 3 + 1,
             device=self.device,
             dtype=self.float,
         )
-        policy_output_fixed[self.n_dim + 2 :: 3] = params["beta_alpha"]
-        policy_output_fixed[self.n_dim + 3 :: 3] = params["beta_beta"]
-        return policy_output_fixed
+        policy_output[1::3] = params["beta_alpha"]
+        policy_output[2::3] = params["beta_beta"]
+        return policy_output
 
     def get_mask_invalid_actions_forward(
         self,
@@ -682,12 +681,13 @@ def get_mask_invalid_actions_forward(
         done: Optional[bool] = None,
     ) -> List:
         """
-        Returns a vector with the length of the discrete part of the action space + 1:
+        Returns a vector with the length of the discrete part of the action space:
         True if action is invalid going forward given the current state, False
         otherwise.
 
-        All discrete actions are valid, including eos, except if the value of any
-        dimension has excedded max_val, in which case the only valid action is eos.
+        If the state is the source state, the only valid action is action_source. EOS
+        is valid valid from any state (including the source state) and EOS is the only
+        possible action if the value of any dimension has excedded max_val.
         """
         if state is None:
             state = self.state.copy()
@@ -695,24 +695,30 @@ def get_mask_invalid_actions_forward(
             done = self.done
         if done:
             return [True for _ in range(self.action_space_dim)]
-        # If state is source, then next action can only be the action from source.
-        if all([s == ss for s in zip(self.state, self.source)]):
-            mask = [True for _ in range(self.action_space_dim)]
-            mask[-2] = False
+        # If state is source, the generic action is not valid.
+        if all([s == ss for s, ss in zip(state, self.source)]):
+            mask = [False for _ in range(self.action_space_dim)]
+            mask[0] = True
         # If the value of any dimension is greater than max_val, then next action can
         # only be EOS.
-        elif any([s > self.max_val for s in self.state]):
+        elif any([s > self.max_val for s in state]):
             mask = [True for _ in range(self.action_space_dim)]
             mask[-1] = False
+        # Otherwise, only the action_source is not valid
         else:
             mask = [False for _ in range(self.action_space_dim)]
+            mask[-2] = True
         return mask
 
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
         """
-        Returns a vector with the length of the discrete part of the action space + 1:
+        Returns a vector with the length of the discrete part of the action space:
         True if action is invalid going backward given the current state, False
         otherwise.
+
+        The EOS action (returning to the source state for backward actions) is valid
+        from any state. The source action is ignored (invalid) for backward actions. If
+        any dimension is smaller than 0, then the only valid action is EOS.
         """
         if state is None:
             state = self.state.copy()
@@ -721,13 +727,15 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         if done:
             mask = [True for _ in range(self.action_space_dim)]
             mask[-1] = False
-        # If the value of any dimension is smaller than 0.0, then next action can
-        # return to source.
-        if any([s < 0.0 for s in self.state]):
+        # If the value of any dimension is smaller than 0.0, then next action can only
+        # be return to source (EOS)
+        if any([s < 0.0 for s in state]):
             mask = [True for _ in range(self.action_space_dim)]
-            mask[-2] = False
+            mask[-1] = False
         else:
             mask = [False for _ in range(self.action_space_dim)]
+        # action_source is ignored going backwards, thus always invalid.
+        mask[-2] = True
         return mask
 
     def get_parents(
@@ -762,11 +770,11 @@ def get_parents(
         if done:
             return [state], [self.eos]
         # If source state
-        elif state[-1] == 0:
+        if all([s == ss for s, ss in zip(state, self.source)]):
             return [], []
         else:
-            dim, incr = action
-            state[dim] -= incr
+            for dim, incr in enumerate(action):
+                state[dim] -= incr
             parents = [state]
             return parents, [action]
 
@@ -782,53 +790,47 @@ def sample_actions(
         Samples a batch of actions from a batch of policy outputs.
         """
         device = policy_outputs.device
+        import ipdb
+
+        ipdb.set_trace()
+        mask_states_sample = ~mask_invalid_actions.flatten()
         n_states = policy_outputs.shape[0]
-        ns_range = torch.arange(n_states).to(device)
-        # Sample dimensions
-        if sampling_method == "uniform":
-            logits_dims = torch.ones(n_states, self.policy_output_dim).to(device)
-        elif sampling_method == "policy":
-            logits_dims = policy_outputs[:, 0 : self.n_dim + 1]
-            logits_dims /= temperature_logits
-        if mask_invalid_actions is not None:
-            logits_dims[mask_invalid_actions] = -loginf
-        dimensions = Categorical(logits=logits_dims).sample()
-        logprobs_dim = self.logsoftmax(logits_dims)[ns_range, dimensions]
-        # Sample increments
-        ns_range_noeos = ns_range[dimensions != self.eos[0]]
-        dimensions_noeos = dimensions[dimensions != self.eos[0]]
-        increments = torch.zeros(n_states).to(device)
-        logprobs_increments = torch.zeros(n_states).to(device)
-        if len(dimensions_noeos) > 0:
+        # Sample angle increments
+        angles = torch.zeros(n_states, self.n_dim).to(device)
+        logprobs = torch.zeros(n_states, self.n_dim).to(device)
+        if torch.any(mask_states_sample):
             if sampling_method == "uniform":
-                distr_increments = Uniform(
+                distr_angles = Uniform(
                     torch.zeros(len(ns_range_noeos)),
-                    self.max_val * torch.ones(len(ns_range_noeos)),
+                    2 * torch.pi * torch.ones(len(ns_range_noeos)),
                 )
             elif sampling_method == "policy":
-                alphas = policy_outputs[:, self.n_dim + 2 :: 3][
-                    ns_range_noeos, dimensions_noeos
-                ]
-                betas = policy_outputs[:, self.n_dim + 3 :: 3][
-                    ns_range_noeos, dimensions_noeos
-                ]
-                distr_increments = Beta(torch.exp(alphas), torch.exp(betas))
-            increments[ns_range_noeos] = distr_increments.sample()
-            logprobs_increments[ns_range_noeos] = distr_increments.log_prob(
-                increments[ns_range_noeos]
-            )
-            # Apply minimum increment
-            increments[ns_range_noeos] = torch.min(
-                increments[ns_range_noeos],
-                self.min_incr * torch.ones(ns_range_noeos.shape[0]),
+                mix_logits = policy_outputs[mask_states_sample, 0::3].reshape(
+                    -1, self.n_dim, self.n_comp
+                )
+                mix = Categorical(logits=mix_logits)
+                locations = policy_outputs[mask_states_sample, 1::3].reshape(
+                    -1, self.n_dim, self.n_comp
+                )
+                concentrations = policy_outputs[mask_states_sample, 2::3].reshape(
+                    -1, self.n_dim, self.n_comp
+                )
+                vonmises = VonMises(
+                    locations,
+                    torch.exp(concentrations) + self.vonmises_min_concentration,
+                )
+                distr_angles = MixtureSameFamily(mix, vonmises)
+            angles[mask_states_sample] = distr_angles.sample()
+            logprobs[mask_states_sample] = distr_angles.log_prob(
+                angles[mask_states_sample]
             )
-        # Combined probabilities
-        logprobs = logprobs_dim + logprobs_increments
+        logprobs = torch.sum(logprobs, axis=1)
         # Build actions
-        actions = [
-            (dimension, incr)
-            for dimension, incr in zip(dimensions.tolist(), increments.tolist())
-        ]
+        actions_tensor = torch.inf * torch.ones(
+            angles.shape, dtype=self.float, device=device
+        )
+        actions_tensor[mask_states_sample, :] = angles[mask_states_sample]
+        actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, logprobs
 
     def get_logprobs(
@@ -893,29 +895,21 @@ def step(
         """
         if self.done:
             return self.state, action, False
-        # If action is eos or any dimension is beyond max_val or n_actions has reached
-        # max_traj_length, then force eos
-        elif (
-            action[0] == self.eos
-            or any([s > self.max_val for s in self.state])
-            or self.n_actions >= self.max_traj_length
-        ):
+        # If action is eos or any dimension is beyond max_val, then force eos
+        elif action == self.eos or any([s > self.max_val for s in self.state]):
             self.done = True
-            self.n_actions += 1
-            return self.state, (self.eos, 0.0), True
+            return self.state, self.eos, True
         # If action is not eos, then perform action
-        elif action[0] != self.eos:
-            self.n_actions += 1
-            self.state[action[0]] += action[1]
-            return self.state, action, True
-        # Otherwise (unreachable?) it is invalid
         else:
-            return self.state, action, False
+            for dim, incr in enumerate(action):
+                self.state[dim] += incr
+            return self.state, action, True
 
-    def get_grid_terminating_states(self, n_states: int) -> List[List]:
-        n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
-        linspaces = [np.linspace(0, self.max_val, n_per_dim) for _ in range(self.n_dim)]
-        states = list(itertools.product(*linspaces))
-        # TODO: check if necessary
-        states = [list(el) for el in states]
-        return states
+
+#     def get_grid_terminating_states(self, n_states: int) -> List[List]:
+#         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
+#         linspaces = [np.linspace(0, self.max_val, n_per_dim) for _ in range(self.n_dim)]
+#         states = list(itertools.product(*linspaces))
+#         # TODO: check if necessary
+#         states = [list(el) for el in states]
+#         return states

From ddf2350b7fa25ada2e4d1080127afcfa6fc02e6e Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 10 Apr 2023 19:53:25 -0400
Subject: [PATCH 012/206] wip: progress in sample_actions and get_logprobs -
 unfinsihed

---
 gflownet/envs/cube.py | 114 +++++++++++++++++++++++++++++++-----------
 1 file changed, 84 insertions(+), 30 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 1c84a324a..05fe8b843 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -9,7 +9,7 @@
 import numpy.typing as npt
 import pandas as pd
 import torch
-from torch.distributions import Beta, Categorical, Uniform
+from torch.distributions import Bernoulli, Beta, Categorical, Uniform,  MixtureSameFamily
 from torchtyping import TensorType
 
 from gflownet.envs.base import GFlowNetEnv
@@ -671,8 +671,8 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
             device=self.device,
             dtype=self.float,
         )
-        policy_output[1::3] = params["beta_alpha"]
-        policy_output[2::3] = params["beta_beta"]
+        policy_output[1:-1:3] = params["beta_alpha"]
+        policy_output[2:-1:3] = params["beta_beta"]
         return policy_output
 
     def get_mask_invalid_actions_forward(
@@ -790,54 +790,58 @@ def sample_actions(
         Samples a batch of actions from a batch of policy outputs.
         """
         device = policy_outputs.device
-        import ipdb
-
-        ipdb.set_trace()
-        mask_states_sample = ~mask_invalid_actions.flatten()
         n_states = policy_outputs.shape[0]
+        ns_range = torch.arange(n_states).to(device)
+        # EOS
+        mask_eos = torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
+        distr_eos = Bernoulli(logits=policy_outputs[:, -1])
+        states_eos = distr_eos.sample().to(torch.bool)
+        mask_eos[states_eos] = True
+        mask_sample = ~mask_eos
         # Sample angle increments
-        angles = torch.zeros(n_states, self.n_dim).to(device)
-        logprobs = torch.zeros(n_states, self.n_dim).to(device)
-        if torch.any(mask_states_sample):
+        ns_range_sample = ns_range[mask_sample]
+        n_states_sample = len(ns_range_sample)
+        increments = torch.inf * torch.ones((n_states, self.n_dim), device=device, dtype=self.float)
+        logprobs = torch.zeros((n_states, self.n_dim), device=device, dtype=self.float)
+        if torch.any(mask_sample):
             if sampling_method == "uniform":
-                distr_angles = Uniform(
-                    torch.zeros(len(ns_range_noeos)),
-                    2 * torch.pi * torch.ones(len(ns_range_noeos)),
+                distr_increments = Uniform(
+                    torch.zeros(len(ns_range_sample)),
+                    torch.ones(len(ns_range_sample)),
                 )
             elif sampling_method == "policy":
-                mix_logits = policy_outputs[mask_states_sample, 0::3].reshape(
+                mix_logits = policy_outputs[mask_sample, 0:-1:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
                 mix = Categorical(logits=mix_logits)
-                locations = policy_outputs[mask_states_sample, 1::3].reshape(
+                alphas = policy_outputs[mask_sample, 1:-1:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
-                concentrations = policy_outputs[mask_states_sample, 2::3].reshape(
+                betas = policy_outputs[mask_sample, 2:-1:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
-                vonmises = VonMises(
-                    locations,
-                    torch.exp(concentrations) + self.vonmises_min_concentration,
-                )
-                distr_angles = MixtureSameFamily(mix, vonmises)
-            angles[mask_states_sample] = distr_angles.sample()
-            logprobs[mask_states_sample] = distr_angles.log_prob(
-                angles[mask_states_sample]
+                beta_distr = Beta(torch.exp(alphas), torch.exp(betas))
+                distr_increments = MixtureSameFamily(mix, beta_distr)
+            increments[mask_sample] = distr_increments.sample()
+            logprobs[mask_sample] = distr_increments.log_prob(increments[mask_sample])
+            # Apply minimum increment to generic (not from source) actions
+            # TODO: before or after computing logprob?
+            mask_action_generic = ~mask_invalid_actions[:, 0]
+            increments[mask_action_generic] = torch.max(
+                increments[mask_action_generic],
+                self.min_incr * torch.ones(increments[mask_action_generic].shape, device=device),
             )
+        # TODO: Consider Bernoulli logprobs.
         logprobs = torch.sum(logprobs, axis=1)
         # Build actions
-        actions_tensor = torch.inf * torch.ones(
-            angles.shape, dtype=self.float, device=device
-        )
-        actions_tensor[mask_states_sample, :] = angles[mask_states_sample]
-        actions = [tuple(a.tolist()) for a in actions_tensor]
+        actions = [tuple(a.tolist()) for a in increments]
         return actions, logprobs
 
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
-        actions: TensorType["n_states", 2],
+        actions: TensorType["n_states", "n_dim"],
         states_target: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
         loginf: float = 1000,
@@ -845,6 +849,56 @@ def get_logprobs(
         """
         Computes log probabilities of actions given policy outputs and actions.
         """
+        device = policy_outputs.device
+        n_states = policy_outputs.shape[0]
+        ns_range = torch.arange(n_states).to(device)
+        # EOS actions
+        mask_actions_eos = torch.all(actions == torch.inf, axis=1)
+        logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
+        distr_eos = Bernoulli(logits=policy_outputs[:, -1])
+        logprobs_eos = distr_eos.log_prob(mask_actions_eos.to(self.float))
+        mask_force_eos = torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
+        logprobs_eos[mask_force_eos] = -loginf
+        import ipdb; ipdb.set_trace()
+        # Increments
+        mask_sample = torch.logical_and(~mask_actions_eos, ~mask_force_eos)
+        ns_range_sample = ns_range[mask_sample]
+        n_states_sample = len(ns_range_sample)
+        increments = torch.inf * torch.ones((n_states, self.n_dim), device=device, dtype=self.float)
+        logprobs = torch.zeros((n_states, self.n_dim), device=device, dtype=self.float)
+        if torch.any(mask_sample):
+            increments = actions[mask_sample, :]
+            mix_logits = policy_outputs[mask_sample, 0:-1:3].reshape(
+                -1, self.n_dim, self.n_comp
+            )
+            mix = Categorical(logits=mix_logits)
+            alphas = policy_outputs[mask_sample, 1:-1:3].reshape(
+                -1, self.n_dim, self.n_comp
+            )
+            betas = policy_outputs[mask_sample, 2:-1:3].reshape(
+                -1, self.n_dim, self.n_comp
+            )
+            beta_distr = Beta(torch.exp(alphas), torch.exp(betas))
+            distr_increments = MixtureSameFamily(mix, beta_distr)
+            # TODO: what to do with the minimum increments, since the logprob will not
+            # reflect the tru probability of sampling that increment.
+            logprobs_sample = distr_increments.log_prob(increments)
+            increments[mask_sample] = distr_increments.sample()
+            logprobs[mask_sample] = distr_increments.log_prob(increments[mask_sample])
+            # Apply minimum increment to generic (not from source) actions
+            # TODO: before or after computing logprob?
+            mask_action_generic = ~mask_invalid_actions[:, 0]
+            increments[mask_action_generic] = torch.max(
+                increments[mask_action_generic],
+                self.min_incr * torch.ones(increments[mask_action_generic].shape, device=device),
+            )
+        logprobs = torch.sum(logprobs, axis=1)
+        # Build actions
+        actions = [tuple(a.tolist()) for a in increments]
+        import ipdb; ipdb.set_trace()
+        return actions, logprobs
+
+
         device = policy_outputs.device
         dimensions, steps = zip(*actions)
         dimensions = torch.LongTensor([d.long() for d in dimensions]).to(device)

From 5334590d0cc03a12382bd9a2b278094190af8703 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 11 Apr 2023 12:17:20 -0400
Subject: [PATCH 013/206] minor progress

---
 gflownet/envs/cube.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 05fe8b843..2e7c9d1a7 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -858,14 +858,14 @@ def get_logprobs(
         distr_eos = Bernoulli(logits=policy_outputs[:, -1])
         logprobs_eos = distr_eos.log_prob(mask_actions_eos.to(self.float))
         mask_force_eos = torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
-        logprobs_eos[mask_force_eos] = -loginf
+        logprobs_eos[mask_force_eos] = 0.0
         import ipdb; ipdb.set_trace()
         # Increments
         mask_sample = torch.logical_and(~mask_actions_eos, ~mask_force_eos)
         ns_range_sample = ns_range[mask_sample]
         n_states_sample = len(ns_range_sample)
         increments = torch.inf * torch.ones((n_states, self.n_dim), device=device, dtype=self.float)
-        logprobs = torch.zeros((n_states, self.n_dim), device=device, dtype=self.float)
+        logprobs_sample = torch.zeros((n_states, self.n_dim), device=device, dtype=self.float)
         if torch.any(mask_sample):
             increments = actions[mask_sample, :]
             mix_logits = policy_outputs[mask_sample, 0:-1:3].reshape(
@@ -884,7 +884,7 @@ def get_logprobs(
             # reflect the tru probability of sampling that increment.
             logprobs_sample = distr_increments.log_prob(increments)
             increments[mask_sample] = distr_increments.sample()
-            logprobs[mask_sample] = distr_increments.log_prob(increments[mask_sample])
+            logprobs_sample[mask_sample] = distr_increments.log_prob(increments[mask_sample])
             # Apply minimum increment to generic (not from source) actions
             # TODO: before or after computing logprob?
             mask_action_generic = ~mask_invalid_actions[:, 0]

From 7ff17a65bc089982a40c3553d61263dbce084798 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 11 Apr 2023 23:41:36 -0400
Subject: [PATCH 014/206] add missing import

---
 gflownet/envs/ctorus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index 458b1425f..06faacd09 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -2,7 +2,7 @@
 Classes to represent hyper-torus environments
 """
 import itertools
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import numpy as np
 import numpy.typing as npt

From 0a0c9209c305e95c60dd85cd57c2e53a870b76f5 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 11 Apr 2023 23:42:36 -0400
Subject: [PATCH 015/206] finish first version of cube implementation

---
 gflownet/envs/cube.py | 131 +++++++++++++++++-------------------------
 1 file changed, 52 insertions(+), 79 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 2e7c9d1a7..6f2b6c57b 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1,15 +1,15 @@
 """
 Classes to represent hyper-cube environments
 """
-from abc import ABC, abstractmethod
 import itertools
+from abc import ABC, abstractmethod
 from typing import List, Optional, Tuple
 
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 import torch
-from torch.distributions import Bernoulli, Beta, Categorical, Uniform,  MixtureSameFamily
+from torch.distributions import Bernoulli, Beta, Categorical, MixtureSameFamily, Uniform
 from torchtyping import TensorType
 
 from gflownet.envs.base import GFlowNetEnv
@@ -793,46 +793,53 @@ def sample_actions(
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
         # EOS
-        mask_eos = torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
-        distr_eos = Bernoulli(logits=policy_outputs[:, -1])
-        states_eos = distr_eos.sample().to(torch.bool)
-        mask_eos[states_eos] = True
-        mask_sample = ~mask_eos
+        idx_nofix = ns_range[
+            ~torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
+        ]
+        distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
+        mask_sampled_eos = distr_eos.sample().to(torch.bool)
+        logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
+        logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
         # Sample angle increments
-        ns_range_sample = ns_range[mask_sample]
-        n_states_sample = len(ns_range_sample)
-        increments = torch.inf * torch.ones((n_states, self.n_dim), device=device, dtype=self.float)
-        logprobs = torch.zeros((n_states, self.n_dim), device=device, dtype=self.float)
-        if torch.any(mask_sample):
+        idx_sample = idx_nofix[~mask_sampled_eos]
+        n_sample = idx_sample.shape[0]
+        logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
+        increments = torch.inf * torch.ones(
+            (n_states, self.n_dim), device=device, dtype=self.float
+        )
+        if torch.any(idx_sample):
             if sampling_method == "uniform":
                 distr_increments = Uniform(
-                    torch.zeros(len(ns_range_sample)),
-                    torch.ones(len(ns_range_sample)),
+                    torch.zeros(n_sample),
+                    torch.ones(n_sample),
                 )
             elif sampling_method == "policy":
-                mix_logits = policy_outputs[mask_sample, 0:-1:3].reshape(
+                mix_logits = policy_outputs[idx_sample, 0:-1:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
                 mix = Categorical(logits=mix_logits)
-                alphas = policy_outputs[mask_sample, 1:-1:3].reshape(
+                alphas = policy_outputs[idx_sample, 1:-1:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
-                betas = policy_outputs[mask_sample, 2:-1:3].reshape(
+                betas = policy_outputs[idx_sample, 2:-1:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
                 beta_distr = Beta(torch.exp(alphas), torch.exp(betas))
                 distr_increments = MixtureSameFamily(mix, beta_distr)
-            increments[mask_sample] = distr_increments.sample()
-            logprobs[mask_sample] = distr_increments.log_prob(increments[mask_sample])
+            increments[idx_sample, :] = distr_increments.sample()
+            logprobs_sample[idx_sample] = distr_increments.log_prob(
+                increments[idx_sample, :]
+            ).sum(axis=1)
             # Apply minimum increment to generic (not from source) actions
             # TODO: before or after computing logprob?
             mask_action_generic = ~mask_invalid_actions[:, 0]
             increments[mask_action_generic] = torch.max(
                 increments[mask_action_generic],
-                self.min_incr * torch.ones(increments[mask_action_generic].shape, device=device),
+                self.min_incr
+                * torch.ones(increments[mask_action_generic].shape, device=device),
             )
-        # TODO: Consider Bernoulli logprobs.
-        logprobs = torch.sum(logprobs, axis=1)
+        # Combined probabilities
+        logprobs = logprobs_eos + logprobs_sample
         # Build actions
         actions = [tuple(a.tolist()) for a in increments]
         return actions, logprobs
@@ -848,79 +855,45 @@ def get_logprobs(
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of actions given policy outputs and actions.
+
+        At every state, the EOS action can be sampled with probability p(EOS).
+        Otherwise, an increment incr is sampled with probablility
+        p(incr) * (1 - p(EOS)).
         """
         device = policy_outputs.device
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
-        # EOS actions
-        mask_actions_eos = torch.all(actions == torch.inf, axis=1)
+        # Log probs of EOS actions
+        idx_nofix = ns_range[
+            ~torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
+        ]
+        distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
+        mask_sampled_eos = torch.all(actions == torch.inf, axis=1)
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
-        distr_eos = Bernoulli(logits=policy_outputs[:, -1])
-        logprobs_eos = distr_eos.log_prob(mask_actions_eos.to(self.float))
-        mask_force_eos = torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
-        logprobs_eos[mask_force_eos] = 0.0
-        import ipdb; ipdb.set_trace()
-        # Increments
-        mask_sample = torch.logical_and(~mask_actions_eos, ~mask_force_eos)
-        ns_range_sample = ns_range[mask_sample]
-        n_states_sample = len(ns_range_sample)
-        increments = torch.inf * torch.ones((n_states, self.n_dim), device=device, dtype=self.float)
-        logprobs_sample = torch.zeros((n_states, self.n_dim), device=device, dtype=self.float)
-        if torch.any(mask_sample):
-            increments = actions[mask_sample, :]
-            mix_logits = policy_outputs[mask_sample, 0:-1:3].reshape(
+        logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
+        # Log probs of sampled increments
+        idx_sample = idx_nofix[~mask_sampled_eos]
+        logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
+        if torch.any(idx_sample):
+            mix_logits = policy_outputs[idx_sample, 0:-1:3].reshape(
                 -1, self.n_dim, self.n_comp
             )
             mix = Categorical(logits=mix_logits)
-            alphas = policy_outputs[mask_sample, 1:-1:3].reshape(
+            alphas = policy_outputs[idx_sample, 1:-1:3].reshape(
                 -1, self.n_dim, self.n_comp
             )
-            betas = policy_outputs[mask_sample, 2:-1:3].reshape(
+            betas = policy_outputs[idx_sample, 2:-1:3].reshape(
                 -1, self.n_dim, self.n_comp
             )
             beta_distr = Beta(torch.exp(alphas), torch.exp(betas))
             distr_increments = MixtureSameFamily(mix, beta_distr)
             # TODO: what to do with the minimum increments, since the logprob will not
-            # reflect the tru probability of sampling that increment.
-            logprobs_sample = distr_increments.log_prob(increments)
-            increments[mask_sample] = distr_increments.sample()
-            logprobs_sample[mask_sample] = distr_increments.log_prob(increments[mask_sample])
-            # Apply minimum increment to generic (not from source) actions
-            # TODO: before or after computing logprob?
-            mask_action_generic = ~mask_invalid_actions[:, 0]
-            increments[mask_action_generic] = torch.max(
-                increments[mask_action_generic],
-                self.min_incr * torch.ones(increments[mask_action_generic].shape, device=device),
-            )
-        logprobs = torch.sum(logprobs, axis=1)
-        # Build actions
-        actions = [tuple(a.tolist()) for a in increments]
-        import ipdb; ipdb.set_trace()
-        return actions, logprobs
-
-
-        device = policy_outputs.device
-        dimensions, steps = zip(*actions)
-        dimensions = torch.LongTensor([d.long() for d in dimensions]).to(device)
-        steps = torch.FloatTensor(steps).to(device)
-        n_states = policy_outputs.shape[0]
-        ns_range = torch.arange(n_states).to(device)
-        # Dimensions
-        logits_dims = policy_outputs[:, 0::3]
-        if mask_invalid_actions is not None:
-            logits_dims[mask_invalid_actions] = -loginf
-        logprobs_dim = self.logsoftmax(logits_dims)[ns_range, dimensions]
-        # Steps
-        ns_range_noeos = ns_range[dimensions != self.eos]
-        dimensions_noeos = dimensions[dimensions != self.eos]
-        logprobs_steps = torch.zeros(n_states).to(device)
-        if len(dimensions_noeos) > 0:
-            alphas = policy_outputs[:, 1::3][ns_range_noeos, dimensions_noeos]
-            betas = policy_outputs[:, 2::3][ns_range_noeos, dimensions_noeos]
-            distr_steps = Beta(torch.exp(alphas), torch.exp(betas))
-            logprobs_steps[ns_range_noeos] = distr_steps.log_prob(steps[ns_range_noeos])
+            # reflect the true probability of sampling that increment.
+            logprobs_sample[idx_sample] = distr_increments.log_prob(
+                actions[idx_sample, :]
+            ).sum(axis=1)
         # Combined probabilities
-        logprobs = logprobs_dim + logprobs_steps
+        logprobs = logprobs_eos + logprobs_sample
         return logprobs
 
     def step(

From ae9e3e13989b553e9213f593f838c02982e9fc06 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 11 Apr 2023 23:47:01 -0400
Subject: [PATCH 016/206] do not convert to tensor if already tensor

---
 gflownet/gflownet.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 3effa8c67..804921822 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -987,12 +987,14 @@ def __init__(self, config, env, device, float_precision, base=None):
         self.float = float_precision
         # Input and output dimensions
         self.state_dim = env.policy_input_dim
-        self.fixed_output = torch.tensor(env.fixed_policy_output).to(
-            dtype=self.float, device=self.device
-        )
-        self.random_output = torch.tensor(env.random_policy_output).to(
-            dtype=self.float, device=self.device
-        )
+        if not torch.is_tensor(env.fixed_policy_output):
+            self.fixed_output = torch.tensor(env.fixed_policy_output).to(
+                dtype=self.float, device=self.device
+            )
+        if not torch.is_tensor(env.random_policy_output):
+            self.random_output = torch.tensor(env.random_policy_output).to(
+                dtype=self.float, device=self.device
+            )
         self.output_dim = len(self.fixed_output)
         if "shared_weights" in config:
             self.shared_weights = config.shared_weights

From 784063ab28bacce5bbad5c7cef6a5578d5543f84 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 11 Apr 2023 23:50:32 -0400
Subject: [PATCH 017/206] statebatch2polocy returns tensor

---
 gflownet/envs/cube.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 6f2b6c57b..2f3468851 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -118,7 +118,9 @@ def statetorch2policy(
         """
         return torch.clip(states, min=0.0, max=self.max_val)
 
-    def statebatch2policy(self, states: List[List] = None) -> npt.NDArray[np.float32]:
+    def statebatch2policy(
+        self, states: List[List]
+    ) -> TensorType["batch", "state_proxy_dim"]:
         """
         Clips the states into [0, max_val]
 
@@ -127,7 +129,7 @@ def statebatch2policy(self, states: List[List] = None) -> npt.NDArray[np.float32
         state : list
             State
         """
-        return np.clip(np.array(states), a_min=0.0, a_max=self.max_val)
+        return self.statetorch2policy(torch.tensor(states, device=self.device))
 
     def state2policy(self, state: List = None) -> List:
         """

From f7c117849d45a02173d767253e2441ecd1aad974 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 11 Apr 2023 23:52:42 -0400
Subject: [PATCH 018/206] fix

---
 gflownet/gflownet.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 804921822..f1f069713 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -991,10 +991,14 @@ def __init__(self, config, env, device, float_precision, base=None):
             self.fixed_output = torch.tensor(env.fixed_policy_output).to(
                 dtype=self.float, device=self.device
             )
+        else:
+            self.fixed_output = env.fixed_policy_output
         if not torch.is_tensor(env.random_policy_output):
             self.random_output = torch.tensor(env.random_policy_output).to(
                 dtype=self.float, device=self.device
             )
+        else:
+            self.random_output = env.random_policy_output
         self.output_dim = len(self.fixed_output)
         if "shared_weights" in config:
             self.shared_weights = config.shared_weights

From 9e6dc4ce82947a4be214070aed6fb685caccc088 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 12 Apr 2023 00:01:11 -0400
Subject: [PATCH 019/206] handle torch policy_output in tests

---
 tests/gflownet/envs/common.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index ea456506c..9692902c3 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -216,9 +216,11 @@ def test__step__returns_same_state_action_and_invalid_if_done(env):
     mask_invalid = torch.unsqueeze(
         torch.BoolTensor(env.get_mask_invalid_actions_forward()), 0
     )
-    random_policy = torch.unsqueeze(
-        torch.tensor(env.random_policy_output, dtype=env.float), 0
-    )
+    if not torch.is_tensor(env.random_policy_output):
+        random_policy = torch.tensor(env.random_policy_output, dtype=env.float)
+    else:
+        random_policy = env.random_policy_output
+    random_policy = torch.unsqueeze(random_policy, 0)
     actions, _ = env.sample_actions(
         policy_outputs=random_policy, mask_invalid_actions=mask_invalid
     )

From 85014af35ed81e9997153aca6be9170443d8945e Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 12 Apr 2023 10:36:59 -0400
Subject: [PATCH 020/206] uncomment terminating states method

---
 gflownet/envs/cube.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 2f3468851..81d7d6278 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -935,10 +935,10 @@ def step(
             return self.state, action, True
 
 
-#     def get_grid_terminating_states(self, n_states: int) -> List[List]:
-#         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
-#         linspaces = [np.linspace(0, self.max_val, n_per_dim) for _ in range(self.n_dim)]
-#         states = list(itertools.product(*linspaces))
-#         # TODO: check if necessary
-#         states = [list(el) for el in states]
-#         return states
+    def get_grid_terminating_states(self, n_states: int) -> List[List]:
+        n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
+        linspaces = [np.linspace(0, self.max_val, n_per_dim) for _ in range(self.n_dim)]
+        states = list(itertools.product(*linspaces))
+        # TODO: check if necessary
+        states = [list(el) for el in states]
+        return states

From b21846d384315a94a378e144b5417b0993c786d5 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 12 Apr 2023 15:23:16 -0400
Subject: [PATCH 021/206] copy paste kde and plot methods from htorus

---
 gflownet/envs/cube.py | 139 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 81d7d6278..388021aa1 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -942,3 +942,142 @@ def get_grid_terminating_states(self, n_states: int) -> List[List]:
         # TODO: check if necessary
         states = [list(el) for el in states]
         return states
+
+    # TODO: make generic for all environments
+    def sample_from_reward(
+        self, n_samples: int, epsilon=1e-4
+    ) -> TensorType["n_samples", "state_dim"]:
+        """
+        Rejection sampling  with proposal the uniform distribution in 
+        [0, max_val]]^n_dim.
+
+        Returns a tensor in GFloNet (state) format.
+        """
+        samples_final = []
+        max_reward = self.proxy2reward(torch.tensor([self.proxy.min])).to(self.device)
+        while len(samples_final) < n_samples:
+            angles_uniform = (
+                torch.rand(
+                    (n_samples, self.n_dim), dtype=self.float, device=self.device
+                )
+                * 2
+                * np.pi
+            )
+            samples = torch.cat(
+                (
+                    angles_uniform,
+                    torch.ones((angles_uniform.shape[0], 1)).to(angles_uniform),
+                ),
+                axis=1,
+            )
+            rewards = self.reward_torchbatch(samples)
+            mask = (
+                torch.rand(n_samples, dtype=self.float, device=self.device)
+                * (max_reward + epsilon)
+                < rewards
+            )
+            samples_accepted = samples[mask, :]
+            samples_final.extend(samples_accepted[-(n_samples - len(samples_final)) :])
+        return torch.vstack(samples_final)
+
+    def fit_kde(self, samples, kernel="gaussian", bandwidth=0.1):
+        aug_samples = []
+        for add_0 in [0, -2 * np.pi, 2 * np.pi]:
+            for add_1 in [0, -2 * np.pi, 2 * np.pi]:
+                aug_samples.append(
+                    np.stack([samples[:, 0] + add_0, samples[:, 1] + add_1], axis=1)
+                )
+        aug_samples = np.concatenate(aug_samples)
+        kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(aug_samples)
+        return kde
+
+    def plot_reward_samples(
+        self,
+        samples,
+        alpha=0.5,
+        low=-np.pi * 0.5,
+        high=2.5 * np.pi,
+        dpi=150,
+        limit_n_samples=500,
+        **kwargs,
+    ):
+        x = np.linspace(low, high, 201)
+        y = np.linspace(low, high, 201)
+        xx, yy = np.meshgrid(x, y)
+        X = np.stack([xx, yy], axis=-1)
+        samples_mesh = torch.tensor(X.reshape(-1, 2), dtype=self.float)
+        states_mesh = torch.cat(
+            [samples_mesh, torch.ones(samples_mesh.shape[0], 1)], 1
+        ).to(self.device)
+        rewards = torch2np(
+            self.proxy2reward(self.proxy(self.statetorch2proxy(states_mesh)))
+        )
+        # Init figure
+        fig, ax = plt.subplots()
+        fig.set_dpi(dpi)
+        # Plot reward contour
+        h = ax.contourf(xx, yy, rewards.reshape(xx.shape), alpha=alpha)
+        ax.axis("scaled")
+        fig.colorbar(h, ax=ax)
+        ax.plot([0, 0], [0, 2 * np.pi], "-w", alpha=alpha)
+        ax.plot([0, 2 * np.pi], [0, 0], "-w", alpha=alpha)
+        ax.plot([2 * np.pi, 2 * np.pi], [2 * np.pi, 0], "-w", alpha=alpha)
+        ax.plot([2 * np.pi, 0], [2 * np.pi, 2 * np.pi], "-w", alpha=alpha)
+        # Plot samples
+        extra_samples = []
+        for add_0 in [0, -2 * np.pi, 2 * np.pi]:
+            for add_1 in [0, -2 * np.pi, 2 * np.pi]:
+                if not (add_0 == add_1 == 0):
+                    extra_samples.append(
+                        np.stack(
+                            [
+                                samples[:limit_n_samples, 0] + add_0,
+                                samples[:limit_n_samples, 1] + add_1,
+                            ],
+                            axis=1,
+                        )
+                    )
+        extra_samples = np.concatenate(extra_samples)
+        ax.scatter(
+            samples[:limit_n_samples, 0], samples[:limit_n_samples, 1], alpha=alpha
+        )
+        ax.scatter(extra_samples[:, 0], extra_samples[:, 1], alpha=alpha, color="white")
+        ax.grid()
+        # Set tight layout
+        plt.tight_layout()
+        return fig
+
+    def plot_kde(
+        self,
+        kde,
+        alpha=0.5,
+        low=-np.pi * 0.5,
+        high=2.5 * np.pi,
+        dpi=150,
+        colorbar=True,
+        **kwargs,
+    ):
+        x = np.linspace(0, 2 * np.pi, 101)
+        y = np.linspace(0, 2 * np.pi, 101)
+        xx, yy = np.meshgrid(x, y)
+        X = np.stack([xx, yy], axis=-1)
+        Z = np.exp(kde.score_samples(X.reshape(-1, 2))).reshape(xx.shape)
+        # Init figure
+        fig, ax = plt.subplots()
+        fig.set_dpi(dpi)
+        # Plot KDE
+        h = ax.contourf(xx, yy, Z, alpha=alpha)
+        ax.axis("scaled")
+        if colorbar:
+            fig.colorbar(h, ax=ax)
+        ax.set_xticks([])
+        ax.set_yticks([])
+        ax.text(0, -0.3, r"$0$", fontsize=15)
+        ax.text(-0.28, 0, r"$0$", fontsize=15)
+        ax.text(2 * np.pi - 0.4, -0.3, r"$2\pi$", fontsize=15)
+        ax.text(-0.45, 2 * np.pi - 0.3, r"$2\pi$", fontsize=15)
+        for spine in ax.spines.values():
+            spine.set_visible(False)
+        # Set tight layout
+        plt.tight_layout()
+        return fig

From f33caf583b74c5e197adae9fdd2e21e108a35ae1 Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Wed, 12 Apr 2023 21:39:09 -0400
Subject: [PATCH 022/206] remove blank line

---
 gflownet/gflownet.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index f1f069713..5976463b7 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -893,7 +893,6 @@ def test(self, **plot_kwargs):
         jsd += 0.5 * np.sum(density_pred * (log_density_pred - log_mean_dens))
 
         # Plots
-
         if hasattr(self.env, "plot_reward_samples"):
             fig_reward_samples = self.env.plot_reward_samples(x_sampled, **plot_kwargs)
         else:

From 9ae4e8b6552db94add629b10bc780d2ff02da98c Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Wed, 12 Apr 2023 21:39:53 -0400
Subject: [PATCH 023/206] plot_reward_samples; comment out temp methods

---
 gflownet/envs/cube.py | 219 +++++++++++++++++++++---------------------
 1 file changed, 107 insertions(+), 112 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 388021aa1..23a9bdb28 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -943,60 +943,60 @@ def get_grid_terminating_states(self, n_states: int) -> List[List]:
         states = [list(el) for el in states]
         return states
 
-    # TODO: make generic for all environments
-    def sample_from_reward(
-        self, n_samples: int, epsilon=1e-4
-    ) -> TensorType["n_samples", "state_dim"]:
-        """
-        Rejection sampling  with proposal the uniform distribution in 
-        [0, max_val]]^n_dim.
-
-        Returns a tensor in GFloNet (state) format.
-        """
-        samples_final = []
-        max_reward = self.proxy2reward(torch.tensor([self.proxy.min])).to(self.device)
-        while len(samples_final) < n_samples:
-            angles_uniform = (
-                torch.rand(
-                    (n_samples, self.n_dim), dtype=self.float, device=self.device
-                )
-                * 2
-                * np.pi
-            )
-            samples = torch.cat(
-                (
-                    angles_uniform,
-                    torch.ones((angles_uniform.shape[0], 1)).to(angles_uniform),
-                ),
-                axis=1,
-            )
-            rewards = self.reward_torchbatch(samples)
-            mask = (
-                torch.rand(n_samples, dtype=self.float, device=self.device)
-                * (max_reward + epsilon)
-                < rewards
-            )
-            samples_accepted = samples[mask, :]
-            samples_final.extend(samples_accepted[-(n_samples - len(samples_final)) :])
-        return torch.vstack(samples_final)
-
-    def fit_kde(self, samples, kernel="gaussian", bandwidth=0.1):
-        aug_samples = []
-        for add_0 in [0, -2 * np.pi, 2 * np.pi]:
-            for add_1 in [0, -2 * np.pi, 2 * np.pi]:
-                aug_samples.append(
-                    np.stack([samples[:, 0] + add_0, samples[:, 1] + add_1], axis=1)
-                )
-        aug_samples = np.concatenate(aug_samples)
-        kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(aug_samples)
-        return kde
+#     # TODO: make generic for all environments
+#     def sample_from_reward(
+#         self, n_samples: int, epsilon=1e-4
+#     ) -> TensorType["n_samples", "state_dim"]:
+#         """
+#         Rejection sampling  with proposal the uniform distribution in 
+#         [0, max_val]]^n_dim.
+# 
+#         Returns a tensor in GFloNet (state) format.
+#         """
+#         samples_final = []
+#         max_reward = self.proxy2reward(torch.tensor([self.proxy.min])).to(self.device)
+#         while len(samples_final) < n_samples:
+#             angles_uniform = (
+#                 torch.rand(
+#                     (n_samples, self.n_dim), dtype=self.float, device=self.device
+#                 )
+#                 * 2
+#                 * np.pi
+#             )
+#             samples = torch.cat(
+#                 (
+#                     angles_uniform,
+#                     torch.ones((angles_uniform.shape[0], 1)).to(angles_uniform),
+#                 ),
+#                 axis=1,
+#             )
+#             rewards = self.reward_torchbatch(samples)
+#             mask = (
+#                 torch.rand(n_samples, dtype=self.float, device=self.device)
+#                 * (max_reward + epsilon)
+#                 < rewards
+#             )
+#             samples_accepted = samples[mask, :]
+#             samples_final.extend(samples_accepted[-(n_samples - len(samples_final)) :])
+#         return torch.vstack(samples_final)
+# 
+#     def fit_kde(self, samples, kernel="gaussian", bandwidth=0.1):
+#         aug_samples = []
+#         for add_0 in [0, -2 * np.pi, 2 * np.pi]:
+#             for add_1 in [0, -2 * np.pi, 2 * np.pi]:
+#                 aug_samples.append(
+#                     np.stack([samples[:, 0] + add_0, samples[:, 1] + add_1], axis=1)
+#                 )
+#         aug_samples = np.concatenate(aug_samples)
+#         kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(aug_samples)
+#         return kde
 
     def plot_reward_samples(
         self,
         samples,
         alpha=0.5,
-        low=-np.pi * 0.5,
-        high=2.5 * np.pi,
+        low=0.0,
+        high=1.0,
         dpi=150,
         limit_n_samples=500,
         **kwargs,
@@ -1005,79 +1005,74 @@ def plot_reward_samples(
         y = np.linspace(low, high, 201)
         xx, yy = np.meshgrid(x, y)
         X = np.stack([xx, yy], axis=-1)
-        samples_mesh = torch.tensor(X.reshape(-1, 2), dtype=self.float)
-        states_mesh = torch.cat(
-            [samples_mesh, torch.ones(samples_mesh.shape[0], 1)], 1
-        ).to(self.device)
-        rewards = torch2np(
-            self.proxy2reward(self.proxy(self.statetorch2proxy(states_mesh)))
-        )
+        states_mesh = torch.tensor(X.reshape(-1, 2), device=self.device, dtype=self.float)
+        rewards = self.reward_torchbatch(states_mesh)
         # Init figure
         fig, ax = plt.subplots()
         fig.set_dpi(dpi)
         # Plot reward contour
-        h = ax.contourf(xx, yy, rewards.reshape(xx.shape), alpha=alpha)
-        ax.axis("scaled")
-        fig.colorbar(h, ax=ax)
-        ax.plot([0, 0], [0, 2 * np.pi], "-w", alpha=alpha)
-        ax.plot([0, 2 * np.pi], [0, 0], "-w", alpha=alpha)
-        ax.plot([2 * np.pi, 2 * np.pi], [2 * np.pi, 0], "-w", alpha=alpha)
-        ax.plot([2 * np.pi, 0], [2 * np.pi, 2 * np.pi], "-w", alpha=alpha)
+#         h = ax.contourf(xx, yy, rewards.reshape(xx.shape), alpha=alpha)
+#         ax.axis("scaled")
+#         fig.colorbar(h, ax=ax)
+#         ax.plot([0, 0], [0, 2 * np.pi], "-w", alpha=alpha)
+#         ax.plot([0, 2 * np.pi], [0, 0], "-w", alpha=alpha)
+#         ax.plot([2 * np.pi, 2 * np.pi], [2 * np.pi, 0], "-w", alpha=alpha)
+#         ax.plot([2 * np.pi, 0], [2 * np.pi, 2 * np.pi], "-w", alpha=alpha)
         # Plot samples
-        extra_samples = []
-        for add_0 in [0, -2 * np.pi, 2 * np.pi]:
-            for add_1 in [0, -2 * np.pi, 2 * np.pi]:
-                if not (add_0 == add_1 == 0):
-                    extra_samples.append(
-                        np.stack(
-                            [
-                                samples[:limit_n_samples, 0] + add_0,
-                                samples[:limit_n_samples, 1] + add_1,
-                            ],
-                            axis=1,
-                        )
-                    )
-        extra_samples = np.concatenate(extra_samples)
+#         extra_samples = []
+#         for add_0 in [0, -2 * np.pi, 2 * np.pi]:
+#             for add_1 in [0, -2 * np.pi, 2 * np.pi]:
+#                 if not (add_0 == add_1 == 0):
+#                     extra_samples.append(
+#                         np.stack(
+#                             [
+#                                 samples[:limit_n_samples, 0] + add_0,
+#                                 samples[:limit_n_samples, 1] + add_1,
+#                             ],
+#                             axis=1,
+#                         )
+#                     )
+#         extra_samples = np.concatenate(extra_samples)
         ax.scatter(
             samples[:limit_n_samples, 0], samples[:limit_n_samples, 1], alpha=alpha
         )
-        ax.scatter(extra_samples[:, 0], extra_samples[:, 1], alpha=alpha, color="white")
+#         ax.scatter(extra_samples[:, 0], extra_samples[:, 1], alpha=alpha, color="white")
         ax.grid()
         # Set tight layout
         plt.tight_layout()
         return fig
 
-    def plot_kde(
-        self,
-        kde,
-        alpha=0.5,
-        low=-np.pi * 0.5,
-        high=2.5 * np.pi,
-        dpi=150,
-        colorbar=True,
-        **kwargs,
-    ):
-        x = np.linspace(0, 2 * np.pi, 101)
-        y = np.linspace(0, 2 * np.pi, 101)
-        xx, yy = np.meshgrid(x, y)
-        X = np.stack([xx, yy], axis=-1)
-        Z = np.exp(kde.score_samples(X.reshape(-1, 2))).reshape(xx.shape)
-        # Init figure
-        fig, ax = plt.subplots()
-        fig.set_dpi(dpi)
-        # Plot KDE
-        h = ax.contourf(xx, yy, Z, alpha=alpha)
-        ax.axis("scaled")
-        if colorbar:
-            fig.colorbar(h, ax=ax)
-        ax.set_xticks([])
-        ax.set_yticks([])
-        ax.text(0, -0.3, r"$0$", fontsize=15)
-        ax.text(-0.28, 0, r"$0$", fontsize=15)
-        ax.text(2 * np.pi - 0.4, -0.3, r"$2\pi$", fontsize=15)
-        ax.text(-0.45, 2 * np.pi - 0.3, r"$2\pi$", fontsize=15)
-        for spine in ax.spines.values():
-            spine.set_visible(False)
-        # Set tight layout
-        plt.tight_layout()
-        return fig
+#     def plot_kde(
+#         self,
+#         kde,
+#         alpha=0.5,
+#         low=-np.pi * 0.5,
+#         high=2.5 * np.pi,
+#         dpi=150,
+#         colorbar=True,
+#         **kwargs,
+#     ):
+#         x = np.linspace(0, 2 * np.pi, 101)
+#         y = np.linspace(0, 2 * np.pi, 101)
+#         xx, yy = np.meshgrid(x, y)
+#         X = np.stack([xx, yy], axis=-1)
+#         Z = np.exp(kde.score_samples(X.reshape(-1, 2))).reshape(xx.shape)
+#         # Init figure
+#         fig, ax = plt.subplots()
+#         fig.set_dpi(dpi)
+#         # Plot KDE
+#         h = ax.contourf(xx, yy, Z, alpha=alpha)
+#         ax.axis("scaled")
+#         if colorbar:
+#             fig.colorbar(h, ax=ax)
+#         ax.set_xticks([])
+#         ax.set_yticks([])
+#         ax.text(0, -0.3, r"$0$", fontsize=15)
+#         ax.text(-0.28, 0, r"$0$", fontsize=15)
+#         ax.text(2 * np.pi - 0.4, -0.3, r"$2\pi$", fontsize=15)
+#         ax.text(-0.45, 2 * np.pi - 0.3, r"$2\pi$", fontsize=15)
+#         for spine in ax.spines.values():
+#             spine.set_visible(False)
+#         # Set tight layout
+#         plt.tight_layout()
+#         return fig

From e69248da37e729a2745d9a0ac0b76da9155d78c2 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 12 Apr 2023 22:09:59 -0400
Subject: [PATCH 024/206] minor progress2

---
 gflownet/envs/cube.py |  17 +++---
 gflownet/gflownet.py  | 138 ++++++++++++++++++++++--------------------
 2 files changed, 82 insertions(+), 73 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 23a9bdb28..41c84feaf 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -5,6 +5,7 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Tuple
 
+import matplotlib.pyplot as plt
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
@@ -998,7 +999,7 @@ def plot_reward_samples(
         low=0.0,
         high=1.0,
         dpi=150,
-        limit_n_samples=500,
+        max_samples=500,
         **kwargs,
     ):
         x = np.linspace(low, high, 201)
@@ -1011,9 +1012,9 @@ def plot_reward_samples(
         fig, ax = plt.subplots()
         fig.set_dpi(dpi)
         # Plot reward contour
-#         h = ax.contourf(xx, yy, rewards.reshape(xx.shape), alpha=alpha)
-#         ax.axis("scaled")
-#         fig.colorbar(h, ax=ax)
+        h = ax.contourf(xx, yy, rewards.reshape(xx.shape), alpha=alpha)
+        ax.axis("scaled")
+        fig.colorbar(h, ax=ax)
 #         ax.plot([0, 0], [0, 2 * np.pi], "-w", alpha=alpha)
 #         ax.plot([0, 2 * np.pi], [0, 0], "-w", alpha=alpha)
 #         ax.plot([2 * np.pi, 2 * np.pi], [2 * np.pi, 0], "-w", alpha=alpha)
@@ -1026,18 +1027,20 @@ def plot_reward_samples(
 #                     extra_samples.append(
 #                         np.stack(
 #                             [
-#                                 samples[:limit_n_samples, 0] + add_0,
-#                                 samples[:limit_n_samples, 1] + add_1,
+#                                 samples[:max_samples, 0] + add_0,
+#                                 samples[:max_samples, 1] + add_1,
 #                             ],
 #                             axis=1,
 #                         )
 #                     )
 #         extra_samples = np.concatenate(extra_samples)
         ax.scatter(
-            samples[:limit_n_samples, 0], samples[:limit_n_samples, 1], alpha=alpha
+            samples[:max_samples, 0], samples[:max_samples, 1], alpha=alpha
         )
 #         ax.scatter(extra_samples[:, 0], extra_samples[:, 1], alpha=alpha, color="white")
         ax.grid()
+        ax.set_xlim([low, high])
+        ax.set_ylim([low, high])
         # Set tight layout
         plt.tight_layout()
         return fig
diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 5976463b7..ca86977c7 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -821,79 +821,85 @@ def test(self, **plot_kwargs):
         Computes metrics by sampling trajectories from the forward policy.
         """
         if self.buffer.test_pkl is None:
-            return self.l1, self.kl, self.jsd, (None,)
-        with open(self.buffer.test_pkl, "rb") as f:
-            dict_tt = pickle.load(f)
-            x_tt = dict_tt["x"]
-        x_sampled, _ = self.sample_batch(self.env, self.logger.test.n, train=False)
-        if self.buffer.test_type is not None and self.buffer.test_type == "all":
-            if "density_true" in dict_tt:
-                density_true = dict_tt["density_true"]
-            else:
-                rewards = self.env.reward_batch(x_tt)
-                z_true = rewards.sum()
-                density_true = rewards / z_true
-                with open(self.buffer.test_pkl, "wb") as f:
-                    dict_tt["density_true"] = density_true
-                    pickle.dump(dict_tt, f)
-            hist = defaultdict(int)
-            for x in x_sampled:
-                hist[tuple(x)] += 1
-            z_pred = sum([hist[tuple(x)] for x in x_tt]) + 1e-9
-            density_pred = np.array([hist[tuple(x)] / z_pred for x in x_tt])
-            log_density_true = np.log(density_true + 1e-8)
-            log_density_pred = np.log(density_pred + 1e-8)
-        elif self.continuous:
-            x_sampled = torch2np(self.env.statebatch2proxy(x_sampled))
-            x_tt = torch2np(self.env.statebatch2proxy(x_tt))
-            kde_pred = self.env.fit_kde(
-                x_sampled,
-                kernel=self.logger.test.kde.kernel,
-                bandwidth=self.logger.test.kde.bandwidth,
-            )
-            if "log_density_true" in dict_tt and "kde_true" in dict_tt:
-                log_density_true = dict_tt["log_density_true"]
-                kde_true = dict_tt["kde_true"]
-            else:
-                # Sample from reward via rejection sampling
-                x_from_reward = self.env.sample_from_reward(
-                    n_samples=self.logger.test.n
-                )
-                x_from_reward = torch2np(self.env.statetorch2proxy(x_from_reward))
-                # Fit KDE with samples from reward
-                kde_true = self.env.fit_kde(
-                    x_from_reward,
+            l1, kl, jsd = self.l1, self.kl, self.jsd
+            # TODO: Improve conditions where x_sampled is obtained
+            x_sampled = None
+        else:
+            with open(self.buffer.test_pkl, "rb") as f:
+                dict_tt = pickle.load(f)
+                x_tt = dict_tt["x"]
+            x_sampled, _ = self.sample_batch(self.env, self.logger.test.n, train=False)
+            if self.buffer.test_type is not None and self.buffer.test_type == "all":
+                if "density_true" in dict_tt:
+                    density_true = dict_tt["density_true"]
+                else:
+                    rewards = self.env.reward_batch(x_tt)
+                    z_true = rewards.sum()
+                    density_true = rewards / z_true
+                    with open(self.buffer.test_pkl, "wb") as f:
+                        dict_tt["density_true"] = density_true
+                        pickle.dump(dict_tt, f)
+                hist = defaultdict(int)
+                for x in x_sampled:
+                    hist[tuple(x)] += 1
+                z_pred = sum([hist[tuple(x)] for x in x_tt]) + 1e-9
+                density_pred = np.array([hist[tuple(x)] / z_pred for x in x_tt])
+                log_density_true = np.log(density_true + 1e-8)
+                log_density_pred = np.log(density_pred + 1e-8)
+            elif self.continuous:
+                x_sampled = torch2np(self.env.statebatch2proxy(x_sampled))
+                x_tt = torch2np(self.env.statebatch2proxy(x_tt))
+                kde_pred = self.env.fit_kde(
+                    x_sampled,
                     kernel=self.logger.test.kde.kernel,
                     bandwidth=self.logger.test.kde.bandwidth,
                 )
-                # Estimate true log density using test samples
+                if "log_density_true" in dict_tt and "kde_true" in dict_tt:
+                    log_density_true = dict_tt["log_density_true"]
+                    kde_true = dict_tt["kde_true"]
+                else:
+                    # Sample from reward via rejection sampling
+                    x_from_reward = self.env.sample_from_reward(
+                        n_samples=self.logger.test.n
+                    )
+                    x_from_reward = torch2np(self.env.statetorch2proxy(x_from_reward))
+                    # Fit KDE with samples from reward
+                    kde_true = self.env.fit_kde(
+                        x_from_reward,
+                        kernel=self.logger.test.kde.kernel,
+                        bandwidth=self.logger.test.kde.bandwidth,
+                    )
+                    # Estimate true log density using test samples
+                    # TODO: this may be specific-ish for the torus or not
+                    scores_true = kde_true.score_samples(x_tt)
+                    log_density_true = scores_true - logsumexp(scores_true, axis=0)
+                    # Add log_density_true and kde_true to pickled test dict
+                    with open(self.buffer.test_pkl, "wb") as f:
+                        dict_tt["log_density_true"] = log_density_true
+                        dict_tt["kde_true"] = kde_true
+                        pickle.dump(dict_tt, f)
+                # Estimate pred log density using test samples
                 # TODO: this may be specific-ish for the torus or not
-                scores_true = kde_true.score_samples(x_tt)
-                log_density_true = scores_true - logsumexp(scores_true, axis=0)
-                # Add log_density_true and kde_true to pickled test dict
-                with open(self.buffer.test_pkl, "wb") as f:
-                    dict_tt["log_density_true"] = log_density_true
-                    dict_tt["kde_true"] = kde_true
-                    pickle.dump(dict_tt, f)
-            # Estimate pred log density using test samples
-            # TODO: this may be specific-ish for the torus or not
-            scores_pred = kde_pred.score_samples(x_tt)
-            log_density_pred = scores_pred - logsumexp(scores_pred, axis=0)
-            density_true = np.exp(log_density_true)
-            density_pred = np.exp(log_density_pred)
-        else:
-            raise NotImplementedError
-        # L1 error
-        l1 = np.abs(density_pred - density_true).mean()
-        # KL divergence
-        kl = (density_true * (log_density_true - log_density_pred)).mean()
-        # Jensen-Shannon divergence
-        log_mean_dens = np.logaddexp(log_density_true, log_density_pred) + np.log(0.5)
-        jsd = 0.5 * np.sum(density_true * (log_density_true - log_mean_dens))
-        jsd += 0.5 * np.sum(density_pred * (log_density_pred - log_mean_dens))
+                scores_pred = kde_pred.score_samples(x_tt)
+                log_density_pred = scores_pred - logsumexp(scores_pred, axis=0)
+                density_true = np.exp(log_density_true)
+                density_pred = np.exp(log_density_pred)
+            else:
+                raise NotImplementedError
+            # L1 error
+            l1 = np.abs(density_pred - density_true).mean()
+            # KL divergence
+            kl = (density_true * (log_density_true - log_density_pred)).mean()
+            # Jensen-Shannon divergence
+            log_mean_dens = np.logaddexp(log_density_true, log_density_pred) + np.log(0.5)
+            jsd = 0.5 * np.sum(density_true * (log_density_true - log_mean_dens))
+            jsd += 0.5 * np.sum(density_pred * (log_density_pred - log_mean_dens))
 
         # Plots
         if hasattr(self.env, "plot_reward_samples"):
+            if x_sampled is None:
+                x_sampled, _ = self.sample_batch(self.env, self.logger.test.n, train=False)
+                x_sampled = torch2np(self.env.statebatch2proxy(x_sampled))
             fig_reward_samples = self.env.plot_reward_samples(x_sampled, **plot_kwargs)
         else:
             fig_reward_samples = None

From 8e4a79f468f7760878b1d4e7c432d7e3350ddd0c Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 17 Apr 2023 06:50:14 -0400
Subject: [PATCH 025/206] add ccube test

---
 tests/gflownet/envs/test_ccube.py | 122 ++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 tests/gflownet/envs/test_ccube.py

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
new file mode 100644
index 000000000..6e2d4495b
--- /dev/null
+++ b/tests/gflownet/envs/test_ccube.py
@@ -0,0 +1,122 @@
+import common
+import numpy as np
+import pytest
+import torch
+
+from gflownet.envs.cube import ContinuousCube
+
+
+@pytest.fixture
+def env():
+    return ContinuousCube(n_dim=2, n_comp=3)
+
+
+@pytest.mark.parametrize(
+    "action_space",
+    [
+        [
+            (0.0, 0.0),
+            (-1.0, -1.0),
+            (np.inf, np.inf),
+        ],
+    ],
+)
+def test__get_action_space__returns_expected(env, action_space):
+    assert set(action_space) == set(env.action_space)
+
+
+def test__get_policy_output__returns_expected(env):
+    assert env.policy_output_dim == env.n_dim * env.n_comp * 3 + 1
+    fixed_policy_output = env.fixed_policy_output
+    random_policy_output = env.random_policy_output
+    assert torch.all(fixed_policy_output[0:-1:3] == 1)
+    assert torch.all(
+        fixed_policy_output[1:-1:3] == env.fixed_distr_params["beta_alpha"]
+    )
+    assert torch.all(fixed_policy_output[2:-1:3] == env.fixed_distr_params["beta_beta"])
+    assert torch.all(random_policy_output[0:-1:3] == 1)
+    assert torch.all(
+        random_policy_output[1:-1:3] == env.random_distr_params["beta_alpha"]
+    )
+    assert torch.all(
+        random_policy_output[2:-1:3] == env.random_distr_params["beta_beta"]
+    )
+
+
+@pytest.mark.parametrize(
+    "state, expected",
+    [
+        (
+            [0.0, 0.0],
+            [0.0, 0.0],
+        ),
+        (
+            [1.0, 1.0],
+            [1.0, 1.0],
+        ),
+        (
+            [1.1, 1.00001],
+            [1.0, 1.0],
+        ),
+        (
+            [-0.1, 1.00001],
+            [0.0, 1.0],
+        ),
+        (
+            [0.1, 0.21],
+            [0.1, 0.21],
+        ),
+    ],
+)
+def test__state2policy_returns_expected(env, state, expected):
+    assert env.state2policy(state) == expected
+
+
+@pytest.mark.parametrize(
+    "states, expected",
+    [
+        (
+            [[0.0, 0.0], [1.0, 1.0], [1.1, 1.00001], [-0.1, 1.00001], [0.1, 0.21]],
+            [[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 1.0], [0.1, 0.21]],
+        ),
+    ],
+)
+def test__statetorch2policy_returns_expected(env, states, expected):
+    assert torch.equal(
+        env.statetorch2policy(torch.tensor(states)), torch.tensor(expected)
+    )
+
+
+@pytest.mark.parametrize(
+    "state, expected",
+    [
+        (
+            [0.0, 0.0],
+            [True, False, False],
+        ),
+        (
+            [0.1, 0.1],
+            [False, True, False],
+        ),
+        (
+            [1.0, 0.0],
+            [False, True, False],
+        ),
+        (
+            [1.1, 0.0],
+            [True, True, False],
+        ),
+        (
+            [0.1, 1.1],
+            [True, True, False],
+        ),
+    ],
+)
+def test__get_mask_invalid_actions_forward__returns_expected(env, state, expected):
+    assert env.get_mask_invalid_actions_forward(state) == expected, print(
+        state, expected, env.get_mask_invalid_actions_forward(state)
+    )
+
+
+def test__continuous_env_common(env):
+    return common.test__continuous_env_common(env)

From 0c121055a8aad9a4d74d71e03575197f65f83f93 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 18 Apr 2023 10:27:24 -0400
Subject: [PATCH 026/206] proxy and policy states are mapped to [-1, 1]; fix
 plotting of reward distribution

---
 gflownet/envs/cube.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 41c84feaf..338eb16c4 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -63,7 +63,7 @@ def __init__(
         self.eos = self.n_dim
         self.max_val = max_val
         self.min_incr = min_incr * self.max_val
-        # Parameters of fixed policy distribution
+        # Parameters of the policy distribution
         self.n_comp = n_comp
         # Source state: position 0 at all dimensions
         self.source = [0.0 for _ in range(self.n_dim)]
@@ -110,20 +110,20 @@ def statetorch2policy(
         self, states: TensorType["batch", "state_dim"] = None
     ) -> TensorType["batch", "policy_input_dim"]:
         """
-        Clips the states into [0, max_val]
+        Clips the states into [0, max_val] and maps them to [-1.0, 1.0]
 
         Args
         ----
         state : list
             State
         """
-        return torch.clip(states, min=0.0, max=self.max_val)
+        return 2.0 * torch.clip(states, min=0.0, max=self.max_val) - 1.0
 
     def statebatch2policy(
         self, states: List[List]
     ) -> TensorType["batch", "state_proxy_dim"]:
         """
-        Clips the states into [0, max_val]
+        Clips the states into [0, max_val] and maps them to [-1.0, 1.0]
 
         Args
         ----
@@ -134,11 +134,11 @@ def statebatch2policy(
 
     def state2policy(self, state: List = None) -> List:
         """
-        Clips the state into [0, max_val]
+        Clips the state into [0, max_val] and maps it to [-1.0, 1.0]
         """
         if state is None:
             state = self.state.copy()
-        return [min(max(0.0, s), self.max_val) for s in state]
+        return [2.0 * min(max(0.0, s), self.max_val) - 1.0 for s in state]
 
     def state2readable(self, state: List) -> str:
         """
@@ -996,23 +996,23 @@ def plot_reward_samples(
         self,
         samples,
         alpha=0.5,
-        low=0.0,
-        high=1.0,
+        cell_min=-1.0,
+        cell_max=1.0,
         dpi=150,
         max_samples=500,
         **kwargs,
     ):
-        x = np.linspace(low, high, 201)
-        y = np.linspace(low, high, 201)
+        x = np.linspace(cell_min, cell_max, 201)
+        y = np.linspace(cell_min, cell_max, 201)
         xx, yy = np.meshgrid(x, y)
         X = np.stack([xx, yy], axis=-1)
         states_mesh = torch.tensor(X.reshape(-1, 2), device=self.device, dtype=self.float)
-        rewards = self.reward_torchbatch(states_mesh)
+        rewards = self.proxy2reward(self.proxy(states_mesh))
         # Init figure
         fig, ax = plt.subplots()
         fig.set_dpi(dpi)
         # Plot reward contour
-        h = ax.contourf(xx, yy, rewards.reshape(xx.shape), alpha=alpha)
+        h = ax.contourf(xx, yy, rewards.reshape(xx.shape).cpu().numpy(), alpha=alpha)
         ax.axis("scaled")
         fig.colorbar(h, ax=ax)
 #         ax.plot([0, 0], [0, 2 * np.pi], "-w", alpha=alpha)
@@ -1039,8 +1039,9 @@ def plot_reward_samples(
         )
 #         ax.scatter(extra_samples[:, 0], extra_samples[:, 1], alpha=alpha, color="white")
         ax.grid()
-        ax.set_xlim([low, high])
-        ax.set_ylim([low, high])
+        padding = 0.05 * (cell_max - cell_min)
+        ax.set_xlim([cell_min - padding, cell_max + padding])
+        ax.set_ylim([cell_min - padding, cell_max + padding])
         # Set tight layout
         plt.tight_layout()
         return fig

From bb46a513fd46067701566da380fb8a6b43588e33 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 18 Apr 2023 17:49:51 -0400
Subject: [PATCH 027/206] wip: min and max values of beta params; different way
 of handling min increment; but not working properly

---
 gflownet/envs/cube.py | 195 +++++++++++++++++++++++-------------------
 1 file changed, 105 insertions(+), 90 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 338eb16c4..7575dc38d 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -44,6 +44,8 @@ def __init__(
         max_val: float = 1.0,
         min_incr: float = 0.1,
         n_comp: int = 1,
+        beta_params_min: float = 0.1,
+        beta_params_max: float = 2.0,
         fixed_distr_params: dict = {
             "beta_alpha": 2.0,
             "beta_beta": 5.0,
@@ -65,6 +67,8 @@ def __init__(
         self.min_incr = min_incr * self.max_val
         # Parameters of the policy distribution
         self.n_comp = n_comp
+        self.beta_params_min = beta_params_min
+        self.beta_params_max = beta_params_max
         # Source state: position 0 at all dimensions
         self.source = [0.0 for _ in range(self.n_dim)]
         # Action from source: (n_dim, 0)
@@ -342,8 +346,8 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         For each dimension d of the hyper-cube and component c of the mixture, the
         output of the policy should return
           1) the weight of the component in the mixture
-          2) the log(alpha) parameter of the Beta distribution to sample the increment
-          3) the log(beta) parameter of the Beta distribution to sample the increment
+          2) the logit(alpha) parameter of the Beta distribution to sample the increment
+          3) the logit(beta) parameter of the Beta distribution to sample the increment
 
         Additionally, the policy output contains one logit per dimension plus one logit
         for the EOS action, for the categorical distribution over dimensions.
@@ -658,8 +662,8 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         For each dimension d of the hyper-cube and component c of the mixture, the
         output of the policy should return
           1) the weight of the component in the mixture
-          2) the log(alpha) parameter of the Beta distribution to sample the increment
-          3) the log(beta) parameter of the Beta distribution to sample the increment
+          2) the logit(alpha) parameter of the Beta distribution to sample the increment
+          3) the logit(beta) parameter of the Beta distribution to sample the increment
 
         Additionally, the policy output contains one logit of a Bernoulli distribution
         to model the (discrete) forward probability of selecting the EOS action and the
@@ -805,6 +809,7 @@ def sample_actions(
         logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
         # Sample angle increments
         idx_sample = idx_nofix[~mask_sampled_eos]
+        idx_generic = idx_sample[~mask_invalid_actions[idx_sample, 0]]
         n_sample = idx_sample.shape[0]
         logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
         increments = torch.inf * torch.ones(
@@ -824,23 +829,28 @@ def sample_actions(
                 alphas = policy_outputs[idx_sample, 1:-1:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
+                alphas = (
+                    self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
+                )
                 betas = policy_outputs[idx_sample, 2:-1:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
-                beta_distr = Beta(torch.exp(alphas), torch.exp(betas))
+                betas = (
+                    self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
+                )
+                beta_distr = Beta(alphas, betas)
                 distr_increments = MixtureSameFamily(mix, beta_distr)
             increments[idx_sample, :] = distr_increments.sample()
             logprobs_sample[idx_sample] = distr_increments.log_prob(
                 increments[idx_sample, :]
             ).sum(axis=1)
-            # Apply minimum increment to generic (not from source) actions
+            # Map increments of generic (not from source) actions to [min_incr, 1.0]
             # TODO: before or after computing logprob?
-            mask_action_generic = ~mask_invalid_actions[:, 0]
-            increments[mask_action_generic] = torch.max(
-                increments[mask_action_generic],
-                self.min_incr
-                * torch.ones(increments[mask_action_generic].shape, device=device),
-            )
+#             increments[idx_generic] = (
+#                 increments[idx_generic] * (1 - self.min_incr) + self.min_incr
+#             )
+#             assert torch.all(increments[idx_generic] >= self.min_incr)
+#             assert torch.all(increments[idx_generic] <= 1.0)
         # Combined probabilities
         logprobs = logprobs_eos + logprobs_sample
         # Build actions
@@ -871,11 +881,12 @@ def get_logprobs(
             ~torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
         ]
         distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
-        mask_sampled_eos = torch.all(actions == torch.inf, axis=1)
+        mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
         logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
         # Log probs of sampled increments
         idx_sample = idx_nofix[~mask_sampled_eos]
+        idx_generic = idx_sample[~mask_invalid_actions[idx_sample, 0]]
         logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
         if torch.any(idx_sample):
             mix_logits = policy_outputs[idx_sample, 0:-1:3].reshape(
@@ -885,16 +896,20 @@ def get_logprobs(
             alphas = policy_outputs[idx_sample, 1:-1:3].reshape(
                 -1, self.n_dim, self.n_comp
             )
+            alphas = self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
             betas = policy_outputs[idx_sample, 2:-1:3].reshape(
                 -1, self.n_dim, self.n_comp
             )
-            beta_distr = Beta(torch.exp(alphas), torch.exp(betas))
+            betas = self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
+            beta_distr = Beta(alphas, betas)
             distr_increments = MixtureSameFamily(mix, beta_distr)
-            # TODO: what to do with the minimum increments, since the logprob will not
-            # reflect the true probability of sampling that increment.
-            logprobs_sample[idx_sample] = distr_increments.log_prob(
-                actions[idx_sample, :]
-            ).sum(axis=1)
+            increments = actions.clone().detach()
+            # Remap increments of generic actions to [0, 1] to obtain correct probs.
+#             if len(idx_generic) > 0:
+#                 increments[idx_generic] = (increments[idx_generic] - self.min_incr) / (1 - self.min_incr)
+#                 assert torch.all(increments[idx_sample] >= 0.0)
+#                 assert torch.all(increments[idx_sample] <= 1.0)
+            logprobs_sample[idx_sample] = distr_increments.log_prob(increments[idx_sample]).sum(axis=1)
         # Combined probabilities
         logprobs = logprobs_eos + logprobs_sample
         return logprobs
@@ -935,7 +950,6 @@ def step(
                 self.state[dim] += incr
             return self.state, action, True
 
-
     def get_grid_terminating_states(self, n_states: int) -> List[List]:
         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
         linspaces = [np.linspace(0, self.max_val, n_per_dim) for _ in range(self.n_dim)]
@@ -944,53 +958,53 @@ def get_grid_terminating_states(self, n_states: int) -> List[List]:
         states = [list(el) for el in states]
         return states
 
-#     # TODO: make generic for all environments
-#     def sample_from_reward(
-#         self, n_samples: int, epsilon=1e-4
-#     ) -> TensorType["n_samples", "state_dim"]:
-#         """
-#         Rejection sampling  with proposal the uniform distribution in 
-#         [0, max_val]]^n_dim.
-# 
-#         Returns a tensor in GFloNet (state) format.
-#         """
-#         samples_final = []
-#         max_reward = self.proxy2reward(torch.tensor([self.proxy.min])).to(self.device)
-#         while len(samples_final) < n_samples:
-#             angles_uniform = (
-#                 torch.rand(
-#                     (n_samples, self.n_dim), dtype=self.float, device=self.device
-#                 )
-#                 * 2
-#                 * np.pi
-#             )
-#             samples = torch.cat(
-#                 (
-#                     angles_uniform,
-#                     torch.ones((angles_uniform.shape[0], 1)).to(angles_uniform),
-#                 ),
-#                 axis=1,
-#             )
-#             rewards = self.reward_torchbatch(samples)
-#             mask = (
-#                 torch.rand(n_samples, dtype=self.float, device=self.device)
-#                 * (max_reward + epsilon)
-#                 < rewards
-#             )
-#             samples_accepted = samples[mask, :]
-#             samples_final.extend(samples_accepted[-(n_samples - len(samples_final)) :])
-#         return torch.vstack(samples_final)
-# 
-#     def fit_kde(self, samples, kernel="gaussian", bandwidth=0.1):
-#         aug_samples = []
-#         for add_0 in [0, -2 * np.pi, 2 * np.pi]:
-#             for add_1 in [0, -2 * np.pi, 2 * np.pi]:
-#                 aug_samples.append(
-#                     np.stack([samples[:, 0] + add_0, samples[:, 1] + add_1], axis=1)
-#                 )
-#         aug_samples = np.concatenate(aug_samples)
-#         kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(aug_samples)
-#         return kde
+    #     # TODO: make generic for all environments
+    #     def sample_from_reward(
+    #         self, n_samples: int, epsilon=1e-4
+    #     ) -> TensorType["n_samples", "state_dim"]:
+    #         """
+    #         Rejection sampling  with proposal the uniform distribution in
+    #         [0, max_val]]^n_dim.
+    #
+    #         Returns a tensor in GFloNet (state) format.
+    #         """
+    #         samples_final = []
+    #         max_reward = self.proxy2reward(torch.tensor([self.proxy.min])).to(self.device)
+    #         while len(samples_final) < n_samples:
+    #             angles_uniform = (
+    #                 torch.rand(
+    #                     (n_samples, self.n_dim), dtype=self.float, device=self.device
+    #                 )
+    #                 * 2
+    #                 * np.pi
+    #             )
+    #             samples = torch.cat(
+    #                 (
+    #                     angles_uniform,
+    #                     torch.ones((angles_uniform.shape[0], 1)).to(angles_uniform),
+    #                 ),
+    #                 axis=1,
+    #             )
+    #             rewards = self.reward_torchbatch(samples)
+    #             mask = (
+    #                 torch.rand(n_samples, dtype=self.float, device=self.device)
+    #                 * (max_reward + epsilon)
+    #                 < rewards
+    #             )
+    #             samples_accepted = samples[mask, :]
+    #             samples_final.extend(samples_accepted[-(n_samples - len(samples_final)) :])
+    #         return torch.vstack(samples_final)
+    #
+    #     def fit_kde(self, samples, kernel="gaussian", bandwidth=0.1):
+    #         aug_samples = []
+    #         for add_0 in [0, -2 * np.pi, 2 * np.pi]:
+    #             for add_1 in [0, -2 * np.pi, 2 * np.pi]:
+    #                 aug_samples.append(
+    #                     np.stack([samples[:, 0] + add_0, samples[:, 1] + add_1], axis=1)
+    #                 )
+    #         aug_samples = np.concatenate(aug_samples)
+    #         kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(aug_samples)
+    #         return kde
 
     def plot_reward_samples(
         self,
@@ -1006,7 +1020,9 @@ def plot_reward_samples(
         y = np.linspace(cell_min, cell_max, 201)
         xx, yy = np.meshgrid(x, y)
         X = np.stack([xx, yy], axis=-1)
-        states_mesh = torch.tensor(X.reshape(-1, 2), device=self.device, dtype=self.float)
+        states_mesh = torch.tensor(
+            X.reshape(-1, 2), device=self.device, dtype=self.float
+        )
         rewards = self.proxy2reward(self.proxy(states_mesh))
         # Init figure
         fig, ax = plt.subplots()
@@ -1015,29 +1031,27 @@ def plot_reward_samples(
         h = ax.contourf(xx, yy, rewards.reshape(xx.shape).cpu().numpy(), alpha=alpha)
         ax.axis("scaled")
         fig.colorbar(h, ax=ax)
-#         ax.plot([0, 0], [0, 2 * np.pi], "-w", alpha=alpha)
-#         ax.plot([0, 2 * np.pi], [0, 0], "-w", alpha=alpha)
-#         ax.plot([2 * np.pi, 2 * np.pi], [2 * np.pi, 0], "-w", alpha=alpha)
-#         ax.plot([2 * np.pi, 0], [2 * np.pi, 2 * np.pi], "-w", alpha=alpha)
+        #         ax.plot([0, 0], [0, 2 * np.pi], "-w", alpha=alpha)
+        #         ax.plot([0, 2 * np.pi], [0, 0], "-w", alpha=alpha)
+        #         ax.plot([2 * np.pi, 2 * np.pi], [2 * np.pi, 0], "-w", alpha=alpha)
+        #         ax.plot([2 * np.pi, 0], [2 * np.pi, 2 * np.pi], "-w", alpha=alpha)
         # Plot samples
-#         extra_samples = []
-#         for add_0 in [0, -2 * np.pi, 2 * np.pi]:
-#             for add_1 in [0, -2 * np.pi, 2 * np.pi]:
-#                 if not (add_0 == add_1 == 0):
-#                     extra_samples.append(
-#                         np.stack(
-#                             [
-#                                 samples[:max_samples, 0] + add_0,
-#                                 samples[:max_samples, 1] + add_1,
-#                             ],
-#                             axis=1,
-#                         )
-#                     )
-#         extra_samples = np.concatenate(extra_samples)
-        ax.scatter(
-            samples[:max_samples, 0], samples[:max_samples, 1], alpha=alpha
-        )
-#         ax.scatter(extra_samples[:, 0], extra_samples[:, 1], alpha=alpha, color="white")
+        #         extra_samples = []
+        #         for add_0 in [0, -2 * np.pi, 2 * np.pi]:
+        #             for add_1 in [0, -2 * np.pi, 2 * np.pi]:
+        #                 if not (add_0 == add_1 == 0):
+        #                     extra_samples.append(
+        #                         np.stack(
+        #                             [
+        #                                 samples[:max_samples, 0] + add_0,
+        #                                 samples[:max_samples, 1] + add_1,
+        #                             ],
+        #                             axis=1,
+        #                         )
+        #                     )
+        #         extra_samples = np.concatenate(extra_samples)
+        ax.scatter(samples[:max_samples, 0], samples[:max_samples, 1], alpha=alpha)
+        #         ax.scatter(extra_samples[:, 0], extra_samples[:, 1], alpha=alpha, color="white")
         ax.grid()
         padding = 0.05 * (cell_max - cell_min)
         ax.set_xlim([cell_min - padding, cell_max + padding])
@@ -1046,6 +1060,7 @@ def plot_reward_samples(
         plt.tight_layout()
         return fig
 
+
 #     def plot_kde(
 #         self,
 #         kde,

From 916b685060a9055abcd915aa1663c6518e721efd Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 19 Apr 2023 05:23:39 -0400
Subject: [PATCH 028/206] ccube config

---
 config/env/ccube.yaml | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 config/env/ccube.yaml

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
new file mode 100644
index 000000000..a26825f06
--- /dev/null
+++ b/config/env/ccube.yaml
@@ -0,0 +1,30 @@
+defaults:
+  - base
+
+_target_: gflownet.envs.cube.ContinuousCube
+
+id: ccube
+func: corners
+# Dimensions of hypercube
+n_dim: 2
+max_val: 1.0
+# Policy
+beta_params_min: 0.1
+beta_params_max: 2.0
+min_incr: 0.1
+n_comp: 1
+fixed_distribution:
+  beta_alpha: 2.0
+  beta_beta: 5.0
+random_distribution:
+  beta_alpha: 1.0
+  beta_beta: 1.0
+# Buffer
+buffer:
+  data_path: null
+  train: null
+  test:
+    type: grid
+    n: 1000
+    output_csv: ccube_test.csv
+    output_pkl: ccube_test.pkl

From 174c8f0c6e1194ca45029c56bdb43f8e03f1e36f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 19 Apr 2023 05:24:40 -0400
Subject: [PATCH 029/206] wip: correct things around min_increment, when to
 stop, etc.

---
 gflownet/envs/cube.py | 67 +++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 7575dc38d..d8adec6e3 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -614,12 +614,14 @@ def get_grid_terminating_states(self, n_states: int) -> List[List]:
 
 class ContinuousCube(Cube):
     """
-    Continuous hyper-cube environment (continuous
-    version of a hyper-grid) in which the action space consists of the increment of
-    each dimension d, modelled by a mixture of Beta distributions.
-
-    The states space is the value of each dimension. If the value of any dimension gets
-    larger than max_val, then the trajectory is ended.
+    Continuous hyper-cube environment (continuous version of a hyper-grid) in which the
+    action space consists of the increment of each dimension d, modelled by a mixture
+    of Beta distributions. The states space is the value of each dimension. In order to
+    ensure that all trajectories are of finite length, actions have a minimum increment
+    for all dimensions determined by min_incr. If the value of any dimension is larger
+    than 1 - min_incr, then the trajectory is ended (the only next valid action is
+    EOS). In order to ensure the coverage of the state space, the first action (from
+    the source state) is not constrained by the minimum increment.
 
     Attributes
     ----------
@@ -706,9 +708,9 @@ def get_mask_invalid_actions_forward(
         if all([s == ss for s, ss in zip(state, self.source)]):
             mask = [False for _ in range(self.action_space_dim)]
             mask[0] = True
-        # If the value of any dimension is greater than max_val, then next action can
-        # only be EOS.
-        elif any([s > self.max_val for s in state]):
+        # If the value of any dimension is greater than 1 - min_incr, then next action
+        # can only be EOS.
+        elif any([s > (1 - self.min_incr) for s in state]):
             mask = [True for _ in range(self.action_space_dim)]
             mask[-1] = False
         # Otherwise, only the action_source is not valid
@@ -736,6 +738,7 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
             mask[-1] = False
         # If the value of any dimension is smaller than 0.0, then next action can only
         # be return to source (EOS)
+        # TODO: make sure this is correct
         if any([s < 0.0 for s in state]):
             mask = [True for _ in range(self.action_space_dim)]
             mask[-1] = False
@@ -845,12 +848,13 @@ def sample_actions(
                 increments[idx_sample, :]
             ).sum(axis=1)
             # Map increments of generic (not from source) actions to [min_incr, 1.0]
-            # TODO: before or after computing logprob?
-#             increments[idx_generic] = (
-#                 increments[idx_generic] * (1 - self.min_incr) + self.min_incr
-#             )
-#             assert torch.all(increments[idx_generic] >= self.min_incr)
-#             assert torch.all(increments[idx_generic] <= 1.0)
+            if len(idx_generic) > 0:
+                increments[idx_generic] = (
+                    increments[idx_generic] * (1 - self.min_incr) + self.min_incr
+                )
+            assert torch.all(increments[idx_sample] >= 0.0)
+            assert torch.all(increments[idx_generic] >= self.min_incr)
+            assert torch.all(increments[idx_generic] <= 1.0)
         # Combined probabilities
         logprobs = logprobs_eos + logprobs_sample
         # Build actions
@@ -876,12 +880,19 @@ def get_logprobs(
         device = policy_outputs.device
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
-        # Log probs of EOS actions
+        # Log probs of EOS (back to source if backwards) actions
         idx_nofix = ns_range[
             ~torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
         ]
         distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
-        mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
+        if is_forward:
+            mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
+        else:
+            mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
+#             mask_sampled_eos = torch.logical_or(
+#                 torch.all(states_target[idx_nofix] == 0.0, axis=1),
+#                 torch.all(actions[idx_nofix] == torch.inf, axis=1),
+#             )
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
         logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
         # Log probs of sampled increments
@@ -904,12 +915,19 @@ def get_logprobs(
             beta_distr = Beta(alphas, betas)
             distr_increments = MixtureSameFamily(mix, beta_distr)
             increments = actions.clone().detach()
-            # Remap increments of generic actions to [0, 1] to obtain correct probs.
-#             if len(idx_generic) > 0:
-#                 increments[idx_generic] = (increments[idx_generic] - self.min_incr) / (1 - self.min_incr)
-#                 assert torch.all(increments[idx_sample] >= 0.0)
-#                 assert torch.all(increments[idx_sample] <= 1.0)
-            logprobs_sample[idx_sample] = distr_increments.log_prob(increments[idx_sample]).sum(axis=1)
+            # Remap increments of generic actions to [0, 1] to obtain correct
+            # probabilities, in the case where the actions are not from the source
+            # state (generic) and the transitions are forward.
+            if len(idx_generic) > 0 and is_forward:
+                increments[idx_generic] = (increments[idx_generic] - self.min_incr) / (
+                    1 - self.min_incr
+                )
+                assert torch.all(increments[idx_sample] >= 0.0)
+                assert torch.all(increments[idx_sample] <= 1.0)
+            # TODO: do something with the logprob of returning to source (backwards)?
+            logprobs_sample[idx_sample] = distr_increments.log_prob(
+                increments[idx_sample]
+            ).sum(axis=1)
         # Combined probabilities
         logprobs = logprobs_eos + logprobs_sample
         return logprobs
@@ -941,13 +959,14 @@ def step(
         if self.done:
             return self.state, action, False
         # If action is eos or any dimension is beyond max_val, then force eos
-        elif action == self.eos or any([s > self.max_val for s in self.state]):
+        elif action == self.eos or any([s > (1 - self.min_incr) for s in self.state]):
             self.done = True
             return self.state, self.eos, True
         # If action is not eos, then perform action
         else:
             for dim, incr in enumerate(action):
                 self.state[dim] += incr
+                assert all([s <= self.max_val for s in self.state])
             return self.state, action, True
 
     def get_grid_terminating_states(self, n_states: int) -> List[List]:

From fe72b94fae9b225c26e01252b48f36f4f78d1cf2 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 19 Apr 2023 07:41:52 -0400
Subject: [PATCH 030/206] actions are relative increments within the distance
 to the edge (max_val); action includes min_incr as last value

---
 gflownet/envs/cube.py | 90 +++++++++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 34 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index d8adec6e3..f9b3f22b5 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -618,11 +618,18 @@ class ContinuousCube(Cube):
     action space consists of the increment of each dimension d, modelled by a mixture
     of Beta distributions. The states space is the value of each dimension. In order to
     ensure that all trajectories are of finite length, actions have a minimum increment
-    for all dimensions determined by min_incr. If the value of any dimension is larger
+    for all dimensions determined by min_incr.  If the value of any dimension is larger
     than 1 - min_incr, then the trajectory is ended (the only next valid action is
     EOS). In order to ensure the coverage of the state space, the first action (from
     the source state) is not constrained by the minimum increment.
 
+    Actions do not represent absolute increments but rather the relative increment with
+    respect to the distance to the edges of the hyper-cube, from the minimum increment.
+    That is, if dimension d of a state has value 0.3, the minimum increment (min_incr)
+    is 0.1 and the maximum value (max_val) is 1.0, an action of 0.5 will increment the
+    value of the dimension in 0.5 * (1.0 - 0.3 - 0.1) = 0.5 * 0.6 = 0.3. Therefore, the
+    value of d in the next state will be 0.3 + 0.3 = 0.6.
+
     Attributes
     ----------
     n_dim : int
@@ -641,18 +648,20 @@ def __init__(self, **kwargs):
 
     def get_action_space(self):
         """
-        The actions are tuples of length n_dim, where the value at position d indicates
-        the (positive) increment of dimension d.
+        The actions are tuples of length n_dim + 1, where the value at position d indicates
+        the (positive, relative) increment of dimension d. The value at the last
+        position indicates the minimum increment: 0.0 if the transition is from the
+        source state, min_incr otherwise.
 
         Additionally, there are two special discrete actions:
             - Action from the source state, with no minimum increment. Only valid from
               the source state. Indicated by -1 for all dimensions.
             - EOS action. Indicated by np.inf for all dimensions.
         """
-        generic_action = tuple([0.0 for _ in range(self.n_dim)])
-        self.action_source = tuple([-1.0 for _ in range(self.n_dim)])
-        self.eos = tuple([np.inf for _ in range(self.n_dim)])
-        actions = [generic_action, self.action_source, self.eos]
+        generic_action = tuple([0.0 for _ in range(self.n_dim)] + [self.min_incr])
+        action_source = tuple([0.0 for _ in range(self.n_dim)] + [0.0])
+        self.eos = tuple([np.inf for _ in range(self.n_dim + 1)])
+        actions = [generic_action, action_source, self.eos]
         return actions
 
     def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
@@ -783,10 +792,11 @@ def get_parents(
         if all([s == ss for s, ss in zip(state, self.source)]):
             return [], []
         else:
-            for dim, incr in enumerate(action):
+            min_incr = action[-1]
+            for dim, incr_rel in enumerate(action[:-1]):
+                incr = (incr_rel * (1.0 - state[dim] - min_incr)) / (1 - incr_rel)
                 state[dim] -= incr
-            parents = [state]
-            return parents, [action]
+            return [state], [action]
 
     def sample_actions(
         self,
@@ -810,15 +820,21 @@ def sample_actions(
         mask_sampled_eos = distr_eos.sample().to(torch.bool)
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
         logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
-        # Sample angle increments
+        # Sample increments
         idx_sample = idx_nofix[~mask_sampled_eos]
         idx_generic = idx_sample[~mask_invalid_actions[idx_sample, 0]]
+        idx_source = idx_sample[~mask_invalid_actions[idx_sample, 1]]
         n_sample = idx_sample.shape[0]
         logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
         increments = torch.inf * torch.ones(
             (n_states, self.n_dim), device=device, dtype=self.float
         )
-        if torch.any(idx_sample):
+        min_increments = torch.inf * torch.ones(
+            n_states, device=device, dtype=self.float
+        )
+        min_increments[idx_generic] = self.min_incr
+        min_increments[idx_source] = 0.0
+        if len(idx_sample) > 0:
             if sampling_method == "uniform":
                 distr_increments = Uniform(
                     torch.zeros(n_sample),
@@ -848,17 +864,19 @@ def sample_actions(
                 increments[idx_sample, :]
             ).sum(axis=1)
             # Map increments of generic (not from source) actions to [min_incr, 1.0]
-            if len(idx_generic) > 0:
-                increments[idx_generic] = (
-                    increments[idx_generic] * (1 - self.min_incr) + self.min_incr
-                )
-            assert torch.all(increments[idx_sample] >= 0.0)
-            assert torch.all(increments[idx_generic] >= self.min_incr)
-            assert torch.all(increments[idx_generic] <= 1.0)
+        #             if len(idx_generic) > 0:
+        #                 increments[idx_generic] = (
+        #                     increments[idx_generic] * (1 - self.min_incr) + self.min_incr
+        #                 )
+        #             assert torch.all(increments[idx_sample] >= 0.0)
+        #             assert torch.all(increments[idx_generic] >= self.min_incr)
+        #             assert torch.all(increments[idx_generic] <= 1.0)
         # Combined probabilities
         logprobs = logprobs_eos + logprobs_sample
         # Build actions
-        actions = [tuple(a.tolist()) for a in increments]
+        actions = [
+            tuple(a.tolist() + [m.item()]) for a, m in zip(increments, min_increments)
+        ]
         return actions, logprobs
 
     def get_logprobs(
@@ -889,17 +907,17 @@ def get_logprobs(
             mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
         else:
             mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
-#             mask_sampled_eos = torch.logical_or(
-#                 torch.all(states_target[idx_nofix] == 0.0, axis=1),
-#                 torch.all(actions[idx_nofix] == torch.inf, axis=1),
-#             )
+        #             mask_sampled_eos = torch.logical_or(
+        #                 torch.all(states_target[idx_nofix] == 0.0, axis=1),
+        #                 torch.all(actions[idx_nofix] == torch.inf, axis=1),
+        #             )
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
         logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
         # Log probs of sampled increments
         idx_sample = idx_nofix[~mask_sampled_eos]
         idx_generic = idx_sample[~mask_invalid_actions[idx_sample, 0]]
         logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
-        if torch.any(idx_sample):
+        if len(idx_sample) > 0:
             mix_logits = policy_outputs[idx_sample, 0:-1:3].reshape(
                 -1, self.n_dim, self.n_comp
             )
@@ -914,16 +932,16 @@ def get_logprobs(
             betas = self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
             beta_distr = Beta(alphas, betas)
             distr_increments = MixtureSameFamily(mix, beta_distr)
-            increments = actions.clone().detach()
+            increments = actions[:, :-1].clone().detach()
             # Remap increments of generic actions to [0, 1] to obtain correct
             # probabilities, in the case where the actions are not from the source
             # state (generic) and the transitions are forward.
-            if len(idx_generic) > 0 and is_forward:
-                increments[idx_generic] = (increments[idx_generic] - self.min_incr) / (
-                    1 - self.min_incr
-                )
-                assert torch.all(increments[idx_sample] >= 0.0)
-                assert torch.all(increments[idx_sample] <= 1.0)
+            #             if len(idx_generic) > 0 and is_forward:
+            #                 increments[idx_generic] = (increments[idx_generic] - self.min_incr) / (
+            #                     1 - self.min_incr
+            #                 )
+            #                 assert torch.all(increments[idx_sample] >= 0.0)
+            #                 assert torch.all(increments[idx_sample] <= 1.0)
             # TODO: do something with the logprob of returning to source (backwards)?
             logprobs_sample[idx_sample] = distr_increments.log_prob(
                 increments[idx_sample]
@@ -964,9 +982,13 @@ def step(
             return self.state, self.eos, True
         # If action is not eos, then perform action
         else:
-            for dim, incr in enumerate(action):
+            min_incr = action[-1]
+            for dim, incr_rel in enumerate(action[:-1]):
+                incr = incr_rel * (1.0 - self.state[dim] - min_incr)
                 self.state[dim] += incr
-                assert all([s <= self.max_val for s in self.state])
+            assert all([s <= self.max_val for s in self.state]), print(
+                self.state, action
+            )
             return self.state, action, True
 
     def get_grid_terminating_states(self, n_states: int) -> List[List]:

From 0cc9f6698d92dcbf1941ebdfa3e676de098ba391 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 19 Apr 2023 07:44:15 -0400
Subject: [PATCH 031/206] remove commented code from previous version

---
 gflownet/envs/cube.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index f9b3f22b5..9967a95f3 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -863,14 +863,6 @@ def sample_actions(
             logprobs_sample[idx_sample] = distr_increments.log_prob(
                 increments[idx_sample, :]
             ).sum(axis=1)
-            # Map increments of generic (not from source) actions to [min_incr, 1.0]
-        #             if len(idx_generic) > 0:
-        #                 increments[idx_generic] = (
-        #                     increments[idx_generic] * (1 - self.min_incr) + self.min_incr
-        #                 )
-        #             assert torch.all(increments[idx_sample] >= 0.0)
-        #             assert torch.all(increments[idx_generic] >= self.min_incr)
-        #             assert torch.all(increments[idx_generic] <= 1.0)
         # Combined probabilities
         logprobs = logprobs_eos + logprobs_sample
         # Build actions
@@ -933,15 +925,6 @@ def get_logprobs(
             beta_distr = Beta(alphas, betas)
             distr_increments = MixtureSameFamily(mix, beta_distr)
             increments = actions[:, :-1].clone().detach()
-            # Remap increments of generic actions to [0, 1] to obtain correct
-            # probabilities, in the case where the actions are not from the source
-            # state (generic) and the transitions are forward.
-            #             if len(idx_generic) > 0 and is_forward:
-            #                 increments[idx_generic] = (increments[idx_generic] - self.min_incr) / (
-            #                     1 - self.min_incr
-            #                 )
-            #                 assert torch.all(increments[idx_sample] >= 0.0)
-            #                 assert torch.all(increments[idx_sample] <= 1.0)
             # TODO: do something with the logprob of returning to source (backwards)?
             logprobs_sample[idx_sample] = distr_increments.log_prob(
                 increments[idx_sample]

From f1bf0b7c29de9b9b0924b97e460a60653aeb4b17 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 19 Apr 2023 09:50:42 -0400
Subject: [PATCH 032/206] fix bug about state_id being negative

---
 gflownet/gflownet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index ca86977c7..6af5556fe 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -638,7 +638,7 @@ def trajectorybalance_loss(self, it, batch, loginf=1000):
         )
         # Shift state_id to [1, 2, ...]
         for tid in traj_id.unique():
-            state_id[traj_id == tid] -= state_id[traj_id == tid].min() + 1
+            state_id[traj_id == tid] = state_id[traj_id == tid] - state_id[traj_id == tid].min() + 1
         # Compute rewards
         rewards = self.env.reward_torchbatch(states, done)
         # Build parents forward masks from state masks

From d1dff37c758aa404e416625fd889349da6928d9c Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 19 Apr 2023 09:51:30 -0400
Subject: [PATCH 033/206] increment n_actions; catch backward transitions to
 source via target state

---
 gflownet/envs/cube.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 9967a95f3..38c333f1f 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -898,16 +898,15 @@ def get_logprobs(
         if is_forward:
             mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
         else:
-            mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
-        #             mask_sampled_eos = torch.logical_or(
-        #                 torch.all(states_target[idx_nofix] == 0.0, axis=1),
-        #                 torch.all(actions[idx_nofix] == torch.inf, axis=1),
-        #             )
+#             mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
+            mask_sampled_eos = torch.logical_or(
+                torch.all(states_target[idx_nofix] == 0.0, axis=1),
+                torch.all(actions[idx_nofix] == torch.inf, axis=1),
+            )
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
         logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
         # Log probs of sampled increments
         idx_sample = idx_nofix[~mask_sampled_eos]
-        idx_generic = idx_sample[~mask_invalid_actions[idx_sample, 0]]
         logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
         if len(idx_sample) > 0:
             mix_logits = policy_outputs[idx_sample, 0:-1:3].reshape(
@@ -962,6 +961,7 @@ def step(
         # If action is eos or any dimension is beyond max_val, then force eos
         elif action == self.eos or any([s > (1 - self.min_incr) for s in self.state]):
             self.done = True
+            self.n_actions += 1
             return self.state, self.eos, True
         # If action is not eos, then perform action
         else:
@@ -972,6 +972,7 @@ def step(
             assert all([s <= self.max_val for s in self.state]), print(
                 self.state, action
             )
+            self.n_actions += 1
             return self.state, action, True
 
     def get_grid_terminating_states(self, n_states: int) -> List[List]:

From db552718e5dbe706eb574321ebaa0790c681cc60 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 20 Apr 2023 06:58:41 -0400
Subject: [PATCH 034/206] udpate alex config paths

---
 config/user/alex.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/user/alex.yaml b/config/user/alex.yaml
index dd528d6e6..a6d1f044e 100644
--- a/config/user/alex.yaml
+++ b/config/user/alex.yaml
@@ -1,4 +1,4 @@
 logdir:
-  root: /network/scratch/a/alex.hernandez-garcia/logs/gflownet
+  root: /network/scratch/h/hernanga/logs/gflownet
 data:
-  alanine_dipeptide: /home/mila/a/alex.hernandez-garcia/gflownet/data/alanine_dipeptide_conformers_1.npy
+  alanine_dipeptide: /home/mila/h/hernanga/gflownet/data/alanine_dipeptide_conformers_1.npy

From bda8c3cf297a54adc121f6b644b87ea55148185c Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 20 Apr 2023 07:45:06 -0400
Subject: [PATCH 035/206] randomise plotted samples

---
 gflownet/envs/cube.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 38c333f1f..3dccdf20d 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -749,6 +749,7 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         # be return to source (EOS)
         # TODO: make sure this is correct
         if any([s < 0.0 for s in state]):
+            import ipdb; ipdb.set_trace()
             mask = [True for _ in range(self.action_space_dim)]
             mask[-1] = False
         else:
@@ -1075,7 +1076,8 @@ def plot_reward_samples(
         #                         )
         #                     )
         #         extra_samples = np.concatenate(extra_samples)
-        ax.scatter(samples[:max_samples, 0], samples[:max_samples, 1], alpha=alpha)
+        random_indices = np.random.permutation(samples.shape[0])[:max_samples]
+        ax.scatter(samples[random_indices, 0], samples[random_indices, 1], alpha=alpha)
         #         ax.scatter(extra_samples[:, 0], extra_samples[:, 1], alpha=alpha, color="white")
         ax.grid()
         padding = 0.05 * (cell_max - cell_min)

From d0e92bdb8c2a8b2336202fd9895cc7a52310e1ef Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 20 Apr 2023 07:45:30 -0400
Subject: [PATCH 036/206] add install six to setup

---
 setup_gflownet.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup_gflownet.sh b/setup_gflownet.sh
index d7ce20b16..958fb65a4 100644
--- a/setup_gflownet.sh
+++ b/setup_gflownet.sh
@@ -11,6 +11,8 @@ python -m virtualenv $1
 source $1/bin/activate
 # Update pip
 python -m pip install --upgrade pip
+# Force reinstall six to avoid issues with existing installations
+python -m pip install --upgrade --force-reinstall six
 # Install PyTorch family
 python -m pip install torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
 # Requirements to run

From 5bc88b2ced1dd6e29b2a01f444d5d9160ce240a7 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 20 Apr 2023 11:40:07 -0400
Subject: [PATCH 037/206] model back to source with different logit

---
 gflownet/envs/cube.py | 63 ++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 3dccdf20d..c78fc99b1 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -676,21 +676,22 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
           2) the logit(alpha) parameter of the Beta distribution to sample the increment
           3) the logit(beta) parameter of the Beta distribution to sample the increment
 
-        Additionally, the policy output contains one logit of a Bernoulli distribution
-        to model the (discrete) forward probability of selecting the EOS action and the
-        (discrete) backward probability of returning to the source node.
+        Additionally, the policy output contains one logit (pos [-1]) of a Bernoulli
+        distribution to model the (discrete) forward probability of selecting the EOS
+        action and another logit (pos [-2]) for the (discrete) backward probability of
+        returning to the source node.
 
-        Therefore, the output of the policy model has dimensionality D x C x 3 + 1,
+        Therefore, the output of the policy model has dimensionality D x C x 3 + 2,
         where D is the number of dimensions (self.n_dim) and C is the number of
         components (self.n_comp).
         """
         policy_output = torch.ones(
-            self.n_dim * self.n_comp * 3 + 1,
+            self.n_dim * self.n_comp * 3 + 2,
             device=self.device,
             dtype=self.float,
         )
-        policy_output[1:-1:3] = params["beta_alpha"]
-        policy_output[2:-1:3] = params["beta_beta"]
+        policy_output[1:-2:3] = params["beta_alpha"]
+        policy_output[2:-2:3] = params["beta_beta"]
         return policy_output
 
     def get_mask_invalid_actions_forward(
@@ -745,6 +746,7 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         if done:
             mask = [True for _ in range(self.action_space_dim)]
             mask[-1] = False
+            return mask
         # If the value of any dimension is smaller than 0.0, then next action can only
         # be return to source (EOS)
         # TODO: make sure this is correct
@@ -814,9 +816,7 @@ def sample_actions(
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
         # EOS
-        idx_nofix = ns_range[
-            ~torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
-        ]
+        idx_nofix = ns_range[torch.any(~mask_invalid_actions[:, :-1], axis=1)]
         distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
         mask_sampled_eos = distr_eos.sample().to(torch.bool)
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
@@ -842,17 +842,17 @@ def sample_actions(
                     torch.ones(n_sample),
                 )
             elif sampling_method == "policy":
-                mix_logits = policy_outputs[idx_sample, 0:-1:3].reshape(
+                mix_logits = policy_outputs[idx_sample, 0:-2:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
                 mix = Categorical(logits=mix_logits)
-                alphas = policy_outputs[idx_sample, 1:-1:3].reshape(
+                alphas = policy_outputs[idx_sample, 1:-2:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
                 alphas = (
                     self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
                 )
-                betas = policy_outputs[idx_sample, 2:-1:3].reshape(
+                betas = policy_outputs[idx_sample, 2:-2:3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
                 betas = (
@@ -891,34 +891,34 @@ def get_logprobs(
         device = policy_outputs.device
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
-        # Log probs of EOS (back to source if backwards) actions
-        idx_nofix = ns_range[
-            ~torch.logical_and(mask_invalid_actions[:, 0], mask_invalid_actions[:, 1])
-        ]
-        distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
+        # Log probs of EOS and source (backwards) actions
+        idx_nofix = ns_range[torch.any(~mask_invalid_actions[:, :-1], axis=1)]
+        logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
+        logprobs_source = torch.zeros(n_states, device=device, dtype=self.float)
         if is_forward:
-            mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
+            mask_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
+            distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
+            logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_eos.to(self.float))
+            mask_sample = ~mask_eos
         else:
-#             mask_sampled_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
-            mask_sampled_eos = torch.logical_or(
-                torch.all(states_target[idx_nofix] == 0.0, axis=1),
-                torch.all(actions[idx_nofix] == torch.inf, axis=1),
-            )
-        logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
-        logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
+            source = torch.tensor(self.source, device=device)
+            mask_source = torch.all(states_target[idx_nofix] == source, axis=1)
+            distr_source = Bernoulli(logits=policy_outputs[idx_nofix, -2])
+            logprobs_source[idx_nofix] = distr_source.log_prob(mask_source.to(self.float))
+            mask_sample = ~mask_source
         # Log probs of sampled increments
-        idx_sample = idx_nofix[~mask_sampled_eos]
+        idx_sample = idx_nofix[mask_sample]
         logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
         if len(idx_sample) > 0:
-            mix_logits = policy_outputs[idx_sample, 0:-1:3].reshape(
+            mix_logits = policy_outputs[idx_sample, 0:-2:3].reshape(
                 -1, self.n_dim, self.n_comp
             )
             mix = Categorical(logits=mix_logits)
-            alphas = policy_outputs[idx_sample, 1:-1:3].reshape(
+            alphas = policy_outputs[idx_sample, 1:-2:3].reshape(
                 -1, self.n_dim, self.n_comp
             )
             alphas = self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
-            betas = policy_outputs[idx_sample, 2:-1:3].reshape(
+            betas = policy_outputs[idx_sample, 2:-2:3].reshape(
                 -1, self.n_dim, self.n_comp
             )
             betas = self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
@@ -930,7 +930,7 @@ def get_logprobs(
                 increments[idx_sample]
             ).sum(axis=1)
         # Combined probabilities
-        logprobs = logprobs_eos + logprobs_sample
+        logprobs = logprobs_eos + logprobs_source + logprobs_sample
         return logprobs
 
     def step(
@@ -959,6 +959,7 @@ def step(
         """
         if self.done:
             return self.state, action, False
+        # TODO: remove condition
         # If action is eos or any dimension is beyond max_val, then force eos
         elif action == self.eos or any([s > (1 - self.min_incr) for s in self.state]):
             self.done = True

From 48033b54265c2a79f45de511bd833365bd781a4c Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 21 Apr 2023 03:00:17 -0400
Subject: [PATCH 038/206] action space and mask include back-to-zero action

---
 gflownet/envs/cube.py | 54 +++++++++++++++----------------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index c78fc99b1..8e0ea2bd9 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -654,14 +654,15 @@ def get_action_space(self):
         source state, min_incr otherwise.
 
         Additionally, there are two special discrete actions:
-            - Action from the source state, with no minimum increment. Only valid from
-              the source state. Indicated by -1 for all dimensions.
-            - EOS action. Indicated by np.inf for all dimensions.
+            - EOS action. Indicated by np.inf for all dimensions. Only valid forwards.
+            - Back-to-source action. Indicated by -1 for all dimensions. Only valid
+              backwards.
         """
         generic_action = tuple([0.0 for _ in range(self.n_dim)] + [self.min_incr])
-        action_source = tuple([0.0 for _ in range(self.n_dim)] + [0.0])
+        from_source = tuple([0.0 for _ in range(self.n_dim)] + [0.0])
+        to_source = tuple([-1.0 for _ in range(self.n_dim + 1)])
         self.eos = tuple([np.inf for _ in range(self.n_dim + 1)])
-        actions = [generic_action, action_source, self.eos]
+        actions = [generic_action, from_source, to_source, self.eos]
         return actions
 
     def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
@@ -704,9 +705,9 @@ def get_mask_invalid_actions_forward(
         True if action is invalid going forward given the current state, False
         otherwise.
 
-        If the state is the source state, the only valid action is action_source. EOS
-        is valid valid from any state (including the source state) and EOS is the only
-        possible action if the value of any dimension has excedded max_val.
+        If the state is the source state, the generic action is not valid. EOS is valid
+        valid from any state (including the source state). The back-to-source action is
+        ignored (invalid) going forward.
         """
         if state is None:
             state = self.state.copy()
@@ -716,18 +717,14 @@ def get_mask_invalid_actions_forward(
             return [True for _ in range(self.action_space_dim)]
         # If state is source, the generic action is not valid.
         if all([s == ss for s, ss in zip(state, self.source)]):
-            mask = [False for _ in range(self.action_space_dim)]
-            mask[0] = True
+            return [True, False, True, False]
         # If the value of any dimension is greater than 1 - min_incr, then next action
         # can only be EOS.
         elif any([s > (1 - self.min_incr) for s in state]):
-            mask = [True for _ in range(self.action_space_dim)]
-            mask[-1] = False
+            return [True, True, True, False]
         # Otherwise, only the action_source is not valid
         else:
-            mask = [False for _ in range(self.action_space_dim)]
-            mask[-2] = True
-        return mask
+            return [False, True, True, False]
 
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
         """
@@ -735,30 +732,17 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         True if action is invalid going backward given the current state, False
         otherwise.
 
-        The EOS action (returning to the source state for backward actions) is valid
-        from any state. The source action is ignored (invalid) for backward actions. If
-        any dimension is smaller than 0, then the only valid action is EOS.
+        The back-to-source action (returning to the source state for backward actions)
+        is valid from any state. The source action is ignored (invalid) for backward
+        actions.
         """
         if state is None:
             state = self.state.copy()
         if done is None:
             done = self.done
         if done:
-            mask = [True for _ in range(self.action_space_dim)]
-            mask[-1] = False
-            return mask
-        # If the value of any dimension is smaller than 0.0, then next action can only
-        # be return to source (EOS)
-        # TODO: make sure this is correct
-        if any([s < 0.0 for s in state]):
-            import ipdb; ipdb.set_trace()
-            mask = [True for _ in range(self.action_space_dim)]
-            mask[-1] = False
-        else:
-            mask = [False for _ in range(self.action_space_dim)]
-        # action_source is ignored going backwards, thus always invalid.
-        mask[-2] = True
-        return mask
+            return [True, True, True, False]
+        return [False, True, False, True]
 
     def get_parents(
         self, state: List = None, done: bool = None, action: Tuple[int, float] = None
@@ -816,7 +800,7 @@ def sample_actions(
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
         # EOS
-        idx_nofix = ns_range[torch.any(~mask_invalid_actions[:, :-1], axis=1)]
+        idx_nofix = ns_range[torch.any(~mask_invalid_actions[:, :2], axis=1)]
         distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
         mask_sampled_eos = distr_eos.sample().to(torch.bool)
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
@@ -892,7 +876,7 @@ def get_logprobs(
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
         # Log probs of EOS and source (backwards) actions
-        idx_nofix = ns_range[torch.any(~mask_invalid_actions[:, :-1], axis=1)]
+        idx_nofix = ns_range[torch.any(~mask_invalid_actions[:, :2], axis=1)]
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
         logprobs_source = torch.zeros(n_states, device=device, dtype=self.float)
         if is_forward:

From 3d0232f422f16943487aad03dacacddd5e451bb9 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 21 Apr 2023 04:02:29 -0400
Subject: [PATCH 039/206] add min property to corners proxy

---
 gflownet/proxy/corners.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/gflownet/proxy/corners.py b/gflownet/proxy/corners.py
index a577f647e..e5e4e0a57 100644
--- a/gflownet/proxy/corners.py
+++ b/gflownet/proxy/corners.py
@@ -30,6 +30,15 @@ def setup(self, env=None):
             self.cov_inv = torch.linalg.inv(cov)
             self.mulnormal_norm = 1.0 / ((2 * torch.pi) ** 2 * cov_det) ** 0.5
 
+    @property
+    def min(self):
+        if not hasattr(self, "_min"):
+            mode = self.mu * torch.ones(
+                self.n_dim, device=self.device, dtype=self.float
+            )
+            self._min = self(torch.unsqueeze(mode, 0))[0]
+        return self._min
+
     def __call__(self, states: TensorType["batch", "state_dim"]) -> TensorType["batch"]:
         return (
             -1.0

From 0cbf2b17c549d4b5d337da9e629fd6a5cf1adb15 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 21 Apr 2023 05:06:21 -0400
Subject: [PATCH 040/206] add get_uniform_terminating_states to htorus and
 rewrite sample_from_reward by using that method

---
 gflownet/envs/htorus.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/gflownet/envs/htorus.py b/gflownet/envs/htorus.py
index 7b4741ae5..9a3dbd700 100644
--- a/gflownet/envs/htorus.py
+++ b/gflownet/envs/htorus.py
@@ -534,6 +534,14 @@ def get_grid_terminating_states(self, n_states: int) -> List[List]:
         states = [list(el) + [self.length_traj] for el in angles]
         return states
 
+    def get_uniform_terminating_states(
+        self, n_states: int, seed: int = None
+    ) -> List[List]:
+        rng = np.random.default_rng(seed)
+        angles = rng.uniform(low=0.0, high=(2 * np.pi), size=(n_states, self.n_dim))
+        states = np.concatenate((angles, np.ones((n_states, 1))), axis=1)
+        return states.tolist()
+
     # TODO: make generic for all environments
     def sample_from_reward(
         self, n_samples: int, epsilon=1e-4
@@ -546,27 +554,16 @@ def sample_from_reward(
         samples_final = []
         max_reward = self.proxy2reward(torch.tensor([self.proxy.min])).to(self.device)
         while len(samples_final) < n_samples:
-            angles_uniform = (
-                torch.rand(
-                    (n_samples, self.n_dim), dtype=self.float, device=self.device
-                )
-                * 2
-                * np.pi
-            )
-            samples = torch.cat(
-                (
-                    angles_uniform,
-                    torch.ones((angles_uniform.shape[0], 1)).to(angles_uniform),
-                ),
-                axis=1,
+            samples_uniform = self.statebatch2proxy(
+                self.get_uniform_terminating_states(n_samples)
             )
-            rewards = self.reward_torchbatch(samples)
+            rewards = self.proxy2reward(self.proxy(samples_uniform))
             mask = (
                 torch.rand(n_samples, dtype=self.float, device=self.device)
                 * (max_reward + epsilon)
                 < rewards
             )
-            samples_accepted = samples[mask, :]
+            samples_accepted = samples_uniform[mask]
             samples_final.extend(samples_accepted[-(n_samples - len(samples_final)) :])
         return torch.vstack(samples_final)
 

From d384df91ee9b0b67432b97727a3c75cf0c2b5caa Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 21 Apr 2023 05:07:25 -0400
Subject: [PATCH 041/206] blisort

---
 gflownet/gflownet.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 6af5556fe..f8aa11219 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -638,7 +638,9 @@ def trajectorybalance_loss(self, it, batch, loginf=1000):
         )
         # Shift state_id to [1, 2, ...]
         for tid in traj_id.unique():
-            state_id[traj_id == tid] = state_id[traj_id == tid] - state_id[traj_id == tid].min() + 1
+            state_id[traj_id == tid] = (
+                state_id[traj_id == tid] - state_id[traj_id == tid].min() + 1
+            )
         # Compute rewards
         rewards = self.env.reward_torchbatch(states, done)
         # Build parents forward masks from state masks
@@ -891,14 +893,18 @@ def test(self, **plot_kwargs):
             # KL divergence
             kl = (density_true * (log_density_true - log_density_pred)).mean()
             # Jensen-Shannon divergence
-            log_mean_dens = np.logaddexp(log_density_true, log_density_pred) + np.log(0.5)
+            log_mean_dens = np.logaddexp(log_density_true, log_density_pred) + np.log(
+                0.5
+            )
             jsd = 0.5 * np.sum(density_true * (log_density_true - log_mean_dens))
             jsd += 0.5 * np.sum(density_pred * (log_density_pred - log_mean_dens))
 
         # Plots
         if hasattr(self.env, "plot_reward_samples"):
             if x_sampled is None:
-                x_sampled, _ = self.sample_batch(self.env, self.logger.test.n, train=False)
+                x_sampled, _ = self.sample_batch(
+                    self.env, self.logger.test.n, train=False
+                )
                 x_sampled = torch2np(self.env.statebatch2proxy(x_sampled))
             fig_reward_samples = self.env.plot_reward_samples(x_sampled, **plot_kwargs)
         else:

From a4ef411eed0b97bcee20d67aff8a524992b500ab Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 21 Apr 2023 05:08:59 -0400
Subject: [PATCH 042/206] x_from_reward are not reconverted to proxy because
 they come now in proxy format from sample_from_reward

---
 gflownet/gflownet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index f8aa11219..cb7fa7d9f 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -864,7 +864,7 @@ def test(self, **plot_kwargs):
                     x_from_reward = self.env.sample_from_reward(
                         n_samples=self.logger.test.n
                     )
-                    x_from_reward = torch2np(self.env.statetorch2proxy(x_from_reward))
+                    x_from_reward = torch2np(x_from_reward)
                     # Fit KDE with samples from reward
                     kde_true = self.env.fit_kde(
                         x_from_reward,

From 1c40a84bbc9e5c520aa52adbacbdf6c04d875cbd Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 21 Apr 2023 05:23:13 -0400
Subject: [PATCH 043/206] make norm and min of torus proxy tensors and avoid
 recomputing

---
 gflownet/envs/htorus.py |  2 +-
 gflownet/proxy/torus.py | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/gflownet/envs/htorus.py b/gflownet/envs/htorus.py
index 9a3dbd700..97cb73f0a 100644
--- a/gflownet/envs/htorus.py
+++ b/gflownet/envs/htorus.py
@@ -552,7 +552,7 @@ def sample_from_reward(
         Returns a tensor in GFloNet (state) format.
         """
         samples_final = []
-        max_reward = self.proxy2reward(torch.tensor([self.proxy.min])).to(self.device)
+        max_reward = self.proxy2reward(self.proxy.min)
         while len(samples_final) < n_samples:
             samples_uniform = self.statebatch2proxy(
                 self.get_uniform_terminating_states(n_samples)
diff --git a/gflownet/proxy/torus.py b/gflownet/proxy/torus.py
index 4fa521835..89d3ba44c 100644
--- a/gflownet/proxy/torus.py
+++ b/gflownet/proxy/torus.py
@@ -17,17 +17,25 @@ def setup(self, env=None):
 
     @property
     def min(self):
-        if self.normalize:
-            return -1.0
-        else:
-            return -((self.n_dim * 2) ** 3)
+        if not hasattr(self, "_min"):
+            if self.normalize:
+                self._min = torch.tensor(-1.0, device=self.device, dtype=self.float)
+            else:
+                self._min = torch.tensor(
+                    -((self.n_dim * 2) ** 3), device=self.device, dtype=self.float
+                )
+        return self._min
 
     @property
     def norm(self):
-        if self.normalize:
-            return -((self.n_dim * 2) ** 3)
-        else:
-            return -1.0
+        if not hasattr(self, "_norm"):
+            if self.normalize:
+                self._norm = torch.tensor(
+                    -((self.n_dim * 2) ** 3), device=self.device, dtype=self.float
+                )
+            else:
+                self._norm = torch.tensor(-1.0, device=self.device, dtype=self.float)
+        return self._norm
 
     def __call__(self, states: TensorType["batch", "state_dim"]) -> TensorType["batch"]:
         """

From 1c95e16d4f99ee5dd8ace4614e356b5dc3c2a7a4 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 21 Apr 2023 05:42:34 -0400
Subject: [PATCH 044/206] add bernoulli logit param

---
 config/env/ccube.yaml |   2 +
 gflownet/envs/cube.py | 183 ++++++++++++++++++------------------------
 2 files changed, 82 insertions(+), 103 deletions(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index a26825f06..8349a5f7c 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -16,9 +16,11 @@ n_comp: 1
 fixed_distribution:
   beta_alpha: 2.0
   beta_beta: 5.0
+  bernoulli_logit: -2.3
 random_distribution:
   beta_alpha: 1.0
   beta_beta: 1.0
+  bernoulli_logit: -0.693
 # Buffer
 buffer:
   data_path: null
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 8e0ea2bd9..9f62d90ec 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -10,6 +10,7 @@
 import numpy.typing as npt
 import pandas as pd
 import torch
+from sklearn.neighbors import KernelDensity
 from torch.distributions import Bernoulli, Beta, Categorical, MixtureSameFamily, Uniform
 from torchtyping import TensorType
 
@@ -49,10 +50,12 @@ def __init__(
         fixed_distr_params: dict = {
             "beta_alpha": 2.0,
             "beta_beta": 5.0,
+            "bernoulli_logit": -2.3,
         },
         random_distr_params: dict = {
             "beta_alpha": 1.0,
             "beta_beta": 1.0,
+            "bernoulli_logit": -0.693,
         },
         **kwargs,
     ):
@@ -134,7 +137,9 @@ def statebatch2policy(
         state : list
             State
         """
-        return self.statetorch2policy(torch.tensor(states, device=self.device))
+        return self.statetorch2policy(
+            torch.tensor(states, device=self.device, dtype=self.float)
+        )
 
     def state2policy(self, state: List = None) -> List:
         """
@@ -693,6 +698,8 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         )
         policy_output[1:-2:3] = params["beta_alpha"]
         policy_output[2:-2:3] = params["beta_beta"]
+        policy_output[-2] = params["bernoulli_logit"]
+        policy_output[-1] = params["bernoulli_logit"]
         return policy_output
 
     def get_mask_invalid_actions_forward(
@@ -888,7 +895,9 @@ def get_logprobs(
             source = torch.tensor(self.source, device=device)
             mask_source = torch.all(states_target[idx_nofix] == source, axis=1)
             distr_source = Bernoulli(logits=policy_outputs[idx_nofix, -2])
-            logprobs_source[idx_nofix] = distr_source.log_prob(mask_source.to(self.float))
+            logprobs_source[idx_nofix] = distr_source.log_prob(
+                mask_source.to(self.float)
+            )
             mask_sample = ~mask_source
         # Log probs of sampled increments
         idx_sample = idx_nofix[mask_sample]
@@ -969,53 +978,42 @@ def get_grid_terminating_states(self, n_states: int) -> List[List]:
         states = [list(el) for el in states]
         return states
 
+    def get_uniform_terminating_states(
+        self, n_states: int, seed: int = None
+    ) -> List[List]:
+        rng = np.random.default_rng(seed)
+        states = rng.uniform(low=0.0, high=self.max_val, size=(n_states, self.n_dim))
+        return states.tolist()
+
     #     # TODO: make generic for all environments
-    #     def sample_from_reward(
-    #         self, n_samples: int, epsilon=1e-4
-    #     ) -> TensorType["n_samples", "state_dim"]:
-    #         """
-    #         Rejection sampling  with proposal the uniform distribution in
-    #         [0, max_val]]^n_dim.
-    #
-    #         Returns a tensor in GFloNet (state) format.
-    #         """
-    #         samples_final = []
-    #         max_reward = self.proxy2reward(torch.tensor([self.proxy.min])).to(self.device)
-    #         while len(samples_final) < n_samples:
-    #             angles_uniform = (
-    #                 torch.rand(
-    #                     (n_samples, self.n_dim), dtype=self.float, device=self.device
-    #                 )
-    #                 * 2
-    #                 * np.pi
-    #             )
-    #             samples = torch.cat(
-    #                 (
-    #                     angles_uniform,
-    #                     torch.ones((angles_uniform.shape[0], 1)).to(angles_uniform),
-    #                 ),
-    #                 axis=1,
-    #             )
-    #             rewards = self.reward_torchbatch(samples)
-    #             mask = (
-    #                 torch.rand(n_samples, dtype=self.float, device=self.device)
-    #                 * (max_reward + epsilon)
-    #                 < rewards
-    #             )
-    #             samples_accepted = samples[mask, :]
-    #             samples_final.extend(samples_accepted[-(n_samples - len(samples_final)) :])
-    #         return torch.vstack(samples_final)
-    #
-    #     def fit_kde(self, samples, kernel="gaussian", bandwidth=0.1):
-    #         aug_samples = []
-    #         for add_0 in [0, -2 * np.pi, 2 * np.pi]:
-    #             for add_1 in [0, -2 * np.pi, 2 * np.pi]:
-    #                 aug_samples.append(
-    #                     np.stack([samples[:, 0] + add_0, samples[:, 1] + add_1], axis=1)
-    #                 )
-    #         aug_samples = np.concatenate(aug_samples)
-    #         kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(aug_samples)
-    #         return kde
+    def sample_from_reward(
+        self, n_samples: int, epsilon=1e-4
+    ) -> TensorType["n_samples", "state_dim"]:
+        """
+        Rejection sampling  with proposal the uniform distribution in
+        [0, max_val]]^n_dim.
+
+        Returns a tensor in GFloNet (state) format.
+        """
+        samples_final = []
+        max_reward = self.proxy2reward(self.proxy.min)
+        while len(samples_final) < n_samples:
+            samples_uniform = self.statebatch2proxy(
+                self.get_uniform_terminating_states(n_samples)
+            )
+            rewards = self.proxy2reward(self.proxy(samples_uniform))
+            mask = (
+                torch.rand(n_samples, dtype=self.float, device=self.device)
+                * (max_reward + epsilon)
+                < rewards
+            )
+            samples_accepted = samples_uniform[mask]
+            samples_final.extend(samples_accepted[-(n_samples - len(samples_final)) :])
+        return torch.vstack(samples_final)
+
+    # TODO: make generic for all envs
+    def fit_kde(self, samples, kernel="gaussian", bandwidth=0.1):
+        return KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(samples)
 
     def plot_reward_samples(
         self,
@@ -1027,6 +1025,7 @@ def plot_reward_samples(
         max_samples=500,
         **kwargs,
     ):
+        # Sample a grid of points in the state space and obtain the rewards
         x = np.linspace(cell_min, cell_max, 201)
         y = np.linspace(cell_min, cell_max, 201)
         xx, yy = np.meshgrid(x, y)
@@ -1042,68 +1041,46 @@ def plot_reward_samples(
         h = ax.contourf(xx, yy, rewards.reshape(xx.shape).cpu().numpy(), alpha=alpha)
         ax.axis("scaled")
         fig.colorbar(h, ax=ax)
-        #         ax.plot([0, 0], [0, 2 * np.pi], "-w", alpha=alpha)
-        #         ax.plot([0, 2 * np.pi], [0, 0], "-w", alpha=alpha)
-        #         ax.plot([2 * np.pi, 2 * np.pi], [2 * np.pi, 0], "-w", alpha=alpha)
-        #         ax.plot([2 * np.pi, 0], [2 * np.pi, 2 * np.pi], "-w", alpha=alpha)
         # Plot samples
-        #         extra_samples = []
-        #         for add_0 in [0, -2 * np.pi, 2 * np.pi]:
-        #             for add_1 in [0, -2 * np.pi, 2 * np.pi]:
-        #                 if not (add_0 == add_1 == 0):
-        #                     extra_samples.append(
-        #                         np.stack(
-        #                             [
-        #                                 samples[:max_samples, 0] + add_0,
-        #                                 samples[:max_samples, 1] + add_1,
-        #                             ],
-        #                             axis=1,
-        #                         )
-        #                     )
-        #         extra_samples = np.concatenate(extra_samples)
         random_indices = np.random.permutation(samples.shape[0])[:max_samples]
         ax.scatter(samples[random_indices, 0], samples[random_indices, 1], alpha=alpha)
-        #         ax.scatter(extra_samples[:, 0], extra_samples[:, 1], alpha=alpha, color="white")
+        # Figure settings
         ax.grid()
         padding = 0.05 * (cell_max - cell_min)
         ax.set_xlim([cell_min - padding, cell_max + padding])
         ax.set_ylim([cell_min - padding, cell_max + padding])
-        # Set tight layout
         plt.tight_layout()
         return fig
 
-
-#     def plot_kde(
-#         self,
-#         kde,
-#         alpha=0.5,
-#         low=-np.pi * 0.5,
-#         high=2.5 * np.pi,
-#         dpi=150,
-#         colorbar=True,
-#         **kwargs,
-#     ):
-#         x = np.linspace(0, 2 * np.pi, 101)
-#         y = np.linspace(0, 2 * np.pi, 101)
-#         xx, yy = np.meshgrid(x, y)
-#         X = np.stack([xx, yy], axis=-1)
-#         Z = np.exp(kde.score_samples(X.reshape(-1, 2))).reshape(xx.shape)
-#         # Init figure
-#         fig, ax = plt.subplots()
-#         fig.set_dpi(dpi)
-#         # Plot KDE
-#         h = ax.contourf(xx, yy, Z, alpha=alpha)
-#         ax.axis("scaled")
-#         if colorbar:
-#             fig.colorbar(h, ax=ax)
-#         ax.set_xticks([])
-#         ax.set_yticks([])
-#         ax.text(0, -0.3, r"$0$", fontsize=15)
-#         ax.text(-0.28, 0, r"$0$", fontsize=15)
-#         ax.text(2 * np.pi - 0.4, -0.3, r"$2\pi$", fontsize=15)
-#         ax.text(-0.45, 2 * np.pi - 0.3, r"$2\pi$", fontsize=15)
-#         for spine in ax.spines.values():
-#             spine.set_visible(False)
-#         # Set tight layout
-#         plt.tight_layout()
-#         return fig
+    # TODO: make generic for all envs
+    def plot_kde(
+        self,
+        kde,
+        alpha=0.5,
+        cell_min=-1.0,
+        cell_max=1.0,
+        dpi=150,
+        colorbar=True,
+        **kwargs,
+    ):
+        # Sample a grid of points in the state space and score them with the KDE
+        x = np.linspace(cell_min, cell_max, 201)
+        y = np.linspace(cell_min, cell_max, 201)
+        xx, yy = np.meshgrid(x, y)
+        X = np.stack([xx, yy], axis=-1)
+        Z = np.exp(kde.score_samples(X.reshape(-1, 2))).reshape(xx.shape)
+        # Init figure
+        fig, ax = plt.subplots()
+        fig.set_dpi(dpi)
+        # Plot KDE
+        h = ax.contourf(xx, yy, Z, alpha=alpha)
+        ax.axis("scaled")
+        if colorbar:
+            fig.colorbar(h, ax=ax)
+        ax.set_xticks([])
+        ax.set_yticks([])
+        for spine in ax.spines.values():
+            spine.set_visible(False)
+        # Set tight layout
+        plt.tight_layout()
+        return fig

From f01e47504a373e6a356a970f6127629a1a0a7ac0 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 21 Apr 2023 05:43:08 -0400
Subject: [PATCH 045/206] make norm and min of unfirom proxy tensors and avoid
 recomputing

---
 gflownet/proxy/uniform.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gflownet/proxy/uniform.py b/gflownet/proxy/uniform.py
index 436a4d6f3..651172aa0 100644
--- a/gflownet/proxy/uniform.py
+++ b/gflownet/proxy/uniform.py
@@ -13,4 +13,6 @@ def __call__(self, states: TensorType["batch", "state_dim"]) -> TensorType["batc
 
     @property
     def min(self):
-        return -1.0
+        if not hasattr(self, "_min"):
+            self._min = torch.tensor(-1.0, device=self.device, dtype=self.float)
+        return self._min

From 95d6d617000032fc520df8abd253d8b6579f334f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 21 Apr 2023 06:53:41 -0400
Subject: [PATCH 046/206] move lines down

---
 gflownet/envs/cube.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 9f62d90ec..7d204c15c 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -821,11 +821,6 @@ def sample_actions(
         increments = torch.inf * torch.ones(
             (n_states, self.n_dim), device=device, dtype=self.float
         )
-        min_increments = torch.inf * torch.ones(
-            n_states, device=device, dtype=self.float
-        )
-        min_increments[idx_generic] = self.min_incr
-        min_increments[idx_source] = 0.0
         if len(idx_sample) > 0:
             if sampling_method == "uniform":
                 distr_increments = Uniform(
@@ -858,6 +853,11 @@ def sample_actions(
         # Combined probabilities
         logprobs = logprobs_eos + logprobs_sample
         # Build actions
+        min_increments = torch.inf * torch.ones(
+            n_states, device=device, dtype=self.float
+        )
+        min_increments[idx_generic] = self.min_incr
+        min_increments[idx_source] = 0.0
         actions = [
             tuple(a.tolist() + [m.item()]) for a, m in zip(increments, min_increments)
         ]

From e609e8e7806d91c471486f930333ef896d7ec58b Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 24 Apr 2023 04:41:02 -0400
Subject: [PATCH 047/206] add assert incr >= min_incr

---
 gflownet/envs/cube.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 7d204c15c..e11d23134 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -789,6 +789,7 @@ def get_parents(
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
                 incr = (incr_rel * (1.0 - state[dim] - min_incr)) / (1 - incr_rel)
+                assert incr >= min_incr
                 state[dim] -= incr
             return [state], [action]
 
@@ -963,6 +964,7 @@ def step(
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
                 incr = incr_rel * (1.0 - self.state[dim] - min_incr)
+                assert incr >= min_incr
                 self.state[dim] += incr
             assert all([s <= self.max_val for s in self.state]), print(
                 self.state, action

From f18aff48cda17f8a90287e4b06205a236e7baca0 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 24 Apr 2023 05:28:56 -0400
Subject: [PATCH 048/206] wip: new policy_output, new masks

---
 gflownet/envs/cube.py | 73 ++++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 25 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index e11d23134..2589b3d70 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -409,21 +409,32 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         Returns a vector with the length of the discrete part of the action space + 1:
         True if action is invalid going backward given the current state, False
         otherwise.
+
+        The backward mask has the following structure:
+
+        - 0:n_dim : whether keeping a dimension as is, that is sampling a decrement of
+          0, can have zero probability. True if the value at the dimension is smaller
+          than or equal to 1 - min_incr.
+        - n_dim : whether going to source is invalid. Always valid, hence always False,
+          except if done.
+        - n_dim + 1 : whether sampling EOS is invalid. Only valid if done.
         """
         if state is None:
             state = self.state.copy()
         if done is None:
             done = self.done
+        mask_dim = self.n_dim + 2
+        # If done, only valid action is EOS.
         if done:
-            mask = [True for _ in range(self.action_space_dim)]
+            mask = [True for _ in range(mask_dim)]
             mask[-1] = False
-        # If the value of any dimension is smaller than 0.0, then next action can
-        # return to source.
-        if any([s < 0.0 for s in self.state]):
-            mask = [True for _ in range(self.action_space_dim)]
-            mask[-2] = False
-        else:
-            mask = [False for _ in range(self.action_space_dim)]
+        mask = [True for _ in range(mask_dim)]
+        mask[-2] = False
+        # Dimensions whose value is greater than 1 - min_incr must have non-zero
+        # probability of sampling a decrement of exactly zero.
+        for dim, s in enumerate(state):
+            if s > 1 - self.min_incr:
+                mask[dim] = False
         return mask
 
     def get_parents(
@@ -687,17 +698,22 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         action and another logit (pos [-2]) for the (discrete) backward probability of
         returning to the source node.
 
+        Finally, the backward distribution requires a discrete probability distribution
+        (Bernoulli) for each dimension, to model the probability of sampling an
+        increment equal to zero when the value at the dimension is larger than
+        1 - min_incr. These are stored at [0:n_dim].
+
         Therefore, the output of the policy model has dimensionality D x C x 3 + 2,
         where D is the number of dimensions (self.n_dim) and C is the number of
         components (self.n_comp).
         """
         policy_output = torch.ones(
-            self.n_dim * self.n_comp * 3 + 2,
+            self.n_dim + self.n_dim * self.n_comp * 3 + 2,
             device=self.device,
             dtype=self.float,
         )
-        policy_output[1:-2:3] = params["beta_alpha"]
-        policy_output[2:-2:3] = params["beta_beta"]
+        policy_output[self.n_dim + 1 : -2 : 3] = params["beta_alpha"]
+        policy_output[self.n_dim + 2 : -2 : 3] = params["beta_beta"]
         policy_output[-2] = params["bernoulli_logit"]
         policy_output[-1] = params["bernoulli_logit"]
         return policy_output
@@ -712,26 +728,33 @@ def get_mask_invalid_actions_forward(
         True if action is invalid going forward given the current state, False
         otherwise.
 
-        If the state is the source state, the generic action is not valid. EOS is valid
-        valid from any state (including the source state). The back-to-source action is
-        ignored (invalid) going forward.
+        The forward mask has the following structure:
+
+        - 0:n_dim : whether sampling each dimension is invalid. Invalid (True) if the
+          value at the dimension is larger than 1 - min_incr.
+        - n_dim : whether sampling from source is invalid. Invalid except when when the
+          state is the source state.
+        - n_dim + 1 : whether sampling EOS is invalid. EOS is valid from any state
+          (including the source state), hence always False.
         """
         if state is None:
             state = self.state.copy()
         if done is None:
             done = self.done
+        mask_dim = self.n_dim + 2
+        # If done, no action is valid
         if done:
-            return [True for _ in range(self.action_space_dim)]
-        # If state is source, the generic action is not valid.
-        if all([s == ss for s, ss in zip(state, self.source)]):
-            return [True, False, True, False]
-        # If the value of any dimension is greater than 1 - min_incr, then next action
-        # can only be EOS.
-        elif any([s > (1 - self.min_incr) for s in state]):
-            return [True, True, True, False]
-        # Otherwise, only the action_source is not valid
-        else:
-            return [False, True, True, False]
+            return [True for _ in range(mask_dim)]
+        mask = [False for _ in range(mask_dim)]
+        # If state is not source, sampling from source is invalid.
+        if state != self.source:
+            mask[-2] = True
+        # Dimensions whose value is greater than 1 - min_incr cannot be further
+        # incremented
+        for dim, s in enumerate(state):
+            if s > 1 - self.min_incr:
+                mask[dim] = True
+        return mask
 
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
         """

From 0d23ba3abd1236b331f784579dfcf8a7d1898a55 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 24 Apr 2023 05:33:16 -0400
Subject: [PATCH 049/206] fix that previous changes had been done in HybridCube

---
 gflownet/envs/cube.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 2589b3d70..a007daf12 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -724,9 +724,7 @@ def get_mask_invalid_actions_forward(
         done: Optional[bool] = None,
     ) -> List:
         """
-        Returns a vector with the length of the discrete part of the action space:
-        True if action is invalid going forward given the current state, False
-        otherwise.
+        Returns a vector indicating which backward actions are invalid.
 
         The forward mask has the following structure:
 
@@ -758,21 +756,34 @@ def get_mask_invalid_actions_forward(
 
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
         """
-        Returns a vector with the length of the discrete part of the action space:
-        True if action is invalid going backward given the current state, False
-        otherwise.
+        Returns a vector indicating which backward actions are invalid.
+
+        The backward mask has the following structure:
 
-        The back-to-source action (returning to the source state for backward actions)
-        is valid from any state. The source action is ignored (invalid) for backward
-        actions.
+        - 0:n_dim : whether keeping a dimension as is, that is sampling a decrement of
+          0, can have zero probability. True if the value at the dimension is smaller
+          than or equal to 1 - min_incr.
+        - n_dim : whether going to source is invalid. Always valid, hence always False,
+          except if done.
+        - n_dim + 1 : whether sampling EOS is invalid. Only valid if done.
         """
         if state is None:
             state = self.state.copy()
         if done is None:
             done = self.done
+        mask_dim = self.n_dim + 2
+        # If done, only valid action is EOS.
         if done:
-            return [True, True, True, False]
-        return [False, True, False, True]
+            mask = [True for _ in range(mask_dim)]
+            mask[-1] = False
+        mask = [True for _ in range(mask_dim)]
+        mask[-2] = False
+        # Dimensions whose value is greater than 1 - min_incr must have non-zero
+        # probability of sampling a decrement of exactly zero.
+        for dim, s in enumerate(state):
+            if s > 1 - self.min_incr:
+                mask[dim] = False
+        return mask
 
     def get_parents(
         self, state: List = None, done: bool = None, action: Tuple[int, float] = None

From ac952c437e3cb009e8329a9b0286c8e76cf75480 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 25 Apr 2023 04:53:06 -0400
Subject: [PATCH 050/206] big update of get_logprobs and sample_actions with
 new actions

---
 gflownet/envs/cube.py | 165 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 142 insertions(+), 23 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index a007daf12..be88d595a 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -776,6 +776,10 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         if done:
             mask = [True for _ in range(mask_dim)]
             mask[-1] = False
+            return mask
+        # If state is source, all actions are invalid.
+        if state == self.source:
+            return [True for _ in range(mask_dim)]
         mask = [True for _ in range(mask_dim)]
         mask[-2] = False
         # Dimensions whose value is greater than 1 - min_incr must have non-zero
@@ -842,15 +846,23 @@ def sample_actions(
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
         # EOS
-        idx_nofix = ns_range[torch.any(~mask_invalid_actions[:, :2], axis=1)]
+        idx_nofix = ns_range[torch.any(~mask_invalid_actions[:, : self.n_dim], axis=1)]
         distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
         mask_sampled_eos = distr_eos.sample().to(torch.bool)
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
         logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
         # Sample increments
         idx_sample = idx_nofix[~mask_sampled_eos]
-        idx_generic = idx_sample[~mask_invalid_actions[idx_sample, 0]]
-        idx_source = idx_sample[~mask_invalid_actions[idx_sample, 1]]
+        mask_idx_sample = torch.zeros(n_states, device=device, dtype=torch.bool)
+        mask_idx_sample[idx_sample] = True
+        mask_source_sample = torch.logical_and(
+            ~mask_invalid_actions[:, self.n_dim], mask_idx_sample
+        )
+        mask_generic_sample = torch.logical_and(
+            mask_invalid_actions[:, self.n_dim], mask_idx_sample
+        )
+        idx_source = ns_range[mask_source_sample]
+        idx_generic = ns_range[mask_generic_sample]
         n_sample = idx_sample.shape[0]
         logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
         increments = torch.inf * torch.ones(
@@ -863,17 +875,17 @@ def sample_actions(
                     torch.ones(n_sample),
                 )
             elif sampling_method == "policy":
-                mix_logits = policy_outputs[idx_sample, 0:-2:3].reshape(
+                mix_logits = policy_outputs[idx_sample, self.n_dim : -2 : 3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
                 mix = Categorical(logits=mix_logits)
-                alphas = policy_outputs[idx_sample, 1:-2:3].reshape(
+                alphas = policy_outputs[idx_sample, self.n_dim + 1 : -2 : 3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
                 alphas = (
                     self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
                 )
-                betas = policy_outputs[idx_sample, 2:-2:3].reshape(
+                betas = policy_outputs[idx_sample, self.n_dim + 2 : -2 : 3].reshape(
                     -1, self.n_dim, self.n_comp
                 )
                 betas = (
@@ -887,15 +899,25 @@ def sample_actions(
             ).sum(axis=1)
         # Combined probabilities
         logprobs = logprobs_eos + logprobs_sample
-        # Build actions
+        # Set minimum increments
         min_increments = torch.inf * torch.ones(
             n_states, device=device, dtype=self.float
         )
         min_increments[idx_generic] = self.min_incr
         min_increments[idx_source] = 0.0
+        # Make increments of near-edge dims 0
+        mask_nearedge_dims = mask_invalid_actions[:, : self.n_dim]
+        mask_idx_sample = torch.zeros(
+            mask_nearedge_dims.shape, device=device, dtype=torch.bool
+        )
+        mask_idx_sample[idx_sample, :] = True
+        mask_nearedge_dims = torch.logical_and(mask_nearedge_dims, mask_idx_sample)
+        increments[mask_nearedge_dims] = 0.0
+        # Build actions
         actions = [
             tuple(a.tolist() + [m.item()]) for a, m in zip(increments, min_increments)
         ]
+        # TODO: implement logprobs here too
         return actions, logprobs
 
     def get_logprobs(
@@ -910,15 +932,55 @@ def get_logprobs(
         """
         Computes log probabilities of actions given policy outputs and actions.
 
-        At every state, the EOS action can be sampled with probability p(EOS).
-        Otherwise, an increment incr is sampled with probablility
-        p(incr) * (1 - p(EOS)).
+        For forward transitons, at every state, the probability of the EOS action is
+        p(EOS). Otherwise, the probability of an increment incr is p(incr) * (1 -
+        p(EOS)). When a dimension is larger than 1 - min_incr, the probabililty of
+        incrementing that dimension by 0 is 1.
+
+        For backward transitons, at every state, the probability of the back-to-source
+        action is p(back-to-source). Otherwise, the probability of an increment
+        (decrement) incr is p(incr) * (1 - p(back-to-source)). When a dimension is
+        larger than 1 - min_incr, the probabililty of incrementing that dimension by 0
+        must be non-zero and is p(zeroincr). In turn, the probability of sampling a
+        non-zero increment incr is (1 - p(zeroincr)) * p(incr).
+
+        Overall, we compute the log probabilities as follows:
+
+        log p = logprobs_eos + logprobs_source + logprobs_increments + logprobs_zeroincr
+
+        - logprobs_eos:
+            - 0, that is p(~EOS) = 1 for backward transitions.
+            - forward, the log p of the sampled event (EOS or not EOS)
+
+        - logprobs_source:
+            - 0, that is p(~source) = 1 for forward transitions.
+            - backward, the log p of the sampled event (source or not source)
+
+        - logprobs_increments:
+            - 0, that is p(~increment) = 1 for EOS or source events.
+            - otherwise, the log p of sampling the increment.
+
+        - logprobs_zeroincr:
+            - 0, that is p(~zeroincr) = 1 for forward transitions.
+            - 0, that is p(~zeroincr) = 1 for for dimensions that are smaller than or
+              equal to 1 - min_incr, backwards.
+            - otherwise, the log p of the sampled event (sampled 0 or not).
         """
         device = policy_outputs.device
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
+        # Determine which states have non-deterministic actions
+        if is_forward:
+            # EOS is the only valid action if all dimensions are invalid. That is, the
+            # action is non-deterministic if any dimension is valid (i.e. mask = False).
+            idx_nofix = ns_range[
+                torch.any(~mask_invalid_actions[:, : self.n_dim], axis=1)
+            ]
+        else:
+            # The action is non-deterministic if sampling EOS (last value of mask) is
+            # invalid (True).
+            idx_nofix = ns_range[mask_invalid_actions[:, -1]]
         # Log probs of EOS and source (backwards) actions
-        idx_nofix = ns_range[torch.any(~mask_invalid_actions[:, :2], axis=1)]
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
         logprobs_source = torch.zeros(n_states, device=device, dtype=self.float)
         if is_forward:
@@ -936,29 +998,78 @@ def get_logprobs(
             mask_sample = ~mask_source
         # Log probs of sampled increments
         idx_sample = idx_nofix[mask_sample]
-        logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
+        logprobs_increments = torch.zeros(
+            (n_states, self.n_dim), device=device, dtype=self.float
+        )
+        logprobs_zeroincr = torch.zeros(
+            (n_states, self.n_dim), device=device, dtype=self.float
+        )
         if len(idx_sample) > 0:
-            mix_logits = policy_outputs[idx_sample, 0:-2:3].reshape(
+            mix_logits = policy_outputs[idx_sample, self.n_dim : -2 : 3].reshape(
                 -1, self.n_dim, self.n_comp
             )
             mix = Categorical(logits=mix_logits)
-            alphas = policy_outputs[idx_sample, 1:-2:3].reshape(
+            alphas = policy_outputs[idx_sample, self.n_dim + 1 : -2 : 3].reshape(
                 -1, self.n_dim, self.n_comp
             )
             alphas = self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
-            betas = policy_outputs[idx_sample, 2:-2:3].reshape(
+            betas = policy_outputs[idx_sample, self.n_dim + 2 : -2 : 3].reshape(
                 -1, self.n_dim, self.n_comp
             )
             betas = self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
             beta_distr = Beta(alphas, betas)
             distr_increments = MixtureSameFamily(mix, beta_distr)
             increments = actions[:, :-1].clone().detach()
-            # TODO: do something with the logprob of returning to source (backwards)?
-            logprobs_sample[idx_sample] = distr_increments.log_prob(
+            logprobs_increments[idx_sample] = distr_increments.log_prob(
                 increments[idx_sample]
-            ).sum(axis=1)
+            )
+            # Make logprobs of "invalid" dimensions (value larger than 1 - mincr) 0.
+            # TODO: indexing can be done more efficiently to avoid sampling from the
+            # distribution above.
+            mask_nearedge_dims = ~mask_invalid_actions[:, : self.n_dim]
+            mask_idx_sample = torch.zeros(
+                mask_nearedge_dims.shape, device=device, dtype=torch.bool
+            )
+            mask_idx_sample[idx_sample, :] = True
+            mask_nearedge_dims = torch.logical_and(mask_nearedge_dims, mask_idx_sample)
+            logprobs_increments[mask_nearedge_dims] = 0.0
+            # Log probs of sampling zero increments
+            if not is_forward:
+                mask_zeroincr = increments[mask_nearedge_dims] == 0.0
+                logits_zeroincr = policy_outputs[idx_sample, : self.n_dim][
+                    mask_nearedge_dims
+                ]
+                distr_zeroincr = Bernoulli(logits=logits_zeroincr)
+                logprobs_zeroincr[mask_nearedge_dims] = distr_zeroincr.log_prob(
+                    mask_zeroincr.to(self.float)
+                )
+                # TODO: make logprobs_increments = 0 if increment was zero and
+                # near-edge. Already done?
         # Combined probabilities
-        logprobs = logprobs_eos + logprobs_source + logprobs_sample
+        sumlogprobs_increments = logprobs_increments.sum(axis=1)
+        sumlogprobs_zeroincr = logprobs_zeroincr.sum(axis=1)
+        logprobs = (
+            logprobs_eos
+            + logprobs_source
+            + sumlogprobs_increments
+            + sumlogprobs_zeroincr
+        )
+        # Sanity checks
+        if is_forward:
+            mask_fix = torch.all(mask_invalid_actions[:, : self.n_dim], axis=1)
+            assert torch.all(logprobs_source == 0.0)
+            assert torch.all(logprobs_zeroincr == 0.0)
+            assert torch.all(sumlogprobs_increments[idx_nofix][mask_eos] == 0.0)
+            mask_fixdim = mask_invalid_actions[:, self.n_dim]
+            assert torch.all(logprobs_increments[mask_fixdim] == 0.0)
+        else:
+            mask_fix = ~mask_invalid_actions[:, -1]
+            assert torch.all(logprobs_eos == 0.0)
+            assert torch.all(sumlogprobs_increments[idx_nofix][mask_source] == 0.0)
+            assert torch.all(sumlogprobs_zeroincr[idx_nofix][mask_source] == 0.0)
+            mask_nozeroincr = mask_invalid_actions[:, self.n_dim]
+            assert torch.all(logprobs_zeroincr[mask_nozeroincr] == 0.0)
+        assert torch.all(logprobs[mask_fix] == 0.0)
         return logprobs
 
     def step(
@@ -998,11 +1109,19 @@ def step(
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
                 incr = incr_rel * (1.0 - self.state[dim] - min_incr)
-                assert incr >= min_incr
+                assert (
+                    incr >= min_incr
+                ), f"""
+                Increment {incr} at dim {dim} smaller than minimum increment ({min_incr}).
+                \nState:\n{self.state}\nAction:\n{action}
+                """
                 self.state[dim] += incr
-            assert all([s <= self.max_val for s in self.state]), print(
-                self.state, action
-            )
+            assert all(
+                [s <= self.max_val for s in self.state]
+            ), f"""
+            State is out of cube bounds.
+            \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
+            """
             self.n_actions += 1
             return self.state, action, True
 

From b49fbd841474282866f59b4fda77a4af21266532 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 25 Apr 2023 05:06:19 -0400
Subject: [PATCH 051/206] fix bug in computation of increments from relative
 increment and add assertion

---
 gflownet/envs/cube.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index be88d595a..aa5a68222 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -826,9 +826,26 @@ def get_parents(
         else:
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
-                incr = (incr_rel * (1.0 - state[dim] - min_incr)) / (1 - incr_rel)
-                assert incr >= min_incr
+                incr = (min_incr + incr_rel * (1.0 - state[dim] - min_incr)) / (1 - incr_rel)
+                assert (
+                    incr >= min_incr
+                ), f"""
+                Increment {incr} at dim {dim} smaller than minimum increment ({min_incr}).
+                \nState:\n{state}\nAction:\n{action}
+                """
                 state[dim] -= incr
+                assert all(
+                    [s <= self.max_val for s in state]
+                ), f"""
+                State is out of cube bounds.
+                \nState:\n{state}\nAction:\n{action}\nIncrement: {incr}
+                """
+                assert all(
+                    [s >= 0.0 for s in state]
+                ), f"""
+                State is out of cube bounds.
+                \nState:\n{state}\nAction:\n{action}\nIncrement: {incr}
+                """
             return [state], [action]
 
     def sample_actions(
@@ -1108,7 +1125,7 @@ def step(
         else:
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
-                incr = incr_rel * (1.0 - self.state[dim] - min_incr)
+                incr = min_incr + incr_rel * (1.0 - self.state[dim] - min_incr)
                 assert (
                     incr >= min_incr
                 ), f"""
@@ -1122,6 +1139,12 @@ def step(
             State is out of cube bounds.
             \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
             """
+            assert all(
+                [s >= 0.0 for s in self.state]
+            ), f"""
+            State is out of cube bounds.
+            \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
+            """
             self.n_actions += 1
             return self.state, action, True
 

From 480f5b4285b73f1c05ecf869f0a80d49ce79271d Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 25 Apr 2023 05:09:44 -0400
Subject: [PATCH 052/206] fix bug in indexing

---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index aa5a68222..4a6ce468f 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1053,7 +1053,7 @@ def get_logprobs(
             # Log probs of sampling zero increments
             if not is_forward:
                 mask_zeroincr = increments[mask_nearedge_dims] == 0.0
-                logits_zeroincr = policy_outputs[idx_sample, : self.n_dim][
+                logits_zeroincr = policy_outputs[:, : self.n_dim][
                     mask_nearedge_dims
                 ]
                 distr_zeroincr = Bernoulli(logits=logits_zeroincr)

From 0cd82ceedf53ac66806291d6a61f16c98334df02 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 25 Apr 2023 05:15:48 -0400
Subject: [PATCH 053/206] fix bug in computation of increments from relative
 increment and add assertion

---
 gflownet/envs/cube.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index e11d23134..60eebe90a 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -788,7 +788,9 @@ def get_parents(
         else:
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
-                incr = (incr_rel * (1.0 - state[dim] - min_incr)) / (1 - incr_rel)
+                incr = (min_incr + incr_rel * (1.0 - state[dim] - min_incr)) / (
+                    1 - incr_rel
+                )
                 assert incr >= min_incr
                 state[dim] -= incr
             return [state], [action]
@@ -963,7 +965,7 @@ def step(
         else:
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
-                incr = incr_rel * (1.0 - self.state[dim] - min_incr)
+                incr = min_incr + incr_rel * (1.0 - self.state[dim] - min_incr)
                 assert incr >= min_incr
                 self.state[dim] += incr
             assert all([s <= self.max_val for s in self.state]), print(

From d95cdd28203c18148137f0b30880d8437f8e8b19 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 25 Apr 2023 05:16:16 -0400
Subject: [PATCH 054/206] black and isort

---
 gflownet/envs/cube.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 4a6ce468f..f5da4f023 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -826,7 +826,9 @@ def get_parents(
         else:
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
-                incr = (min_incr + incr_rel * (1.0 - state[dim] - min_incr)) / (1 - incr_rel)
+                incr = (min_incr + incr_rel * (1.0 - state[dim] - min_incr)) / (
+                    1 - incr_rel
+                )
                 assert (
                     incr >= min_incr
                 ), f"""
@@ -1053,9 +1055,7 @@ def get_logprobs(
             # Log probs of sampling zero increments
             if not is_forward:
                 mask_zeroincr = increments[mask_nearedge_dims] == 0.0
-                logits_zeroincr = policy_outputs[:, : self.n_dim][
-                    mask_nearedge_dims
-                ]
+                logits_zeroincr = policy_outputs[:, : self.n_dim][mask_nearedge_dims]
                 distr_zeroincr = Bernoulli(logits=logits_zeroincr)
                 logprobs_zeroincr[mask_nearedge_dims] = distr_zeroincr.log_prob(
                     mask_zeroincr.to(self.float)

From 5373100d18a9b141dc06c49b5c6406429cba0c8c Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 25 Apr 2023 05:41:42 -0400
Subject: [PATCH 055/206] add epsilon to asserts

---
 gflownet/envs/cube.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index f5da4f023..d546fdc0b 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -824,26 +824,27 @@ def get_parents(
         if all([s == ss for s, ss in zip(state, self.source)]):
             return [], []
         else:
+            epsilon = 1e-9
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
                 incr = (min_incr + incr_rel * (1.0 - state[dim] - min_incr)) / (
                     1 - incr_rel
                 )
                 assert (
-                    incr >= min_incr
+                    incr >= (min_incr - epsilon)
                 ), f"""
                 Increment {incr} at dim {dim} smaller than minimum increment ({min_incr}).
                 \nState:\n{state}\nAction:\n{action}
                 """
                 state[dim] -= incr
                 assert all(
-                    [s <= self.max_val for s in state]
+                    [s <= (self.max_val + epsilon) for s in state]
                 ), f"""
                 State is out of cube bounds.
                 \nState:\n{state}\nAction:\n{action}\nIncrement: {incr}
                 """
                 assert all(
-                    [s >= 0.0 for s in state]
+                    [s >= (0.0 - epsilon) for s in state]
                 ), f"""
                 State is out of cube bounds.
                 \nState:\n{state}\nAction:\n{action}\nIncrement: {incr}
@@ -1123,24 +1124,25 @@ def step(
             return self.state, self.eos, True
         # If action is not eos, then perform action
         else:
+            epsilon = 1e-9
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
                 incr = min_incr + incr_rel * (1.0 - self.state[dim] - min_incr)
                 assert (
-                    incr >= min_incr
+                    incr >= (min_incr - epsilon)
                 ), f"""
                 Increment {incr} at dim {dim} smaller than minimum increment ({min_incr}).
                 \nState:\n{self.state}\nAction:\n{action}
                 """
                 self.state[dim] += incr
             assert all(
-                [s <= self.max_val for s in self.state]
+                [s <= (self.max_val + epsilon) for s in self.state]
             ), f"""
             State is out of cube bounds.
             \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
             """
             assert all(
-                [s >= 0.0 for s in self.state]
+                [s >= (0.0 - epsilon) for s in self.state]
             ), f"""
             State is out of cube bounds.
             \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}

From 7ddd2cb33dca4a90848dbdab50490bb0201b507e Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Sun, 30 Apr 2023 19:50:37 +0200
Subject: [PATCH 056/206] change increment in get parents

---
 gflownet/envs/cube.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index d546fdc0b..67a942aa0 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -827,9 +827,7 @@ def get_parents(
             epsilon = 1e-9
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
-                incr = (min_incr + incr_rel * (1.0 - state[dim] - min_incr)) / (
-                    1 - incr_rel
-                )
+                incr = min_incr + incr_rel * (state[dim] - min_incr)
                 assert (
                     incr >= (min_incr - epsilon)
                 ), f"""

From 49421cebe32e2f8e0ab3d4d9a6225e4fe4ac49b5 Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Sun, 30 Apr 2023 20:10:47 +0200
Subject: [PATCH 057/206] update mask backward with condition of s < min_incr

---
 gflownet/envs/cube.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 67a942aa0..9209a9471 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -763,8 +763,8 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         - 0:n_dim : whether keeping a dimension as is, that is sampling a decrement of
           0, can have zero probability. True if the value at the dimension is smaller
           than or equal to 1 - min_incr.
-        - n_dim : whether going to source is invalid. Always valid, hence always False,
-          except if done.
+        - n_dim : whether other actions except back-to-source are invalid. False if any
+          dimension is smaller than min_incr.
         - n_dim + 1 : whether sampling EOS is invalid. Only valid if done.
         """
         if state is None:
@@ -780,8 +780,13 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         # If state is source, all actions are invalid.
         if state == self.source:
             return [True for _ in range(mask_dim)]
+        # If any dimension is smaller than m, then back-to-source is the only valid
+        # action
+        if any([s < self.min_incr for s in state]):
+            mask = [True for _ in range(mask_dim)]
+            mask[-2] = False
+            return mask
         mask = [True for _ in range(mask_dim)]
-        mask[-2] = False
         # Dimensions whose value is greater than 1 - min_incr must have non-zero
         # probability of sampling a decrement of exactly zero.
         for dim, s in enumerate(state):
@@ -828,8 +833,8 @@ def get_parents(
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
                 incr = min_incr + incr_rel * (state[dim] - min_incr)
-                assert (
-                    incr >= (min_incr - epsilon)
+                assert incr >= (
+                    min_incr - epsilon
                 ), f"""
                 Increment {incr} at dim {dim} smaller than minimum increment ({min_incr}).
                 \nState:\n{state}\nAction:\n{action}
@@ -1126,8 +1131,8 @@ def step(
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
                 incr = min_incr + incr_rel * (1.0 - self.state[dim] - min_incr)
-                assert (
-                    incr >= (min_incr - epsilon)
+                assert incr >= (
+                    min_incr - epsilon
                 ), f"""
                 Increment {incr} at dim {dim} smaller than minimum increment ({min_incr}).
                 \nState:\n{self.state}\nAction:\n{action}

From ed3606d3ec3cc414f83355e56b0cf9ed336fc311 Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Sun, 30 Apr 2023 20:23:22 +0200
Subject: [PATCH 058/206] add condition about mandatory back-to-source in
 get_log_probs

---
 gflownet/envs/cube.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 9209a9471..5ff7f5c4b 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -977,6 +977,8 @@ def get_logprobs(
 
         - logprobs_source:
             - 0, that is p(~source) = 1 for forward transitions.
+            - 0, that is p(~source) = 1 for backward transitions when any dimension is
+              smaller than min_incr.
             - backward, the log p of the sampled event (source or not source)
 
         - logprobs_increments:
@@ -1001,8 +1003,13 @@ def get_logprobs(
             ]
         else:
             # The action is non-deterministic if sampling EOS (last value of mask) is
-            # invalid (True).
-            idx_nofix = ns_range[mask_invalid_actions[:, -1]]
+            # invalid (True) and back-to-source (second to last) is not the only action
+            # (False).
+            idx_nofix = ns_range[
+                torch.logical_and(
+                    mask_invalid_actions[:, -1], ~mask_invalid_actions[:, -2]
+                )
+            ]
         # Log probs of EOS and source (backwards) actions
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
         logprobs_source = torch.zeros(n_states, device=device, dtype=self.float)

From 94b5dcd753c6648eecaace37a0bafa05cc181e01 Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Sun, 30 Apr 2023 21:35:38 +0200
Subject: [PATCH 059/206] sum log det jacobian to logprobs; make it zero in
 base env; implement log det jacobian in hypercube;

---
 gflownet/envs/base.py | 11 +++++++++++
 gflownet/envs/cube.py | 36 ++++++++++++++++++++++++++++++++++++
 gflownet/gflownet.py  |  2 ++
 3 files changed, 49 insertions(+)

diff --git a/gflownet/envs/base.py b/gflownet/envs/base.py
index fb1140a30..4396a2628 100644
--- a/gflownet/envs/base.py
+++ b/gflownet/envs/base.py
@@ -280,6 +280,17 @@ def get_logprobs(
         logprobs = self.logsoftmax(logits)[ns_range, action_indices]
         return logprobs
 
+    def get_log_det_jacobian(
+        self, states: TensorType["batch_size", "state_dim"], is_forward: bool
+    ):
+        """
+        Computes the logarithm of the determinant of the Jacobian of the sampled
+        actions with respect to the states. In general, the determinant is equal to 1,
+        hence the logarithm is 0. Environments where this is not the case must
+        implement the computation of the Jacobian for forward and backward transitions.
+        """
+        return torch.zeros(states.shape[0], device=states.device, dtype=self.float)
+
     def get_policy_output(self, params: Optional[dict] = None):
         """
         Defines the structure of the output of the policy model, from which an
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 5ff7f5c4b..e75061c3e 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1100,6 +1100,42 @@ def get_logprobs(
         assert torch.all(logprobs[mask_fix] == 0.0)
         return logprobs
 
+    def get_log_det_jacobian(
+        self, states: TensorType["batch_size", "state_dim"], is_forward: bool
+    ):
+        """
+        Computes the logarithm of the determinant of the Jacobian of the sampled
+        actions with respect to the states.
+
+        Forward: the sampled variables are the relative increments r and the state
+        updates (s -> s') are:
+
+        s' = s + m + r(1 - s - m)
+        r = (s' - s - m) / (1 - s - m)
+
+        Therefore, the derivative of r wrt to s' is
+
+        dr/ds' = 1 / (1 - s - m)
+
+        Backward: the sampled variables are the relative decrements r and the state
+        updates (s' -> s) are:
+
+        s = s' - m - r(s' - m)
+        r = (s' - s - m) / (s' - m)
+
+        Therefore, the derivative of r wrt to s is
+
+        dr/ds = -1 / (s' - m)
+
+        The derivatives of the components of r with respect to dimensions of s or s'
+        other than itself are zero. Therefore, the Jacobian is diagonal and the
+        determinant is the product of the diagonal.
+        """
+        if is_forward:
+            return torch.sum(torch.log(1.0 / (1 - states - self.min_incr)), dim=1)
+        else:
+            return torch.sum(torch.log(-1.0 / (states - self.min_incr)), dim=1)
+
     def step(
         self, action: Tuple[int, float]
     ) -> Tuple[List[float], Tuple[int, float], bool]:
diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index cb7fa7d9f..96937fa8d 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -657,6 +657,7 @@ def trajectorybalance_loss(self, it, batch, loginf=1000):
         logprobs_f = self.env.get_logprobs(
             policy_output_f, True, actions, states, masks_f, loginf
         )
+        logprobs_f = logprobs_f + self.env.get_log_det_jacobian(parents, True)
         sumlogprobs_f = torch.zeros(
             len(torch.unique(traj_id, sorted=True)),
             dtype=self.float,
@@ -667,6 +668,7 @@ def trajectorybalance_loss(self, it, batch, loginf=1000):
         logprobs_b = self.env.get_logprobs(
             policy_output_b, False, actions, parents, masks_b, loginf
         )
+        logprobs_b = logprobs_b + self.env.get_log_det_jacobian(states, False)
         sumlogprobs_b = torch.zeros(
             len(torch.unique(traj_id, sorted=True)),
             dtype=self.float,

From c7a453024e559a0138c209c4a9dc9a31cebd7532 Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Sun, 30 Apr 2023 22:03:05 +0200
Subject: [PATCH 060/206] add jacobian to logprobs inside get_logprobs; now
 get_logprobs needs states_from as argument so adapt in other scripts

---
 gflownet/envs/base.py   |  3 ++-
 gflownet/envs/ctorus.py |  3 ++-
 gflownet/envs/cube.py   | 15 +++++++++++----
 gflownet/envs/htorus.py |  9 +++++----
 gflownet/gflownet.py    |  6 ++----
 5 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/gflownet/envs/base.py b/gflownet/envs/base.py
index 4396a2628..51438b080 100644
--- a/gflownet/envs/base.py
+++ b/gflownet/envs/base.py
@@ -256,7 +256,8 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "actions_dim"],
-        states_target: TensorType["n_states", "policy_input_dim"],
+        states_from: TensorType["n_states", "policy_input_dim"],
+        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
         loginf: float = 1000,
     ) -> TensorType["batch_size"]:
diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index 06faacd09..7d8dbbba0 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -203,7 +203,8 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_target: TensorType["n_states", "policy_input_dim"],
+        states_from: TensorType["n_states", "policy_input_dim"],
+        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["n_states", "1"] = None,
         loginf: float = 1000,
     ) -> TensorType["batch_size"]:
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index e75061c3e..5606ef01c 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -210,7 +210,7 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", 2],
-        states_target: TensorType["n_states", "policy_input_dim"],
+        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
         loginf: float = 1000,
     ) -> TensorType["batch_size"]:
@@ -543,7 +543,7 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", 2],
-        states_target: TensorType["n_states", "policy_input_dim"],
+        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
         loginf: float = 1000,
     ) -> TensorType["batch_size"]:
@@ -948,7 +948,8 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_target: TensorType["n_states", "policy_input_dim"],
+        states_to: TensorType["n_states", "policy_input_dim"],
+        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
         loginf: float = 1000,
     ) -> TensorType["batch_size"]:
@@ -1020,7 +1021,7 @@ def get_logprobs(
             mask_sample = ~mask_eos
         else:
             source = torch.tensor(self.source, device=device)
-            mask_source = torch.all(states_target[idx_nofix] == source, axis=1)
+            mask_source = torch.all(states_to[idx_nofix] == source, axis=1)
             distr_source = Bernoulli(logits=policy_outputs[idx_nofix, -2])
             logprobs_source[idx_nofix] = distr_source.log_prob(
                 mask_source.to(self.float)
@@ -1073,6 +1074,11 @@ def get_logprobs(
                 )
                 # TODO: make logprobs_increments = 0 if increment was zero and
                 # near-edge. Already done?
+        # Log determinant of the Jacobian
+        log_det_jacobian = torch.zeros(n_states, device=device, dtype=self.float)
+        log_det_jacobian[idx_sample] = self.get_log_det_jacobian(
+            states_from[idx_sample], is_forward
+        )
         # Combined probabilities
         sumlogprobs_increments = logprobs_increments.sum(axis=1)
         sumlogprobs_zeroincr = logprobs_zeroincr.sum(axis=1)
@@ -1081,6 +1087,7 @@ def get_logprobs(
             + logprobs_source
             + sumlogprobs_increments
             + sumlogprobs_zeroincr
+            + log_det_jacobian
         )
         # Sanity checks
         if is_forward:
diff --git a/gflownet/envs/htorus.py b/gflownet/envs/htorus.py
index 97cb73f0a..242ad446c 100644
--- a/gflownet/envs/htorus.py
+++ b/gflownet/envs/htorus.py
@@ -406,7 +406,8 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", 2],
-        states_target: TensorType["n_states", "policy_input_dim"],
+        states_from: TensorType["n_states", "policy_input_dim"],
+        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
         loginf: float = 1000,
     ) -> TensorType["batch_size"]:
@@ -439,11 +440,11 @@ def get_logprobs(
         source = torch.tensor(self.source_angles, device=device)
         source_aux = torch.tensor(self.source_angles + [-1], device=device)
         nsource_ne_nsteps = torch.ne(
-            torch.sum(torch.ne(states_target[:, :-1], source), axis=1),
-            states_target[:, -1],
+            torch.sum(torch.ne(states_to[:, :-1], source), axis=1),
+            states_to[:, -1],
         )
         angledim_ne_source = torch.ne(
-            states_target[ns_range, dimensions], source_aux[dimensions]
+            states_to[ns_range, dimensions], source_aux[dimensions]
         )
         noeos = torch.ne(dimensions, self.eos[0])
         nofix_indices = torch.logical_and(
diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 96937fa8d..82e14828b 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -655,9 +655,8 @@ def trajectorybalance_loss(self, it, batch, loginf=1000):
         # Forward trajectories
         policy_output_f = self.forward_policy(self.env.statetorch2policy(parents))
         logprobs_f = self.env.get_logprobs(
-            policy_output_f, True, actions, states, masks_f, loginf
+            policy_output_f, True, actions, parents, states, masks_f, loginf
         )
-        logprobs_f = logprobs_f + self.env.get_log_det_jacobian(parents, True)
         sumlogprobs_f = torch.zeros(
             len(torch.unique(traj_id, sorted=True)),
             dtype=self.float,
@@ -666,9 +665,8 @@ def trajectorybalance_loss(self, it, batch, loginf=1000):
         # Backward trajectories
         policy_output_b = self.backward_policy(self.env.statetorch2policy(states))
         logprobs_b = self.env.get_logprobs(
-            policy_output_b, False, actions, parents, masks_b, loginf
+            policy_output_b, False, actions, states, parents, masks_b, loginf
         )
-        logprobs_b = logprobs_b + self.env.get_log_det_jacobian(states, False)
         sumlogprobs_b = torch.zeros(
             len(torch.unique(traj_id, sorted=True)),
             dtype=self.float,

From 6b32769737c4fde2a7d9f76c3899067539a9c06a Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Sun, 30 Apr 2023 22:04:56 +0200
Subject: [PATCH 061/206] refactor get_logprobs in test

---
 tests/gflownet/envs/common.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 9692902c3..356e342e9 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -184,7 +184,8 @@ def test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env):
             policy_outputs=policy_outputs,
             is_forward=True,
             actions=actions_torch,
-            states_target=None,
+            states_from=None,
+            states_to=None,
             mask_invalid_actions=masks_invalid_torch,
         )
         action = actions[0]

From f5398ae1b0d55a74f5bade58e647d4a0fca6b132 Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Sun, 30 Apr 2023 22:17:38 +0200
Subject: [PATCH 062/206] add epsilon to jacobian

---
 gflownet/envs/cube.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 5606ef01c..6f5d02dd6 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1138,10 +1138,15 @@ def get_log_det_jacobian(
         other than itself are zero. Therefore, the Jacobian is diagonal and the
         determinant is the product of the diagonal.
         """
+        epsilon = 1e-9
         if is_forward:
-            return torch.sum(torch.log(1.0 / (1 - states - self.min_incr)), dim=1)
+            return torch.sum(
+                torch.log(1.0 / ((1 - states - self.min_incr) + epsilon)), dim=1
+            )
         else:
-            return torch.sum(torch.log(-1.0 / (states - self.min_incr)), dim=1)
+            return torch.sum(
+                torch.log(-1.0 / ((states - self.min_incr) + epsilon)), dim=1
+            )
 
     def step(
         self, action: Tuple[int, float]

From a53d9dac7bcaa781d53f267f70cd9d3d8d69d2ed Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 30 Apr 2023 16:19:51 -0400
Subject: [PATCH 063/206] fix typo

---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 5606ef01c..b445e2fd1 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -948,7 +948,7 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_to: TensorType["n_states", "policy_input_dim"],
+        states_from: TensorType["n_states", "policy_input_dim"],
         states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
         loginf: float = 1000,

From 1a3e6f97add18995e471efef4f69d7b3748e2c9d Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 1 May 2023 03:08:24 -0400
Subject: [PATCH 064/206] change sign of backward jacobian; fix mask nofix in
 get_logprobs; fix asserts in get_logprobs

---
 gflownet/envs/cube.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 1fa59e1fa..add7a7698 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1005,10 +1005,10 @@ def get_logprobs(
         else:
             # The action is non-deterministic if sampling EOS (last value of mask) is
             # invalid (True) and back-to-source (second to last) is not the only action
-            # (False).
+            # (True).
             idx_nofix = ns_range[
                 torch.logical_and(
-                    mask_invalid_actions[:, -1], ~mask_invalid_actions[:, -2]
+                    mask_invalid_actions[:, -1], mask_invalid_actions[:, -2]
                 )
             ]
         # Log probs of EOS and source (backwards) actions
@@ -1090,19 +1090,22 @@ def get_logprobs(
             + log_det_jacobian
         )
         # Sanity checks
+        if torch.any(torch.isnan(logprobs)):
+            import ipdb; ipdb.set_trace()
+        assert not torch.any(torch.isnan(logprobs))
         if is_forward:
             mask_fix = torch.all(mask_invalid_actions[:, : self.n_dim], axis=1)
             assert torch.all(logprobs_source == 0.0)
             assert torch.all(logprobs_zeroincr == 0.0)
             assert torch.all(sumlogprobs_increments[idx_nofix][mask_eos] == 0.0)
-            mask_fixdim = mask_invalid_actions[:, self.n_dim]
+            mask_fixdim = mask_invalid_actions[:, : self.n_dim]
             assert torch.all(logprobs_increments[mask_fixdim] == 0.0)
         else:
             mask_fix = ~mask_invalid_actions[:, -1]
             assert torch.all(logprobs_eos == 0.0)
             assert torch.all(sumlogprobs_increments[idx_nofix][mask_source] == 0.0)
             assert torch.all(sumlogprobs_zeroincr[idx_nofix][mask_source] == 0.0)
-            mask_nozeroincr = mask_invalid_actions[:, self.n_dim]
+            mask_nozeroincr = mask_invalid_actions[:, : self.n_dim]
             assert torch.all(logprobs_zeroincr[mask_nozeroincr] == 0.0)
         assert torch.all(logprobs[mask_fix] == 0.0)
         return logprobs
@@ -1134,6 +1137,9 @@ def get_log_det_jacobian(
 
         dr/ds = -1 / (s' - m)
 
+        We change the sign of the derivative (Jacobian) because r is strictly
+        decreasing in the domain of s.
+
         The derivatives of the components of r with respect to dimensions of s or s'
         other than itself are zero. Therefore, the Jacobian is diagonal and the
         determinant is the product of the diagonal.
@@ -1145,7 +1151,7 @@ def get_log_det_jacobian(
             )
         else:
             return torch.sum(
-                torch.log(-1.0 / ((states - self.min_incr) + epsilon)), dim=1
+                torch.log(1.0 / ((states - self.min_incr) + epsilon)), dim=1
             )
 
     def step(

From 71891562e06b77688fb63ee488fa96c584201871 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 2 May 2023 03:14:46 -0400
Subject: [PATCH 065/206] disable eos action from source

---
 gflownet/envs/cube.py | 45 +++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index add7a7698..8b2e9442d 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -744,6 +744,9 @@ def get_mask_invalid_actions_forward(
         if done:
             return [True for _ in range(mask_dim)]
         mask = [False for _ in range(mask_dim)]
+        # If state is source, EOS is invalid
+        if state == self.source:
+            mask[-1] = True
         # If state is not source, sampling from source is invalid.
         if state != self.source:
             mask[-2] = True
@@ -868,21 +871,25 @@ def sample_actions(
         device = policy_outputs.device
         n_states = policy_outputs.shape[0]
         ns_range = torch.arange(n_states).to(device)
+        mask_nofix = torch.any(~mask_invalid_actions[:, : self.n_dim], axis=1)
+        idx_nofix = ns_range[mask_nofix]
         # EOS
-        idx_nofix = ns_range[torch.any(~mask_invalid_actions[:, : self.n_dim], axis=1)]
-        distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
+        mask_can_eos = torch.logical_and(mask_nofix, ~mask_invalid_actions[:, -1])
+        idx_can_eos = ns_range[mask_can_eos]
+        distr_eos = Bernoulli(logits=policy_outputs[idx_can_eos, -1])
         mask_sampled_eos = distr_eos.sample().to(torch.bool)
+        idx_sampled_eos = idx_can_eos[mask_sampled_eos]
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
-        logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_sampled_eos.to(self.float))
         # Sample increments
-        idx_sample = idx_nofix[~mask_sampled_eos]
-        mask_idx_sample = torch.zeros(n_states, device=device, dtype=torch.bool)
-        mask_idx_sample[idx_sample] = True
+        mask_sample = torch.zeros(n_states, device=device, dtype=torch.bool)
+        mask_sample[idx_nofix] = True
+        mask_sample[idx_sampled_eos] = False
+        idx_sample = ns_range[mask_sample]
         mask_source_sample = torch.logical_and(
-            ~mask_invalid_actions[:, self.n_dim], mask_idx_sample
+            ~mask_invalid_actions[:, self.n_dim], mask_sample
         )
         mask_generic_sample = torch.logical_and(
-            mask_invalid_actions[:, self.n_dim], mask_idx_sample
+            mask_invalid_actions[:, self.n_dim], mask_sample
         )
         idx_source = ns_range[mask_source_sample]
         idx_generic = ns_range[mask_generic_sample]
@@ -930,11 +937,11 @@ def sample_actions(
         min_increments[idx_source] = 0.0
         # Make increments of near-edge dims 0
         mask_nearedge_dims = mask_invalid_actions[:, : self.n_dim]
-        mask_idx_sample = torch.zeros(
+        mask_sample = torch.zeros(
             mask_nearedge_dims.shape, device=device, dtype=torch.bool
         )
-        mask_idx_sample[idx_sample, :] = True
-        mask_nearedge_dims = torch.logical_and(mask_nearedge_dims, mask_idx_sample)
+        mask_sample[idx_sample, :] = True
+        mask_nearedge_dims = torch.logical_and(mask_nearedge_dims, mask_sample)
         increments[mask_nearedge_dims] = 0.0
         # Build actions
         actions = [
@@ -999,18 +1006,16 @@ def get_logprobs(
         if is_forward:
             # EOS is the only valid action if all dimensions are invalid. That is, the
             # action is non-deterministic if any dimension is valid (i.e. mask = False).
-            idx_nofix = ns_range[
-                torch.any(~mask_invalid_actions[:, : self.n_dim], axis=1)
-            ]
+            mask_nofix = torch.any(~mask_invalid_actions[:, : self.n_dim], axis=1)
+            idx_nofix = ns_range[mask_nofix]
         else:
             # The action is non-deterministic if sampling EOS (last value of mask) is
             # invalid (True) and back-to-source (second to last) is not the only action
             # (True).
-            idx_nofix = ns_range[
-                torch.logical_and(
-                    mask_invalid_actions[:, -1], mask_invalid_actions[:, -2]
-                )
-            ]
+            mask_nofix = torch.logical_and(
+                mask_invalid_actions[:, -1], mask_invalid_actions[:, -2]
+            )
+            idx_nofix = ns_range[mask_nofix]
         # Log probs of EOS and source (backwards) actions
         logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
         logprobs_source = torch.zeros(n_states, device=device, dtype=self.float)
@@ -1090,8 +1095,6 @@ def get_logprobs(
             + log_det_jacobian
         )
         # Sanity checks
-        if torch.any(torch.isnan(logprobs)):
-            import ipdb; ipdb.set_trace()
         assert not torch.any(torch.isnan(logprobs))
         if is_forward:
             mask_fix = torch.all(mask_invalid_actions[:, : self.n_dim], axis=1)

From 6661c3daafc03d1f62ff5ba1d4a6cd5b7363b39f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 2 May 2023 03:52:09 -0400
Subject: [PATCH 066/206] make jacobian of zero increments zero

---
 gflownet/envs/base.py | 10 +++++-----
 gflownet/envs/cube.py | 18 +++++++++---------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/gflownet/envs/base.py b/gflownet/envs/base.py
index 51438b080..6dae24a96 100644
--- a/gflownet/envs/base.py
+++ b/gflownet/envs/base.py
@@ -281,16 +281,16 @@ def get_logprobs(
         logprobs = self.logsoftmax(logits)[ns_range, action_indices]
         return logprobs
 
-    def get_log_det_jacobian(
+    def get_jacobian_diag(
         self, states: TensorType["batch_size", "state_dim"], is_forward: bool
     ):
         """
         Computes the logarithm of the determinant of the Jacobian of the sampled
-        actions with respect to the states. In general, the determinant is equal to 1,
-        hence the logarithm is 0. Environments where this is not the case must
-        implement the computation of the Jacobian for forward and backward transitions.
+        actions with respect to the states. In general, the determinant is equal to 1.
+        Environments where this is not the case must implement the computation of the
+        Jacobian for forward and backward transitions.
         """
-        return torch.zeros(states.shape[0], device=states.device, dtype=self.float)
+        return torch.ones(states.shape, device=states.device, dtype=self.float)
 
     def get_policy_output(self, params: Optional[dict] = None):
         """
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 8b2e9442d..1adea039f 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1080,10 +1080,14 @@ def get_logprobs(
                 # TODO: make logprobs_increments = 0 if increment was zero and
                 # near-edge. Already done?
         # Log determinant of the Jacobian
-        log_det_jacobian = torch.zeros(n_states, device=device, dtype=self.float)
-        log_det_jacobian[idx_sample] = self.get_log_det_jacobian(
+        jacobian_diag = torch.ones(
+            (n_states, self.n_dim), device=device, dtype=self.float
+        )
+        jacobian_diag[idx_sample] = self.get_jacobian_diag(
             states_from[idx_sample], is_forward
         )
+        jacobian_diag[mask_nearedge_dims] = 1.0
+        log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)
         # Combined probabilities
         sumlogprobs_increments = logprobs_increments.sum(axis=1)
         sumlogprobs_zeroincr = logprobs_zeroincr.sum(axis=1)
@@ -1113,7 +1117,7 @@ def get_logprobs(
         assert torch.all(logprobs[mask_fix] == 0.0)
         return logprobs
 
-    def get_log_det_jacobian(
+    def get_jacobian_diag(
         self, states: TensorType["batch_size", "state_dim"], is_forward: bool
     ):
         """
@@ -1149,13 +1153,9 @@ def get_log_det_jacobian(
         """
         epsilon = 1e-9
         if is_forward:
-            return torch.sum(
-                torch.log(1.0 / ((1 - states - self.min_incr) + epsilon)), dim=1
-            )
+            return 1.0 / ((1 - states - self.min_incr) + epsilon)
         else:
-            return torch.sum(
-                torch.log(1.0 / ((states - self.min_incr) + epsilon)), dim=1
-            )
+            return 1.0 / ((states - self.min_incr) + epsilon)
 
     def step(
         self, action: Tuple[int, float]

From 1d395efca70aa8e4dd0897717501f0fe00a64e0a Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 3 May 2023 03:40:01 -0400
Subject: [PATCH 067/206] change equation to obtain the backward transition
 target state from a relative increment so that it matches the forward
 transition

---
 gflownet/envs/cube.py | 54 +++++++++++++++----------------------------
 1 file changed, 18 insertions(+), 36 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 1adea039f..69dbfb104 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -832,29 +832,24 @@ def get_parents(
         if all([s == ss for s, ss in zip(state, self.source)]):
             return [], []
         else:
-            epsilon = 1e-9
             min_incr = action[-1]
             for dim, incr_rel in enumerate(action[:-1]):
-                incr = min_incr + incr_rel * (state[dim] - min_incr)
-                assert incr >= (
-                    min_incr - epsilon
-                ), f"""
-                Increment {incr} at dim {dim} smaller than minimum increment ({min_incr}).
-                \nState:\n{state}\nAction:\n{action}
-                """
-                state[dim] -= incr
-                assert all(
-                    [s <= (self.max_val + epsilon) for s in state]
-                ), f"""
-                State is out of cube bounds.
-                \nState:\n{state}\nAction:\n{action}\nIncrement: {incr}
-                """
-                assert all(
-                    [s >= (0.0 - epsilon) for s in state]
-                ), f"""
-                State is out of cube bounds.
-                \nState:\n{state}\nAction:\n{action}\nIncrement: {incr}
-                """
+                state[dim] = (state[dim] - min_incr - incr_rel * (1.0 - min_incr)) / (
+                    1.0 - incr_rel
+                )
+            epsilon = 1e-9
+            assert all(
+                [s <= (self.max_val + epsilon) for s in state]
+            ), f"""
+            State is out of cube bounds.
+            \nState:\n{state}\nAction:\n{action}\nIncrement: {incr}
+            """
+            assert all(
+                [s >= (0.0 - epsilon) for s in state]
+            ), f"""
+            State is out of cube bounds.
+            \nState:\n{state}\nAction:\n{action}\nIncrement: {incr}
+            """
             return [state], [action]
 
     def sample_actions(
@@ -1124,7 +1119,7 @@ def get_jacobian_diag(
         Computes the logarithm of the determinant of the Jacobian of the sampled
         actions with respect to the states.
 
-        Forward: the sampled variables are the relative increments r and the state
+        The sampled variables are the relative increments r and the state
         updates (s -> s') are:
 
         s' = s + m + r(1 - s - m)
@@ -1134,19 +1129,6 @@ def get_jacobian_diag(
 
         dr/ds' = 1 / (1 - s - m)
 
-        Backward: the sampled variables are the relative decrements r and the state
-        updates (s' -> s) are:
-
-        s = s' - m - r(s' - m)
-        r = (s' - s - m) / (s' - m)
-
-        Therefore, the derivative of r wrt to s is
-
-        dr/ds = -1 / (s' - m)
-
-        We change the sign of the derivative (Jacobian) because r is strictly
-        decreasing in the domain of s.
-
         The derivatives of the components of r with respect to dimensions of s or s'
         other than itself are zero. Therefore, the Jacobian is diagonal and the
         determinant is the product of the diagonal.
@@ -1155,7 +1137,7 @@ def get_jacobian_diag(
         if is_forward:
             return 1.0 / ((1 - states - self.min_incr) + epsilon)
         else:
-            return 1.0 / ((states - self.min_incr) + epsilon)
+            return 1.0 / ((1 - states - self.min_incr) + epsilon)
 
     def step(
         self, action: Tuple[int, float]

From c1b651c46dca8520160d7870c79ce792f4983a01 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 4 May 2023 04:54:25 -0400
Subject: [PATCH 068/206] move mask of near-edge dims out of if to avoid error

---
 gflownet/envs/cube.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 69dbfb104..975ec5cf4 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1035,6 +1035,13 @@ def get_logprobs(
         logprobs_zeroincr = torch.zeros(
             (n_states, self.n_dim), device=device, dtype=self.float
         )
+        # Build mask of near-edge values
+        mask_nearedge_dims = ~mask_invalid_actions[:, : self.n_dim]
+        mask_idx_sample = torch.zeros(
+            mask_nearedge_dims.shape, device=device, dtype=torch.bool
+        )
+        mask_idx_sample[idx_sample, :] = True
+        mask_nearedge_dims = torch.logical_and(mask_nearedge_dims, mask_idx_sample)
         if len(idx_sample) > 0:
             mix_logits = policy_outputs[idx_sample, self.n_dim : -2 : 3].reshape(
                 -1, self.n_dim, self.n_comp
@@ -1057,12 +1064,6 @@ def get_logprobs(
             # Make logprobs of "invalid" dimensions (value larger than 1 - mincr) 0.
             # TODO: indexing can be done more efficiently to avoid sampling from the
             # distribution above.
-            mask_nearedge_dims = ~mask_invalid_actions[:, : self.n_dim]
-            mask_idx_sample = torch.zeros(
-                mask_nearedge_dims.shape, device=device, dtype=torch.bool
-            )
-            mask_idx_sample[idx_sample, :] = True
-            mask_nearedge_dims = torch.logical_and(mask_nearedge_dims, mask_idx_sample)
             logprobs_increments[mask_nearedge_dims] = 0.0
             # Log probs of sampling zero increments
             if not is_forward:

From 5d9424fab6a0b5b71c42200511ff20c962ac7072 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 4 May 2023 05:35:54 -0400
Subject: [PATCH 069/206] pass backward relative increment (rb) to backward
 beta distribution instead of rf; revert backward jacobian; pass
 min_increments to jacobian method

---
 gflownet/envs/cube.py | 49 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 975ec5cf4..0f6e5532c 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -730,8 +730,8 @@ def get_mask_invalid_actions_forward(
 
         - 0:n_dim : whether sampling each dimension is invalid. Invalid (True) if the
           value at the dimension is larger than 1 - min_incr.
-        - n_dim : whether sampling from source is invalid. Invalid except when when the
-          state is the source state.
+        - n_dim : whether sampling from source is invalid. Invalid (True) except when
+          when the state is the source state.
         - n_dim + 1 : whether sampling EOS is invalid. EOS is valid from any state
           (including the source state), hence always False.
         """
@@ -833,9 +833,9 @@ def get_parents(
             return [], []
         else:
             min_incr = action[-1]
-            for dim, incr_rel in enumerate(action[:-1]):
-                state[dim] = (state[dim] - min_incr - incr_rel * (1.0 - min_incr)) / (
-                    1.0 - incr_rel
+            for dim, incr_rel_f in enumerate(action[:-1]):
+                state[dim] = (state[dim] - min_incr - incr_rel_f * (1.0 - min_incr)) / (
+                    1.0 - incr_rel_f
                 )
             epsilon = 1e-9
             assert all(
@@ -1057,7 +1057,24 @@ def get_logprobs(
             betas = self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
             beta_distr = Beta(alphas, betas)
             distr_increments = MixtureSameFamily(mix, beta_distr)
-            increments = actions[:, :-1].clone().detach()
+            increments_f = actions[:, :-1].clone().detach()
+            # Compute backward relative increments (rb) from forward relative
+            # increments (rf)
+            # Forward (s -> s'): s' = s + m + rf * (1 - s - m)
+            # Forward: rf = (s' - s - m) / (1 - s - m)
+            # Backward (s' -> s): s = (s' - m - rf * (1 - m) / (1 - rf)
+            # Backward (s' -> s): s = s' - m - rb * (s' - m)
+            # Backward: rb = (s' - s - m) / (s' - m)
+            # rb = rf (1 - s - m) / (s' - m)
+            if not is_forward:
+                increments_b = (
+                    increments_f
+                    * (1 - states_to - self.min_incr)
+                    / (states_from - self.min_incr)
+                )
+                increments = increments_b
+            else:
+                increments = increments_f
             logprobs_increments[idx_sample] = distr_increments.log_prob(
                 increments[idx_sample]
             )
@@ -1076,11 +1093,19 @@ def get_logprobs(
                 # TODO: make logprobs_increments = 0 if increment was zero and
                 # near-edge. Already done?
         # Log determinant of the Jacobian
+        min_increments = torch.self.min_incr * torch.ones(
+            idx_sample.shape[0], device=device, dtype=self.float
+        )
+        if is_forward:
+            mask_source_sample = torch.logical_and(
+                ~mask_invalid_actions[:, -2], mask_sample
+            )
+            min_increments[mask_source_sample] = 0.0
         jacobian_diag = torch.ones(
             (n_states, self.n_dim), device=device, dtype=self.float
         )
         jacobian_diag[idx_sample] = self.get_jacobian_diag(
-            states_from[idx_sample], is_forward
+            states_from[idx_sample], is_forward, min_increments
         )
         jacobian_diag[mask_nearedge_dims] = 1.0
         log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)
@@ -1113,8 +1138,12 @@ def get_logprobs(
         assert torch.all(logprobs[mask_fix] == 0.0)
         return logprobs
 
+    # TODO: min_incr is zero from source!
     def get_jacobian_diag(
-        self, states: TensorType["batch_size", "state_dim"], is_forward: bool
+        self,
+        states: TensorType["batch_size", "state_dim"],
+        is_forward: bool,
+        min_increments: TensorType["batch_size"],
     ):
         """
         Computes the logarithm of the determinant of the Jacobian of the sampled
@@ -1136,9 +1165,9 @@ def get_jacobian_diag(
         """
         epsilon = 1e-9
         if is_forward:
-            return 1.0 / ((1 - states - self.min_incr) + epsilon)
+            return 1.0 / ((1 - states - min_increments) + epsilon)
         else:
-            return 1.0 / ((1 - states - self.min_incr) + epsilon)
+            return 1.0 / ((states - min_increments) + epsilon)
 
     def step(
         self, action: Tuple[int, float]

From f1e05c261d4bb26e3e60064e1e995a8b86adaece Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 4 May 2023 05:36:48 -0400
Subject: [PATCH 070/206] add kwargs to get_jacobian_diag in base

---
 gflownet/envs/base.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gflownet/envs/base.py b/gflownet/envs/base.py
index 6dae24a96..d3d68ddde 100644
--- a/gflownet/envs/base.py
+++ b/gflownet/envs/base.py
@@ -282,7 +282,10 @@ def get_logprobs(
         return logprobs
 
     def get_jacobian_diag(
-        self, states: TensorType["batch_size", "state_dim"], is_forward: bool
+        self,
+        states: TensorType["batch_size", "state_dim"],
+        is_forward: bool,
+        **kwargs,
     ):
         """
         Computes the logarithm of the determinant of the Jacobian of the sampled

From 159929effdf431e6609a3d34a6a52cc5add911bf Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 4 May 2023 06:24:30 -0400
Subject: [PATCH 071/206] fix errors and update docstring of jacobian method

---
 gflownet/envs/cube.py | 52 ++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 0f6e5532c..decf5aa6a 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1058,20 +1058,21 @@ def get_logprobs(
             beta_distr = Beta(alphas, betas)
             distr_increments = MixtureSameFamily(mix, beta_distr)
             increments_f = actions[:, :-1].clone().detach()
-            # Compute backward relative increments (rb) from forward relative
-            # increments (rf)
-            # Forward (s -> s'): s' = s + m + rf * (1 - s - m)
-            # Forward: rf = (s' - s - m) / (1 - s - m)
-            # Backward (s' -> s): s = (s' - m - rf * (1 - m) / (1 - rf)
-            # Backward (s' -> s): s = s' - m - rb * (s' - m)
-            # Backward: rb = (s' - s - m) / (s' - m)
-            # rb = rf (1 - s - m) / (s' - m)
+            # Compute backward relative increments (r_b) from forward relative
+            # increments (r_f)
+            # Forward (s -> s'): s' = s + m + r_f * (1 - s - m)
+            # Forward: r_f = (s' - s - m) / (1 - s - m)
+            # Backward (s' -> s): s = (s' - m - r_f * (1 - m) / (1 - r_f)
+            # Backward (s' -> s): s = s' - m - r_b * (s' - m)
+            # Backward: r_b = (s' - s - m) / (s' - m)
+            # r_b = r_f (1 - s - m) / (s' - m)
             if not is_forward:
                 increments_b = (
                     increments_f
                     * (1 - states_to - self.min_incr)
                     / (states_from - self.min_incr)
                 )
+                increments_b = torch.clip(increments_b, min=0.0, max=1.0)
                 increments = increments_b
             else:
                 increments = increments_f
@@ -1093,13 +1094,11 @@ def get_logprobs(
                 # TODO: make logprobs_increments = 0 if increment was zero and
                 # near-edge. Already done?
         # Log determinant of the Jacobian
-        min_increments = torch.self.min_incr * torch.ones(
-            idx_sample.shape[0], device=device, dtype=self.float
+        min_increments = self.min_incr * torch.ones(
+            len(idx_sample), device=device, dtype=self.float
         )
         if is_forward:
-            mask_source_sample = torch.logical_and(
-                ~mask_invalid_actions[:, -2], mask_sample
-            )
+            mask_source_sample = ~mask_invalid_actions[idx_sample, -2]
             min_increments[mask_source_sample] = 0.0
         jacobian_diag = torch.ones(
             (n_states, self.n_dim), device=device, dtype=self.float
@@ -1146,24 +1145,37 @@ def get_jacobian_diag(
         min_increments: TensorType["batch_size"],
     ):
         """
-        Computes the logarithm of the determinant of the Jacobian of the sampled
-        actions with respect to the states.
+        Computes the diagonal of the Jacobian of the sampled actions with respect to
+        the states.
 
-        The sampled variables are the relative increments r and the state
+        Forward: the sampled variables are the relative increments r_f and the state
         updates (s -> s') are:
 
-        s' = s + m + r(1 - s - m)
-        r = (s' - s - m) / (1 - s - m)
+        s' = s + m + r_f(1 - s - m)
+        r_f = (s' - s - m) / (1 - s - m)
+
+        Therefore, the derivative of r_f wrt to s' is
+
+        dr_f/ds' = 1 / (1 - s - m)
+
+        Backward: the sampled variables are the relative decrements r_b and the state
+        updates (s' -> s) are:
+
+        s = s' - m - r_b(s' - m)
+        r_b = (s' - s - m) / (s' - m)
+
+        Therefore, the derivative of r_b wrt to s is
 
-        Therefore, the derivative of r wrt to s' is
+        dr_b/ds = -1 / (s' - m)
 
-        dr/ds' = 1 / (1 - s - m)
+        We take the absolute value of the derivative (Jacobian).
 
         The derivatives of the components of r with respect to dimensions of s or s'
         other than itself are zero. Therefore, the Jacobian is diagonal and the
         determinant is the product of the diagonal.
         """
         epsilon = 1e-9
+        min_increments = min_increments.unsqueeze(-1).repeat(1, states.shape[1])
         if is_forward:
             return 1.0 / ((1 - states - min_increments) + epsilon)
         else:

From a4f9d9f809414cb1564ac67d0d2a7ba25d60af95 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 5 May 2023 05:26:44 -0400
Subject: [PATCH 072/206] assert for inf logprobs and add epsilons to avoid
 infs

---
 gflownet/envs/cube.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index decf5aa6a..7c0171ff3 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1072,7 +1072,7 @@ def get_logprobs(
                     * (1 - states_to - self.min_incr)
                     / (states_from - self.min_incr)
                 )
-                increments_b = torch.clip(increments_b, min=0.0, max=1.0)
+                increments_b = torch.clip(increments_b, min=1e-6, max=1.0 - 1e-6)
                 increments = increments_b
             else:
                 increments = increments_f
@@ -1120,6 +1120,7 @@ def get_logprobs(
         )
         # Sanity checks
         assert not torch.any(torch.isnan(logprobs))
+        assert not torch.any(torch.isinf(logprobs))
         if is_forward:
             mask_fix = torch.all(mask_invalid_actions[:, : self.n_dim], axis=1)
             assert torch.all(logprobs_source == 0.0)

From ac247b15c3c9ec66142bde4524bdc2a7e2f977d1 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 12 May 2023 13:04:36 -0400
Subject: [PATCH 073/206] add output dir for multirun exps

---
 config/main.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/config/main.yaml b/config/main.yaml
index 97417b444..5207cecdc 100644
--- a/config/main.yaml
+++ b/config/main.yaml
@@ -20,6 +20,8 @@ hydra:
   # See: https://hydra.cc/docs/configure_hydra/workdir/
   run:
     dir: ${user.logdir.root}/${now:%Y-%m-%d_%H-%M-%S}
+  sweep:
+    dir: ${user.logdir.root}/multirun/${now:%Y-%m-%d_%H-%M-%S}
   job:
     # See: https://hydra.cc/docs/upgrades/1.1_to_1.2/changes_to_job_working_dir/
     # See: https://hydra.cc/docs/tutorials/basic/running_your_app/working_directory/#disable-changing-current-working-dir-to-jobs-output-dir

From 51c9e0c62ac15807c9dccaf409dd220a4bc90c18 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 7 Sep 2023 16:28:41 -0400
Subject: [PATCH 074/206] Resolve remaining conflicts, black and isort.

---
 gflownet/gflownet.py          | 12 +++++++++---
 scripts/dav_mp20_stats.py     | 13 +++++++------
 tests/gflownet/envs/common.py | 17 -----------------
 3 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 85f719456..3352a856a 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -953,7 +953,13 @@ def test(self, **plot_kwargs):
         Computes metrics by sampling trajectories from the forward policy.
         """
         if self.buffer.test_pkl is None:
-            l1, kl, jsd, corr_prob_traj_rewards, nll_tt = self.l1, self.kl, self.jsd, self.corr_prob_traj_rewards, self.nll_tt,
+            l1, kl, jsd, corr_prob_traj_rewards, nll_tt = (
+                self.l1,
+                self.kl,
+                self.jsd,
+                self.corr_prob_traj_rewards,
+                self.nll_tt,
+            )
             # TODO: Improve conditions where x_sampled is obtained
             x_sampled = None
         else:
@@ -1020,8 +1026,8 @@ def test(self, **plot_kwargs):
                     # Fit KDE with samples from reward
                     kde_true = self.env.fit_kde(
                         x_from_reward,
-                    kernel=self.logger.test.kde.kernel,
-                    bandwidth=self.logger.test.kde.bandwidth,
+                        kernel=self.logger.test.kde.kernel,
+                        bandwidth=self.logger.test.kde.bandwidth,
                     )
                     # Estimate true log density using test samples
                     # TODO: this may be specific-ish for the torus or not
diff --git a/scripts/dav_mp20_stats.py b/scripts/dav_mp20_stats.py
index 3153d9b9a..2b3e7ee5d 100644
--- a/scripts/dav_mp20_stats.py
+++ b/scripts/dav_mp20_stats.py
@@ -1,27 +1,28 @@
+import pickle
 import sys
 from argparse import ArgumentParser
+from copy import deepcopy
 from pathlib import Path
-import pickle
-import matplotlib.pyplot as plt
+
 import matplotlib as mpl
+import matplotlib.pyplot as plt
 import numpy as np
 import torch
+from Levenshtein import distance as levenshtein_distance
 from tqdm import tqdm
 from yaml import safe_load
-from copy import deepcopy
-from Levenshtein import distance as levenshtein_distance
 
 ROOT = Path(__file__).resolve().parent.parent
 sys.path.append(str(ROOT))
 sys.path.append(str(ROOT / "external" / "repos" / "ActiveLearningMaterials"))
 CMAP = mpl.colormaps["cividis"]
 
+from collections import Counter
+
 from external.repos.ActiveLearningMaterials.dave.utils.loaders import make_loaders
 from gflownet.proxy.crystals.dave import DAVE
 from gflownet.utils.common import load_gflow_net_from_run_path, resolve_path
 
-from collections import Counter
-
 
 def make_str(v):
     return "".join(["".join([chr(i + 97) for _ in range(k)]) for i, k in enumerate(v)])
diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index f04fd82a7..d1ccf425e 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -261,29 +261,12 @@ def test__get_parents__returns_same_state_and_eos_if_done(env):
 
 @pytest.mark.repeat(10)
 def test__step__returns_same_state_action_and_invalid_if_done(env):
-<<<<<<< HEAD
-    # Sample random action
-    mask_invalid = torch.unsqueeze(
-        torch.BoolTensor(env.get_mask_invalid_actions_forward()), 0
-    )
-    if not torch.is_tensor(env.random_policy_output):
-        random_policy = torch.tensor(env.random_policy_output, dtype=env.float)
-    else:
-        random_policy = env.random_policy_output
-    random_policy = torch.unsqueeze(random_policy, 0)
-    actions, _ = env.sample_actions(
-        policy_outputs=random_policy, mask_invalid_actions=mask_invalid
-    )
-    action = actions[0]
-    env.set_state(env.state, done=True)
-=======
     env.reset()
     # Sample random trajectory
     env.trajectory_random()
     assert env.done
     # Attempt another step
     action = env.action_space[np.random.randint(low=0, high=env.action_space_dim)]
->>>>>>> cube-sep23
     next_state, action_step, valid = env.step(action)
     if torch.is_tensor(env.state):
         assert env.equal(next_state, env.state)

From b9e57eb91422bad83a072a63f5e7721216697442 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 7 Sep 2023 16:44:27 -0400
Subject: [PATCH 075/206] Minor changes of docstring.

---
 gflownet/envs/cube.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 7c0171ff3..87b0859cb 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -634,10 +634,10 @@ class ContinuousCube(Cube):
     action space consists of the increment of each dimension d, modelled by a mixture
     of Beta distributions. The states space is the value of each dimension. In order to
     ensure that all trajectories are of finite length, actions have a minimum increment
-    for all dimensions determined by min_incr.  If the value of any dimension is larger
-    than 1 - min_incr, then the trajectory is ended (the only next valid action is
-    EOS). In order to ensure the coverage of the state space, the first action (from
-    the source state) is not constrained by the minimum increment.
+    for all dimensions determined by min_incr. If the value of any dimension is larger
+    than 1 - min_incr, then that dimension can be further incremented. In order to
+    ensure the coverage of the state space, the first action (from the source state) is
+    not constrained by the minimum increment.
 
     Actions do not represent absolute increments but rather the relative increment with
     respect to the distance to the edges of the hyper-cube, from the minimum increment.
@@ -656,7 +656,7 @@ class ContinuousCube(Cube):
 
     min_incr : float
         Minimum increment in the actions, expressed as the fraction of max_val. This is
-        necessary to ensure coverage of the state space.
+        necessary to ensure that trajectories have finite length.
     """
 
     def __init__(self, **kwargs):

From cfc98463bc7a3d57376f01f7af81f64adf5490b1 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 7 Sep 2023 16:49:56 -0400
Subject: [PATCH 076/206] Minor changes of docstring.

---
 gflownet/envs/cube.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 87b0859cb..b70bcd8cd 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -685,7 +685,10 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         """
         Defines the structure of the output of the policy model, from which an
         action is to be determined or sampled, by returning a vector with a fixed
-        random policy.
+        random policy. The environment consists of both continuous and discrete
+        actions.
+
+        Continuous actions
 
         For each dimension d of the hyper-cube and component c of the mixture, the
         output of the policy should return
@@ -693,10 +696,12 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
           2) the logit(alpha) parameter of the Beta distribution to sample the increment
           3) the logit(beta) parameter of the Beta distribution to sample the increment
 
-        Additionally, the policy output contains one logit (pos [-1]) of a Bernoulli
-        distribution to model the (discrete) forward probability of selecting the EOS
-        action and another logit (pos [-2]) for the (discrete) backward probability of
-        returning to the source node.
+        Discrete actions
+
+        Additionally, the policy output contains one logit of a Bernoulli distribution
+        to model the (discrete) forward probability of selecting the EOS action and
+        another logit for the (discrete) backward probability of returning to the
+        source node.
 
         Finally, the backward distribution requires a discrete probability distribution
         (Bernoulli) for each dimension, to model the probability of sampling an

From b2afaee8622bc445e8f0033fe176c3136dabf54c Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 7 Sep 2023 17:57:00 -0400
Subject: [PATCH 077/206] Improve organisation and retrieval of different parts
 of the policy output.

---
 config/env/ccube.yaml |  10 +++-
 gflownet/envs/cube.py | 126 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 121 insertions(+), 15 deletions(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index 8349a5f7c..220a9fea6 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -14,13 +14,19 @@ beta_params_max: 2.0
 min_incr: 0.1
 n_comp: 1
 fixed_distribution:
+  beta_weights: 1.0
   beta_alpha: 2.0
   beta_beta: 5.0
-  bernoulli_logit: -2.3
+  bernoulli_bw_zero_incr_logits: 1.0
+  bernoulli_source_logit: 1.0
+  bernoulli_eos_logit: 1.0
 random_distribution:
+  beta_weights: 1.0
   beta_alpha: 1.0
   beta_beta: 1.0
-  bernoulli_logit: -0.693
+  bernoulli_bw_zero_incr_logits: 1.0
+  bernoulli_source_logit: 1.0
+  bernoulli_eos_logit: 1.0
 # Buffer
 buffer:
   data_path: null
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index b70bcd8cd..aa9041edd 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -48,14 +48,20 @@ def __init__(
         beta_params_min: float = 0.1,
         beta_params_max: float = 2.0,
         fixed_distr_params: dict = {
+            "beta_weights": 1.0,
             "beta_alpha": 2.0,
             "beta_beta": 5.0,
-            "bernoulli_logit": -2.3,
+            "bernoulli_bw_zero_incr_logits": 1.0,
+            "bernoulli_source_logit": 1.0,
+            "bernoulli_eos_logit": 1.0,
         },
         random_distr_params: dict = {
+            "beta_weights": 1.0,
             "beta_alpha": 1.0,
             "beta_beta": 1.0,
-            "bernoulli_logit": -0.693,
+            "bernoulli_bw_zero_incr_logits": 1.0,
+            "bernoulli_source_logit": 1.0,
+            "bernoulli_eos_logit": 1.0,
         },
         **kwargs,
     ):
@@ -696,33 +702,127 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
           2) the logit(alpha) parameter of the Beta distribution to sample the increment
           3) the logit(beta) parameter of the Beta distribution to sample the increment
 
+        These parameters are the first n_dim * n_comp * 3 of the policy output such
+        that the first 3 x C elements correspond to the first dimension, and so on.
+
         Discrete actions
 
-        Additionally, the policy output contains one logit of a Bernoulli distribution
-        to model the (discrete) forward probability of selecting the EOS action and
-        another logit for the (discrete) backward probability of returning to the
-        source node.
+        Additionally, the policy output contains one logit (pos -1) of a Bernoulli
+        distribution to model the (discrete) forward probability of selecting the EOS
+        action and another logit (pos -2) for the (discrete) backward probability of
+        returning to the source node.
 
         Finally, the backward distribution requires a discrete probability distribution
         (Bernoulli) for each dimension, to model the probability of sampling an
         increment equal to zero when the value at the dimension is larger than
-        1 - min_incr. These are stored at [0:n_dim].
+        1 - min_incr. These are stored after the continuous part.
 
         Therefore, the output of the policy model has dimensionality D x C x 3 + 2,
         where D is the number of dimensions (self.n_dim) and C is the number of
         components (self.n_comp).
         """
-        policy_output = torch.ones(
-            self.n_dim + self.n_dim * self.n_comp * 3 + 2,
+        # Parameters for continuous actions
+        self._len_policy_output_cont = self.n_dim * self.n_comp * 3
+        policy_output_cont = torch.empty(
+            self._len_policy_output_cont,
+            dtype=self.float,
             device=self.device,
+        )
+        policy_output_cont[0::3] = params["beta_weights"]
+        policy_output_cont[1::3] = params["beta_alpha"]
+        policy_output_cont[2::3] = params["beta_beta"]
+        # Logits for Bernouilli distributions to model backward zero increments
+        policy_output_bw_zero_incrs = torch.full(
+            self.n_dim,
+            params["bernoulli_bw_zero_incr_logits"],
             dtype=self.float,
+            device=self.device,
+        )
+        # Logit for Bernoulli distribution to model EOS action
+        policy_output_eos = torch.tensor(
+            [params["bernoulli_eos_logit"]], dtype=self.float, device=self.device
+        )
+        # Logit for Bernoulli distribution to model back-to-source action
+        policy_output_source = torch.tensor(
+            [params["bernoulli_source_logit"]], dtype=self.float, device=self.device
+        )
+        # Concatenate all outputs
+        policy_output = torch.cat(
+            policy_output_cont,
+            policy_output_bw_zero_incrs,
+            policy_output_source,
+            policy_output_eos,
         )
-        policy_output[self.n_dim + 1 : -2 : 3] = params["beta_alpha"]
-        policy_output[self.n_dim + 2 : -2 : 3] = params["beta_beta"]
-        policy_output[-2] = params["bernoulli_logit"]
-        policy_output[-1] = params["bernoulli_logit"]
         return policy_output
 
+    def _get_policy_betas_weights(
+        self, policy_output: TensorType["n_states", "policy_output_dim"]
+    ) -> TensorType["n_states", "n_dim * n_comp"]:
+        """
+        Reduces a given policy output to the part corresponding to the weights of the
+        mixture of Beta distributions.
+
+        See: get_policy_output()
+        """
+        return policy_output[0 : self._len_policy_output_cont : 3]
+
+    def _get_policy_betas_alpha(
+        self, policy_output: TensorType["n_states", "policy_output_dim"]
+    ) -> TensorType["n_states", "n_dim * n_comp"]:
+        """
+        Reduces a given policy output to the part corresponding to the alphas of the
+        mixture of Beta distributions.
+
+        See: get_policy_output()
+        """
+        return policy_output[1 : self._len_policy_output_cont : 3]
+
+    def _get_policy_betas_beta(
+        self, policy_output: TensorType["n_states", "policy_output_dim"]
+    ) -> TensorType["n_states", "n_dim * n_comp"]:
+        """
+        Reduces a given policy output to the part corresponding to the betas of the
+        mixture of Beta distributions.
+
+        See: get_policy_output()
+        """
+        return policy_output[2 : self._len_policy_output_cont : 3]
+
+    def _get_policy_bw_zero_increment_logits(
+        self, policy_output: TensorType["n_states", "policy_output_dim"]
+    ) -> TensorType["n_states", "n_dim"]:
+        """
+        Reduces a given policy output to the part corresponding to the logits of the
+        Bernoulli distributions to model the backward zero increments of each dimension.
+
+        See: get_policy_output()
+        """
+        return policy_output[
+            self._len_policy_output_cont : self._len_policy_output_cont + self.n_dim
+        ]
+
+    def _get_policy_eos_logit(
+        self, policy_output: TensorType["n_states", "policy_output_dim"]
+    ) -> TensorType["n_states", "1"]:
+        """
+        Reduces a given policy output to the part corresponding to the logit of the
+        Bernoulli distribution to model the EOS action.
+
+        See: get_policy_output()
+        """
+        return policy_output[-1]
+
+    def _get_policy_source_logit(
+        self, policy_output: TensorType["n_states", "policy_output_dim"]
+    ) -> TensorType["n_states", "1"]:
+        """
+        Reduces a given policy output to the part corresponding to the logit of the
+        Bernoulli distribution to model the back-to-source action.
+
+        See: get_policy_output()
+        """
+        return policy_output[-2]
+
     def get_mask_invalid_actions_forward(
         self,
         state: Optional[List] = None,

From b39b1d41126537961a4156fbac8cf6a384f4d558 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 8 Sep 2023 11:21:13 -0400
Subject: [PATCH 078/206] New version of masks and their tests ready.

---
 gflownet/envs/cube.py             | 119 +++++++++-------
 tests/gflownet/envs/test_ccube.py | 223 ++++++++++++++++++++++++++++--
 2 files changed, 280 insertions(+), 62 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index aa9041edd..f15d7173f 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -714,8 +714,9 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
 
         Finally, the backward distribution requires a discrete probability distribution
         (Bernoulli) for each dimension, to model the probability of sampling an
-        increment equal to zero when the value at the dimension is larger than
-        1 - min_incr. These are stored after the continuous part.
+        increment (decrement, since backwards) equal to zero when the value at the
+        dimension is larger than 1 - min_incr. These are stored after the continuous
+        part.
 
         Therefore, the output of the policy model has dimensionality D x C x 3 + 2,
         where D is the number of dimensions (self.n_dim) and C is the number of
@@ -733,7 +734,7 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         policy_output_cont[2::3] = params["beta_beta"]
         # Logits for Bernouilli distributions to model backward zero increments
         policy_output_bw_zero_incrs = torch.full(
-            self.n_dim,
+            (self.n_dim,),
             params["bernoulli_bw_zero_incr_logits"],
             dtype=self.float,
             device=self.device,
@@ -748,10 +749,12 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         )
         # Concatenate all outputs
         policy_output = torch.cat(
-            policy_output_cont,
-            policy_output_bw_zero_incrs,
-            policy_output_source,
-            policy_output_eos,
+            (
+                policy_output_cont,
+                policy_output_bw_zero_incrs,
+                policy_output_source,
+                policy_output_eos,
+            )
         )
         return policy_output
 
@@ -829,77 +832,93 @@ def get_mask_invalid_actions_forward(
         done: Optional[bool] = None,
     ) -> List:
         """
-        Returns a vector indicating which backward actions are invalid.
+        The action space is continuous, thus the mask is not only of invalid actions as
+        in discrete environments, but also an indicator of "special cases", for example
+        states from which only certain actions are possible.
+
+        In order to approximately stick to the semantics in discrete environments,
+        where the mask is of "invalid" actions, that is the value is True if an action
+        is invalid, the mask values of special cases are True if the special cases they
+        refer to are "invalid". In other words, the values are False if the state has
+        the special case.
 
         The forward mask has the following structure:
 
-        - 0:n_dim : whether sampling each dimension is invalid. Invalid (True) if the
-          value at the dimension is larger than 1 - min_incr.
-        - n_dim : whether sampling from source is invalid. Invalid (True) except when
-          when the state is the source state.
-        - n_dim + 1 : whether sampling EOS is invalid. EOS is valid from any state
-          (including the source state), hence always False.
+        - 0:n_dim : special case when a dimension cannot be further incremented. False
+          if the value at the dimension is larger than 1 - min_incr, True otherwise.
+        - -2 : special case when the state is the source state. False when the state is
+          the source state, True otherwise.
+        - -1 : whether EOS action is invalid. EOS is valid from any state, except the
+          source state or if done is True.
         """
-        if state is None:
-            state = self.state.copy()
-        if done is None:
-            done = self.done
+        state = self._get_state(state)
+        done = self._get_done(done)
         mask_dim = self.n_dim + 2
-        # If done, no action is valid
+        mask = [True] * mask_dim
+        # If done, the entire mask is True (all actions are "invalid" and no special
+        # cases)
         if done:
-            return [True for _ in range(mask_dim)]
-        mask = [False for _ in range(mask_dim)]
-        # If state is source, EOS is invalid
+            return mask
+        # If the state is the source state, indicate special case source (False)
         if state == self.source:
-            mask[-1] = True
-        # If state is not source, sampling from source is invalid.
-        if state != self.source:
-            mask[-2] = True
+            mask[-2] = False
+        # If the state is not the source state, EOS is not invalid
+        else:
+            mask[-1] = False
         # Dimensions whose value is greater than 1 - min_incr cannot be further
-        # incremented
+        # incremented (special case, thus False)
         for dim, s in enumerate(state):
             if s > 1 - self.min_incr:
-                mask[dim] = True
+                mask[dim] = False
         return mask
 
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
         """
-        Returns a vector indicating which backward actions are invalid.
+        The action space is continuous, thus the mask is not only of invalid actions as
+        in discrete environments, but also an indicator of "special cases", for example
+        states from which only certain actions are possible.
+
+        In order to approximately stick to the semantics in discrete environments,
+        where the mask is of "invalid" actions, that is the value is True if an action
+        is invalid, the mask values of special cases are True if the special cases they
+        refer to are "invalid". In other words, the values are False if the state has
+        the special case.
 
         The backward mask has the following structure:
 
-        - 0:n_dim : whether keeping a dimension as is, that is sampling a decrement of
-          0, can have zero probability. True if the value at the dimension is smaller
-          than or equal to 1 - min_incr.
-        - n_dim : whether other actions except back-to-source are invalid. False if any
-          dimension is smaller than min_incr.
-        - n_dim + 1 : whether sampling EOS is invalid. Only valid if done.
-        """
-        if state is None:
-            state = self.state.copy()
-        if done is None:
-            done = self.done
+        - 0:n_dim : special case when a dimension can remain as is, that is sampling a
+          decrement of exactly 0 is possible. False if the value at the dimension is
+          larger than 1 - min_incr, True otherwise. If the cube is 1D, then this
+          special case never occurs, hence the value is always True.
+        - -2 : special case when back-to-source action is the only possible action.
+          False if any dimension is smaller than min_incr, True otherwise.
+        - -1 : whether EOS action is invalid. False only if done is True, True
+          (invalid) otherwise.
+        """
+        state = self._get_state(state)
+        done = self._get_done(done)
         mask_dim = self.n_dim + 2
+        mask = [True] * mask_dim
+        # If state is source, all actions are invalid and no special cases.
+        if state == self.source:
+            return mask
         # If done, only valid action is EOS.
         if done:
-            mask = [True for _ in range(mask_dim)]
             mask[-1] = False
             return mask
-        # If state is source, all actions are invalid.
-        if state == self.source:
-            return [True for _ in range(mask_dim)]
-        # If any dimension is smaller than m, then back-to-source is the only valid
-        # action
+        # If any dimension is smaller than m, then back-to-source action is not invalid
+        # (False)
         if any([s < self.min_incr for s in state]):
-            mask = [True for _ in range(mask_dim)]
             mask[-2] = False
             return mask
-        mask = [True for _ in range(mask_dim)]
-        # Dimensions whose value is greater than 1 - min_incr must have non-zero
-        # probability of sampling a decrement of exactly zero.
+        # Dimensions whose value is greater than 1 - min_incr can remain as are
+        # (special case, thus False)
+        if self.n_dim == 1:
+            return mask
         for dim, s in enumerate(state):
             if s > 1 - self.min_incr:
                 mask[dim] = False
+        # TODO: if all dims are special cases, at least one should decrease.
         return mask
 
     def get_parents(
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 6e2d4495b..7669dc8a3 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -7,8 +7,13 @@
 
 
 @pytest.fixture
-def env():
-    return ContinuousCube(n_dim=2, n_comp=3)
+def cube1d():
+    return ContinuousCube(n_dim=1, n_comp=3, min_incr=0.1, max_val=1.0)
+
+
+@pytest.fixture
+def cube2d():
+    return ContinuousCube(n_dim=2, n_comp=3, min_incr=0.1, max_val=1.0)
 
 
 @pytest.mark.parametrize(
@@ -21,28 +26,218 @@ def env():
         ],
     ],
 )
+@pytest.mark.skip(reason="skip while developping other tests")
 def test__get_action_space__returns_expected(env, action_space):
     assert set(action_space) == set(env.action_space)
 
 
-def test__get_policy_output__returns_expected(env):
-    assert env.policy_output_dim == env.n_dim * env.n_comp * 3 + 1
-    fixed_policy_output = env.fixed_policy_output
-    random_policy_output = env.random_policy_output
-    assert torch.all(fixed_policy_output[0:-1:3] == 1)
+@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+def test__get_policy_output__fixed_as_expected(env, request):
+    env = request.getfixturevalue(env)
+    policy_output = env.fixed_policy_output
+    params = env.fixed_distr_params
+    policy_output__as_expected(env, policy_output, params)
+
+
+@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+def test__get_policy_output__random_as_expected(env, request):
+    env = request.getfixturevalue(env)
+    policy_output = env.random_policy_output
+    params = env.random_distr_params
+    policy_output__as_expected(env, policy_output, params)
+
+
+def policy_output__as_expected(env, policy_output, params):
+    assert torch.all(
+        env._get_policy_betas_weights(policy_output) == params["beta_weights"]
+    )
+    assert torch.all(env._get_policy_betas_alpha(policy_output) == params["beta_alpha"])
+    assert torch.all(env._get_policy_betas_beta(policy_output) == params["beta_beta"])
     assert torch.all(
-        fixed_policy_output[1:-1:3] == env.fixed_distr_params["beta_alpha"]
+        env._get_policy_bw_zero_increment_logits(policy_output)
+        == params["bernoulli_bw_zero_incr_logits"]
     )
-    assert torch.all(fixed_policy_output[2:-1:3] == env.fixed_distr_params["beta_beta"])
-    assert torch.all(random_policy_output[0:-1:3] == 1)
     assert torch.all(
-        random_policy_output[1:-1:3] == env.random_distr_params["beta_alpha"]
+        env._get_policy_eos_logit(policy_output) == params["bernoulli_eos_logit"]
     )
     assert torch.all(
-        random_policy_output[2:-1:3] == env.random_distr_params["beta_beta"]
+        env._get_policy_source_logit(policy_output) == params["bernoulli_source_logit"]
     )
 
 
+@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+def test__mask_forward__returns_all_true_if_done(env, request):
+    env = request.getfixturevalue(env)
+    # Sample states
+    states = env.get_uniform_terminating_states(100)
+    # Iterate over state and test
+    for state in states:
+        env.set_state(state, done=True)
+        mask = env.get_mask_invalid_actions_forward()
+        assert all(mask)
+
+
+@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+def test__mask_backward__returns_all_true_except_eos_if_done(env, request):
+    env = request.getfixturevalue(env)
+    # Sample states
+    states = env.get_uniform_terminating_states(100)
+    # Iterate over state and test
+    for state in states:
+        env.set_state(state, done=True)
+        mask = env.get_mask_invalid_actions_backward()
+        assert all(mask[:-1])
+        assert mask[-1] is False
+
+
+@pytest.mark.parametrize(
+    "state, mask_expected",
+    [
+        (
+            [0.0],
+            [True, False, True],
+        ),
+        (
+            [0.5],
+            [True, True, False],
+        ),
+        (
+            [0.90],
+            [True, True, False],
+        ),
+        (
+            [0.95],
+            [False, True, False],
+        ),
+    ],
+)
+def test__mask_forward__1d__returns_expected(cube1d, state, mask_expected):
+    env = cube1d
+    mask = env.get_mask_invalid_actions_forward(state)
+    assert mask == mask_expected
+
+
+@pytest.mark.parametrize(
+    "state, mask_expected",
+    [
+        (
+            [0.0, 0.0],
+            [True, True, False, True],
+        ),
+        (
+            [0.5, 0.5],
+            [True, True, True, False],
+        ),
+        (
+            [0.90, 0.5],
+            [True, True, True, False],
+        ),
+        (
+            [0.95, 0.5],
+            [False, True, True, False],
+        ),
+        (
+            [0.5, 0.90],
+            [True, True, True, False],
+        ),
+        (
+            [0.5, 0.95],
+            [True, False, True, False],
+        ),
+    ],
+)
+def test__mask_forward__2d__returns_expected(cube2d, state, mask_expected):
+    env = cube2d
+    mask = env.get_mask_invalid_actions_forward(state)
+    assert mask == mask_expected
+
+
+@pytest.mark.parametrize(
+    "state, mask_expected",
+    [
+        (
+            [0.0],
+            [True, True, True],
+        ),
+        (
+            [0.1],
+            [True, True, True],
+        ),
+        (
+            [0.05],
+            [True, False, True],
+        ),
+        (
+            [0.5],
+            [True, True, True],
+        ),
+        (
+            [0.90],
+            [True, True, True],
+        ),
+        (
+            [0.95],
+            [True, True, True],
+        ),
+    ],
+)
+def test__mask_backward__1d__returns_expected(cube1d, state, mask_expected):
+    env = cube1d
+    mask = env.get_mask_invalid_actions_backward(state)
+    assert mask == mask_expected
+
+
+@pytest.mark.parametrize(
+    "state, mask_expected",
+    [
+        (
+            [0.0, 0.0],
+            [True, True, True, True],
+        ),
+        (
+            [0.5, 0.5],
+            [True, True, True, True],
+        ),
+        (
+            [0.05, 0.5],
+            [True, True, False, True],
+        ),
+        (
+            [0.5, 0.05],
+            [True, True, False, True],
+        ),
+        (
+            [0.05, 0.05],
+            [True, True, False, True],
+        ),
+        (
+            [0.90, 0.5],
+            [True, True, True, True],
+        ),
+        (
+            [0.5, 0.90],
+            [True, True, True, True],
+        ),
+        (
+            [0.95, 0.5],
+            [False, True, True, True],
+        ),
+        (
+            [0.5, 0.95],
+            [True, False, True, True],
+        ),
+        (
+            [0.95, 0.95],
+            [False, False, True, True],
+        ),
+    ],
+)
+def test__mask_backward__2d__returns_expected(cube2d, state, mask_expected):
+    env = cube2d
+    mask = env.get_mask_invalid_actions_backward(state)
+    assert mask == mask_expected
+
+
 @pytest.mark.parametrize(
     "state, expected",
     [
@@ -68,6 +263,7 @@ def test__get_policy_output__returns_expected(env):
         ),
     ],
 )
+@pytest.mark.skip(reason="skip while developping other tests")
 def test__state2policy_returns_expected(env, state, expected):
     assert env.state2policy(state) == expected
 
@@ -81,6 +277,7 @@ def test__state2policy_returns_expected(env, state, expected):
         ),
     ],
 )
+@pytest.mark.skip(reason="skip while developping other tests")
 def test__statetorch2policy_returns_expected(env, states, expected):
     assert torch.equal(
         env.statetorch2policy(torch.tensor(states)), torch.tensor(expected)
@@ -112,11 +309,13 @@ def test__statetorch2policy_returns_expected(env, states, expected):
         ),
     ],
 )
+@pytest.mark.skip(reason="skip while developping other tests")
 def test__get_mask_invalid_actions_forward__returns_expected(env, state, expected):
     assert env.get_mask_invalid_actions_forward(state) == expected, print(
         state, expected, env.get_mask_invalid_actions_forward(state)
     )
 
 
+@pytest.mark.skip(reason="skip while developping other tests")
 def test__continuous_env_common(env):
     return common.test__continuous_env_common(env)

From aa2277585639d5172fe10f92c42d80d96da2a363 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 8 Sep 2023 18:43:58 -0400
Subject: [PATCH 079/206] Forward sampling implemented with extensive tests and
 passed

---
 gflownet/envs/cube.py             | 253 ++++++++++++++++++++------
 tests/gflownet/envs/test_ccube.py | 283 +++++++++++++++++++++++++++---
 2 files changed, 464 insertions(+), 72 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index f15d7173f..079807cf6 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -15,6 +15,7 @@
 from torchtyping import TensorType
 
 from gflownet.envs.base import GFlowNetEnv
+from gflownet.utils.common import tbool, tfloat
 
 
 class Cube(GFlowNetEnv, ABC):
@@ -668,6 +669,7 @@ class ContinuousCube(Cube):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
+    # TODO: rewrite docstring
     def get_action_space(self):
         """
         The actions are tuples of length n_dim + 1, where the value at position d indicates
@@ -767,7 +769,7 @@ def _get_policy_betas_weights(
 
         See: get_policy_output()
         """
-        return policy_output[0 : self._len_policy_output_cont : 3]
+        return policy_output[:, 0 : self._len_policy_output_cont : 3]
 
     def _get_policy_betas_alpha(
         self, policy_output: TensorType["n_states", "policy_output_dim"]
@@ -778,7 +780,7 @@ def _get_policy_betas_alpha(
 
         See: get_policy_output()
         """
-        return policy_output[1 : self._len_policy_output_cont : 3]
+        return policy_output[:, 1 : self._len_policy_output_cont : 3]
 
     def _get_policy_betas_beta(
         self, policy_output: TensorType["n_states", "policy_output_dim"]
@@ -789,7 +791,7 @@ def _get_policy_betas_beta(
 
         See: get_policy_output()
         """
-        return policy_output[2 : self._len_policy_output_cont : 3]
+        return policy_output[:, 2 : self._len_policy_output_cont : 3]
 
     def _get_policy_bw_zero_increment_logits(
         self, policy_output: TensorType["n_states", "policy_output_dim"]
@@ -801,7 +803,7 @@ def _get_policy_bw_zero_increment_logits(
         See: get_policy_output()
         """
         return policy_output[
-            self._len_policy_output_cont : self._len_policy_output_cont + self.n_dim
+            :, self._len_policy_output_cont : self._len_policy_output_cont + self.n_dim
         ]
 
     def _get_policy_eos_logit(
@@ -813,7 +815,7 @@ def _get_policy_eos_logit(
 
         See: get_policy_output()
         """
-        return policy_output[-1]
+        return policy_output[:, -1]
 
     def _get_policy_source_logit(
         self, policy_output: TensorType["n_states", "policy_output_dim"]
@@ -824,7 +826,7 @@ def _get_policy_source_logit(
 
         See: get_policy_output()
         """
-        return policy_output[-2]
+        return policy_output[:, -2]
 
     def get_mask_invalid_actions_forward(
         self,
@@ -836,16 +838,20 @@ def get_mask_invalid_actions_forward(
         in discrete environments, but also an indicator of "special cases", for example
         states from which only certain actions are possible.
 
-        In order to approximately stick to the semantics in discrete environments,
-        where the mask is of "invalid" actions, that is the value is True if an action
-        is invalid, the mask values of special cases are True if the special cases they
+        The values of True/False intend to approximately stick to the semantics in
+        discrete environments, where the mask is of "invalid" actions, but it is
+        important to note that a direct interpretation in this sense does not always
+        apply.
+
+        For example, the mask values of special cases are True if the special cases they
         refer to are "invalid". In other words, the values are False if the state has
         the special case.
 
         The forward mask has the following structure:
 
-        - 0:n_dim : special case when a dimension cannot be further incremented. False
-          if the value at the dimension is larger than 1 - min_incr, True otherwise.
+        - 0:n_dim : whether the dimension cannot be further incremented (increment is
+          invalid). True if the value at the dimension is larger than 1 - min_incr,
+          False otherwise.
         - -2 : special case when the state is the source state. False when the state is
           the source state, True otherwise.
         - -1 : whether EOS action is invalid. EOS is valid from any state, except the
@@ -854,22 +860,22 @@ def get_mask_invalid_actions_forward(
         state = self._get_state(state)
         done = self._get_done(done)
         mask_dim = self.n_dim + 2
-        mask = [True] * mask_dim
         # If done, the entire mask is True (all actions are "invalid" and no special
         # cases)
         if done:
-            return mask
-        # If the state is the source state, indicate special case source (False)
+            return [True] * mask_dim
+        mask = [False] * mask_dim
+        # If the state is not the source state, EOS is invalid
         if state == self.source:
-            mask[-2] = False
-        # If the state is not the source state, EOS is not invalid
+            mask[-1] = True
+        # If the state is not the source, indicate not special case (True)
         else:
-            mask[-1] = False
+            mask[-2] = True
         # Dimensions whose value is greater than 1 - min_incr cannot be further
         # incremented (special case, thus False)
         for dim, s in enumerate(state):
             if s > 1 - self.min_incr:
-                mask[dim] = False
+                mask[dim] = True
         return mask
 
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
@@ -921,6 +927,7 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         # TODO: if all dims are special cases, at least one should decrease.
         return mask
 
+    # TODO: remove all together?
     def get_parents(
         self, state: List = None, done: bool = None, action: Tuple[int, float] = None
     ) -> Tuple[List[List], List[Tuple[int, float]]]:
@@ -976,6 +983,28 @@ def get_parents(
             """
             return [state], [action]
 
+    @staticmethod
+    def relative_to_absolute_increments(
+        states: TensorType["n_states", "n_dim"],
+        increments_rel: TensorType["n_states", "n_dim"],
+        min_increments: TensorType["n_states", "n_dim"],
+        max_val: float,
+    ):
+        """
+        Returns a batch of absolute increments (actions) given a batch of states,
+        relative increments and minimum_increments.
+
+        Given a dimension value x, a relative increment r, a minimum increment m and a
+        maximum value 1, the absolute increment a is given by:
+
+        a = m + r * (1 - x - m)
+        """
+        max_val = torch.full_like(states, max_val)
+        increments_abs = min_increments + increments_rel * (
+            max_val - states - min_increments
+        )
+        return increments_abs
+
     def sample_actions(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -1069,6 +1098,136 @@ def sample_actions(
         # TODO: implement logprobs here too
         return actions, logprobs
 
+    def sample_actions_batch(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+        mask: Optional[TensorType["n_states", "policy_output_dim"]] = None,
+        states_from: Optional[List] = None,
+        is_backward: Optional[bool] = False,
+        sampling_method: Optional[str] = "policy",
+        temperature_logits: Optional[float] = 1.0,
+        max_sampling_attempts: Optional[int] = 10,
+    ) -> Tuple[List[Tuple], TensorType["n_states"]]:
+        """
+        Samples a batch of actions from a batch of policy outputs.
+        """
+        if not is_backward:
+            return self._sample_actions_batch_forward(
+                policy_outputs, mask, states_from, sampling_method, temperature_logits
+            )
+
+    def _sample_actions_batch_forward(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+        mask: Optional[TensorType["n_states", "policy_output_dim"]] = None,
+        states_from: Optional[List] = None,
+        sampling_method: Optional[str] = "policy",
+        temperature_logits: Optional[float] = 1.0,
+        max_sampling_attempts: Optional[int] = 10,
+    ) -> Tuple[List[Tuple], TensorType["n_states"]]:
+        """
+        Samples a a batch of forward actions from a batch of policy outputs.
+
+        An action indicates, for each dimension, the absolute increment of the
+        dimension value. However, in order to ensure that trajectories have finite
+        length, increments must have a minumum increment (self.min_incr) except if the
+        originating state is the source state (special case, see
+        get_mask_invalid_actions_forward()). Furthermore, absolute increments must also
+        be smaller than the distance from the dimension value to the edge of the cube
+        (self.max_val). In order to accomodate these constraints, first relative
+        increments (in [0, 1]) are sampled from a (mixture of) Beta distribution(s),
+        where 0.0 indicates an absolute increment of min_incr and 1.0 indicates an
+        absolute increment of 1 - x + min_incr (going to the edge).
+
+        Therefore, given a dimension value x, a relative increment r, a minimum
+        increment m and a maximum value 1, the absolute increment a is given by:
+
+        a = m + r * (1 - x - m)
+
+        The continuous distribution to sample the continuous action described above
+        must be mixed with the discrete distribution to model the sampling of the EOS
+        action. The EOS action can be sampled from any state except from the source
+        state or whether the trajectory is done. That the EOS action is invalid is
+        indicated by mask[-1] being False.
+
+        Finally, regarding the constraints on the increments, the following special
+        cases are taken into account:
+
+        - The originating state is the source state: in this case, the minimum
+          increment is 0.0 instead of self.min_incr. This is to ensure that the entire
+          state space can be reached. This is indicated by mask[-2] being False.
+        - The value at a dimension is at a distance from the cube edge smaller than the
+          minimum increment (x > 1 - m). In this case, absolute increment must be 0.0.
+          This is indicated by mask[d] being True.
+        """
+        # Initialize variables
+        n_states = policy_outputs.shape[0]
+        is_eos = torch.zeros(n_states, dtype=torch.bool, device=self.device)
+        # Determine source states
+        is_source = ~mask[:, -2]
+        # EOS is the only possible action if no dimension can be sampled (mask of all
+        # dimensions is "invalid" i.e. True)
+        is_near_edge = mask[:, : self.n_dim]
+        is_eos_forced = torch.all(is_near_edge, dim=1)
+        is_eos[is_eos_forced] = True
+        # Ensure that is_eos_forced does not include any source state
+        assert not torch.any(torch.logical_and(is_source, is_eos_forced))
+        # Sample EOS from Bernoulli distribution
+        do_eos = torch.logical_and(~is_source, ~is_eos_forced)
+        if torch.any(do_eos):
+            is_eos_sampled = torch.zeros_like(do_eos)
+            logits_eos = self._get_policy_eos_logit(policy_outputs)[do_eos]
+            distr_eos = Bernoulli(logits=logits_eos)
+            is_eos_sampled[do_eos] = tbool(distr_eos.sample(), device=self.device)
+            is_eos[is_eos_sampled] = True
+        # Sample relative increments if EOS is not the sampled or forced action
+        do_increments = ~is_eos
+        if torch.any(do_increments):
+            if sampling_method == "uniform":
+                raise NotImplementedError()
+            elif sampling_method == "policy":
+                mix_logits = self._get_policy_betas_weights(policy_outputs)[
+                    do_increments
+                ].reshape(-1, self.n_dim, self.n_comp)
+                mix = Categorical(logits=mix_logits)
+                alphas = self._get_policy_betas_alpha(policy_outputs)[
+                    do_increments
+                ].reshape(-1, self.n_dim, self.n_comp)
+                alphas = (
+                    self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
+                )
+                betas = self._get_policy_betas_beta(policy_outputs)[
+                    do_increments
+                ].reshape(-1, self.n_dim, self.n_comp)
+                betas = (
+                    self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
+                )
+                beta_distr = Beta(alphas, betas)
+                distr_increments = MixtureSameFamily(mix, beta_distr)
+            # Shape of increments_rel: [n_do_increments, n_dim]
+            increments_rel = distr_increments.sample()
+        # Get minimum increments
+        min_increments = torch.full_like(
+            increments_rel, self.min_incr, dtype=self.float, device=self.device
+        )
+        min_increments[is_source[do_increments]] = 0.0
+        # Compute absolute increments
+        states_from_do_increments = tfloat(
+            states_from, float_type=self.float, device=self.device
+        )[do_increments]
+        increments_abs = self.relative_to_absolute_increments(
+            states_from_do_increments, increments_rel, min_increments, self.max_val
+        )
+        # Set 0 increments in near edge dimensions that cannot be further incremented
+        increments_abs[is_near_edge[do_increments]] = 0.0
+        # Build actions
+        actions_tensor = torch.full(
+            (n_states, self.n_dim), torch.inf, dtype=self.float, device=self.device
+        )
+        actions_tensor[do_increments] = increments_abs
+        actions = [tuple(a.tolist()) for a in actions_tensor]
+        return actions, None
+
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -1310,13 +1469,15 @@ def step(
         self, action: Tuple[int, float]
     ) -> Tuple[List[float], Tuple[int, float], bool]:
         """
-        Executes step given an action.
+        Executes step given an action. An action is the absolute increment of each
+        dimension.
 
         Args
         ----
         action : tuple
-            Action to be executed. An action is a tuple with two values:
-            (dimension, increment).
+            Action to be executed. An action is a tuple of length n_dim + 1, with the
+            relative increment for each dimension, and the minumum increment at the
+            last entry of the tuple.
 
         Returns
         -------
@@ -1332,39 +1493,29 @@ def step(
         """
         if self.done:
             return self.state, action, False
-        # TODO: remove condition
-        # If action is eos or any dimension is beyond max_val, then force eos
-        elif action == self.eos or any([s > (1 - self.min_incr) for s in self.state]):
+        if action == self.eos:
+            assert self.state != self.source
             self.done = True
             self.n_actions += 1
             return self.state, self.eos, True
-        # If action is not eos, then perform action
-        else:
-            epsilon = 1e-9
-            min_incr = action[-1]
-            for dim, incr_rel in enumerate(action[:-1]):
-                incr = min_incr + incr_rel * (1.0 - self.state[dim] - min_incr)
-                assert incr >= (
-                    min_incr - epsilon
-                ), f"""
-                Increment {incr} at dim {dim} smaller than minimum increment ({min_incr}).
-                \nState:\n{self.state}\nAction:\n{action}
-                """
-                self.state[dim] += incr
-            assert all(
-                [s <= (self.max_val + epsilon) for s in self.state]
-            ), f"""
-            State is out of cube bounds.
-            \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
-            """
-            assert all(
-                [s >= (0.0 - epsilon) for s in self.state]
-            ), f"""
-            State is out of cube bounds.
-            \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
-            """
-            self.n_actions += 1
-            return self.state, action, True
+        # Generic action
+        epsilon = 1e-9
+        for dim, incr in enumerate(action):
+            self.state[dim] += incr
+        assert all(
+            [s <= (self.max_val + epsilon) for s in self.state]
+        ), f"""
+        State is out of cube bounds.
+        \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
+        """
+        assert all(
+            [s >= (0.0 - epsilon) for s in self.state]
+        ), f"""
+        State is out of cube bounds.
+        \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
+        """
+        self.n_actions += 1
+        return self.state, action, True
 
     def get_grid_terminating_states(self, n_states: int) -> List[List]:
         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 7669dc8a3..78f18bbf6 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -2,8 +2,10 @@
 import numpy as np
 import pytest
 import torch
+from torch.distributions import Bernoulli, Beta
 
 from gflownet.envs.cube import ContinuousCube
+from gflownet.utils.common import tbool, tfloat
 
 
 @pytest.fixture
@@ -34,34 +36,36 @@ def test__get_action_space__returns_expected(env, action_space):
 @pytest.mark.parametrize("env", ["cube1d", "cube2d"])
 def test__get_policy_output__fixed_as_expected(env, request):
     env = request.getfixturevalue(env)
-    policy_output = env.fixed_policy_output
+    policy_outputs = torch.unsqueeze(env.fixed_policy_output, 0)
     params = env.fixed_distr_params
-    policy_output__as_expected(env, policy_output, params)
+    policy_output__as_expected(env, policy_outputs, params)
 
 
 @pytest.mark.parametrize("env", ["cube1d", "cube2d"])
 def test__get_policy_output__random_as_expected(env, request):
     env = request.getfixturevalue(env)
-    policy_output = env.random_policy_output
+    policy_outputs = torch.unsqueeze(env.random_policy_output, 0)
     params = env.random_distr_params
-    policy_output__as_expected(env, policy_output, params)
+    policy_output__as_expected(env, policy_outputs, params)
 
 
-def policy_output__as_expected(env, policy_output, params):
+def policy_output__as_expected(env, policy_outputs, params):
     assert torch.all(
-        env._get_policy_betas_weights(policy_output) == params["beta_weights"]
+        env._get_policy_betas_weights(policy_outputs) == params["beta_weights"]
     )
-    assert torch.all(env._get_policy_betas_alpha(policy_output) == params["beta_alpha"])
-    assert torch.all(env._get_policy_betas_beta(policy_output) == params["beta_beta"])
     assert torch.all(
-        env._get_policy_bw_zero_increment_logits(policy_output)
+        env._get_policy_betas_alpha(policy_outputs) == params["beta_alpha"]
+    )
+    assert torch.all(env._get_policy_betas_beta(policy_outputs) == params["beta_beta"])
+    assert torch.all(
+        env._get_policy_bw_zero_increment_logits(policy_outputs)
         == params["bernoulli_bw_zero_incr_logits"]
     )
     assert torch.all(
-        env._get_policy_eos_logit(policy_output) == params["bernoulli_eos_logit"]
+        env._get_policy_eos_logit(policy_outputs) == params["bernoulli_eos_logit"]
     )
     assert torch.all(
-        env._get_policy_source_logit(policy_output) == params["bernoulli_source_logit"]
+        env._get_policy_source_logit(policy_outputs) == params["bernoulli_source_logit"]
     )
 
 
@@ -95,19 +99,19 @@ def test__mask_backward__returns_all_true_except_eos_if_done(env, request):
     [
         (
             [0.0],
-            [True, False, True],
+            [False, False, True],
         ),
         (
             [0.5],
-            [True, True, False],
+            [False, True, False],
         ),
         (
             [0.90],
-            [True, True, False],
+            [False, True, False],
         ),
         (
             [0.95],
-            [False, True, False],
+            [True, True, False],
         ),
     ],
 )
@@ -122,27 +126,27 @@ def test__mask_forward__1d__returns_expected(cube1d, state, mask_expected):
     [
         (
             [0.0, 0.0],
-            [True, True, False, True],
+            [False, False, False, True],
         ),
         (
             [0.5, 0.5],
-            [True, True, True, False],
+            [False, False, True, False],
         ),
         (
             [0.90, 0.5],
-            [True, True, True, False],
+            [False, False, True, False],
         ),
         (
             [0.95, 0.5],
-            [False, True, True, False],
+            [True, False, True, False],
         ),
         (
             [0.5, 0.90],
-            [True, True, True, False],
+            [False, False, True, False],
         ),
         (
             [0.5, 0.95],
-            [True, False, True, False],
+            [False, True, True, False],
         ),
     ],
 )
@@ -238,6 +242,243 @@ def test__mask_backward__2d__returns_expected(cube2d, state, mask_expected):
     assert mask == mask_expected
 
 
+@pytest.mark.parametrize(
+    "state, increments_rel, min_increments, state_expected",
+    [
+        (
+            [0.0, 0.0],
+            [0.5, 0.5],
+            [0.0, 0.0],
+            [0.5, 0.5],
+        ),
+        (
+            [0.0, 0.0],
+            [0.0, 0.0],
+            [0.0, 0.0],
+            [0.0, 0.0],
+        ),
+        (
+            [0.0, 0.0],
+            [0.1794, 0.9589],
+            [0.0, 0.0],
+            [0.1794, 0.9589],
+        ),
+        (
+            [0.3, 0.5],
+            [0.0, 0.0],
+            [0.1, 0.1],
+            [0.4, 0.6],
+        ),
+        (
+            [0.3, 0.5],
+            [1.0, 1.0],
+            [0.1, 0.1],
+            [1.0, 1.0],
+        ),
+        (
+            [0.3, 0.5],
+            [0.5, 0.5],
+            [0.1, 0.1],
+            [0.7, 0.8],
+        ),
+        (
+            [0.27, 0.85],
+            [0.12, 0.76],
+            [0.1, 0.1],
+            [0.4456, 0.988],
+        ),
+        (
+            [0.27, 0.95],
+            [0.12, 0.0],
+            [0.1, 0.0],
+            [0.4456, 0.95],
+        ),
+        (
+            [0.95, 0.27],
+            [0.0, 0.12],
+            [0.0, 0.1],
+            [0.95, 0.4456],
+        ),
+    ],
+)
+def test__relative_to_absolute_increments__2d__returns_expected(
+    cube2d, state, increments_rel, min_increments, state_expected
+):
+    env = cube2d
+    # Convert to tensors
+    states = tfloat([state], float_type=env.float, device=env.device)
+    increments_rel = tfloat([increments_rel], float_type=env.float, device=env.device)
+    min_increments = tfloat([min_increments], float_type=env.float, device=env.device)
+    states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
+    # Get absolute increments
+    increments_abs = env.relative_to_absolute_increments(
+        states, increments_rel, min_increments, env.max_val
+    )
+    states_next = states + increments_abs
+    assert torch.all(torch.isclose(states_next, states_expected))
+
+
+@pytest.mark.parametrize(
+    "state, action, state_expected",
+    [
+        (
+            [0.0, 0.0],
+            (0.5, 0.5),
+            [0.5, 0.5],
+        ),
+        (
+            [0.0, 0.0],
+            (0.0, 0.0),
+            [0.0, 0.0],
+        ),
+        (
+            [0.0, 0.0],
+            (0.1794, 0.9589),
+            [0.1794, 0.9589],
+        ),
+        (
+            [0.3, 0.5],
+            (0.1, 0.1),
+            [0.4, 0.6],
+        ),
+        (
+            [0.3, 0.5],
+            (0.7, 0.5),
+            [1.0, 1.0],
+        ),
+        (
+            [0.3, 0.5],
+            (0.4, 0.3),
+            [0.7, 0.8],
+        ),
+        (
+            [0.27, 0.85],
+            (0.1756, 0.138),
+            [0.4456, 0.988],
+        ),
+        (
+            [0.27, 0.95],
+            (0.1756, 0.0),
+            [0.4456, 0.95],
+        ),
+        (
+            [0.95, 0.27],
+            (0.0, 0.1756),
+            [0.95, 0.4456],
+        ),
+    ],
+)
+def test__step_forward__2d__returns_expected(cube2d, state, action, state_expected):
+    env = cube2d
+    env.set_state(state)
+    state_new, action, valid = env.step(action)
+    assert env.isclose(state_new, state_expected)
+
+
+@pytest.mark.parametrize(
+    "states, force_eos",
+    [
+        (
+            [[0.0, 0.0], [0.0, 0.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.0], [0.16, 0.93]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.3], [0.2, 0.95], [0.01, 0.01]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.0, 0.0], [0.0, 0.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+            [False, False, False, True, False],
+        ),
+        (
+            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.0], [0.16, 0.93]],
+            [False, True, True, False, False],
+        ),
+        (
+            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.98], [0.92, 0.95], [0.01, 0.01]],
+            [False, False, False, True, True],
+        ),
+    ],
+)
+def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos):
+    env = cube2d
+    n_states = len(states)
+    force_eos = tbool(force_eos, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Define Beta distribution with low variance and get confident range
+    n_samples = 10000
+    beta_params_min = 0.0
+    beta_params_max = 10000
+    alpha = 10
+    alphas_presigmoid = alpha * torch.ones(n_samples)
+    alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
+    beta = 1.0
+    betas_presigmoid = beta * torch.ones(n_samples)
+    betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
+    beta_distr = Beta(alphas, betas)
+    samples = beta_distr.sample()
+    mean_incr_rel = 0.9 * samples.mean()
+    min_incr_rel = 0.9 * samples.min()
+    max_incr_rel = 1.1 * samples.max()
+    # Define Bernoulli parameters for EOS with deterministic probability
+    logit_force_eos = torch.inf
+    logit_force_noeos = -torch.inf
+    # Estimate confident intervals of absolute actions
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    is_source = torch.all(states_torch == 0.0, dim=1)
+    is_near_edge = states_torch > 1.0 - env.min_incr
+    min_increments = torch.full_like(
+        states_torch, env.min_incr, dtype=env.float, device=env.device
+    )
+    min_increments[is_source, :] = 0.0
+    min_increments[is_near_edge] = 0.0
+    increments_rel_min = torch.full_like(
+        states_torch, min_incr_rel, dtype=env.float, device=env.device
+    )
+    increments_rel_min[is_near_edge] = 0.0
+    increments_rel_max = torch.full_like(
+        states_torch, max_incr_rel, dtype=env.float, device=env.device
+    )
+    increments_abs_min = env.relative_to_absolute_increments(
+        states_torch, increments_rel_min, min_increments, env.max_val
+    )
+    increments_abs_max = env.relative_to_absolute_increments(
+        states_torch, increments_rel_max, min_increments, env.max_val
+    )
+    # Get EOS actions
+    is_eos_forced = torch.all(is_near_edge, dim=1)
+    is_eos = torch.logical_or(is_eos_forced, force_eos)
+    increments_abs_min[is_eos] = torch.inf
+    increments_abs_max[is_eos] = torch.inf
+    # Reconfigure environment
+    env.n_comp = 1
+    env.beta_params_min = 0.0
+    env.beta_params_max = beta_params_max
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["beta_alpha"] = alpha
+    params["beta_beta"] = beta
+    params["bernoulli_eos_logit"] = logit_force_noeos
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    policy_outputs[force_eos, -1] = logit_force_eos
+    # Sample actions
+    actions, _ = env.sample_actions_batch(
+        policy_outputs, masks, states, is_backward=False
+    )
+    actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
+    actions_eos = torch.all(actions_tensor == torch.inf, dim=1)
+    assert torch.all(actions_eos == is_eos)
+    assert torch.all(actions_tensor >= increments_abs_min)
+    assert torch.all(actions_tensor <= increments_abs_max)
+
+
 @pytest.mark.parametrize(
     "state, expected",
     [

From e59777c1ed8e51ad8ddb2ac08f6bf54d29149112 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 8 Sep 2023 19:13:55 -0400
Subject: [PATCH 080/206] First step in simplifying policy - dimensions cannot
 do zero increments; Backward masks remain.

---
 config/env/ccube.yaml             |  2 -
 gflownet/envs/cube.py             | 72 +++++++++----------------------
 tests/gflownet/envs/test_ccube.py | 26 +++++------
 3 files changed, 33 insertions(+), 67 deletions(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index 220a9fea6..c2f65bb5f 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -17,14 +17,12 @@ fixed_distribution:
   beta_weights: 1.0
   beta_alpha: 2.0
   beta_beta: 5.0
-  bernoulli_bw_zero_incr_logits: 1.0
   bernoulli_source_logit: 1.0
   bernoulli_eos_logit: 1.0
 random_distribution:
   beta_weights: 1.0
   beta_alpha: 1.0
   beta_beta: 1.0
-  bernoulli_bw_zero_incr_logits: 1.0
   bernoulli_source_logit: 1.0
   bernoulli_eos_logit: 1.0
 # Buffer
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 079807cf6..244f5923c 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -52,7 +52,6 @@ def __init__(
             "beta_weights": 1.0,
             "beta_alpha": 2.0,
             "beta_beta": 5.0,
-            "bernoulli_bw_zero_incr_logits": 1.0,
             "bernoulli_source_logit": 1.0,
             "bernoulli_eos_logit": 1.0,
         },
@@ -60,7 +59,6 @@ def __init__(
             "beta_weights": 1.0,
             "beta_alpha": 1.0,
             "beta_beta": 1.0,
-            "bernoulli_bw_zero_incr_logits": 1.0,
             "bernoulli_source_logit": 1.0,
             "bernoulli_eos_logit": 1.0,
         },
@@ -714,12 +712,7 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         action and another logit (pos -2) for the (discrete) backward probability of
         returning to the source node.
 
-        Finally, the backward distribution requires a discrete probability distribution
-        (Bernoulli) for each dimension, to model the probability of sampling an
-        increment (decrement, since backwards) equal to zero when the value at the
-        dimension is larger than 1 - min_incr. These are stored after the continuous
-        part.
-
+        * TODO: review count
         Therefore, the output of the policy model has dimensionality D x C x 3 + 2,
         where D is the number of dimensions (self.n_dim) and C is the number of
         components (self.n_comp).
@@ -734,13 +727,6 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         policy_output_cont[0::3] = params["beta_weights"]
         policy_output_cont[1::3] = params["beta_alpha"]
         policy_output_cont[2::3] = params["beta_beta"]
-        # Logits for Bernouilli distributions to model backward zero increments
-        policy_output_bw_zero_incrs = torch.full(
-            (self.n_dim,),
-            params["bernoulli_bw_zero_incr_logits"],
-            dtype=self.float,
-            device=self.device,
-        )
         # Logit for Bernoulli distribution to model EOS action
         policy_output_eos = torch.tensor(
             [params["bernoulli_eos_logit"]], dtype=self.float, device=self.device
@@ -753,7 +739,6 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         policy_output = torch.cat(
             (
                 policy_output_cont,
-                policy_output_bw_zero_incrs,
                 policy_output_source,
                 policy_output_eos,
             )
@@ -793,19 +778,6 @@ def _get_policy_betas_beta(
         """
         return policy_output[:, 2 : self._len_policy_output_cont : 3]
 
-    def _get_policy_bw_zero_increment_logits(
-        self, policy_output: TensorType["n_states", "policy_output_dim"]
-    ) -> TensorType["n_states", "n_dim"]:
-        """
-        Reduces a given policy output to the part corresponding to the logits of the
-        Bernoulli distributions to model the backward zero increments of each dimension.
-
-        See: get_policy_output()
-        """
-        return policy_output[
-            :, self._len_policy_output_cont : self._len_policy_output_cont + self.n_dim
-        ]
-
     def _get_policy_eos_logit(
         self, policy_output: TensorType["n_states", "policy_output_dim"]
     ) -> TensorType["n_states", "1"]:
@@ -849,17 +821,16 @@ def get_mask_invalid_actions_forward(
 
         The forward mask has the following structure:
 
-        - 0:n_dim : whether the dimension cannot be further incremented (increment is
-          invalid). True if the value at the dimension is larger than 1 - min_incr,
-          False otherwise.
-        - -2 : special case when the state is the source state. False when the state is
+        - 0 : whether a continuous action is invalid. True if the value at any
+          dimension is larger than 1 - min_incr, or if done is True.  False otherwise.
+        - 1 : special case when the state is the source state. False when the state is
           the source state, True otherwise.
-        - -1 : whether EOS action is invalid. EOS is valid from any state, except the
+        - 2 : whether EOS action is invalid. EOS is valid from any state, except the
           source state or if done is True.
         """
         state = self._get_state(state)
         done = self._get_done(done)
-        mask_dim = self.n_dim + 2
+        mask_dim = 3
         # If done, the entire mask is True (all actions are "invalid" and no special
         # cases)
         if done:
@@ -867,17 +838,17 @@ def get_mask_invalid_actions_forward(
         mask = [False] * mask_dim
         # If the state is not the source state, EOS is invalid
         if state == self.source:
-            mask[-1] = True
+            mask[2] = True
         # If the state is not the source, indicate not special case (True)
         else:
-            mask[-2] = True
-        # Dimensions whose value is greater than 1 - min_incr cannot be further
-        # incremented (special case, thus False)
-        for dim, s in enumerate(state):
-            if s > 1 - self.min_incr:
-                mask[dim] = True
+            mask[1] = True
+        # If the value of any dimension is greater than 1 - min_incr, then continuous
+        # actions are invalid (True).
+        if any([s > 1 - self.min_incr for s in state]):
+            mask[0] = True
         return mask
 
+    # TODO: re-do
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
         """
         The action space is continuous, thus the mask is not only of invalid actions as
@@ -1156,19 +1127,18 @@ def _sample_actions_batch_forward(
         - The originating state is the source state: in this case, the minimum
           increment is 0.0 instead of self.min_incr. This is to ensure that the entire
           state space can be reached. This is indicated by mask[-2] being False.
-        - The value at a dimension is at a distance from the cube edge smaller than the
-          minimum increment (x > 1 - m). In this case, absolute increment must be 0.0.
-          This is indicated by mask[d] being True.
+        - The value at any dimension is at a distance from the cube edge smaller than the
+          minimum increment (x > 1 - m). In this case, only EOS is valid.
+          This is indicated by mask[0] being True (continuous actions are invalid).
         """
         # Initialize variables
         n_states = policy_outputs.shape[0]
         is_eos = torch.zeros(n_states, dtype=torch.bool, device=self.device)
         # Determine source states
-        is_source = ~mask[:, -2]
-        # EOS is the only possible action if no dimension can be sampled (mask of all
-        # dimensions is "invalid" i.e. True)
-        is_near_edge = mask[:, : self.n_dim]
-        is_eos_forced = torch.all(is_near_edge, dim=1)
+        is_source = ~mask[:, 1]
+        # EOS is the only possible action continuous actions are invalid (mask[0] is
+        # True)
+        is_eos_forced = mask[:, 0]
         is_eos[is_eos_forced] = True
         # Ensure that is_eos_forced does not include any source state
         assert not torch.any(torch.logical_and(is_source, is_eos_forced))
@@ -1218,8 +1188,6 @@ def _sample_actions_batch_forward(
         increments_abs = self.relative_to_absolute_increments(
             states_from_do_increments, increments_rel, min_increments, self.max_val
         )
-        # Set 0 increments in near edge dimensions that cannot be further incremented
-        increments_abs[is_near_edge[do_increments]] = 0.0
         # Build actions
         actions_tensor = torch.full(
             (n_states, self.n_dim), torch.inf, dtype=self.float, device=self.device
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 78f18bbf6..d6c96b259 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -57,10 +57,6 @@ def policy_output__as_expected(env, policy_outputs, params):
         env._get_policy_betas_alpha(policy_outputs) == params["beta_alpha"]
     )
     assert torch.all(env._get_policy_betas_beta(policy_outputs) == params["beta_beta"])
-    assert torch.all(
-        env._get_policy_bw_zero_increment_logits(policy_outputs)
-        == params["bernoulli_bw_zero_incr_logits"]
-    )
     assert torch.all(
         env._get_policy_eos_logit(policy_outputs) == params["bernoulli_eos_logit"]
     )
@@ -126,27 +122,31 @@ def test__mask_forward__1d__returns_expected(cube1d, state, mask_expected):
     [
         (
             [0.0, 0.0],
-            [False, False, False, True],
+            [False, False, True],
         ),
         (
             [0.5, 0.5],
-            [False, False, True, False],
+            [False, True, False],
         ),
         (
             [0.90, 0.5],
-            [False, False, True, False],
+            [False, True, False],
         ),
         (
             [0.95, 0.5],
-            [True, False, True, False],
+            [True, True, False],
         ),
         (
             [0.5, 0.90],
-            [False, False, True, False],
+            [False, True, False],
         ),
         (
             [0.5, 0.95],
-            [False, True, True, False],
+            [True, True, False],
+        ),
+        (
+            [0.95, 0.95],
+            [True, True, False],
         ),
     ],
 )
@@ -185,6 +185,7 @@ def test__mask_forward__2d__returns_expected(cube2d, state, mask_expected):
         ),
     ],
 )
+@pytest.mark.skip(reason="skip while developping other tests")
 def test__mask_backward__1d__returns_expected(cube1d, state, mask_expected):
     env = cube1d
     mask = env.get_mask_invalid_actions_backward(state)
@@ -236,6 +237,7 @@ def test__mask_backward__1d__returns_expected(cube1d, state, mask_expected):
         ),
     ],
 )
+@pytest.mark.skip(reason="skip while developping other tests")
 def test__mask_backward__2d__returns_expected(cube2d, state, mask_expected):
     env = cube2d
     mask = env.get_mask_invalid_actions_backward(state)
@@ -438,11 +440,9 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
         states_torch, env.min_incr, dtype=env.float, device=env.device
     )
     min_increments[is_source, :] = 0.0
-    min_increments[is_near_edge] = 0.0
     increments_rel_min = torch.full_like(
         states_torch, min_incr_rel, dtype=env.float, device=env.device
     )
-    increments_rel_min[is_near_edge] = 0.0
     increments_rel_max = torch.full_like(
         states_torch, max_incr_rel, dtype=env.float, device=env.device
     )
@@ -453,7 +453,7 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
         states_torch, increments_rel_max, min_increments, env.max_val
     )
     # Get EOS actions
-    is_eos_forced = torch.all(is_near_edge, dim=1)
+    is_eos_forced = torch.any(is_near_edge, dim=1)
     is_eos = torch.logical_or(is_eos_forced, force_eos)
     increments_abs_min[is_eos] = torch.inf
     increments_abs_max[is_eos] = torch.inf

From 180d700697a055e0518b3850cb71fc6afff7b5d3 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 8 Sep 2023 20:29:58 -0400
Subject: [PATCH 081/206] Implement backward sampling and its tests.

---
 gflownet/envs/cube.py             | 199 ++++++++++++++++++++++++------
 tests/gflownet/envs/test_ccube.py | 190 +++++++++++++++++++++++++---
 2 files changed, 332 insertions(+), 57 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 244f5923c..2008e3d60 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -822,7 +822,7 @@ def get_mask_invalid_actions_forward(
         The forward mask has the following structure:
 
         - 0 : whether a continuous action is invalid. True if the value at any
-          dimension is larger than 1 - min_incr, or if done is True.  False otherwise.
+          dimension is larger than 1 - min_incr, or if done is True. False otherwise.
         - 1 : special case when the state is the source state. False when the state is
           the source state, True otherwise.
         - 2 : whether EOS action is invalid. EOS is valid from any state, except the
@@ -848,7 +848,7 @@ def get_mask_invalid_actions_forward(
             mask[0] = True
         return mask
 
-    # TODO: re-do
+    # TODO: can we simplify to 2 values?
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
         """
         The action space is continuous, thus the mask is not only of invalid actions as
@@ -863,39 +863,31 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
 
         The backward mask has the following structure:
 
-        - 0:n_dim : special case when a dimension can remain as is, that is sampling a
-          decrement of exactly 0 is possible. False if the value at the dimension is
-          larger than 1 - min_incr, True otherwise. If the cube is 1D, then this
-          special case never occurs, hence the value is always True.
-        - -2 : special case when back-to-source action is the only possible action.
+        - 0 : whether a continuous action is invalid. True if the value at any
+          dimension is smaller than min_incr, or if done is True. False otherwise.
+        - 1 : special case when back-to-source action is the only possible action.
           False if any dimension is smaller than min_incr, True otherwise.
-        - -1 : whether EOS action is invalid. False only if done is True, True
+        - 2 : whether EOS action is invalid. False only if done is True, True
           (invalid) otherwise.
         """
         state = self._get_state(state)
         done = self._get_done(done)
-        mask_dim = self.n_dim + 2
+        mask_dim = 3
         mask = [True] * mask_dim
         # If state is source, all actions are invalid and no special cases.
         if state == self.source:
             return mask
         # If done, only valid action is EOS.
         if done:
-            mask[-1] = False
+            mask[2] = False
             return mask
-        # If any dimension is smaller than m, then back-to-source action is not invalid
-        # (False)
+        # If any dimension is smaller than m, then back-to-source action is the only
+        # possible actiona.
         if any([s < self.min_incr for s in state]):
-            mask[-2] = False
-            return mask
-        # Dimensions whose value is greater than 1 - min_incr can remain as are
-        # (special case, thus False)
-        if self.n_dim == 1:
+            mask[1] = False
             return mask
-        for dim, s in enumerate(state):
-            if s > 1 - self.min_incr:
-                mask[dim] = False
-        # TODO: if all dims are special cases, at least one should decrease.
+        # Otherwise, continuous actions are valid
+        mask[0] = False
         return mask
 
     # TODO: remove all together?
@@ -960,6 +952,7 @@ def relative_to_absolute_increments(
         increments_rel: TensorType["n_states", "n_dim"],
         min_increments: TensorType["n_states", "n_dim"],
         max_val: float,
+        is_backward: bool,
     ):
         """
         Returns a batch of absolute increments (actions) given a batch of states,
@@ -971,9 +964,12 @@ def relative_to_absolute_increments(
         a = m + r * (1 - x - m)
         """
         max_val = torch.full_like(states, max_val)
-        increments_abs = min_increments + increments_rel * (
-            max_val - states - min_increments
-        )
+        if is_backward:
+            increments_abs = min_increments + increments_rel * (states - min_increments)
+        else:
+            increments_abs = min_increments + increments_rel * (
+                max_val - states - min_increments
+            )
         return increments_abs
 
     def sample_actions(
@@ -1086,6 +1082,10 @@ def sample_actions_batch(
             return self._sample_actions_batch_forward(
                 policy_outputs, mask, states_from, sampling_method, temperature_logits
             )
+        else:
+            return self._sample_actions_batch_backward(
+                policy_outputs, mask, states_from, sampling_method, temperature_logits
+            )
 
     def _sample_actions_batch_forward(
         self,
@@ -1176,23 +1176,148 @@ def _sample_actions_batch_forward(
                 distr_increments = MixtureSameFamily(mix, beta_distr)
             # Shape of increments_rel: [n_do_increments, n_dim]
             increments_rel = distr_increments.sample()
-        # Get minimum increments
-        min_increments = torch.full_like(
-            increments_rel, self.min_incr, dtype=self.float, device=self.device
-        )
-        min_increments[is_source[do_increments]] = 0.0
-        # Compute absolute increments
-        states_from_do_increments = tfloat(
-            states_from, float_type=self.float, device=self.device
-        )[do_increments]
-        increments_abs = self.relative_to_absolute_increments(
-            states_from_do_increments, increments_rel, min_increments, self.max_val
-        )
+            # Get minimum increments
+            min_increments = torch.full_like(
+                increments_rel, self.min_incr, dtype=self.float, device=self.device
+            )
+            min_increments[is_source[do_increments]] = 0.0
+            # Compute absolute increments
+            states_from_do_increments = tfloat(
+                states_from, float_type=self.float, device=self.device
+            )[do_increments]
+            increments_abs = self.relative_to_absolute_increments(
+                states_from_do_increments,
+                increments_rel,
+                min_increments,
+                self.max_val,
+                is_backward=False,
+            )
         # Build actions
         actions_tensor = torch.full(
             (n_states, self.n_dim), torch.inf, dtype=self.float, device=self.device
         )
-        actions_tensor[do_increments] = increments_abs
+        if torch.any(do_increments):
+            actions_tensor[do_increments] = increments_abs
+        actions = [tuple(a.tolist()) for a in actions_tensor]
+        return actions, None
+
+    # TODO: Rewrite docstring
+    # TODO: Write function common to forward and backward
+    # TODO: Catch source states?
+    def _sample_actions_batch_backward(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+        mask: Optional[TensorType["n_states", "policy_output_dim"]] = None,
+        states_from: Optional[List] = None,
+        sampling_method: Optional[str] = "policy",
+        temperature_logits: Optional[float] = 1.0,
+        max_sampling_attempts: Optional[int] = 10,
+    ) -> Tuple[List[Tuple], TensorType["n_states"]]:
+        """
+        Samples a a batch of backward actions from a batch of policy outputs.
+
+        An action indicates, for each dimension, the absolute increment of the
+        dimension value. However, in order to ensure that trajectories have finite
+        length, increments must have a minumum increment (self.min_incr) except if the
+        originating state is the source state (special case, see
+        get_mask_invalid_actions_backward()). Furthermore, absolute increments must also
+        be smaller than the distance from the dimension value to the edge of the cube
+        (self.max_val). In order to accomodate these constraints, first relative
+        increments (in [0, 1]) are sampled from a (mixture of) Beta distribution(s),
+        where 0.0 indicates an absolute increment of min_incr and 1.0 indicates an
+        absolute increment of 1 - x + min_incr (going to the edge).
+
+        Therefore, given a dimension value x, a relative increment r, a minimum
+        increment m and a maximum value 1, the absolute increment a is given by:
+
+        a = m + r * (1 - x - m)
+
+        The continuous distribution to sample the continuous action described above
+        must be mixed with the discrete distribution to model the sampling of the EOS
+        action. The EOS action can be sampled from any state except from the source
+        state or whether the trajectory is done. That the EOS action is invalid is
+        indicated by mask[-1] being False.
+
+        Finally, regarding the constraints on the increments, the following special
+        cases are taken into account:
+
+        - The originating state is the source state: in this case, the minimum
+          increment is 0.0 instead of self.min_incr. This is to ensure that the entire
+          state space can be reached. This is indicated by mask[-2] being False.
+        - The value at any dimension is at a distance from the cube edge smaller than the
+          minimum increment (x > 1 - m). In this case, only EOS is valid.
+          This is indicated by mask[0] being True (continuous actions are invalid).
+        """
+        # Initialize variables
+        n_states = policy_outputs.shape[0]
+        is_bts = torch.zeros(n_states, dtype=torch.bool, device=self.device)
+        # EOS is the only possible action only if the entire mask is True
+        is_eos = torch.all(mask, dim=1)
+        # Back-to-source (BTS) is the only possible action if mask[1] is False
+        is_bts_forced = ~mask[:, 1]
+        is_bts[is_bts_forced] = True
+        # Sample BTS from Bernoulli distribution
+        do_bts = torch.logical_and(~is_bts_forced, ~is_eos)
+        if torch.any(do_bts):
+            is_bts_sampled = torch.zeros_like(do_bts)
+            logits_bts = self._get_policy_source_logit(policy_outputs)[do_bts]
+            distr_bts = Bernoulli(logits=logits_bts)
+            is_bts_sampled[do_bts] = tbool(distr_bts.sample(), device=self.device)
+            is_bts[is_bts_sampled] = True
+        # Sample relative increments if actions are neither BTS nor EOS
+        do_increments = torch.logical_and(~is_bts, ~is_eos)
+        if torch.any(do_increments):
+            if sampling_method == "uniform":
+                raise NotImplementedError()
+            elif sampling_method == "policy":
+                mix_logits = self._get_policy_betas_weights(policy_outputs)[
+                    do_increments
+                ].reshape(-1, self.n_dim, self.n_comp)
+                mix = Categorical(logits=mix_logits)
+                alphas = self._get_policy_betas_alpha(policy_outputs)[
+                    do_increments
+                ].reshape(-1, self.n_dim, self.n_comp)
+                alphas = (
+                    self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
+                )
+                betas = self._get_policy_betas_beta(policy_outputs)[
+                    do_increments
+                ].reshape(-1, self.n_dim, self.n_comp)
+                betas = (
+                    self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
+                )
+                beta_distr = Beta(alphas, betas)
+                distr_increments = MixtureSameFamily(mix, beta_distr)
+            # Shape of increments_rel: [n_do_increments, n_dim]
+            increments_rel = distr_increments.sample()
+            # Set minimum increments
+            min_increments = torch.full_like(
+                increments_rel, self.min_incr, dtype=self.float, device=self.device
+            )
+            # Compute absolute increments
+            states_from_do_increments = tfloat(
+                states_from, float_type=self.float, device=self.device
+            )[do_increments]
+            increments_abs = self.relative_to_absolute_increments(
+                states_from_do_increments,
+                increments_rel,
+                min_increments,
+                self.max_val,
+                is_backward=True,
+            )
+        # Build actions
+        actions_tensor = torch.zeros(
+            (n_states, self.n_dim), dtype=self.float, device=self.device
+        )
+        actions_tensor[is_eos] = torch.inf
+        if torch.any(do_increments):
+            actions_tensor[do_increments] = increments_abs
+        if torch.any(is_bts):
+            # BTS actions are equal to the originating states
+            actions_bts = tfloat(
+                states_from, float_type=self.float, device=self.device
+            )[is_bts]
+            actions_tensor[is_bts] = actions_bts
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, None
 
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index d6c96b259..acfce303d 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -165,7 +165,7 @@ def test__mask_forward__2d__returns_expected(cube2d, state, mask_expected):
         ),
         (
             [0.1],
-            [True, True, True],
+            [False, True, True],
         ),
         (
             [0.05],
@@ -173,19 +173,18 @@ def test__mask_forward__2d__returns_expected(cube2d, state, mask_expected):
         ),
         (
             [0.5],
-            [True, True, True],
+            [False, True, True],
         ),
         (
             [0.90],
-            [True, True, True],
+            [False, True, True],
         ),
         (
             [0.95],
-            [True, True, True],
+            [False, True, True],
         ),
     ],
 )
-@pytest.mark.skip(reason="skip while developping other tests")
 def test__mask_backward__1d__returns_expected(cube1d, state, mask_expected):
     env = cube1d
     mask = env.get_mask_invalid_actions_backward(state)
@@ -197,47 +196,46 @@ def test__mask_backward__1d__returns_expected(cube1d, state, mask_expected):
     [
         (
             [0.0, 0.0],
-            [True, True, True, True],
+            [True, True, True],
         ),
         (
             [0.5, 0.5],
-            [True, True, True, True],
+            [False, True, True],
         ),
         (
             [0.05, 0.5],
-            [True, True, False, True],
+            [True, False, True],
         ),
         (
             [0.5, 0.05],
-            [True, True, False, True],
+            [True, False, True],
         ),
         (
             [0.05, 0.05],
-            [True, True, False, True],
+            [True, False, True],
         ),
         (
             [0.90, 0.5],
-            [True, True, True, True],
+            [False, True, True],
         ),
         (
             [0.5, 0.90],
-            [True, True, True, True],
+            [False, True, True],
         ),
         (
             [0.95, 0.5],
-            [False, True, True, True],
+            [False, True, True],
         ),
         (
             [0.5, 0.95],
-            [True, False, True, True],
+            [False, True, True],
         ),
         (
             [0.95, 0.95],
-            [False, False, True, True],
+            [False, True, True],
         ),
     ],
 )
-@pytest.mark.skip(reason="skip while developping other tests")
 def test__mask_backward__2d__returns_expected(cube2d, state, mask_expected):
     env = cube2d
     mask = env.get_mask_invalid_actions_backward(state)
@@ -303,7 +301,7 @@ def test__mask_backward__2d__returns_expected(cube2d, state, mask_expected):
         ),
     ],
 )
-def test__relative_to_absolute_increments__2d__returns_expected(
+def test__relative_to_absolute_increments__2d_forward__returns_expected(
     cube2d, state, increments_rel, min_increments, state_expected
 ):
     env = cube2d
@@ -314,12 +312,64 @@ def test__relative_to_absolute_increments__2d__returns_expected(
     states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
     # Get absolute increments
     increments_abs = env.relative_to_absolute_increments(
-        states, increments_rel, min_increments, env.max_val
+        states, increments_rel, min_increments, env.max_val, is_backward=False
     )
     states_next = states + increments_abs
     assert torch.all(torch.isclose(states_next, states_expected))
 
 
+@pytest.mark.parametrize(
+    "state, increments_rel, min_increments, state_expected",
+    [
+        (
+            [1.0, 1.0],
+            [0.0, 0.0],
+            [0.1, 0.1],
+            [0.9, 0.9],
+        ),
+        (
+            [1.0, 1.0],
+            [1.0, 1.0],
+            [0.1, 0.1],
+            [0.0, 0.0],
+        ),
+        (
+            [1.0, 1.0],
+            [0.1794, 0.9589],
+            [0.1, 0.1],
+            [0.73854, 0.03699],
+        ),
+        (
+            [0.3, 0.5],
+            [0.0, 0.0],
+            [0.1, 0.1],
+            [0.2, 0.4],
+        ),
+        (
+            [0.3, 0.5],
+            [1.0, 1.0],
+            [0.1, 0.1],
+            [0.0, 0.0],
+        ),
+    ],
+)
+def test__relative_to_absolute_increments__2d_backward__returns_expected(
+    cube2d, state, increments_rel, min_increments, state_expected
+):
+    env = cube2d
+    # Convert to tensors
+    states = tfloat([state], float_type=env.float, device=env.device)
+    increments_rel = tfloat([increments_rel], float_type=env.float, device=env.device)
+    min_increments = tfloat([min_increments], float_type=env.float, device=env.device)
+    states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
+    # Get absolute increments
+    increments_abs = env.relative_to_absolute_increments(
+        states, increments_rel, min_increments, env.max_val, is_backward=True
+    )
+    states_next = states - increments_abs
+    assert torch.all(torch.isclose(states_next, states_expected))
+
+
 @pytest.mark.parametrize(
     "state, action, state_expected",
     [
@@ -447,10 +497,10 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
         states_torch, max_incr_rel, dtype=env.float, device=env.device
     )
     increments_abs_min = env.relative_to_absolute_increments(
-        states_torch, increments_rel_min, min_increments, env.max_val
+        states_torch, increments_rel_min, min_increments, env.max_val, is_backward=False
     )
     increments_abs_max = env.relative_to_absolute_increments(
-        states_torch, increments_rel_max, min_increments, env.max_val
+        states_torch, increments_rel_max, min_increments, env.max_val, is_backward=False
     )
     # Get EOS actions
     is_eos_forced = torch.any(is_near_edge, dim=1)
@@ -479,6 +529,106 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
     assert torch.all(actions_tensor <= increments_abs_max)
 
 
+@pytest.mark.parametrize(
+    "states, force_bst",
+    [
+        (
+            [[1.0, 1.0], [1.0, 1.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.05], [0.16, 0.93]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.3], [0.2, 0.95], [0.01, 0.01]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.0001, 0.0], [0.001, 0.01], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+            [False, False, False, True, False],
+        ),
+        (
+            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [1.0, 1.0], [0.16, 0.93]],
+            [False, True, True, True, False],
+        ),
+        (
+            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.98], [0.92, 0.95], [0.01, 0.01]],
+            [False, False, False, True, True],
+        ),
+    ],
+)
+def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bst):
+    env = cube2d
+    n_states = len(states)
+    force_bst = tbool(force_bst, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+    )
+    # Define Beta distribution with low variance and get confident range
+    n_samples = 10000
+    beta_params_min = 0.0
+    beta_params_max = 10000
+    alpha = 10
+    alphas_presigmoid = alpha * torch.ones(n_samples)
+    alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
+    beta = 1.0
+    betas_presigmoid = beta * torch.ones(n_samples)
+    betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
+    beta_distr = Beta(alphas, betas)
+    samples = beta_distr.sample()
+    mean_incr_rel = 0.9 * samples.mean()
+    min_incr_rel = 0.9 * samples.min()
+    max_incr_rel = 1.1 * samples.max()
+    # Define Bernoulli parameters for BST with deterministic probability
+    logit_force_bst = torch.inf
+    logit_force_nobst = -torch.inf
+    # Estimate confident intervals of absolute actions
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    is_near_edge = states_torch < env.min_incr
+    min_increments = torch.full_like(
+        states_torch, env.min_incr, dtype=env.float, device=env.device
+    )
+    increments_rel_min = torch.full_like(
+        states_torch, min_incr_rel, dtype=env.float, device=env.device
+    )
+    increments_rel_max = torch.full_like(
+        states_torch, max_incr_rel, dtype=env.float, device=env.device
+    )
+    increments_abs_min = env.relative_to_absolute_increments(
+        states_torch, increments_rel_min, min_increments, env.max_val, is_backward=True
+    )
+    increments_abs_max = env.relative_to_absolute_increments(
+        states_torch, increments_rel_max, min_increments, env.max_val, is_backward=True
+    )
+    # Get BST actions
+    is_bst_forced = torch.any(is_near_edge, dim=1)
+    is_bst = torch.logical_or(is_bst_forced, force_bst)
+    increments_abs_min[is_bst] = states_torch[is_bst]
+    increments_abs_max[is_bst] = states_torch[is_bst]
+    # Reconfigure environment
+    env.n_comp = 1
+    env.beta_params_min = 0.0
+    env.beta_params_max = beta_params_max
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["beta_alpha"] = alpha
+    params["beta_beta"] = beta
+    params["bernoulli_source_logit"] = logit_force_nobst
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    policy_outputs[force_bst, -2] = logit_force_bst
+    # Sample actions
+    actions, _ = env.sample_actions_batch(
+        policy_outputs, masks, states, is_backward=True
+    )
+    actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
+    actions_bst = torch.all(actions_tensor == states_torch, dim=1)
+    assert torch.all(actions_bst == is_bst)
+    assert torch.all(actions_tensor >= increments_abs_min)
+    assert torch.all(actions_tensor <= increments_abs_max)
+
+
 @pytest.mark.parametrize(
     "state, expected",
     [

From 4574c6b9e12d7f4a2d695dd0ab30f12597ddd49f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 8 Sep 2023 20:30:32 -0400
Subject: [PATCH 082/206] Change variable names in ctorus to match cube

---
 gflownet/envs/ctorus.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index 9bad1f4bc..1ca9f03eb 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -262,18 +262,18 @@ def sample_actions_batch(
             actions_tensor[do_sample] = angles_sampled
             logprobs[do_sample] = distr_angles.log_prob(angles_sampled)
         logprobs = torch.sum(logprobs, axis=1)
-        # Catch special case for backwards return-to-source actions
+        # Catch special case for backwards backt-to-source (BTS) actions
         if is_backward:
-            do_return_to_source = mask[:, 0]
-            if torch.any(do_return_to_source):
+            do_bts = mask[:, 0]
+            if torch.any(do_bts):
                 source_angles = tfloat(
                     self.source[: self.n_dim], float_type=self.float, device=self.device
                 )
                 states_from_angles = tfloat(
                     states_from, float_type=self.float, device=self.device
-                )[do_return_to_source, : self.n_dim]
-                actions_return_to_source = states_from_angles - source_angles
-                actions_tensor[do_return_to_source] = actions_return_to_source
+                )[do_bts, : self.n_dim]
+                actions_bts = states_from_angles - source_angles
+                actions_tensor[do_bts] = actions_bts
         # TODO: is this too inefficient because of the multiple data transfers?
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, logprobs

From e20a0f63a7f5587b4ce6eaba25e7edbbb79c4f62 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 8 Sep 2023 20:53:21 -0400
Subject: [PATCH 083/206] Minor changes in typing, ctorus

---
 gflownet/envs/ctorus.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index 1ca9f03eb..b61622dbb 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -318,7 +318,7 @@ def _step(
         self,
         action: Tuple[float],
         backward: bool,
-    ) -> Tuple[List[float], Tuple[int, float], bool]:
+    ) -> Tuple[List[float], Tuple[float], bool]:
         """
         Updates self.state given a non-EOS action. This method is called by both step()
         and step_backwards(), with the corresponding value of argument backward.
@@ -368,7 +368,7 @@ def _step(
 
     def step(
         self, action: Tuple[float], skip_mask_check: bool = False
-    ) -> Tuple[List[float], Tuple[int, float], bool]:
+    ) -> Tuple[List[float], Tuple[float], bool]:
         """
         Executes forward step given an action.
 
@@ -414,7 +414,7 @@ def step(
 
     def step_backwards(
         self, action: Tuple[float], skip_mask_check: bool = False
-    ) -> Tuple[List[float], Tuple[int, float], bool]:
+    ) -> Tuple[List[float], Tuple[float], bool]:
         """
         Executes backward step given an action.
 

From 35e9e1a9bd54ffa0ea8f41535f8ea64600589f58 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 8 Sep 2023 20:54:08 -0400
Subject: [PATCH 084/206] Implement step_backwards

---
 gflownet/envs/cube.py             | 112 ++++++++++++++++++++++++------
 tests/gflownet/envs/test_ccube.py |   9 ++-
 2 files changed, 99 insertions(+), 22 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 2008e3d60..641e7a1c7 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1251,8 +1251,8 @@ def _sample_actions_batch_backward(
         # Initialize variables
         n_states = policy_outputs.shape[0]
         is_bts = torch.zeros(n_states, dtype=torch.bool, device=self.device)
-        # EOS is the only possible action only if the entire mask is True
-        is_eos = torch.all(mask, dim=1)
+        # EOS is the only possible action only if done is True (mask[2] is False)
+        is_eos = ~mask[:, 2]
         # Back-to-source (BTS) is the only possible action if mask[1] is False
         is_bts_forced = ~mask[:, 1]
         is_bts[is_bts_forced] = True
@@ -1558,19 +1558,23 @@ def get_jacobian_diag(
         else:
             return 1.0 / ((states - min_increments) + epsilon)
 
-    def step(
-        self, action: Tuple[int, float]
-    ) -> Tuple[List[float], Tuple[int, float], bool]:
+    def _step(
+        self,
+        action: Tuple[float],
+        backward: bool,
+    ) -> Tuple[List[float], Tuple[float], bool]:
         """
-        Executes step given an action. An action is the absolute increment of each
-        dimension.
+        Updates self.state given a non-EOS action. This method is called by both step()
+        and step_backwards(), with the corresponding value of argument backward.
 
         Args
         ----
         action : tuple
-            Action to be executed. An action is a tuple of length n_dim + 1, with the
-            relative increment for each dimension, and the minumum increment at the
-            last entry of the tuple.
+            Action to be executed. An action is a tuple of length n_dim, with the
+            absolute increment for each dimension.
+
+        backward : bool
+            If True, perform backward step. Otherwise (default), perform forward step.
 
         Returns
         -------
@@ -1584,17 +1588,12 @@ def step(
             False, if the action is not allowed for the current state, e.g. stop at the
             root state
         """
-        if self.done:
-            return self.state, action, False
-        if action == self.eos:
-            assert self.state != self.source
-            self.done = True
-            self.n_actions += 1
-            return self.state, self.eos, True
-        # Generic action
         epsilon = 1e-9
         for dim, incr in enumerate(action):
-            self.state[dim] += incr
+            if backward:
+                self.state[dim] -= incr
+            else:
+                self.state[dim] += incr
         assert all(
             [s <= (self.max_val + epsilon) for s in self.state]
         ), f"""
@@ -1607,9 +1606,82 @@ def step(
         State is out of cube bounds.
         \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
         """
-        self.n_actions += 1
         return self.state, action, True
 
+    def step(self, action: Tuple[float]) -> Tuple[List[float], Tuple[int, float], bool]:
+        """
+        Executes step given an action. An action is the absolute increment of each
+        dimension.
+
+        Args
+        ----
+        action : tuple
+            Action to be executed. An action is a tuple of length n_dim, with the
+            absolute increment for each dimension.
+
+        Returns
+        -------
+        self.state : list
+            The sequence after executing the action
+
+        action : int
+            Action executed
+
+        valid : bool
+            False, if the action is not allowed for the current state, e.g. stop at the
+            root state
+        """
+        if self.done:
+            return self.state, action, False
+        if action == self.eos:
+            assert self.state != self.source
+            self.done = True
+            self.n_actions += 1
+            return self.state, self.eos, True
+        # Otherwise perform action
+        else:
+            self.n_actions += 1
+            self._step(action, backward=False)
+            return self.state, action, True
+
+    def step_backwards(
+        self, action: Tuple[int, float]
+    ) -> Tuple[List[float], Tuple[int, float], bool]:
+        """
+        Executes backward step given an action. An action is the absolute decrement of
+        each dimension.
+
+        Args
+        ----
+        action : tuple
+            Action to be executed. An action is a tuple of length n_dim, with the
+            absolute decrement for each dimension.
+
+        Returns
+        -------
+        self.state : list
+            The sequence after executing the action
+
+        action : int
+            Action executed
+
+        valid : bool
+            False, if the action is not allowed for the current state, e.g. stop at the
+            root state
+        """
+        # If done is True, set done to False, increment n_actions and return same state
+        if self.done:
+            assert action == self.eos
+            self.done = False
+            self.n_actions += 1
+            return self.state, action, True
+        # Otherwise perform action
+        else:
+            assert action != self.eos
+            self.n_actions += 1
+            self._step(action, backward=True)
+            return self.state, action, True
+
     def get_grid_terminating_states(self, n_states: int) -> List[List]:
         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
         linspaces = [np.linspace(0, self.max_val, n_per_dim) for _ in range(self.n_dim)]
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index acfce303d..2c5ba08eb 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -708,5 +708,10 @@ def test__get_mask_invalid_actions_forward__returns_expected(env, state, expecte
 
 
 @pytest.mark.skip(reason="skip while developping other tests")
-def test__continuous_env_common(env):
-    return common.test__continuous_env_common(env)
+def test__continuous_env_common__cube1d(cube1d):
+    return common.test__continuous_env_common(cube1d)
+
+
+@pytest.mark.skip(reason="skip while developping other tests")
+def test__continuous_env_common__cube2d(cube2d):
+    return common.test__continuous_env_common(cube2d)

From f5ea29cda4b7ab034eca412ad7cff45fb2dd33e8 Mon Sep 17 00:00:00 2001
From: Alex <alexhg@e.email>
Date: Sat, 9 Sep 2023 14:17:29 -0400
Subject: [PATCH 085/206] Add new common env tests.

---
 tests/gflownet/envs/common.py | 105 +++++++++++++++++++++++++---------
 1 file changed, 79 insertions(+), 26 deletions(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index a9576819c..6997c853b 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -6,14 +6,16 @@
 from hydra import compose, initialize
 from omegaconf import OmegaConf
 
-from gflownet.utils.common import copy
+from gflownet.utils.common import copy, tbool, tfloat
 
 
 def test__all_env_common(env):
     test__init__state_is_source_no_parents(env)
     test__reset__state_is_source_no_parents(env)
+    test__set_state__creates_new_copy_of_state(env)
     test__step__returns_same_state_action_and_invalid_if_done(env)
     test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env)
+    test__sample_actions__backward__returns_eos_if_done(env)
     test__step_random__does_not_sample_invalid_actions(env)
     test__get_parents_step_get_mask__are_compatible(env)
     test__sample_backwards_reaches_source(env)
@@ -25,7 +27,9 @@ def test__all_env_common(env):
 
 def test__continuous_env_common(env):
     test__reset__state_is_source(env)
-    test__get_parents__returns_no_parents_in_initial_state(env)
+    test__set_state__creates_new_copy_of_state(env)
+    test__sampling_forwards_reaches_done_in_finite_steps(env)
+    test__sample_actions__backward__returns_eos_if_done(env)
     #     test__gflownet_minimal_runs(env)
     #     test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env)
     #     test__get_parents__returns_same_state_and_eos_if_done(env)
@@ -34,6 +38,34 @@ def test__continuous_env_common(env):
     test__sample_backwards_reaches_source(env)
 
 
+def _get_terminating_states(env, n):
+    # Hacky way of skipping the Crystal BW sampling test until fixed
+    if env.__class__.__name__ == "Crystal":
+        return
+    if hasattr(env, "get_all_terminating_states"):
+        return env.get_all_terminating_states()
+    elif hasattr(env, "get_grid_terminating_states"):
+        return env.get_grid_terminating_states(n)
+    elif hasattr(env, "get_uniform_terminating_states"):
+        return env.get_uniform_terminating_states(n, 0)
+    elif hasattr(env, "get_random_terminating_states"):
+        return env.get_random_terminating_states(n, 0)
+    else:
+        print(
+            f"""
+        Testing backward sampling or setting terminating states requires that the
+        environment implements one of the following:
+            - get_all_terminating_states()
+            - get_grid_terminating_states()
+            - get_uniform_terminating_states()
+            - get_random_terminating_states()
+        Environment {env.__class__} does not have any of the above, therefore backward
+        sampling will not be tested.
+        """
+        )
+        return None
+
+
 @pytest.mark.repeat(100)
 def test__step_random__does_not_sample_invalid_actions(env):
     env = env.reset()
@@ -75,32 +107,53 @@ def test__get_parents_step_get_mask__are_compatible(env):
             assert mask[env.action_space.index(p_a)] is False
 
 
+@pytest.mark.repeat(500)
+def test__sampling_forwards_reaches_done_in_finite_steps(env):
+    n_actions = 0
+    while not env.done:
+        # Sample random action
+        state_next, action, valid = env.step_random()
+        n_actions += 1
+        assert n_actions <= env.max_traj_length
+
+
+@pytest.mark.repeat(5)
+def test__set_state__creates_new_copy_of_state(env):
+    states = _get_terminating_states(env, 5)
+    if states is None:
+        return
+    state_ids = []
+    for state in states:
+        for idx in range(5):
+            env_new = env.copy().reset(idx)
+            env_new.set_state(state, done=True)
+            state_ids.append(id(env.state))
+    assert len(np.unique(state_ids)) == len(state_ids)
+
+
+@pytest.mark.repeat(5)
+def test__sample_actions__backward__returns_eos_if_done(env, n=5):
+    states = _get_terminating_states(env, n)
+    if states is None:
+        return
+    # Set states, done and get masks
+    masks = []
+    for state in states:
+        env.set_state(state, done=True)
+        masks.append(env.get_mask_invalid_actions_backward())
+    # Build random policy outputs and tensor masks
+    policy_outputs = torch.tile(torch.tensor(env.random_policy_output), (n, 1))
+    masks_invalid_torch = tbool(masks, device=env.device)
+    actions, _ = env.sample_actions_batch(
+        policy_outputs, masks, states, is_backward=True
+    )
+    assert all([action == env.eos for action in actions])
+
+
 @pytest.mark.repeat(100)
 def test__sample_backwards_reaches_source(env, n=100):
-    # Hacky way of skipping the Crystal BW sampling test until fixed
-    if env.__class__.__name__ == "Crystal":
-        return
-    if hasattr(env, "get_all_terminating_states"):
-        x = env.get_all_terminating_states()
-    elif hasattr(env, "get_grid_terminating_states"):
-        x = env.get_grid_terminating_states(n)
-    elif hasattr(env, "get_uniform_terminating_states"):
-        x = env.get_uniform_terminating_states(n, 0)
-    elif hasattr(env, "get_random_terminating_states"):
-        x = env.get_random_terminating_states(n, 0)
-    else:
-        print(
-            f"""
-        Testing backward sampling requires that the environment implements one of the
-        following:
-            - get_all_terminating_states()
-            - get_grid_terminating_states()
-            - get_uniform_terminating_states()
-            - get_random_terminating_states()
-        Environment {env.__class__} does not have any of the above, therefore backward
-        sampling will not be tested.
-        """
-        )
+    states = _get_terminating_states(env, n)
+    if states is None:
         return
     for state in x:
         env.set_state(state, done=True)

From d6fdcfe3ea2dd60ff2ccfb83a7a3ff14a9b5434f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sat, 9 Sep 2023 14:38:47 -0400
Subject: [PATCH 086/206] Fixes in new tests.

---
 tests/gflownet/envs/common.py | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 6997c853b..f9aa21e5c 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -28,14 +28,16 @@ def test__all_env_common(env):
 def test__continuous_env_common(env):
     test__reset__state_is_source(env)
     test__set_state__creates_new_copy_of_state(env)
-    test__sampling_forwards_reaches_done_in_finite_steps(env)
-    test__sample_actions__backward__returns_eos_if_done(env)
-    #     test__gflownet_minimal_runs(env)
-    #     test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env)
-    #     test__get_parents__returns_same_state_and_eos_if_done(env)
-    test__step__returns_same_state_action_and_invalid_if_done(env)
-    test__actions2indices__returns_expected_tensor(env)
-    test__sample_backwards_reaches_source(env)
+
+
+#     test__sampling_forwards_reaches_done_in_finite_steps(env)
+#     test__sample_actions__backward__returns_eos_if_done(env)
+#     test__gflownet_minimal_runs(env)
+#     test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env)
+#     test__get_parents__returns_same_state_and_eos_if_done(env)
+#     test__step__returns_same_state_action_and_invalid_if_done(env)
+#     test__actions2indices__returns_expected_tensor(env)
+#     test__sample_backwards_reaches_source(env)
 
 
 def _get_terminating_states(env, n):
@@ -122,12 +124,13 @@ def test__set_state__creates_new_copy_of_state(env):
     states = _get_terminating_states(env, 5)
     if states is None:
         return
-    state_ids = []
+    envs = []
     for state in states:
         for idx in range(5):
             env_new = env.copy().reset(idx)
             env_new.set_state(state, done=True)
-            state_ids.append(id(env.state))
+            envs.append(env_new)
+    state_ids = [id(env.state) for env in envs]
     assert len(np.unique(state_ids)) == len(state_ids)
 
 
@@ -142,8 +145,13 @@ def test__sample_actions__backward__returns_eos_if_done(env, n=5):
         env.set_state(state, done=True)
         masks.append(env.get_mask_invalid_actions_backward())
     # Build random policy outputs and tensor masks
-    policy_outputs = torch.tile(torch.tensor(env.random_policy_output), (n, 1))
-    masks_invalid_torch = tbool(masks, device=env.device)
+    policy_outputs = torch.tile(
+        tfloat(env.random_policy_output, float_type=env.float, device=env.device),
+        (len(states), 1),
+    )
+    # Add noise to policy outputs
+    policy_outputs += torch.randn(policy_outputs.shape)
+    masks = tbool(masks, device=env.device)
     actions, _ = env.sample_actions_batch(
         policy_outputs, masks, states, is_backward=True
     )
@@ -155,7 +163,7 @@ def test__sample_backwards_reaches_source(env, n=100):
     states = _get_terminating_states(env, n)
     if states is None:
         return
-    for state in x:
+    for state in states:
         env.set_state(state, done=True)
         n_actions = 0
         while True:

From 8ef8e54cc8df50ef46518be663be68fa078985b6 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sat, 9 Sep 2023 15:10:00 -0400
Subject: [PATCH 087/206] Fix passing atol to isclose

---
 gflownet/envs/base.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/base.py b/gflownet/envs/base.py
index 992234b2d..7ed941bc0 100644
--- a/gflownet/envs/base.py
+++ b/gflownet/envs/base.py
@@ -959,10 +959,12 @@ def isclose(state_x, state_y, atol=1e-8):
                 y_nan = torch.isnan(state_y)
                 if not torch.equal(x_nan, y_nan):
                     return False
-                return torch.all(torch.isclose(state_x[~x_nan], state_y[~y_nan], atol))
+                return torch.all(
+                    torch.isclose(state_x[~x_nan], state_y[~y_nan], atol=atol)
+                )
             return torch.equal(state_x, state_y)
         else:
-            return np.all(np.isclose(state_x, state_y, atol))
+            return np.all(np.isclose(state_x, state_y, atol=atol))
 
     def set_energies_stats(self, energies_stats):
         self.energies_stats = energies_stats

From 5fa35264535ba7f96a400b1e0f946cce771d8b80 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sat, 9 Sep 2023 15:11:13 -0400
Subject: [PATCH 088/206] Fix action space and check if state is close enough
 to source.

---
 gflownet/envs/cube.py | 46 ++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 641e7a1c7..e47e31c3a 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -15,7 +15,7 @@
 from torchtyping import TensorType
 
 from gflownet.envs.base import GFlowNetEnv
-from gflownet.utils.common import tbool, tfloat
+from gflownet.utils.common import copy, tbool, tfloat
 
 
 class Cube(GFlowNetEnv, ABC):
@@ -667,25 +667,22 @@ class ContinuousCube(Cube):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    # TODO: rewrite docstring
     def get_action_space(self):
         """
-        The actions are tuples of length n_dim + 1, where the value at position d indicates
-        the (positive, relative) increment of dimension d. The value at the last
-        position indicates the minimum increment: 0.0 if the transition is from the
-        source state, min_incr otherwise.
+        The action space is continuous, thus not defined as such here.
 
-        Additionally, there are two special discrete actions:
-            - EOS action. Indicated by np.inf for all dimensions. Only valid forwards.
-            - Back-to-source action. Indicated by -1 for all dimensions. Only valid
-              backwards.
-        """
-        generic_action = tuple([0.0 for _ in range(self.n_dim)] + [self.min_incr])
-        from_source = tuple([0.0 for _ in range(self.n_dim)] + [0.0])
-        to_source = tuple([-1.0 for _ in range(self.n_dim + 1)])
-        self.eos = tuple([np.inf for _ in range(self.n_dim + 1)])
-        actions = [generic_action, from_source, to_source, self.eos]
-        return actions
+        The actions are tuples of length n_dim, where the value at position d indicates
+        the increment of dimension d.
+
+        EOS is indicated by np.inf for all dimensions.
+
+        This method defines self.eos and the returned action space is simply a
+        representative (arbitrary) action with an increment of 0.0 in all dimensions,
+        and EOS.
+        """
+        self.eos = tuple([np.inf] * self.n_dim)
+        self.representative_action = tuple([0.0] * self.n_dim)
+        return [self.representative_action, self.eos]
 
     def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         """
@@ -874,9 +871,6 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         done = self._get_done(done)
         mask_dim = 3
         mask = [True] * mask_dim
-        # If state is source, all actions are invalid and no special cases.
-        if state == self.source:
-            return mask
         # If done, only valid action is EOS.
         if done:
             mask[2] = False
@@ -1594,12 +1588,24 @@ def _step(
                 self.state[dim] -= incr
             else:
                 self.state[dim] += incr
+        # If state is close enough to source, set source to avoid escaping comparison
+        # to source.
+        if self.isclose(self.state, self.source, atol=1e-6):
+            self.state = copy(self.source)
+        if not all([s <= (self.max_val + epsilon) for s in self.state]):
+            import ipdb
+
+            ipdb.set_trace()
         assert all(
             [s <= (self.max_val + epsilon) for s in self.state]
         ), f"""
         State is out of cube bounds.
         \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
         """
+        if not all([s >= (0.0 - epsilon) for s in self.state]):
+            import ipdb
+
+            ipdb.set_trace()
         assert all(
             [s >= (0.0 - epsilon) for s in self.state]
         ), f"""

From 21eac3af8c74fc4e05ba62b71ab3bea483e0e77e Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sat, 9 Sep 2023 15:12:05 -0400
Subject: [PATCH 089/206] Re-enable and adapt tests (passed by cube)

---
 tests/gflownet/envs/common.py     | 8 ++++----
 tests/gflownet/envs/test_ccube.py | 5 ++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index f9aa21e5c..f72cdf32b 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -28,16 +28,16 @@ def test__all_env_common(env):
 def test__continuous_env_common(env):
     test__reset__state_is_source(env)
     test__set_state__creates_new_copy_of_state(env)
+    test__sampling_forwards_reaches_done_in_finite_steps(env)
+    test__sample_actions__backward__returns_eos_if_done(env)
+    test__step__returns_same_state_action_and_invalid_if_done(env)
+    test__sample_backwards_reaches_source(env)
 
 
-#     test__sampling_forwards_reaches_done_in_finite_steps(env)
-#     test__sample_actions__backward__returns_eos_if_done(env)
 #     test__gflownet_minimal_runs(env)
 #     test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env)
 #     test__get_parents__returns_same_state_and_eos_if_done(env)
-#     test__step__returns_same_state_action_and_invalid_if_done(env)
 #     test__actions2indices__returns_expected_tensor(env)
-#     test__sample_backwards_reaches_source(env)
 
 
 def _get_terminating_states(env, n):
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 2c5ba08eb..21ba83e4b 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -161,7 +161,7 @@ def test__mask_forward__2d__returns_expected(cube2d, state, mask_expected):
     [
         (
             [0.0],
-            [True, True, True],
+            [True, False, True],
         ),
         (
             [0.1],
@@ -196,7 +196,7 @@ def test__mask_backward__1d__returns_expected(cube1d, state, mask_expected):
     [
         (
             [0.0, 0.0],
-            [True, True, True],
+            [True, False, True],
         ),
         (
             [0.5, 0.5],
@@ -712,6 +712,5 @@ def test__continuous_env_common__cube1d(cube1d):
     return common.test__continuous_env_common(cube1d)
 
 
-@pytest.mark.skip(reason="skip while developping other tests")
 def test__continuous_env_common__cube2d(cube2d):
     return common.test__continuous_env_common(cube2d)

From 54f5559557346cbf589a3f721c1238db19394c8f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 18:32:42 -0400
Subject: [PATCH 090/206] Pack code to make increment distribution into
 separate shared method.

---
 gflownet/envs/cube.py | 59 ++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 34 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index e47e31c3a..7f2812b45 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1081,6 +1081,25 @@ def sample_actions_batch(
                 policy_outputs, mask, states_from, sampling_method, temperature_logits
             )
 
+    def _make_increments_distribution(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+    ) -> MixtureSameFamily:
+        mix_logits = self._get_policy_betas_weights(policy_outputs).reshape(
+            -1, self.n_dim, self.n_comp
+        )
+        mix = Categorical(logits=mix_logits)
+        alphas = self._get_policy_betas_alpha(policy_outputs).reshape(
+            -1, self.n_dim, self.n_comp
+        )
+        alphas = self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
+        betas = self._get_policy_betas_beta(policy_outputs).reshape(
+            -1, self.n_dim, self.n_comp
+        )
+        betas = self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
+        beta_distr = Beta(alphas, betas)
+        return MixtureSameFamily(mix, beta_distr)
+
     def _sample_actions_batch_forward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -1150,24 +1169,9 @@ def _sample_actions_batch_forward(
             if sampling_method == "uniform":
                 raise NotImplementedError()
             elif sampling_method == "policy":
-                mix_logits = self._get_policy_betas_weights(policy_outputs)[
-                    do_increments
-                ].reshape(-1, self.n_dim, self.n_comp)
-                mix = Categorical(logits=mix_logits)
-                alphas = self._get_policy_betas_alpha(policy_outputs)[
-                    do_increments
-                ].reshape(-1, self.n_dim, self.n_comp)
-                alphas = (
-                    self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
-                )
-                betas = self._get_policy_betas_beta(policy_outputs)[
-                    do_increments
-                ].reshape(-1, self.n_dim, self.n_comp)
-                betas = (
-                    self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
+                distr_increments = self._make_increments_distribution(
+                    policy_outputs[do_increments]
                 )
-                beta_distr = Beta(alphas, betas)
-                distr_increments = MixtureSameFamily(mix, beta_distr)
             # Shape of increments_rel: [n_do_increments, n_dim]
             increments_rel = distr_increments.sample()
             # Get minimum increments
@@ -1264,24 +1268,9 @@ def _sample_actions_batch_backward(
             if sampling_method == "uniform":
                 raise NotImplementedError()
             elif sampling_method == "policy":
-                mix_logits = self._get_policy_betas_weights(policy_outputs)[
-                    do_increments
-                ].reshape(-1, self.n_dim, self.n_comp)
-                mix = Categorical(logits=mix_logits)
-                alphas = self._get_policy_betas_alpha(policy_outputs)[
-                    do_increments
-                ].reshape(-1, self.n_dim, self.n_comp)
-                alphas = (
-                    self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
+                distr_increments = self._make_increments_distribution(
+                    policy_outputs[do_increments]
                 )
-                betas = self._get_policy_betas_beta(policy_outputs)[
-                    do_increments
-                ].reshape(-1, self.n_dim, self.n_comp)
-                betas = (
-                    self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
-                )
-                beta_distr = Beta(alphas, betas)
-                distr_increments = MixtureSameFamily(mix, beta_distr)
             # Shape of increments_rel: [n_do_increments, n_dim]
             increments_rel = distr_increments.sample()
             # Set minimum increments
@@ -1614,6 +1603,7 @@ def _step(
         """
         return self.state, action, True
 
+    # TODO: make generic for continuous environments
     def step(self, action: Tuple[float]) -> Tuple[List[float], Tuple[int, float], bool]:
         """
         Executes step given an action. An action is the absolute increment of each
@@ -1650,6 +1640,7 @@ def step(self, action: Tuple[float]) -> Tuple[List[float], Tuple[int, float], bo
             self._step(action, backward=False)
             return self.state, action, True
 
+    # TODO: make generic for continuous environments
     def step_backwards(
         self, action: Tuple[int, float]
     ) -> Tuple[List[float], Tuple[int, float], bool]:

From 36f7fc9461e83ab10cae59a65a2953cf731ba4a7 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 19:26:02 -0400
Subject: [PATCH 091/206] First version of new version of get_logprobs
 (forward)

---
 gflownet/envs/cube.py | 142 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 140 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 7f2812b45..07fc503ea 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1149,7 +1149,7 @@ def _sample_actions_batch_forward(
         is_eos = torch.zeros(n_states, dtype=torch.bool, device=self.device)
         # Determine source states
         is_source = ~mask[:, 1]
-        # EOS is the only possible action continuous actions are invalid (mask[0] is
+        # EOS is the only possible action if continuous actions are invalid (mask[0] is
         # True)
         is_eos_forced = mask[:, 0]
         is_eos[is_eos_forced] = True
@@ -1304,7 +1304,100 @@ def _sample_actions_batch_backward(
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, None
 
+    # TODO: Remove need for states_to?
     def get_logprobs(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+        is_forward: bool,
+        actions: TensorType["n_states", "n_dim"],
+        states_from: TensorType["n_states", "policy_input_dim"],
+        states_to: TensorType["n_states", "policy_input_dim"],
+        mask_invalid_actions: TensorType["n_states", "3"] = None,
+    ) -> TensorType["batch_size"]:
+        """
+        Computes log probabilities of actions given policy outputs and actions.
+        """
+        if is_forward:
+            return self._get_logprobs_forward(
+                policy_outputs, actions, states_from, states_to, mask
+            )
+        else:
+            raise NotImplementedError()
+
+    # TODO: Unify sample_actions and get_logprobs
+    def _get_logprobs_forward(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+        actions: TensorType["n_states", "n_dim"],
+        states_from: TensorType["n_states", "policy_input_dim"],
+        states_to: TensorType["n_states", "policy_input_dim"],
+        mask: TensorType["n_states", "3"] = None,
+    ) -> TensorType["batch_size"]:
+        """
+        Computes log probabilities of actions.
+        """
+        # Initialize variables
+        n_states = policy_outputs.shape[0]
+        is_eos = torch.zeros(n_states, dtype=torch.bool, device=self.device)
+        logprobs_eos = torch.zeros(n_states, dtype=torch.bool, device=self.device)
+        logprobs_increments_rel = torch.zeros(
+            n_states, dtype=torch.bool, device=self.device
+        )
+        jacobian_diag = torch.ones(
+            (n_states, self.n_dim), device=self.device, dtype=self.float
+        )
+        eos_tensor = tfloat(self.eos, float_type=self.float, device=self.device)
+        # Determine source states
+        is_source = ~mask[:, 1]
+        # EOS is the only possible action if continuous actions are invalid (mask[0] is
+        # True)
+        is_eos_forced = mask[:, 0]
+        is_eos[is_eos_forced] = True
+        # Ensure that is_eos_forced does not include any source state
+        assert not torch.any(torch.logical_and(is_source, is_eos_forced))
+        # Get sampled EOS actions and get log probs from Bernoulli distribution
+        do_eos = torch.logical_and(~is_source, ~is_eos_forced)
+        if torch.any(do_eos):
+            is_eos_sampled = torch.all(actions[do_eos] == eos_tensor, dim=1)
+            is_eos[is_eos_sampled] = True
+            logits_eos = self._get_policy_eos_logit(policy_outputs)[do_eos]
+            distr_eos = Bernoulli(logits=logits_eos)
+            logprobs_eos[do_eos] = distr_eos.log_prob(is_eos_sampled.to(self.float))
+        # Get log probs of relative increments if EOS was not the sampled or forced
+        # action
+        do_increments = ~is_eos
+        if torch.any(do_increments):
+            # Shape of increments_rel: [n_do_increments, n_dim]
+            increments_rel = actions[do_increments]
+            distr_increments = self._make_increments_distribution(
+                policy_outputs[do_increments]
+            )
+            logprobs_increments_rel[do_increments] = distr_increments.log_prob(
+                increments_rel
+            )
+            # Get minimum increments
+            min_increments = torch.full_like(
+                increments_rel, self.min_incr, dtype=self.float, device=self.device
+            )
+            min_increments[is_source[do_increments]] = 0.0
+            # Compute diagonal of the Jacobian (see _get_jacobian_diag())
+            states_from_do_increments = tfloat(
+                states_from, float_type=self.float, device=self.device
+            )[do_increments]
+            jacobian_diag[do_sample] = self._get_jacobian_diag(
+                states_from_do_increments,
+                min_increments,
+                self.max_val,
+                is_backward=False,
+            )
+        # Get log determinant of the Jacobian
+        log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)
+        # Compute combined probabilities
+        sumlogprobs_increments = logprobs_increments_rel.sum(axis=1)
+        logprobs = logprobs_eos + sumlogprobs_increments + log_det_jacobian
+        return logprobs
+
+    def get_logprobs_old(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
@@ -1498,7 +1591,7 @@ def get_logprobs(
         return logprobs
 
     # TODO: min_incr is zero from source!
-    def get_jacobian_diag(
+    def get_jacobian_diag_old(
         self,
         states: TensorType["batch_size", "state_dim"],
         is_forward: bool,
@@ -1541,6 +1634,51 @@ def get_jacobian_diag(
         else:
             return 1.0 / ((states - min_increments) + epsilon)
 
+    # TODO: min_incr is zero from source!
+    @staticmethod
+    def _get_jacobian_diag(
+        states_from: TensorType["n_states", "n_dim"],
+        min_increments: TensorType["n_states", "n_dim"],
+        max_val: float,
+        is_backward: bool,
+    ):
+        """
+        Computes the diagonal of the Jacobian of the sampled actions with respect to
+        the target states.
+
+        Forward: the sampled variables are the relative increments r_f and the state
+        updates (s -> s') are (assuming max_val = 1):
+
+        s' = s + m + r_f(1 - s - m)
+        r_f = (s' - s - m) / (1 - s - m)
+
+        Therefore, the derivative of r_f wrt to s' is
+
+        dr_f/ds' = 1 / (1 - s - m)
+
+        Backward: the sampled variables are the relative decrements r_b and the state
+        updates (s' -> s) are:
+
+        s = s' - m - r_b(s' - m)
+        r_b = (s' - s - m) / (s' - m)
+
+        Therefore, the derivative of r_b wrt to s is
+
+        dr_b/ds = -1 / (s' - m)
+
+        We take the absolute value of the derivative (Jacobian).
+
+        The derivatives of the components of r with respect to dimensions of s or s'
+        other than itself are zero. Therefore, the Jacobian is diagonal and the
+        determinant is the product of the diagonal.
+        """
+        epsilon = 1e-9
+        max_val = torch.full_like(states_from, max_val)
+        if is_backward:
+            return 1.0 / ((states_from - min_increments) + epsilon)
+        else:
+            return 1.0 / ((max_val - states_from - min_increments) + epsilon)
+
     def _step(
         self,
         action: Tuple[float],

From b2b164baa481a7999574d10d3dad5c03ffa2a295 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 20:31:04 -0400
Subject: [PATCH 092/206] Test of fw logprobs for EOS actions and fixes.

---
 gflownet/envs/cube.py             | 16 ++++--
 tests/gflownet/envs/test_ccube.py | 82 +++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 07fc503ea..e608a10fb 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -797,6 +797,7 @@ def _get_policy_source_logit(
         """
         return policy_output[:, -2]
 
+    # TODO: EOS must be valid from source too
     def get_mask_invalid_actions_forward(
         self,
         state: Optional[List] = None,
@@ -1305,6 +1306,8 @@ def _sample_actions_batch_backward(
         return actions, None
 
     # TODO: Remove need for states_to?
+    # TODO: reorganise args
+    # TODO: mask_invalid_actions -> mask
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -1319,7 +1322,7 @@ def get_logprobs(
         """
         if is_forward:
             return self._get_logprobs_forward(
-                policy_outputs, actions, states_from, states_to, mask
+                policy_outputs, actions, states_from, states_to, mask_invalid_actions
             )
         else:
             raise NotImplementedError()
@@ -1339,9 +1342,9 @@ def _get_logprobs_forward(
         # Initialize variables
         n_states = policy_outputs.shape[0]
         is_eos = torch.zeros(n_states, dtype=torch.bool, device=self.device)
-        logprobs_eos = torch.zeros(n_states, dtype=torch.bool, device=self.device)
+        logprobs_eos = torch.zeros(n_states, dtype=self.float, device=self.device)
         logprobs_increments_rel = torch.zeros(
-            n_states, dtype=torch.bool, device=self.device
+            (n_states, self.n_dim), dtype=self.float, device=self.device
         )
         jacobian_diag = torch.ones(
             (n_states, self.n_dim), device=self.device, dtype=self.float
@@ -1358,11 +1361,14 @@ def _get_logprobs_forward(
         # Get sampled EOS actions and get log probs from Bernoulli distribution
         do_eos = torch.logical_and(~is_source, ~is_eos_forced)
         if torch.any(do_eos):
-            is_eos_sampled = torch.all(actions[do_eos] == eos_tensor, dim=1)
+            is_eos_sampled = torch.zeros_like(do_eos)
+            is_eos_sampled[do_eos] = torch.all(actions[do_eos] == eos_tensor, dim=1)
             is_eos[is_eos_sampled] = True
             logits_eos = self._get_policy_eos_logit(policy_outputs)[do_eos]
             distr_eos = Bernoulli(logits=logits_eos)
-            logprobs_eos[do_eos] = distr_eos.log_prob(is_eos_sampled.to(self.float))
+            logprobs_eos[do_eos] = distr_eos.log_prob(
+                is_eos_sampled[do_eos].to(self.float)
+            )
         # Get log probs of relative increments if EOS was not the sampled or forced
         # action
         do_increments = ~is_eos
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 21ba83e4b..4cd48e04b 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -629,6 +629,88 @@ def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bs
     assert torch.all(actions_tensor <= increments_abs_max)
 
 
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.95, 0.97], [0.96, 0.5], [0.5, 0.96]],
+            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
+        ),
+        (
+            [[0.95, 0.97], [0.901, 0.5], [1.0, 1.0]],
+            [[np.inf, np.inf], [0.01, 0.2], [0.3, 0.01]],
+        ),
+    ],
+)
+def test__get_logprobs_forward__2d__nearedge_returns_prob1(cube2d, states, actions):
+    """
+    The only valid action from 'near-edge' states is EOS, thus the the log probability
+    should be zero, regardless of the action and the policy outputs
+    """
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Build policy outputs
+    params = env.fixed_distr_params
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Add noise to policy outputs
+    policy_outputs += torch.randn(policy_outputs.shape)
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, True, actions, states_torch, None, masks
+    )
+    assert torch.all(logprobs == 0.0)
+
+
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+            [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
+        ),
+        (
+            [[0.5, 0.97], [0.01, 0.01], [1.0, 1.0]],
+            [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
+        ),
+    ],
+)
+def test__get_logprobs_forward__2d__eos_actions_return_expected(
+    cube2d, states, actions
+):
+    """
+    The only valid action from 'near-edge' states is EOS, thus the the log probability
+    should be zero, regardless of the action and the policy outputs
+    """
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Define Bernoulli parameter for EOS with deterministic probability (force EOS)
+    # If Bernouilli has logit torch.inf, the logprobs are nan
+    logit_force_eos = 1000
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["bernoulli_eos_logit"] = logit_force_eos
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Add noise to policy outputs
+    policy_outputs += torch.randn(policy_outputs.shape)
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, True, actions, states_torch, None, masks
+    )
+    assert torch.all(logprobs == 0.0)
+
+
 @pytest.mark.parametrize(
     "state, expected",
     [

From 5fc722d4a25638f97a1786ff70e54c3973323e6b Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 20:57:21 -0400
Subject: [PATCH 093/206] Adjust test

---
 tests/gflownet/envs/test_ccube.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 4cd48e04b..b66a93ffa 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -675,7 +675,7 @@ def test__get_logprobs_forward__2d__nearedge_returns_prob1(cube2d, states, actio
             [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
         ),
         (
-            [[0.5, 0.97], [0.01, 0.01], [1.0, 1.0]],
+            [[1.0, 1.0], [0.01, 0.01], [0.001, 0.1]],
             [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
         ),
     ],
@@ -695,20 +695,24 @@ def test__get_logprobs_forward__2d__eos_actions_return_expected(
     masks = tbool(
         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
     )
+    # Get EOS forced
+    is_near_edge = states_torch > 1.0 - env.min_incr
+    is_eos_forced = torch.any(is_near_edge, dim=1)
     # Define Bernoulli parameter for EOS with deterministic probability (force EOS)
     # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_force_eos = 1000
+    logit_eos = 1
+    distr_eos = Bernoulli(logits=logit_eos)
+    logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
     # Build policy outputs
     params = env.fixed_distr_params
-    params["bernoulli_eos_logit"] = logit_force_eos
+    params["bernoulli_eos_logit"] = logit_eos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    # Add noise to policy outputs
-    policy_outputs += torch.randn(policy_outputs.shape)
     # Get log probs
     logprobs = env.get_logprobs(
         policy_outputs, True, actions, states_torch, None, masks
     )
-    assert torch.all(logprobs == 0.0)
+    assert torch.all(logprobs[is_eos_forced] == 0.0)
+    assert torch.all(torch.isclose(logprobs[~is_eos_forced], logprob_eos, atol=1e-6))
 
 
 @pytest.mark.parametrize(

From 1558f34b19d1e4a7e2c03ae89da22b820704e7fc Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 21:45:52 -0400
Subject: [PATCH 094/206] Test of fw logprobs for actions from source

---
 config/env/ccube.yaml             |  1 +
 gflownet/envs/cube.py             |  4 ++-
 tests/gflownet/envs/test_ccube.py | 52 +++++++++++++++++++++++++++++--
 3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index c2f65bb5f..c28535864 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -21,6 +21,7 @@ fixed_distribution:
   bernoulli_eos_logit: 1.0
 random_distribution:
   beta_weights: 1.0
+  # IMPORTANT: adjust because of sigmoid!
   beta_alpha: 1.0
   beta_beta: 1.0
   bernoulli_source_logit: 1.0
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index e608a10fb..d0beb522f 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1308,6 +1308,7 @@ def _sample_actions_batch_backward(
     # TODO: Remove need for states_to?
     # TODO: reorganise args
     # TODO: mask_invalid_actions -> mask
+    # TODO: states_from must be tensor or could be list?
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -1378,6 +1379,7 @@ def _get_logprobs_forward(
             distr_increments = self._make_increments_distribution(
                 policy_outputs[do_increments]
             )
+            # TODO: deal with increments of 0.0 or 1.0 which will yield nan
             logprobs_increments_rel[do_increments] = distr_increments.log_prob(
                 increments_rel
             )
@@ -1390,7 +1392,7 @@ def _get_logprobs_forward(
             states_from_do_increments = tfloat(
                 states_from, float_type=self.float, device=self.device
             )[do_increments]
-            jacobian_diag[do_sample] = self._get_jacobian_diag(
+            jacobian_diag[do_increments] = self._get_jacobian_diag(
                 states_from_do_increments,
                 min_increments,
                 self.max_val,
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index b66a93ffa..6fdacf656 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -509,7 +509,7 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
     increments_abs_max[is_eos] = torch.inf
     # Reconfigure environment
     env.n_comp = 1
-    env.beta_params_min = 0.0
+    env.beta_params_min = beta_params_min
     env.beta_params_max = beta_params_max
     # Build policy outputs
     params = env.fixed_distr_params
@@ -609,7 +609,7 @@ def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bs
     increments_abs_max[is_bst] = states_torch[is_bst]
     # Reconfigure environment
     env.n_comp = 1
-    env.beta_params_min = 0.0
+    env.beta_params_min = beta_params_min
     env.beta_params_max = beta_params_max
     # Build policy outputs
     params = env.fixed_distr_params
@@ -715,6 +715,54 @@ def test__get_logprobs_forward__2d__eos_actions_return_expected(
     assert torch.all(torch.isclose(logprobs[~is_eos_forced], logprob_eos, atol=1e-6))
 
 
+@pytest.mark.parametrize(
+    "actions",
+    [
+        [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+        [[0.999, 0.999], [0.0001, 0.0001], [0.5, 0.5]],
+    ],
+)
+def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1(
+    cube2d, actions
+):
+    """
+    With Uniform increment policy, all the actions from the source must have the same
+    probability.
+    """
+    env = cube2d
+    n_states = len(actions)
+    states = [env.source for _ in range(n_states)]
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Define Uniform Beta distribution (large values of alpha and beta and max of 1.0)
+    beta_params_min = 0.0
+    beta_params_max = 1.0
+    alpha_presigmoid = 1000.0
+    betas_presigmoid = 1000.0
+    # Define Bernoulli parameter for impossible EOS
+    # If Bernouilli has logit -torch.inf, the logprobs are nan
+    logit_force_noeos = -1000
+    # Reconfigure environment
+    env.n_comp = 1
+    env.beta_params_min = beta_params_min
+    env.beta_params_max = beta_params_max
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["beta_alpha"] = alpha_presigmoid
+    params["beta_beta"] = betas_presigmoid
+    params["bernoulli_eos_logit"] = logit_force_noeos
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, True, actions, states_torch, None, masks
+    )
+    assert torch.all(logprobs == 0.0)
+
+
 @pytest.mark.parametrize(
     "state, expected",
     [

From fa451d6a079877bc0d7e905221b0cbbd5ec8d138 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 22:04:39 -0400
Subject: [PATCH 095/206] First version of _get_logprobs_backward

---
 gflownet/envs/cube.py | 83 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 81 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index d0beb522f..1569834b9 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1326,7 +1326,9 @@ def get_logprobs(
                 policy_outputs, actions, states_from, states_to, mask_invalid_actions
             )
         else:
-            raise NotImplementedError()
+            return self._get_logprobs_backward(
+                policy_outputs, actions, states_from, states_to, mask_invalid_actions
+            )
 
     # TODO: Unify sample_actions and get_logprobs
     def _get_logprobs_forward(
@@ -1338,7 +1340,7 @@ def _get_logprobs_forward(
         mask: TensorType["n_states", "3"] = None,
     ) -> TensorType["batch_size"]:
         """
-        Computes log probabilities of actions.
+        Computes log probabilities of forward actions.
         """
         # Initialize variables
         n_states = policy_outputs.shape[0]
@@ -1405,6 +1407,83 @@ def _get_logprobs_forward(
         logprobs = logprobs_eos + sumlogprobs_increments + log_det_jacobian
         return logprobs
 
+    # TODO: Unify sample_actions and get_logprobs
+    def _get_logprobs_backward(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+        actions: TensorType["n_states", "n_dim"],
+        states_from: TensorType["n_states", "policy_input_dim"],
+        states_to: TensorType["n_states", "policy_input_dim"],
+        mask: TensorType["n_states", "3"] = None,
+    ) -> TensorType["batch_size"]:
+        """
+        Computes log probabilities of backward actions.
+        """
+        # Initialize variables
+        n_states = policy_outputs.shape[0]
+        is_bts = torch.zeros(n_states, dtype=torch.bool, device=self.device)
+        logprobs_bts = torch.zeros(n_states, dtype=self.float, device=self.device)
+        logprobs_increments_rel = torch.zeros(
+            (n_states, self.n_dim), dtype=self.float, device=self.device
+        )
+        jacobian_diag = torch.ones(
+            (n_states, self.n_dim), device=self.device, dtype=self.float
+        )
+        # EOS is the only possible action only if done is True (mask[2] is False)
+        is_eos = ~mask[:, 2]
+        # Back-to-source (BTS) is the only possible action if mask[1] is False
+        is_bts_forced = ~mask[:, 1]
+        is_bts[is_bts_forced] = True
+        # Get sampled BTS actions and get log probs from Bernoulli distribution
+        do_bts = torch.logical_and(~is_bts_forced, ~is_eos)
+        if torch.any(do_bts):
+            # BTS actions are equal to the originating states
+            is_bts_sampled = torch.zeros_like(do_bts)
+            is_bts_sampled[do_bts] = torch.all(
+                actions[do_bts] == states_from[do_bts], dim=1
+            )
+            is_bts[is_bts_sampled] = True
+            logits_bts = self._get_policy_source_logit(policy_outputs)[do_bts]
+            distr_bts = Bernoulli(logits=logits_bts)
+            logprobs_bts[do_bts] = distr_bts.log_prob(
+                is_bts_sampled[do_bts].to(self.float)
+            )
+        # Get log probs of relative increments if actions were neither BTS nor EOS
+        do_increments = torch.logical_and(~is_bts, ~is_eos)
+        if torch.any(do_increments):
+            # Shape of increments_rel: [n_do_increments, n_dim]
+            increments_rel = actions[do_increments]
+            distr_increments = self._make_increments_distribution(
+                policy_outputs[do_increments]
+            )
+            # TODO: deal with increments of 0.0 or 1.0 which will yield nan
+            logprobs_increments_rel[do_increments] = distr_increments.log_prob(
+                increments_rel
+            )
+            # Set minimum increments
+            min_increments = torch.full_like(
+                increments_rel, self.min_incr, dtype=self.float, device=self.device
+            )
+            # Compute diagonal of the Jacobian (see _get_jacobian_diag())
+            states_from_do_increments = tfloat(
+                states_from, float_type=self.float, device=self.device
+            )[do_increments]
+            jacobian_diag[do_increments] = self._get_jacobian_diag(
+                states_from_do_increments,
+                min_increments,
+                self.max_val,
+                is_backward=False,
+            )
+        # Get log determinant of the Jacobian
+        log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)
+        # Compute combined probabilities
+        sumlogprobs_increments = logprobs_increments_rel.sum(axis=1)
+        logprobs = logprobs_bst + sumlogprobs_increments + log_det_jacobian
+        # Logprobs of forced EOS are 0
+        # TODO: is there any avoidable computation of is_eos actions?
+        logprobs[is_eos] = 0.0
+        return logprobs
+
     def get_logprobs_old(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],

From d67c36fefde3f5610d9a86306e5b350676c1b96b Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 22:25:42 -0400
Subject: [PATCH 096/206] Fix typo

---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 1569834b9..4cb797004 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1478,7 +1478,7 @@ def _get_logprobs_backward(
         log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)
         # Compute combined probabilities
         sumlogprobs_increments = logprobs_increments_rel.sum(axis=1)
-        logprobs = logprobs_bst + sumlogprobs_increments + log_det_jacobian
+        logprobs = logprobs_bts + sumlogprobs_increments + log_det_jacobian
         # Logprobs of forced EOS are 0
         # TODO: is there any avoidable computation of is_eos actions?
         logprobs[is_eos] = 0.0

From a29afc2d73b9fd5a16da9bb4a92f49d19b5bad54 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 22:26:34 -0400
Subject: [PATCH 097/206] Add env common test:
 test__get_logprobs__backward__returns_zero_if_done

---
 tests/gflownet/envs/common.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index f72cdf32b..5c6888689 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -16,6 +16,7 @@ def test__all_env_common(env):
     test__step__returns_same_state_action_and_invalid_if_done(env)
     test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env)
     test__sample_actions__backward__returns_eos_if_done(env)
+    test__get_logprobs__backward__returns_zero_if_done(env)
     test__step_random__does_not_sample_invalid_actions(env)
     test__get_parents_step_get_mask__are_compatible(env)
     test__sample_backwards_reaches_source(env)
@@ -30,6 +31,7 @@ def test__continuous_env_common(env):
     test__set_state__creates_new_copy_of_state(env)
     test__sampling_forwards_reaches_done_in_finite_steps(env)
     test__sample_actions__backward__returns_eos_if_done(env)
+    test__get_logprobs__backward__returns_zero_if_done(env)
     test__step__returns_same_state_action_and_invalid_if_done(env)
     test__sample_backwards_reaches_source(env)
 
@@ -158,6 +160,33 @@ def test__sample_actions__backward__returns_eos_if_done(env, n=5):
     assert all([action == env.eos for action in actions])
 
 
+@pytest.mark.repeat(5)
+def test__get_logprobs__backward__returns_zero_if_done(env, n=5):
+    states = _get_terminating_states(env, n)
+    if states is None:
+        return
+    # Set states, done and get masks
+    masks = []
+    for state in states:
+        env.set_state(state, done=True)
+        masks.append(env.get_mask_invalid_actions_backward())
+    # EOS actions
+    actions_eos = torch.tile(
+        tfloat(env.eos, float_type=env.float, device=env.device),
+        (len(states), 1),
+    )
+    # Build random policy outputs and tensor masks
+    policy_outputs = torch.tile(
+        tfloat(env.random_policy_output, float_type=env.float, device=env.device),
+        (len(states), 1),
+    )
+    # Add noise to policy outputs
+    policy_outputs += torch.randn(policy_outputs.shape)
+    masks = tbool(masks, device=env.device)
+    logprobs = env.get_logprobs(policy_outputs, False, actions_eos, states, None, masks)
+    assert torch.all(logprobs == 0.0)
+
+
 @pytest.mark.repeat(100)
 def test__sample_backwards_reaches_source(env, n=100):
     states = _get_terminating_states(env, n)

From b4e2d6fd74a856d7165816110dc993c11f21311d Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 22:27:20 -0400
Subject: [PATCH 098/206] Add cube backward logprobs tests

---
 tests/gflownet/envs/test_ccube.py | 84 ++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 6fdacf656..e7c264e4e 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -698,7 +698,7 @@ def test__get_logprobs_forward__2d__eos_actions_return_expected(
     # Get EOS forced
     is_near_edge = states_torch > 1.0 - env.min_incr
     is_eos_forced = torch.any(is_near_edge, dim=1)
-    # Define Bernoulli parameter for EOS with deterministic probability (force EOS)
+    # Define Bernoulli parameter for EOS
     # If Bernouilli has logit torch.inf, the logprobs are nan
     logit_eos = 1
     distr_eos = Bernoulli(logits=logit_eos)
@@ -763,6 +763,88 @@ def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1
     assert torch.all(logprobs == 0.0)
 
 
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
+            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
+        ),
+    ],
+)
+def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, actions):
+    """
+    The only valid backward action from 'near-edge' states is BTS, thus the the log
+    probability should be zero.
+    """
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+    )
+    # Build policy outputs
+    params = env.fixed_distr_params
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Add noise to policy outputs
+    policy_outputs += torch.randn(policy_outputs.shape)
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, False, actions, states_torch, None, masks
+    )
+    assert torch.all(logprobs == 0.0)
+
+
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+            [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+        ),
+        (
+            [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
+            [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
+        ),
+    ],
+)
+def test__get_logprobs_backward__2d__bts_actions_return_expected(
+    cube2d, states, actions
+):
+    """
+    The only valid action from 'near-edge' states is EOS, thus the the log probability
+    should be zero, regardless of the action and the policy outputs
+    """
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+    )
+    # Get BTS forced
+    is_near_edge = states_torch < env.min_incr
+    is_bts_forced = torch.any(is_near_edge, dim=1)
+    # Define Bernoulli parameter for BTS
+    # If Bernouilli has logit torch.inf, the logprobs are nan
+    logit_bts = 1
+    distr_bts = Bernoulli(logits=logit_bts)
+    logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["bernoulli_source_logit"] = logit_bts
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, False, actions, states_torch, None, masks
+    )
+    assert torch.all(logprobs[is_bts_forced] == 0.0)
+    assert torch.all(torch.isclose(logprobs[~is_bts_forced], logprob_bts, atol=1e-6))
+
+
 @pytest.mark.parametrize(
     "state, expected",
     [

From 25ab3fabf56fa85718d81e928056058f76b084c8 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 23:17:38 -0400
Subject: [PATCH 099/206] Clamp increments when input to log_prob to avoid nan

---
 gflownet/envs/cube.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 4cb797004..9f00d0b78 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1381,9 +1381,9 @@ def _get_logprobs_forward(
             distr_increments = self._make_increments_distribution(
                 policy_outputs[do_increments]
             )
-            # TODO: deal with increments of 0.0 or 1.0 which will yield nan
+            # Clamp because increments of 0.0 or 1.0 would yield nan
             logprobs_increments_rel[do_increments] = distr_increments.log_prob(
-                increments_rel
+                torch.clamp(increments_rel, min=1e-6, max=(1 - 1e-6))
             )
             # Get minimum increments
             min_increments = torch.full_like(
@@ -1456,9 +1456,9 @@ def _get_logprobs_backward(
             distr_increments = self._make_increments_distribution(
                 policy_outputs[do_increments]
             )
-            # TODO: deal with increments of 0.0 or 1.0 which will yield nan
+            # Clamp because increments of 0.0 or 1.0 would yield nan
             logprobs_increments_rel[do_increments] = distr_increments.log_prob(
-                increments_rel
+                torch.clamp(increments_rel, min=1e-6, max=(1 - 1e-6))
             )
             # Set minimum increments
             min_increments = torch.full_like(

From c6b4e0cbf667661d1b336f24d1ab6c1b9eb271dd Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 23:18:17 -0400
Subject: [PATCH 100/206] Tests including (0.0, 0.0) and (1.0, 1.0) actions.

---
 tests/gflownet/envs/test_ccube.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index e7c264e4e..39add328f 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -720,6 +720,7 @@ def test__get_logprobs_forward__2d__eos_actions_return_expected(
     [
         [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
         [[0.999, 0.999], [0.0001, 0.0001], [0.5, 0.5]],
+        [[0.0, 0.0], [1.0, 1.0]],
     ],
 )
 def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1(
@@ -770,6 +771,10 @@ def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1
             [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
             [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
         ),
+        (
+            [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
+            [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
+        ),
     ],
 )
 def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, actions):
@@ -808,6 +813,10 @@ def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, acti
             [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
             [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
         ),
+        (
+            [[1.0, 1.0], [0.0, 0.0]],
+            [[1.0, 1.0], [0.0, 0.0]],
+        ),
     ],
 )
 def test__get_logprobs_backward__2d__bts_actions_return_expected(

From 65aaaace4e99c2f09aa6f4824c940512b76e55ea Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 23:19:10 -0400
Subject: [PATCH 101/206] Common tests of nonzero backward probs for forward
 actions and vice versa

---
 tests/gflownet/envs/common.py | 68 +++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 5c6888689..6c68ee1a2 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -18,6 +18,8 @@ def test__all_env_common(env):
     test__sample_actions__backward__returns_eos_if_done(env)
     test__get_logprobs__backward__returns_zero_if_done(env)
     test__step_random__does_not_sample_invalid_actions(env)
+    test__forward_actions_have_nonzero_backward_prob(env)
+    test__backward_actions_have_nonzero_forward_prob(env)
     test__get_parents_step_get_mask__are_compatible(env)
     test__sample_backwards_reaches_source(env)
     test__state2readable__is_reversible(env)
@@ -32,6 +34,8 @@ def test__continuous_env_common(env):
     test__sampling_forwards_reaches_done_in_finite_steps(env)
     test__sample_actions__backward__returns_eos_if_done(env)
     test__get_logprobs__backward__returns_zero_if_done(env)
+    test__forward_actions_have_nonzero_backward_prob(env)
+    test__backward_actions_have_nonzero_forward_prob(env)
     test__step__returns_same_state_action_and_invalid_if_done(env)
     test__sample_backwards_reaches_source(env)
 
@@ -319,6 +323,70 @@ def test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env):
         env.step(action)
 
 
+@pytest.mark.repeat(1000)
+def test__forward_actions_have_nonzero_backward_prob(env):
+    env = env.reset()
+    policy_random = torch.unsqueeze(
+        tfloat(env.random_policy_output, float_type=env.float, device=env.device), 0
+    )
+    while not env.done:
+        state_new, action, valid = env.step_random(backward=False)
+        if not valid:
+            continue
+        # Get backward logprobs
+        mask_bw = env.get_mask_invalid_actions_backward()
+        masks = torch.unsqueeze(tbool(mask_bw, device=env.device), 0)
+        actions_torch = torch.unsqueeze(
+            tfloat(action, float_type=env.float, device=env.device), 0
+        )
+        states_torch = torch.unsqueeze(
+            tfloat(env.state, float_type=env.float, device=env.device), 0
+        )
+        policy_outputs = policy_random.clone().detach()
+        logprobs_bw = env.get_logprobs(
+            policy_outputs=policy_outputs,
+            is_forward=False,
+            actions=actions_torch,
+            states_from=states_torch,
+            states_to=None,
+            mask_invalid_actions=masks,
+        )
+        assert torch.exp(logprobs_bw) > 0.0
+
+
+def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
+    states = _get_terminating_states(env, n)
+    policy_random = torch.unsqueeze(
+        tfloat(env.random_policy_output, float_type=env.float, device=env.device), 0
+    )
+    for state in states:
+        env.set_state(state, done=True)
+        while True:
+            if env.equal(env.state, env.source):
+                break
+            state_new, action, valid = env.step_random(backward=True)
+            assert valid
+            # Get forward logprobs
+            mask_fw = env.get_mask_invalid_actions_forward()
+            masks = torch.unsqueeze(tbool(mask_fw, device=env.device), 0)
+            actions_torch = torch.unsqueeze(
+                tfloat(action, float_type=env.float, device=env.device), 0
+            )
+            states_torch = torch.unsqueeze(
+                tfloat(env.state, float_type=env.float, device=env.device), 0
+            )
+            policy_outputs = policy_random.clone().detach()
+            logprobs_fw = env.get_logprobs(
+                policy_outputs=policy_outputs,
+                is_forward=True,
+                actions=actions_torch,
+                states_from=states_torch,
+                states_to=None,
+                mask_invalid_actions=masks,
+            )
+            assert torch.exp(logprobs_fw) > 0.0
+
+
 @pytest.mark.repeat(10)
 def test__init__state_is_source_no_parents(env):
     assert env.equal(env.state, env.source)

From 9392ff2e335b21ea4f9adc5f98929cb243f5e06e Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 23:29:53 -0400
Subject: [PATCH 102/206] Adjust default cube params

---
 config/env/ccube.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index c28535864..48a06ad4f 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -10,7 +10,7 @@ n_dim: 2
 max_val: 1.0
 # Policy
 beta_params_min: 0.1
-beta_params_max: 2.0
+beta_params_max: 1000.0
 min_incr: 0.1
 n_comp: 1
 fixed_distribution:
@@ -22,8 +22,8 @@ fixed_distribution:
 random_distribution:
   beta_weights: 1.0
   # IMPORTANT: adjust because of sigmoid!
-  beta_alpha: 1.0
-  beta_beta: 1.0
+  beta_alpha: $beta_params_max
+  beta_beta: $beta_params_max
   bernoulli_source_logit: 1.0
   bernoulli_eos_logit: 1.0
 # Buffer

From 48cb710d2497d41a2848b38b5e9ecb324dc6a416 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 23:31:45 -0400
Subject: [PATCH 103/206] Clean up old code.

---
 gflownet/envs/cube.py | 331 ------------------------------------------
 1 file changed, 331 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 9f00d0b78..df38cca27 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -967,99 +967,6 @@ def relative_to_absolute_increments(
             )
         return increments_abs
 
-    def sample_actions(
-        self,
-        policy_outputs: TensorType["n_states", "policy_output_dim"],
-        sampling_method: str = "policy",
-        mask_invalid_actions: TensorType["n_states", "1"] = None,
-        temperature_logits: float = 1.0,
-        loginf: float = 1000,
-    ) -> Tuple[List[Tuple], TensorType["n_states"]]:
-        """
-        Samples a batch of actions from a batch of policy outputs.
-        """
-        device = policy_outputs.device
-        n_states = policy_outputs.shape[0]
-        ns_range = torch.arange(n_states).to(device)
-        mask_nofix = torch.any(~mask_invalid_actions[:, : self.n_dim], axis=1)
-        idx_nofix = ns_range[mask_nofix]
-        # EOS
-        mask_can_eos = torch.logical_and(mask_nofix, ~mask_invalid_actions[:, -1])
-        idx_can_eos = ns_range[mask_can_eos]
-        distr_eos = Bernoulli(logits=policy_outputs[idx_can_eos, -1])
-        mask_sampled_eos = distr_eos.sample().to(torch.bool)
-        idx_sampled_eos = idx_can_eos[mask_sampled_eos]
-        logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
-        # Sample increments
-        mask_sample = torch.zeros(n_states, device=device, dtype=torch.bool)
-        mask_sample[idx_nofix] = True
-        mask_sample[idx_sampled_eos] = False
-        idx_sample = ns_range[mask_sample]
-        mask_source_sample = torch.logical_and(
-            ~mask_invalid_actions[:, self.n_dim], mask_sample
-        )
-        mask_generic_sample = torch.logical_and(
-            mask_invalid_actions[:, self.n_dim], mask_sample
-        )
-        idx_source = ns_range[mask_source_sample]
-        idx_generic = ns_range[mask_generic_sample]
-        n_sample = idx_sample.shape[0]
-        logprobs_sample = torch.zeros(n_states, device=device, dtype=self.float)
-        increments = torch.inf * torch.ones(
-            (n_states, self.n_dim), device=device, dtype=self.float
-        )
-        if len(idx_sample) > 0:
-            if sampling_method == "uniform":
-                distr_increments = Uniform(
-                    torch.zeros(n_sample),
-                    torch.ones(n_sample),
-                )
-            elif sampling_method == "policy":
-                mix_logits = policy_outputs[idx_sample, self.n_dim : -2 : 3].reshape(
-                    -1, self.n_dim, self.n_comp
-                )
-                mix = Categorical(logits=mix_logits)
-                alphas = policy_outputs[idx_sample, self.n_dim + 1 : -2 : 3].reshape(
-                    -1, self.n_dim, self.n_comp
-                )
-                alphas = (
-                    self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
-                )
-                betas = policy_outputs[idx_sample, self.n_dim + 2 : -2 : 3].reshape(
-                    -1, self.n_dim, self.n_comp
-                )
-                betas = (
-                    self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
-                )
-                beta_distr = Beta(alphas, betas)
-                distr_increments = MixtureSameFamily(mix, beta_distr)
-            increments[idx_sample, :] = distr_increments.sample()
-            logprobs_sample[idx_sample] = distr_increments.log_prob(
-                increments[idx_sample, :]
-            ).sum(axis=1)
-        # Combined probabilities
-        logprobs = logprobs_eos + logprobs_sample
-        # Set minimum increments
-        min_increments = torch.inf * torch.ones(
-            n_states, device=device, dtype=self.float
-        )
-        min_increments[idx_generic] = self.min_incr
-        min_increments[idx_source] = 0.0
-        # Make increments of near-edge dims 0
-        mask_nearedge_dims = mask_invalid_actions[:, : self.n_dim]
-        mask_sample = torch.zeros(
-            mask_nearedge_dims.shape, device=device, dtype=torch.bool
-        )
-        mask_sample[idx_sample, :] = True
-        mask_nearedge_dims = torch.logical_and(mask_nearedge_dims, mask_sample)
-        increments[mask_nearedge_dims] = 0.0
-        # Build actions
-        actions = [
-            tuple(a.tolist() + [m.item()]) for a, m in zip(increments, min_increments)
-        ]
-        # TODO: implement logprobs here too
-        return actions, logprobs
-
     def sample_actions_batch(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -1484,244 +1391,6 @@ def _get_logprobs_backward(
         logprobs[is_eos] = 0.0
         return logprobs
 
-    def get_logprobs_old(
-        self,
-        policy_outputs: TensorType["n_states", "policy_output_dim"],
-        is_forward: bool,
-        actions: TensorType["n_states", "n_dim"],
-        states_from: TensorType["n_states", "policy_input_dim"],
-        states_to: TensorType["n_states", "policy_input_dim"],
-        mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
-        loginf: float = 1000,
-    ) -> TensorType["batch_size"]:
-        """
-        Computes log probabilities of actions given policy outputs and actions.
-
-        For forward transitons, at every state, the probability of the EOS action is
-        p(EOS). Otherwise, the probability of an increment incr is p(incr) * (1 -
-        p(EOS)). When a dimension is larger than 1 - min_incr, the probabililty of
-        incrementing that dimension by 0 is 1.
-
-        For backward transitons, at every state, the probability of the back-to-source
-        action is p(back-to-source). Otherwise, the probability of an increment
-        (decrement) incr is p(incr) * (1 - p(back-to-source)). When a dimension is
-        larger than 1 - min_incr, the probabililty of incrementing that dimension by 0
-        must be non-zero and is p(zeroincr). In turn, the probability of sampling a
-        non-zero increment incr is (1 - p(zeroincr)) * p(incr).
-
-        Overall, we compute the log probabilities as follows:
-
-        log p = logprobs_eos + logprobs_source + logprobs_increments + logprobs_zeroincr
-
-        - logprobs_eos:
-            - 0, that is p(~EOS) = 1 for backward transitions.
-            - forward, the log p of the sampled event (EOS or not EOS)
-
-        - logprobs_source:
-            - 0, that is p(~source) = 1 for forward transitions.
-            - 0, that is p(~source) = 1 for backward transitions when any dimension is
-              smaller than min_incr.
-            - backward, the log p of the sampled event (source or not source)
-
-        - logprobs_increments:
-            - 0, that is p(~increment) = 1 for EOS or source events.
-            - otherwise, the log p of sampling the increment.
-
-        - logprobs_zeroincr:
-            - 0, that is p(~zeroincr) = 1 for forward transitions.
-            - 0, that is p(~zeroincr) = 1 for for dimensions that are smaller than or
-              equal to 1 - min_incr, backwards.
-            - otherwise, the log p of the sampled event (sampled 0 or not).
-        """
-        device = policy_outputs.device
-        n_states = policy_outputs.shape[0]
-        ns_range = torch.arange(n_states).to(device)
-        # Determine which states have non-deterministic actions
-        if is_forward:
-            # EOS is the only valid action if all dimensions are invalid. That is, the
-            # action is non-deterministic if any dimension is valid (i.e. mask = False).
-            mask_nofix = torch.any(~mask_invalid_actions[:, : self.n_dim], axis=1)
-            idx_nofix = ns_range[mask_nofix]
-        else:
-            # The action is non-deterministic if sampling EOS (last value of mask) is
-            # invalid (True) and back-to-source (second to last) is not the only action
-            # (True).
-            mask_nofix = torch.logical_and(
-                mask_invalid_actions[:, -1], mask_invalid_actions[:, -2]
-            )
-            idx_nofix = ns_range[mask_nofix]
-        # Log probs of EOS and source (backwards) actions
-        logprobs_eos = torch.zeros(n_states, device=device, dtype=self.float)
-        logprobs_source = torch.zeros(n_states, device=device, dtype=self.float)
-        if is_forward:
-            mask_eos = torch.all(actions[idx_nofix] == torch.inf, axis=1)
-            distr_eos = Bernoulli(logits=policy_outputs[idx_nofix, -1])
-            logprobs_eos[idx_nofix] = distr_eos.log_prob(mask_eos.to(self.float))
-            mask_sample = ~mask_eos
-        else:
-            source = torch.tensor(self.source, device=device)
-            mask_source = torch.all(states_to[idx_nofix] == source, axis=1)
-            distr_source = Bernoulli(logits=policy_outputs[idx_nofix, -2])
-            logprobs_source[idx_nofix] = distr_source.log_prob(
-                mask_source.to(self.float)
-            )
-            mask_sample = ~mask_source
-        # Log probs of sampled increments
-        idx_sample = idx_nofix[mask_sample]
-        logprobs_increments = torch.zeros(
-            (n_states, self.n_dim), device=device, dtype=self.float
-        )
-        logprobs_zeroincr = torch.zeros(
-            (n_states, self.n_dim), device=device, dtype=self.float
-        )
-        # Build mask of near-edge values
-        mask_nearedge_dims = ~mask_invalid_actions[:, : self.n_dim]
-        mask_idx_sample = torch.zeros(
-            mask_nearedge_dims.shape, device=device, dtype=torch.bool
-        )
-        mask_idx_sample[idx_sample, :] = True
-        mask_nearedge_dims = torch.logical_and(mask_nearedge_dims, mask_idx_sample)
-        if len(idx_sample) > 0:
-            mix_logits = policy_outputs[idx_sample, self.n_dim : -2 : 3].reshape(
-                -1, self.n_dim, self.n_comp
-            )
-            mix = Categorical(logits=mix_logits)
-            alphas = policy_outputs[idx_sample, self.n_dim + 1 : -2 : 3].reshape(
-                -1, self.n_dim, self.n_comp
-            )
-            alphas = self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
-            betas = policy_outputs[idx_sample, self.n_dim + 2 : -2 : 3].reshape(
-                -1, self.n_dim, self.n_comp
-            )
-            betas = self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
-            beta_distr = Beta(alphas, betas)
-            distr_increments = MixtureSameFamily(mix, beta_distr)
-            increments_f = actions[:, :-1].clone().detach()
-            # Compute backward relative increments (r_b) from forward relative
-            # increments (r_f)
-            # Forward (s -> s'): s' = s + m + r_f * (1 - s - m)
-            # Forward: r_f = (s' - s - m) / (1 - s - m)
-            # Backward (s' -> s): s = (s' - m - r_f * (1 - m) / (1 - r_f)
-            # Backward (s' -> s): s = s' - m - r_b * (s' - m)
-            # Backward: r_b = (s' - s - m) / (s' - m)
-            # r_b = r_f (1 - s - m) / (s' - m)
-            if not is_forward:
-                increments_b = (
-                    increments_f
-                    * (1 - states_to - self.min_incr)
-                    / (states_from - self.min_incr)
-                )
-                increments_b = torch.clip(increments_b, min=1e-6, max=1.0 - 1e-6)
-                increments = increments_b
-            else:
-                increments = increments_f
-            logprobs_increments[idx_sample] = distr_increments.log_prob(
-                increments[idx_sample]
-            )
-            # Make logprobs of "invalid" dimensions (value larger than 1 - mincr) 0.
-            # TODO: indexing can be done more efficiently to avoid sampling from the
-            # distribution above.
-            logprobs_increments[mask_nearedge_dims] = 0.0
-            # Log probs of sampling zero increments
-            if not is_forward:
-                mask_zeroincr = increments[mask_nearedge_dims] == 0.0
-                logits_zeroincr = policy_outputs[:, : self.n_dim][mask_nearedge_dims]
-                distr_zeroincr = Bernoulli(logits=logits_zeroincr)
-                logprobs_zeroincr[mask_nearedge_dims] = distr_zeroincr.log_prob(
-                    mask_zeroincr.to(self.float)
-                )
-                # TODO: make logprobs_increments = 0 if increment was zero and
-                # near-edge. Already done?
-        # Log determinant of the Jacobian
-        min_increments = self.min_incr * torch.ones(
-            len(idx_sample), device=device, dtype=self.float
-        )
-        if is_forward:
-            mask_source_sample = ~mask_invalid_actions[idx_sample, -2]
-            min_increments[mask_source_sample] = 0.0
-        jacobian_diag = torch.ones(
-            (n_states, self.n_dim), device=device, dtype=self.float
-        )
-        jacobian_diag[idx_sample] = self.get_jacobian_diag(
-            states_from[idx_sample], is_forward, min_increments
-        )
-        jacobian_diag[mask_nearedge_dims] = 1.0
-        log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)
-        # Combined probabilities
-        sumlogprobs_increments = logprobs_increments.sum(axis=1)
-        sumlogprobs_zeroincr = logprobs_zeroincr.sum(axis=1)
-        logprobs = (
-            logprobs_eos
-            + logprobs_source
-            + sumlogprobs_increments
-            + sumlogprobs_zeroincr
-            + log_det_jacobian
-        )
-        # Sanity checks
-        assert not torch.any(torch.isnan(logprobs))
-        assert not torch.any(torch.isinf(logprobs))
-        if is_forward:
-            mask_fix = torch.all(mask_invalid_actions[:, : self.n_dim], axis=1)
-            assert torch.all(logprobs_source == 0.0)
-            assert torch.all(logprobs_zeroincr == 0.0)
-            assert torch.all(sumlogprobs_increments[idx_nofix][mask_eos] == 0.0)
-            mask_fixdim = mask_invalid_actions[:, : self.n_dim]
-            assert torch.all(logprobs_increments[mask_fixdim] == 0.0)
-        else:
-            mask_fix = ~mask_invalid_actions[:, -1]
-            assert torch.all(logprobs_eos == 0.0)
-            assert torch.all(sumlogprobs_increments[idx_nofix][mask_source] == 0.0)
-            assert torch.all(sumlogprobs_zeroincr[idx_nofix][mask_source] == 0.0)
-            mask_nozeroincr = mask_invalid_actions[:, : self.n_dim]
-            assert torch.all(logprobs_zeroincr[mask_nozeroincr] == 0.0)
-        assert torch.all(logprobs[mask_fix] == 0.0)
-        return logprobs
-
-    # TODO: min_incr is zero from source!
-    def get_jacobian_diag_old(
-        self,
-        states: TensorType["batch_size", "state_dim"],
-        is_forward: bool,
-        min_increments: TensorType["batch_size"],
-    ):
-        """
-        Computes the diagonal of the Jacobian of the sampled actions with respect to
-        the states.
-
-        Forward: the sampled variables are the relative increments r_f and the state
-        updates (s -> s') are:
-
-        s' = s + m + r_f(1 - s - m)
-        r_f = (s' - s - m) / (1 - s - m)
-
-        Therefore, the derivative of r_f wrt to s' is
-
-        dr_f/ds' = 1 / (1 - s - m)
-
-        Backward: the sampled variables are the relative decrements r_b and the state
-        updates (s' -> s) are:
-
-        s = s' - m - r_b(s' - m)
-        r_b = (s' - s - m) / (s' - m)
-
-        Therefore, the derivative of r_b wrt to s is
-
-        dr_b/ds = -1 / (s' - m)
-
-        We take the absolute value of the derivative (Jacobian).
-
-        The derivatives of the components of r with respect to dimensions of s or s'
-        other than itself are zero. Therefore, the Jacobian is diagonal and the
-        determinant is the product of the diagonal.
-        """
-        epsilon = 1e-9
-        min_increments = min_increments.unsqueeze(-1).repeat(1, states.shape[1])
-        if is_forward:
-            return 1.0 / ((1 - states - min_increments) + epsilon)
-        else:
-            return 1.0 / ((states - min_increments) + epsilon)
-
-    # TODO: min_incr is zero from source!
     @staticmethod
     def _get_jacobian_diag(
         states_from: TensorType["n_states", "n_dim"],

From 39397837cc54306cdca7edfdbf80ef25c21bb2d9 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 23:32:18 -0400
Subject: [PATCH 104/206] Adjust default cube params

---
 gflownet/envs/cube.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index df38cca27..46080349b 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -47,7 +47,7 @@ def __init__(
         min_incr: float = 0.1,
         n_comp: int = 1,
         beta_params_min: float = 0.1,
-        beta_params_max: float = 2.0,
+        beta_params_max: float = 1000.0,
         fixed_distr_params: dict = {
             "beta_weights": 1.0,
             "beta_alpha": 2.0,
@@ -57,8 +57,8 @@ def __init__(
         },
         random_distr_params: dict = {
             "beta_weights": 1.0,
-            "beta_alpha": 1.0,
-            "beta_beta": 1.0,
+            "beta_alpha": 1000.0,
+            "beta_beta": 1000.0,
             "bernoulli_source_logit": 1.0,
             "bernoulli_eos_logit": 1.0,
         },

From 353414be7fe1565f6a677d0857f0dc6c641973f9 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 23:37:19 -0400
Subject: [PATCH 105/206] Remove states_to from get_logprobs in envs.

---
 gflownet/envs/base.py   |  1 -
 gflownet/envs/ctorus.py |  1 -
 gflownet/envs/cube.py   | 10 ++--------
 gflownet/envs/htorus.py |  2 +-
 gflownet/envs/tree.py   | 14 +++++++-------
 5 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/gflownet/envs/base.py b/gflownet/envs/base.py
index 7ed941bc0..d84180514 100644
--- a/gflownet/envs/base.py
+++ b/gflownet/envs/base.py
@@ -508,7 +508,6 @@ def get_logprobs(
         is_forward: bool,
         actions: TensorType["n_states", "actions_dim"],
         states_from: TensorType["n_states", "policy_input_dim"],
-        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
     ) -> TensorType["batch_size"]:
         """
diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index b61622dbb..164bcd263 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -284,7 +284,6 @@ def get_logprobs(
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
         states_from: TensorType["n_states", "policy_input_dim"],
-        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["n_states", "1"] = None,
     ) -> TensorType["batch_size"]:
         """
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 46080349b..a3bcc8bfd 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -215,7 +215,6 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", 2],
-        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
         loginf: float = 1000,
     ) -> TensorType["batch_size"]:
@@ -548,7 +547,6 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", 2],
-        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
         loginf: float = 1000,
     ) -> TensorType["batch_size"]:
@@ -1212,7 +1210,6 @@ def _sample_actions_batch_backward(
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, None
 
-    # TODO: Remove need for states_to?
     # TODO: reorganise args
     # TODO: mask_invalid_actions -> mask
     # TODO: states_from must be tensor or could be list?
@@ -1222,7 +1219,6 @@ def get_logprobs(
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
         states_from: TensorType["n_states", "policy_input_dim"],
-        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["n_states", "3"] = None,
     ) -> TensorType["batch_size"]:
         """
@@ -1230,11 +1226,11 @@ def get_logprobs(
         """
         if is_forward:
             return self._get_logprobs_forward(
-                policy_outputs, actions, states_from, states_to, mask_invalid_actions
+                policy_outputs, actions, states_from, mask_invalid_actions
             )
         else:
             return self._get_logprobs_backward(
-                policy_outputs, actions, states_from, states_to, mask_invalid_actions
+                policy_outputs, actions, states_from, mask_invalid_actions
             )
 
     # TODO: Unify sample_actions and get_logprobs
@@ -1243,7 +1239,6 @@ def _get_logprobs_forward(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         actions: TensorType["n_states", "n_dim"],
         states_from: TensorType["n_states", "policy_input_dim"],
-        states_to: TensorType["n_states", "policy_input_dim"],
         mask: TensorType["n_states", "3"] = None,
     ) -> TensorType["batch_size"]:
         """
@@ -1320,7 +1315,6 @@ def _get_logprobs_backward(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         actions: TensorType["n_states", "n_dim"],
         states_from: TensorType["n_states", "policy_input_dim"],
-        states_to: TensorType["n_states", "policy_input_dim"],
         mask: TensorType["n_states", "3"] = None,
     ) -> TensorType["batch_size"]:
         """
diff --git a/gflownet/envs/htorus.py b/gflownet/envs/htorus.py
index d780f6f50..6a85a11c0 100644
--- a/gflownet/envs/htorus.py
+++ b/gflownet/envs/htorus.py
@@ -394,13 +394,13 @@ def sample_actions_batch(
         ]
         return actions, logprobs
 
+    # TODO: requires states_to but it is deprecated anyway
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", 2],
         states_from: TensorType["n_states", "policy_input_dim"],
-        states_to: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
     ) -> TensorType["batch_size"]:
         """
diff --git a/gflownet/envs/tree.py b/gflownet/envs/tree.py
index d9ee0652c..acb21aa81 100644
--- a/gflownet/envs/tree.py
+++ b/gflownet/envs/tree.py
@@ -754,15 +754,15 @@ def get_logprobs_continuous(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_target: TensorType["n_states", "policy_input_dim"],
+        states_from: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["n_states", "1"] = None,
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of actions given policy outputs and actions.
         """
         n_states = policy_outputs.shape[0]
-        if states_target is None:
-            states_target = torch.empty(
+        if states_from is None:
+            states_from = torch.empty(
                 (n_states, self.policy_input_dim), device=self.device
             )
         logprobs = torch.zeros(n_states, device=self.device, dtype=self.float)
@@ -776,7 +776,7 @@ def get_logprobs_continuous(
                 policy_outputs_discrete,
                 is_forward,
                 actions[mask_discrete],
-                states_target[mask_discrete],
+                states_from[mask_discrete],
                 mask_invalid_actions[
                     mask_discrete, : self._index_continuous_policy_output
                 ],
@@ -806,7 +806,7 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_target: TensorType["n_states", "policy_input_dim"],
+        states_from: TensorType["n_states", "policy_input_dim"],
         mask_invalid_actions: TensorType["n_states", "1"] = None,
     ) -> TensorType["batch_size"]:
         """
@@ -817,7 +817,7 @@ def get_logprobs(
                 policy_outputs=policy_outputs,
                 is_forward=is_forward,
                 actions=actions,
-                states_target=states_target,
+                states_from=states_from,
                 mask_invalid_actions=mask_invalid_actions,
             )
         else:
@@ -825,7 +825,7 @@ def get_logprobs(
                 policy_outputs=policy_outputs,
                 is_forward=is_forward,
                 actions=actions,
-                states_target=states_target,
+                states_from=states_from,
                 mask_invalid_actions=mask_invalid_actions,
             )
 

From a24a2a3c8f8f6517fe8d03b0f5e1f0f0cf9506b6 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 23:39:39 -0400
Subject: [PATCH 106/206] Remove states_to from get_logprobs in tests.

---
 tests/gflownet/envs/common.py     |  5 +----
 tests/gflownet/envs/test_ccube.py | 10 +++++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 6c68ee1a2..683b780a3 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -187,7 +187,7 @@ def test__get_logprobs__backward__returns_zero_if_done(env, n=5):
     # Add noise to policy outputs
     policy_outputs += torch.randn(policy_outputs.shape)
     masks = tbool(masks, device=env.device)
-    logprobs = env.get_logprobs(policy_outputs, False, actions_eos, states, None, masks)
+    logprobs = env.get_logprobs(policy_outputs, False, actions_eos, states, masks)
     assert torch.all(logprobs == 0.0)
 
 
@@ -314,7 +314,6 @@ def test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env):
             is_forward=True,
             actions=actions_torch,
             states_from=None,
-            states_to=None,
             mask_invalid_actions=masks_invalid_torch,
         )
         action = actions[0]
@@ -348,7 +347,6 @@ def test__forward_actions_have_nonzero_backward_prob(env):
             is_forward=False,
             actions=actions_torch,
             states_from=states_torch,
-            states_to=None,
             mask_invalid_actions=masks,
         )
         assert torch.exp(logprobs_bw) > 0.0
@@ -381,7 +379,6 @@ def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
                 is_forward=True,
                 actions=actions_torch,
                 states_from=states_torch,
-                states_to=None,
                 mask_invalid_actions=masks,
             )
             assert torch.exp(logprobs_fw) > 0.0
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 39add328f..5773178b1 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -662,7 +662,7 @@ def test__get_logprobs_forward__2d__nearedge_returns_prob1(cube2d, states, actio
     policy_outputs += torch.randn(policy_outputs.shape)
     # Get log probs
     logprobs = env.get_logprobs(
-        policy_outputs, True, actions, states_torch, None, masks
+        policy_outputs, True, actions, states_torch, masks
     )
     assert torch.all(logprobs == 0.0)
 
@@ -709,7 +709,7 @@ def test__get_logprobs_forward__2d__eos_actions_return_expected(
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(
-        policy_outputs, True, actions, states_torch, None, masks
+        policy_outputs, True, actions, states_torch, masks
     )
     assert torch.all(logprobs[is_eos_forced] == 0.0)
     assert torch.all(torch.isclose(logprobs[~is_eos_forced], logprob_eos, atol=1e-6))
@@ -759,7 +759,7 @@ def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(
-        policy_outputs, True, actions, states_torch, None, masks
+        policy_outputs, True, actions, states_torch, masks
     )
     assert torch.all(logprobs == 0.0)
 
@@ -797,7 +797,7 @@ def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, acti
     policy_outputs += torch.randn(policy_outputs.shape)
     # Get log probs
     logprobs = env.get_logprobs(
-        policy_outputs, False, actions, states_torch, None, masks
+        policy_outputs, False, actions, states_torch, masks
     )
     assert torch.all(logprobs == 0.0)
 
@@ -848,7 +848,7 @@ def test__get_logprobs_backward__2d__bts_actions_return_expected(
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(
-        policy_outputs, False, actions, states_torch, None, masks
+        policy_outputs, False, actions, states_torch, masks
     )
     assert torch.all(logprobs[is_bts_forced] == 0.0)
     assert torch.all(torch.isclose(logprobs[~is_bts_forced], logprob_bts, atol=1e-6))

From 05385ae088afa71f84e4fe65fb42495c5e76266f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 23:41:01 -0400
Subject: [PATCH 107/206] Fix hanging old name of method

---
 gflownet/envs/cube.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index a3bcc8bfd..0da0c1553 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -197,7 +197,7 @@ def get_parents(
         pass
 
     @abstractmethod
-    def sample_actions(
+    def sample_actions_batch(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         sampling_method: str = "policy",
@@ -481,7 +481,7 @@ def get_parents(
             parents = [state]
             return parents, [action]
 
-    def sample_actions(
+    def sample_actions_batch(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         sampling_method: str = "policy",

From 045aa0ca7c91c52622e3e8978c9ee3cfbde87c98 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 10 Sep 2023 23:48:58 -0400
Subject: [PATCH 108/206] Replace taking exponential of logprobs by comparing
 with -1e6

---
 tests/gflownet/envs/common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 683b780a3..56a517184 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -349,7 +349,7 @@ def test__forward_actions_have_nonzero_backward_prob(env):
             states_from=states_torch,
             mask_invalid_actions=masks,
         )
-        assert torch.exp(logprobs_bw) > 0.0
+        assert logprobs_bw > -1e6
 
 
 def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
@@ -381,7 +381,7 @@ def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
                 states_from=states_torch,
                 mask_invalid_actions=masks,
             )
-            assert torch.exp(logprobs_fw) > 0.0
+            assert logprobs_fw > -1e6
 
 
 @pytest.mark.repeat(10)

From a189cc53e6ddf43b0cd3169ae64b4cbf471ede2c Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 00:01:43 -0400
Subject: [PATCH 109/206] Add continuous attribute to cube

---
 config/env/ccube.yaml | 1 +
 gflownet/envs/cube.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index 48a06ad4f..7135b071d 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -4,6 +4,7 @@ defaults:
 _target_: gflownet.envs.cube.ContinuousCube
 
 id: ccube
+continuous: True
 func: corners
 # Dimensions of hypercube
 n_dim: 2
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 0da0c1553..b8773ae77 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -68,7 +68,6 @@ def __init__(
         assert max_val > 0.0
         assert n_comp > 0
         # Main properties
-        self.continuous = True
         self.n_dim = n_dim
         self.eos = self.n_dim
         self.max_val = max_val
@@ -97,6 +96,7 @@ def __init__(
             random_distr_params=random_distr_params,
             **kwargs,
         )
+        self.continuous = True
 
     @abstractmethod
     def get_action_space(self):

From 466090082daf4301b8d3066f8865f1c33e24529a Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 00:05:51 -0400
Subject: [PATCH 110/206] Restore old test method

---
 gflownet/gflownet.py | 196 ++++++++++++++++++++++---------------------
 1 file changed, 99 insertions(+), 97 deletions(-)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index c95d34be2..9d07caae3 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -954,117 +954,119 @@ def test(self, **plot_kwargs):
         Computes metrics by sampling trajectories from the forward policy.
         """
         if self.buffer.test_pkl is None:
-            l1, kl, jsd, corr_prob_traj_rewards, nll_tt = (
+            return (
                 self.l1,
                 self.kl,
                 self.jsd,
                 self.corr_prob_traj_rewards,
                 self.nll_tt,
+                (None,),
+                {},
             )
-            # TODO: Improve conditions where x_sampled is obtained
-            x_sampled = None
-        else:
-            with open(self.buffer.test_pkl, "rb") as f:
-                dict_tt = pickle.load(f)
-                x_tt = dict_tt["x"]
-            # Compute correlation between the rewards of the test data and the log
-            # likelihood of the data according the the GFlowNet policy; and NLL.
-            # TODO: organise code for better efficiency and readability
-            logprobs_x_tt = self.estimate_logprobs_data(
-                x_tt,
-                n_trajectories=self.logger.test.n_trajs_logprobs,
-                max_data_size=self.logger.test.max_data_logprobs,
+        with open(self.buffer.test_pkl, "rb") as f:
+            dict_tt = pickle.load(f)
+            x_tt = dict_tt["x"]
+
+        # Compute correlation between the rewards of the test data and the log
+        # likelihood of the data according the the GFlowNet policy; and NLL.
+        # TODO: organise code for better efficiency and readability
+        logprobs_x_tt = self.estimate_logprobs_data(
+            x_tt,
+            n_trajectories=self.logger.test.n_trajs_logprobs,
+            max_data_size=self.logger.test.max_data_logprobs,
+        )
+        rewards_x_tt = self.env.reward_batch(x_tt)
+        corr_prob_traj_rewards = np.corrcoef(
+            np.exp(logprobs_x_tt.cpu().numpy()), rewards_x_tt
+        )[0, 1]
+        nll_tt = -logprobs_x_tt.mean().item()
+
+        batch, _ = self.sample_batch(n_forward=self.logger.test.n, train=False)
+        assert batch.is_valid()
+        x_sampled = batch.get_terminating_states()
+
+        if self.buffer.test_type is not None and self.buffer.test_type == "all":
+            if "density_true" in dict_tt:
+                density_true = dict_tt["density_true"]
+            else:
+                rewards = self.env.reward_batch(x_tt)
+                z_true = rewards.sum()
+                density_true = rewards / z_true
+                with open(self.buffer.test_pkl, "wb") as f:
+                    dict_tt["density_true"] = density_true
+                    pickle.dump(dict_tt, f)
+            hist = defaultdict(int)
+            for x in x_sampled:
+                hist[tuple(x)] += 1
+            z_pred = sum([hist[tuple(x)] for x in x_tt]) + 1e-9
+            density_pred = np.array([hist[tuple(x)] / z_pred for x in x_tt])
+            log_density_true = np.log(density_true + 1e-8)
+            log_density_pred = np.log(density_pred + 1e-8)
+        elif self.buffer.test_type == "random":
+            # TODO: refactor
+            env_metrics = self.env.test(x_sampled)
+            return (
+                self.l1,
+                self.kl,
+                self.jsd,
+                self.corr_prob_traj_rewards,
+                self.nll_tt,
+                (None,),
+                env_metrics,
             )
-            rewards_x_tt = self.env.reward_batch(x_tt)
-            corr_prob_traj_rewards = np.corrcoef(
-                np.exp(logprobs_x_tt.cpu().numpy()), rewards_x_tt
-            )[0, 1]
-            nll_tt = -logprobs_x_tt.mean().item()
-
-            batch, _ = self.sample_batch(n_forward=self.logger.test.n, train=False)
-            assert batch.is_valid()
-            x_sampled = batch.get_terminating_states()
-
-            if self.buffer.test_type is not None and self.buffer.test_type == "all":
-                if "density_true" in dict_tt:
-                    density_true = dict_tt["density_true"]
-                else:
-                    rewards = self.env.reward_batch(x_tt)
-                    z_true = rewards.sum()
-                    density_true = rewards / z_true
-                    with open(self.buffer.test_pkl, "wb") as f:
-                        dict_tt["density_true"] = density_true
-                        pickle.dump(dict_tt, f)
-                hist = defaultdict(int)
-                for x in x_sampled:
-                    hist[tuple(x)] += 1
-                z_pred = sum([hist[tuple(x)] for x in x_tt]) + 1e-9
-                density_pred = np.array([hist[tuple(x)] / z_pred for x in x_tt])
-                log_density_true = np.log(density_true + 1e-8)
-                log_density_pred = np.log(density_pred + 1e-8)
-            elif self.buffer.test_type == "random":
-                # TODO: refactor
-                env_metrics = self.env.test(x_sampled)
-                return l1, kl, jsd, corr_prob_traj_rewards, nll_tt, (None,), env_metrics
-            elif self.continuous:
-                # TODO make it work with conditional env
-                x_sampled = torch2np(self.env.statebatch2proxy(x_sampled))
-                x_tt = torch2np(self.env.statebatch2proxy(x_tt))
-                kde_pred = self.env.fit_kde(
-                    x_sampled,
+        elif self.continuous:
+            # TODO make it work with conditional env
+            x_sampled = torch2np(self.env.statebatch2proxy(x_sampled))
+            x_tt = torch2np(self.env.statebatch2proxy(x_tt))
+            kde_pred = self.env.fit_kde(
+                x_sampled,
+                kernel=self.logger.test.kde.kernel,
+                bandwidth=self.logger.test.kde.bandwidth,
+            )
+            if "log_density_true" in dict_tt and "kde_true" in dict_tt:
+                log_density_true = dict_tt["log_density_true"]
+                kde_true = dict_tt["kde_true"]
+            else:
+                # Sample from reward via rejection sampling
+                x_from_reward = self.env.sample_from_reward(
+                    n_samples=self.logger.test.n
+                )
+                x_from_reward = torch2np(self.env.statetorch2proxy(x_from_reward))
+                # Fit KDE with samples from reward
+                kde_true = self.env.fit_kde(
+                    x_from_reward,
                     kernel=self.logger.test.kde.kernel,
                     bandwidth=self.logger.test.kde.bandwidth,
                 )
-                if "log_density_true" in dict_tt and "kde_true" in dict_tt:
-                    log_density_true = dict_tt["log_density_true"]
-                    kde_true = dict_tt["kde_true"]
-                else:
-                    # Sample from reward via rejection sampling
-                    x_from_reward = self.env.sample_from_reward(
-                        n_samples=self.logger.test.n
-                    )
-                    x_from_reward = torch2np(self.env.statetorch2proxy(x_from_reward))
-                    # Fit KDE with samples from reward
-                    kde_true = self.env.fit_kde(
-                        x_from_reward,
-                        kernel=self.logger.test.kde.kernel,
-                        bandwidth=self.logger.test.kde.bandwidth,
-                    )
-                    # Estimate true log density using test samples
-                    # TODO: this may be specific-ish for the torus or not
-                    scores_true = kde_true.score_samples(x_tt)
-                    log_density_true = scores_true - logsumexp(scores_true, axis=0)
-                    # Add log_density_true and kde_true to pickled test dict
-                    with open(self.buffer.test_pkl, "wb") as f:
-                        dict_tt["log_density_true"] = log_density_true
-                        dict_tt["kde_true"] = kde_true
-                        pickle.dump(dict_tt, f)
-                # Estimate pred log density using test samples
+                # Estimate true log density using test samples
                 # TODO: this may be specific-ish for the torus or not
-                scores_pred = kde_pred.score_samples(x_tt)
-                log_density_pred = scores_pred - logsumexp(scores_pred, axis=0)
-                density_true = np.exp(log_density_true)
-                density_pred = np.exp(log_density_pred)
-            else:
-                raise NotImplementedError
-            # L1 error
-            l1 = np.abs(density_pred - density_true).mean()
-            # KL divergence
-            kl = (density_true * (log_density_true - log_density_pred)).mean()
-            # Jensen-Shannon divergence
-            log_mean_dens = np.logaddexp(log_density_true, log_density_pred) + np.log(
-                0.5
-            )
-            jsd = 0.5 * np.sum(density_true * (log_density_true - log_mean_dens))
-            jsd += 0.5 * np.sum(density_pred * (log_density_pred - log_mean_dens))
+                scores_true = kde_true.score_samples(x_tt)
+                log_density_true = scores_true - logsumexp(scores_true, axis=0)
+                # Add log_density_true and kde_true to pickled test dict
+                with open(self.buffer.test_pkl, "wb") as f:
+                    dict_tt["log_density_true"] = log_density_true
+                    dict_tt["kde_true"] = kde_true
+                    pickle.dump(dict_tt, f)
+            # Estimate pred log density using test samples
+            # TODO: this may be specific-ish for the torus or not
+            scores_pred = kde_pred.score_samples(x_tt)
+            log_density_pred = scores_pred - logsumexp(scores_pred, axis=0)
+            density_true = np.exp(log_density_true)
+            density_pred = np.exp(log_density_pred)
+        else:
+            raise NotImplementedError
+        # L1 error
+        l1 = np.abs(density_pred - density_true).mean()
+        # KL divergence
+        kl = (density_true * (log_density_true - log_density_pred)).mean()
+        # Jensen-Shannon divergence
+        log_mean_dens = np.logaddexp(log_density_true, log_density_pred) + np.log(0.5)
+        jsd = 0.5 * np.sum(density_true * (log_density_true - log_mean_dens))
+        jsd += 0.5 * np.sum(density_pred * (log_density_pred - log_mean_dens))
 
         # Plots
+
         if hasattr(self.env, "plot_reward_samples"):
-            if x_sampled is None:
-                batch, _ = self.sample_batch(n_forward=self.logger.test.n, train=False)
-                assert batch.is_valid()
-                x_sampled = batch.get_terminating_states()
-                x_sampled = torch2np(self.env.statebatch2proxy(x_sampled))
             fig_reward_samples = self.env.plot_reward_samples(x_sampled, **plot_kwargs)
         else:
             fig_reward_samples = None

From c41ccda3ac65bcfbd7eec87e779b3ede7cf20b05 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 08:39:20 -0400
Subject: [PATCH 111/206] Change conversions to proxy and policy

---
 gflownet/envs/cube.py | 53 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index b8773ae77..8fa6a87f0 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -82,11 +82,7 @@ def __init__(
         self.action_source = (self.n_dim, 0)
         # End-of-sequence action: (n_dim + 1, 0)
         self.eos = (self.n_dim + 1, 0)
-        # Conversions: only conversions to policy are implemented and the rest are the
-        # same
-        self.state2proxy = self.state2policy
-        self.statebatch2proxy = self.statebatch2policy
-        self.statetorch2proxy = self.statetorch2policy
+        # Conversions
         self.state2oracle = self.state2proxy
         self.statebatch2oracle = self.statebatch2proxy
         self.statetorch2oracle = self.statetorch2proxy
@@ -118,7 +114,7 @@ def get_mask_invalid_actions_forward(
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
         pass
 
-    def statetorch2policy(
+    def statetorch2proxy(
         self, states: TensorType["batch", "state_dim"] = None
     ) -> TensorType["batch", "policy_input_dim"]:
         """
@@ -131,7 +127,7 @@ def statetorch2policy(
         """
         return 2.0 * torch.clip(states, min=0.0, max=self.max_val) - 1.0
 
-    def statebatch2policy(
+    def statebatch2proxy(
         self, states: List[List]
     ) -> TensorType["batch", "state_proxy_dim"]:
         """
@@ -142,11 +138,11 @@ def statebatch2policy(
         state : list
             State
         """
-        return self.statetorch2policy(
-            torch.tensor(states, device=self.device, dtype=self.float)
+        return self.statetorch2proxy(
+            tfloat(states, device=self.device, float_type=self.float)
         )
 
-    def state2policy(self, state: List = None) -> List:
+    def state2proxy(self, state: List = None) -> List:
         """
         Clips the state into [0, max_val] and maps it to [-1.0, 1.0]
         """
@@ -154,6 +150,43 @@ def state2policy(self, state: List = None) -> List:
             state = self.state.copy()
         return [2.0 * min(max(0.0, s), self.max_val) - 1.0 for s in state]
 
+    # TODO: Check issue with get_logprobs using states_from in policy format.
+    def statetorch2policy(
+        self, states: TensorType["batch", "state_dim"] = None
+    ) -> TensorType["batch", "policy_input_dim"]:
+        """
+        Clips the states into [0, max_val]
+
+        Args
+        ----
+        state : list
+            State
+        """
+        return torch.clip(states, min=0.0, max=self.max_val)
+
+    def statebatch2policy(
+        self, states: List[List]
+    ) -> TensorType["batch", "state_proxy_dim"]:
+        """
+        Clips the states into [0, max_val]
+
+        Args
+        ----
+        state : list
+            State
+        """
+        return self.statetorch2policy(
+            tfloat(states, device=self.device, float_type=self.float)
+        )
+
+    def state2policy(self, state: List = None) -> List:
+        """
+        Clips the state into [0, max_val]
+        """
+        if state is None:
+            state = self.state.copy()
+        return [min(max(0.0, s), self.max_val) for s in state]
+
     def state2readable(self, state: List) -> str:
         """
         Converts a state (a list of positions) into a human-readable string

From bb55e2fa51f145d3e301d48af76ddca9d36b61be Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 08:42:00 -0400
Subject: [PATCH 112/206] Add assert that logprobs are finite.

---
 tests/gflownet/envs/common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 56a517184..5be3286dd 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -349,6 +349,7 @@ def test__forward_actions_have_nonzero_backward_prob(env):
             states_from=states_torch,
             mask_invalid_actions=masks,
         )
+        assert torch.isfinite(logprobs_bw)
         assert logprobs_bw > -1e6
 
 
@@ -381,6 +382,7 @@ def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
                 states_from=states_torch,
                 mask_invalid_actions=masks,
             )
+            assert torch.isfinite(logprobs_fw)
             assert logprobs_fw > -1e6
 
 

From 6465a2bff740b28cdd174e7b712659a58dfc3c3b Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 08:42:41 -0400
Subject: [PATCH 113/206] Add assert that logprobs are finite.

---
 tests/gflownet/envs/test_ccube.py | 64 +++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 15 deletions(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 5773178b1..c267fdcc3 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -661,9 +661,7 @@ def test__get_logprobs_forward__2d__nearedge_returns_prob1(cube2d, states, actio
     # Add noise to policy outputs
     policy_outputs += torch.randn(policy_outputs.shape)
     # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, True, actions, states_torch, masks
-    )
+    logprobs = env.get_logprobs(policy_outputs, True, actions, states_torch, masks)
     assert torch.all(logprobs == 0.0)
 
 
@@ -708,9 +706,7 @@ def test__get_logprobs_forward__2d__eos_actions_return_expected(
     params["bernoulli_eos_logit"] = logit_eos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, True, actions, states_torch, masks
-    )
+    logprobs = env.get_logprobs(policy_outputs, True, actions, states_torch, masks)
     assert torch.all(logprobs[is_eos_forced] == 0.0)
     assert torch.all(torch.isclose(logprobs[~is_eos_forced], logprob_eos, atol=1e-6))
 
@@ -758,12 +754,54 @@ def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1
     params["bernoulli_eos_logit"] = logit_force_noeos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, True, actions, states_torch, masks
-    )
+    logprobs = env.get_logprobs(policy_outputs, True, actions, states_torch, masks)
     assert torch.all(logprobs == 0.0)
 
 
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]],
+            [[0.1, 0.2], [0.001, 0.001], [0.5, 0.5]],
+        ),
+        (
+            [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7]],
+            [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
+        ),
+        (
+            [[0.6384, 0.4577], [0.5, 0.5], [0.7, 0.7]],
+            [[0.2988, 0.3585], [0.1, 0.1], [0.1, 0.1]],
+        ),
+    ],
+)
+def test__get_logprobs_forward__2d__notnan(cube2d, states, actions):
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Get EOS forced
+    is_near_edge = states_torch > 1.0 - env.min_incr
+    is_eos_forced = torch.any(is_near_edge, dim=1)
+    # Define Bernoulli parameter for EOS
+    # If Bernouilli has logit torch.inf, the logprobs are nan
+    logit_eos = 1
+    distr_eos = Bernoulli(logits=logit_eos)
+    logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["bernoulli_eos_logit"] = logit_eos
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(policy_outputs, True, actions, states_torch, masks)
+    assert torch.all(logprobs[is_eos_forced] == 0.0)
+    assert torch.all(torch.isfinite(logprobs))
+
+
 @pytest.mark.parametrize(
     "states, actions",
     [
@@ -796,9 +834,7 @@ def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, acti
     # Add noise to policy outputs
     policy_outputs += torch.randn(policy_outputs.shape)
     # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, False, actions, states_torch, masks
-    )
+    logprobs = env.get_logprobs(policy_outputs, False, actions, states_torch, masks)
     assert torch.all(logprobs == 0.0)
 
 
@@ -847,9 +883,7 @@ def test__get_logprobs_backward__2d__bts_actions_return_expected(
     params["bernoulli_source_logit"] = logit_bts
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, False, actions, states_torch, masks
-    )
+    logprobs = env.get_logprobs(policy_outputs, False, actions, states_torch, masks)
     assert torch.all(logprobs[is_bts_forced] == 0.0)
     assert torch.all(torch.isclose(logprobs[~is_bts_forced], logprob_bts, atol=1e-6))
 

From 62f2f025dd6e90e23dc35a1f1f66dd3ae33dee66 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 08:44:30 -0400
Subject: [PATCH 114/206] Add TODO

---
 gflownet/gflownet.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 9d07caae3..46072a501 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -855,6 +855,7 @@ def train(self):
                     )  # returns (opt loss, *metrics)
                 else:
                     print("Unknown loss!")
+                # TODO: deal with this in a better way
                 if not all([torch.isfinite(loss) for loss in losses]):
                     if self.logger.debug:
                         print("Loss is not finite - skipping iteration")

From a43dcb0c3c903e8db6c66280932d8cecb121ffe5 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 08:58:13 -0400
Subject: [PATCH 115/206] Fix that get_logprobs truly receives states_from and
 not states_target

---
 gflownet/envs/base.py   |  3 ++-
 gflownet/envs/ctorus.py |  2 +-
 gflownet/envs/cube.py   | 32 ++++++++++++++++----------------
 gflownet/envs/htorus.py |  2 +-
 gflownet/envs/tree.py   |  5 +++--
 gflownet/gflownet.py    | 14 ++++++++------
 6 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/gflownet/envs/base.py b/gflownet/envs/base.py
index d84180514..ca25b7d82 100644
--- a/gflownet/envs/base.py
+++ b/gflownet/envs/base.py
@@ -502,12 +502,13 @@ def sample_actions_batch(
         actions = [self.action_space[idx] for idx in action_indices]
         return actions, logprobs
 
+    # TODO: Extend docstring
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "actions_dim"],
-        states_from: TensorType["n_states", "policy_input_dim"],
+        states_from: Optional[List] = None,
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
     ) -> TensorType["batch_size"]:
         """
diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index 164bcd263..b4383774d 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -283,7 +283,7 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_from: TensorType["n_states", "policy_input_dim"],
+        states_from: Optional[List] = None,
         mask_invalid_actions: TensorType["n_states", "1"] = None,
     ) -> TensorType["batch_size"]:
         """
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 8fa6a87f0..28994f97c 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1002,7 +1002,7 @@ def sample_actions_batch(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         mask: Optional[TensorType["n_states", "policy_output_dim"]] = None,
-        states_from: Optional[List] = None,
+        states_from: List = None,
         is_backward: Optional[bool] = False,
         sampling_method: Optional[str] = "policy",
         temperature_logits: Optional[float] = 1.0,
@@ -1043,7 +1043,7 @@ def _sample_actions_batch_forward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         mask: Optional[TensorType["n_states", "policy_output_dim"]] = None,
-        states_from: Optional[List] = None,
+        states_from: List = None,
         sampling_method: Optional[str] = "policy",
         temperature_logits: Optional[float] = 1.0,
         max_sampling_attempts: Optional[int] = 10,
@@ -1145,7 +1145,7 @@ def _sample_actions_batch_backward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         mask: Optional[TensorType["n_states", "policy_output_dim"]] = None,
-        states_from: Optional[List] = None,
+        states_from: List = None,
         sampling_method: Optional[str] = "policy",
         temperature_logits: Optional[float] = 1.0,
         max_sampling_attempts: Optional[int] = 10,
@@ -1245,13 +1245,13 @@ def _sample_actions_batch_backward(
 
     # TODO: reorganise args
     # TODO: mask_invalid_actions -> mask
-    # TODO: states_from must be tensor or could be list?
+    # TODO: Add docstring
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_from: TensorType["n_states", "policy_input_dim"],
+        states_from: List,
         mask_invalid_actions: TensorType["n_states", "3"] = None,
     ) -> TensorType["batch_size"]:
         """
@@ -1271,7 +1271,7 @@ def _get_logprobs_forward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         actions: TensorType["n_states", "n_dim"],
-        states_from: TensorType["n_states", "policy_input_dim"],
+        states_from: List,
         mask: TensorType["n_states", "3"] = None,
     ) -> TensorType["batch_size"]:
         """
@@ -1279,6 +1279,9 @@ def _get_logprobs_forward(
         """
         # Initialize variables
         n_states = policy_outputs.shape[0]
+        states_from_tensor = tfloat(
+            states_from, float_type=self.float, device=self.device
+        )
         is_eos = torch.zeros(n_states, dtype=torch.bool, device=self.device)
         logprobs_eos = torch.zeros(n_states, dtype=self.float, device=self.device)
         logprobs_increments_rel = torch.zeros(
@@ -1326,11 +1329,8 @@ def _get_logprobs_forward(
             )
             min_increments[is_source[do_increments]] = 0.0
             # Compute diagonal of the Jacobian (see _get_jacobian_diag())
-            states_from_do_increments = tfloat(
-                states_from, float_type=self.float, device=self.device
-            )[do_increments]
             jacobian_diag[do_increments] = self._get_jacobian_diag(
-                states_from_do_increments,
+                states_from_tensor[do_increments],
                 min_increments,
                 self.max_val,
                 is_backward=False,
@@ -1347,7 +1347,7 @@ def _get_logprobs_backward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         actions: TensorType["n_states", "n_dim"],
-        states_from: TensorType["n_states", "policy_input_dim"],
+        states_from: List,
         mask: TensorType["n_states", "3"] = None,
     ) -> TensorType["batch_size"]:
         """
@@ -1355,6 +1355,9 @@ def _get_logprobs_backward(
         """
         # Initialize variables
         n_states = policy_outputs.shape[0]
+        states_from_tensor = tfloat(
+            states_from, float_type=self.float, device=self.device
+        )
         is_bts = torch.zeros(n_states, dtype=torch.bool, device=self.device)
         logprobs_bts = torch.zeros(n_states, dtype=self.float, device=self.device)
         logprobs_increments_rel = torch.zeros(
@@ -1374,7 +1377,7 @@ def _get_logprobs_backward(
             # BTS actions are equal to the originating states
             is_bts_sampled = torch.zeros_like(do_bts)
             is_bts_sampled[do_bts] = torch.all(
-                actions[do_bts] == states_from[do_bts], dim=1
+                actions[do_bts] == states_from_tensor[do_bts], dim=1
             )
             is_bts[is_bts_sampled] = True
             logits_bts = self._get_policy_source_logit(policy_outputs)[do_bts]
@@ -1399,11 +1402,8 @@ def _get_logprobs_backward(
                 increments_rel, self.min_incr, dtype=self.float, device=self.device
             )
             # Compute diagonal of the Jacobian (see _get_jacobian_diag())
-            states_from_do_increments = tfloat(
-                states_from, float_type=self.float, device=self.device
-            )[do_increments]
             jacobian_diag[do_increments] = self._get_jacobian_diag(
-                states_from_do_increments,
+                states_from_tensor[do_increments],
                 min_increments,
                 self.max_val,
                 is_backward=False,
diff --git a/gflownet/envs/htorus.py b/gflownet/envs/htorus.py
index 6a85a11c0..ebcbe6091 100644
--- a/gflownet/envs/htorus.py
+++ b/gflownet/envs/htorus.py
@@ -400,7 +400,7 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", 2],
-        states_from: TensorType["n_states", "policy_input_dim"],
+        states_from: Optional[List] = None,
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
     ) -> TensorType["batch_size"]:
         """
diff --git a/gflownet/envs/tree.py b/gflownet/envs/tree.py
index acb21aa81..29b3c3edb 100644
--- a/gflownet/envs/tree.py
+++ b/gflownet/envs/tree.py
@@ -754,13 +754,14 @@ def get_logprobs_continuous(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_from: TensorType["n_states", "policy_input_dim"],
+        states_from: Optional[List] = None,
         mask_invalid_actions: TensorType["n_states", "1"] = None,
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of actions given policy outputs and actions.
         """
         n_states = policy_outputs.shape[0]
+        # TODO: make nicer
         if states_from is None:
             states_from = torch.empty(
                 (n_states, self.policy_input_dim), device=self.device
@@ -806,7 +807,7 @@ def get_logprobs(
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_from: TensorType["n_states", "policy_input_dim"],
+        states_from: Optional[List] = None,
         mask_invalid_actions: TensorType["n_states", "1"] = None,
     ) -> TensorType["batch_size"]:
         """
diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 46072a501..234868718 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -541,23 +541,25 @@ def compute_logprobs_trajectories(self, batch: Batch, backward: bool = False):
         assert batch.is_valid()
         # Make indices of batch consecutive since they are used for indexing here
         # Get necessary tensors from batch
-        states = batch.get_states(policy=True)
+        states_policy = batch.get_states(policy=True)
+        states = batch.get_states(policy=False)
         actions = batch.get_actions()
-        parents = batch.get_parents(policy=True)
+        parents_policy = batch.get_parents(policy=True)
+        parents = batch.get_parents(policy=False)
         traj_indices = batch.get_trajectory_indices(consecutive=True)
         if backward:
             # Backward trajectories
             masks_b = batch.get_masks_backward()
-            policy_output_b = self.backward_policy(states)
+            policy_output_b = self.backward_policy(states_policy)
             logprobs_states = self.env.get_logprobs(
-                policy_output_b, False, actions, parents, masks_b
+                policy_output_b, False, actions, states, masks_b
             )
         else:
             # Forward trajectories
             masks_f = batch.get_masks_forward(of_parents=True)
-            policy_output_f = self.forward_policy(parents)
+            policy_output_f = self.forward_policy(parents_policy)
             logprobs_states = self.env.get_logprobs(
-                policy_output_f, True, actions, states, masks_f
+                policy_output_f, True, actions, parents, masks_f
             )
         # Sum log probabilities of all transitions in each trajectory
         logprobs = torch.zeros(

From c898932c0f517c61e783b841af94c6f64f1a9fa8 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 09:34:13 -0400
Subject: [PATCH 116/206] Fix: get_logprobs needs to compute absolute
 increments before computing logprobs.

---
 gflownet/envs/cube.py | 83 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 70 insertions(+), 13 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 28994f97c..147f1380b 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -987,7 +987,13 @@ def relative_to_absolute_increments(
         Given a dimension value x, a relative increment r, a minimum increment m and a
         maximum value 1, the absolute increment a is given by:
 
+        Forward:
+
         a = m + r * (1 - x - m)
+
+        Backward:
+
+        a = m + r * (x - m)
         """
         max_val = torch.full_like(states, max_val)
         if is_backward:
@@ -998,6 +1004,40 @@ def relative_to_absolute_increments(
             )
         return increments_abs
 
+    @staticmethod
+    def absolute_to_relative_increments(
+        states: TensorType["n_states", "n_dim"],
+        increments_abs: TensorType["n_states", "n_dim"],
+        min_increments: TensorType["n_states", "n_dim"],
+        max_val: float,
+        is_backward: bool,
+    ):
+        """
+        Returns a batch of relative increments (as sampled by the Beta distributions)
+        given a batch of states, absolute increments (actions) and minimum_increments.
+
+        Given a dimension value x, an absolute increment a, a minimum increment m and a
+        maximum value 1, the relative increment r is given by:
+
+        Forward:
+
+        r = (a - m) / (1 - x - m)
+
+        Backward:
+
+        r = (a - m) / (x - m)
+        """
+        max_val = torch.full_like(states, max_val)
+        if is_backward:
+            increments_rel = (increments_abs - min_increments) / (
+                states - min_increments
+            )
+        else:
+            increments_rel = (increments_abs - min_increments) / (
+                max_val - states - min_increments
+            )
+        return increments_rel
+
     def sample_actions_batch(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -1314,8 +1354,22 @@ def _get_logprobs_forward(
         # action
         do_increments = ~is_eos
         if torch.any(do_increments):
-            # Shape of increments_rel: [n_do_increments, n_dim]
-            increments_rel = actions[do_increments]
+            # Get absolute increments
+            increments_abs = actions[do_increments]
+            # Get minimum increments
+            min_increments = torch.full_like(
+                increments_abs, self.min_incr, dtype=self.float, device=self.device
+            )
+            min_increments[is_source[do_increments]] = 0.0
+            # Get relative increments
+            increments_rel = self.absolute_to_relative_increments(
+                states_from_tensor[do_increments],
+                increments_abs,
+                min_increments,
+                self.max_val,
+                is_backward=False,
+            )
+            # Get logprobs
             distr_increments = self._make_increments_distribution(
                 policy_outputs[do_increments]
             )
@@ -1323,11 +1377,6 @@ def _get_logprobs_forward(
             logprobs_increments_rel[do_increments] = distr_increments.log_prob(
                 torch.clamp(increments_rel, min=1e-6, max=(1 - 1e-6))
             )
-            # Get minimum increments
-            min_increments = torch.full_like(
-                increments_rel, self.min_incr, dtype=self.float, device=self.device
-            )
-            min_increments[is_source[do_increments]] = 0.0
             # Compute diagonal of the Jacobian (see _get_jacobian_diag())
             jacobian_diag[do_increments] = self._get_jacobian_diag(
                 states_from_tensor[do_increments],
@@ -1388,8 +1437,20 @@ def _get_logprobs_backward(
         # Get log probs of relative increments if actions were neither BTS nor EOS
         do_increments = torch.logical_and(~is_bts, ~is_eos)
         if torch.any(do_increments):
-            # Shape of increments_rel: [n_do_increments, n_dim]
-            increments_rel = actions[do_increments]
+            # Get absolute increments
+            increments_abs = actions[do_increments]
+            min_increments = torch.full_like(
+                increments_abs, self.min_incr, dtype=self.float, device=self.device
+            )
+            # Get relative increments
+            increments_rel = self.absolute_to_relative_increments(
+                states_from_tensor[do_increments],
+                increments_abs,
+                min_increments,
+                self.max_val,
+                is_backward=True,
+            )
+            # Get logprobs
             distr_increments = self._make_increments_distribution(
                 policy_outputs[do_increments]
             )
@@ -1397,10 +1458,6 @@ def _get_logprobs_backward(
             logprobs_increments_rel[do_increments] = distr_increments.log_prob(
                 torch.clamp(increments_rel, min=1e-6, max=(1 - 1e-6))
             )
-            # Set minimum increments
-            min_increments = torch.full_like(
-                increments_rel, self.min_incr, dtype=self.float, device=self.device
-            )
             # Compute diagonal of the Jacobian (see _get_jacobian_diag())
             jacobian_diag[do_increments] = self._get_jacobian_diag(
                 states_from_tensor[do_increments],

From 5327bea3da66194082d16e2e65b85a17235c25d8 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 09:41:31 -0400
Subject: [PATCH 117/206] Fix bug in backwards logprobs

---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 147f1380b..636c5ba39 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1463,7 +1463,7 @@ def _get_logprobs_backward(
                 states_from_tensor[do_increments],
                 min_increments,
                 self.max_val,
-                is_backward=False,
+                is_backward=True,
             )
         # Get log determinant of the Jacobian
         log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)

From 908e9434de58f166cfc2288c0cf51877e55e715f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 09:41:58 -0400
Subject: [PATCH 118/206] Add logprobs backward test

---
 tests/gflownet/envs/test_ccube.py | 44 +++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index c267fdcc3..e8d9127fe 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -888,6 +888,50 @@ def test__get_logprobs_backward__2d__bts_actions_return_expected(
     assert torch.all(torch.isclose(logprobs[~is_bts_forced], logprob_bts, atol=1e-6))
 
 
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.3, 0.3], [0.5, 0.5], [0.8, 0.8]],
+            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
+        ),
+        (
+            [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]],
+            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
+        ),
+        (
+            [[1.0, 1.0], [0.5, 0.5], [0.3, 0.3]],
+            [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
+        ),
+    ],
+)
+def test__get_logprobs_backward__2d__notnan(cube2d, states, actions):
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+    )
+    # Get BTS forced
+    is_near_edge = states_torch < env.min_incr
+    is_bts_forced = torch.any(is_near_edge, dim=1)
+    # Define Bernoulli parameter for BTS
+    # If Bernouilli has logit torch.inf, the logprobs are nan
+    logit_bts = 1
+    distr_bts = Bernoulli(logits=logit_bts)
+    logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["bernoulli_source_logit"] = logit_bts
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(policy_outputs, False, actions, states_torch, masks)
+    assert torch.all(logprobs[is_bts_forced] == 0.0)
+    assert torch.all(torch.isfinite(logprobs))
+
+
 @pytest.mark.parametrize(
     "state, expected",
     [

From a7fbc2f252fb00a1d92db4d8328fc8adc9e9f762 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 11:58:07 -0400
Subject: [PATCH 119/206] Add TODO

---
 gflownet/envs/cube.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 636c5ba39..9a18c6893 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1060,6 +1060,7 @@ def sample_actions_batch(
                 policy_outputs, mask, states_from, sampling_method, temperature_logits
             )
 
+    # TODO: consider using relu and clamp instead sigmoid
     def _make_increments_distribution(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],

From a15182acd6754cc4640ec996acc2a6dd712b5415 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 14:36:11 -0400
Subject: [PATCH 120/206] Return None in plots if n_dim != 2

---
 gflownet/envs/cube.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 9a18c6893..396fe3e53 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1713,6 +1713,8 @@ def plot_reward_samples(
         max_samples=500,
         **kwargs,
     ):
+        if self.n_dim != 2:
+            return None
         # Sample a grid of points in the state space and obtain the rewards
         x = np.linspace(cell_min, cell_max, 201)
         y = np.linspace(cell_min, cell_max, 201)
@@ -1751,6 +1753,8 @@ def plot_kde(
         colorbar=True,
         **kwargs,
     ):
+        if self.n_dim != 2:
+            return None
         # Sample a grid of points in the state space and score them with the KDE
         x = np.linspace(cell_min, cell_max, 201)
         y = np.linspace(cell_min, cell_max, 201)

From 36fb8b33efd29d42b6589d34a717dbf761cf1484 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 17:48:11 -0400
Subject: [PATCH 121/206] Revert "Change conversions to proxy and policy"

This reverts commit c41ccda3ac65bcfbd7eec87e779b3ede7cf20b05.
---
 gflownet/envs/cube.py | 53 ++++++++-----------------------------------
 1 file changed, 10 insertions(+), 43 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 396fe3e53..427509a32 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -82,7 +82,11 @@ def __init__(
         self.action_source = (self.n_dim, 0)
         # End-of-sequence action: (n_dim + 1, 0)
         self.eos = (self.n_dim + 1, 0)
-        # Conversions
+        # Conversions: only conversions to policy are implemented and the rest are the
+        # same
+        self.state2proxy = self.state2policy
+        self.statebatch2proxy = self.statebatch2policy
+        self.statetorch2proxy = self.statetorch2policy
         self.state2oracle = self.state2proxy
         self.statebatch2oracle = self.statebatch2proxy
         self.statetorch2oracle = self.statetorch2proxy
@@ -114,7 +118,7 @@ def get_mask_invalid_actions_forward(
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
         pass
 
-    def statetorch2proxy(
+    def statetorch2policy(
         self, states: TensorType["batch", "state_dim"] = None
     ) -> TensorType["batch", "policy_input_dim"]:
         """
@@ -127,48 +131,11 @@ def statetorch2proxy(
         """
         return 2.0 * torch.clip(states, min=0.0, max=self.max_val) - 1.0
 
-    def statebatch2proxy(
-        self, states: List[List]
-    ) -> TensorType["batch", "state_proxy_dim"]:
-        """
-        Clips the states into [0, max_val] and maps them to [-1.0, 1.0]
-
-        Args
-        ----
-        state : list
-            State
-        """
-        return self.statetorch2proxy(
-            tfloat(states, device=self.device, float_type=self.float)
-        )
-
-    def state2proxy(self, state: List = None) -> List:
-        """
-        Clips the state into [0, max_val] and maps it to [-1.0, 1.0]
-        """
-        if state is None:
-            state = self.state.copy()
-        return [2.0 * min(max(0.0, s), self.max_val) - 1.0 for s in state]
-
-    # TODO: Check issue with get_logprobs using states_from in policy format.
-    def statetorch2policy(
-        self, states: TensorType["batch", "state_dim"] = None
-    ) -> TensorType["batch", "policy_input_dim"]:
-        """
-        Clips the states into [0, max_val]
-
-        Args
-        ----
-        state : list
-            State
-        """
-        return torch.clip(states, min=0.0, max=self.max_val)
-
     def statebatch2policy(
         self, states: List[List]
     ) -> TensorType["batch", "state_proxy_dim"]:
         """
-        Clips the states into [0, max_val]
+        Clips the states into [0, max_val] and maps them to [-1.0, 1.0]
 
         Args
         ----
@@ -176,16 +143,16 @@ def statebatch2policy(
             State
         """
         return self.statetorch2policy(
-            tfloat(states, device=self.device, float_type=self.float)
+            torch.tensor(states, device=self.device, dtype=self.float)
         )
 
     def state2policy(self, state: List = None) -> List:
         """
-        Clips the state into [0, max_val]
+        Clips the state into [0, max_val] and maps it to [-1.0, 1.0]
         """
         if state is None:
             state = self.state.copy()
-        return [min(max(0.0, s), self.max_val) for s in state]
+        return [2.0 * min(max(0.0, s), self.max_val) - 1.0 for s in state]
 
     def state2readable(self, state: List) -> str:
         """

From 5a6a0783f86590471edb6397306f90ae40914f4a Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 17:51:17 -0400
Subject: [PATCH 122/206] tfloat instead of torch.tensor

---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 427509a32..bee65e5a9 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -143,7 +143,7 @@ def statebatch2policy(
             State
         """
         return self.statetorch2policy(
-            torch.tensor(states, device=self.device, dtype=self.float)
+            tfloat(states, device=self.device, float_type=self.float)
         )
 
     def state2policy(self, state: List = None) -> List:

From 2ba2bd93d14671b43d47f38e03ab37358a098a76 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 17:53:43 -0400
Subject: [PATCH 123/206] Remove HybridCube code and config file since it is
 not up to date.

---
 config/env/hcube.yaml |  19 ---
 gflownet/envs/cube.py | 381 ------------------------------------------
 2 files changed, 400 deletions(-)
 delete mode 100644 config/env/hcube.yaml

diff --git a/config/env/hcube.yaml b/config/env/hcube.yaml
deleted file mode 100644
index 01471535f..000000000
--- a/config/env/hcube.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-defaults:
-  - base
-
-_target_: gflownet.envs.cube.HybridCube
-
-id: hcube
-func: corners
-# Dimensions of hypercube
-n_dim: 2
-# Maximum length of trajecotry
-max_traj_length: 10
-# Buffer
-buffer:
-  data_path: null
-  train: null
-  test:
-    type: grid
-    n: 1000
-    output_csv: cube_test.csv
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index bee65e5a9..806899b4a 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -250,387 +250,6 @@ def step(
         pass
 
 
-class HybridCube(Cube):
-    """
-    Continuous (hybrid: discrete and continuous) hyper-cube environment (continuous
-    version of a hyper-grid) in which the action space consists of the increment of
-    dimension d, modelled by a beta distribution.
-
-    The states space is the value of each dimension. If the value of a dimension gets
-    larger than max_val, then the trajectory is ended.
-
-    Attributes
-    ----------
-    n_dim : int
-        Dimensionality of the hyper-cube.
-
-    max_val : float
-        Max length of the hyper-cube.
-
-    min_incr : float
-        Minimum increment in the actions, expressed as the fraction of max_val. This is
-        necessary to ensure coverage of the state space.
-    """
-
-    def __init__(
-        self,
-        n_dim: int = 2,
-        max_val: float = 1.0,
-        min_incr: float = 0.1,
-        n_comp: int = 1,
-        do_nonzero_source_prob: bool = True,
-        fixed_distr_params: dict = {
-            "beta_alpha": 2.0,
-            "beta_beta": 5.0,
-        },
-        random_distr_params: dict = {
-            "beta_alpha": 1.0,
-            "beta_beta": 1.0,
-        },
-        **kwargs,
-    ):
-        assert n_dim > 0
-        assert max_val > 0.0
-        assert n_comp > 0
-        # Main properties
-        self.continuous = True
-        self.n_dim = n_dim
-        self.eos = self.n_dim
-        self.max_val = max_val
-        self.min_incr = min_incr * self.max_val
-        # Parameters of fixed policy distribution
-        self.n_comp = n_comp
-        if do_nonzero_source_prob:
-            self.n_params_per_dim = 4
-        else:
-            self.n_params_per_dim = 3
-        # Source state: position 0 at all dimensions
-        self.source = [0.0 for _ in range(self.n_dim)]
-        # Action from source: (n_dim, 0)
-        self.action_source = (self.n_dim, 0)
-        # End-of-sequence action: (n_dim + 1, 0)
-        self.eos = (self.n_dim + 1, 0)
-        # Conversions: only conversions to policy are implemented and the rest are the
-        # same
-        self.state2proxy = self.state2policy
-        self.statebatch2proxy = self.statebatch2policy
-        self.statetorch2proxy = self.statetorch2policy
-        self.state2oracle = self.state2proxy
-        self.statebatch2oracle = self.statebatch2proxy
-        self.statetorch2oracle = self.statetorch2proxy
-        # Base class init
-        super().__init__(
-            fixed_distr_params=fixed_distr_params,
-            random_distr_params=random_distr_params,
-            **kwargs,
-        )
-
-    def get_action_space(self):
-        """
-        Since this is a hybrid (continuous/discrete) environment, this method
-        constructs a list with the discrete actions.
-
-        The actions are tuples with two values: (dimension, increment) where dimension
-        indicates the index of the dimension on which the action is to be performed and
-        increment indicates the increment of the dimension.
-
-        Additionally, there are two special discrete actions:
-            - Sample an increment for all dimensions. Only valid from the source state.
-            - EOS action
-
-        The (discrete) action space is then one tuple per dimension (with 0 increment),
-        plus the EOS action.
-        """
-        actions = [(d, 0) for d in range(self.n_dim)]
-        actions.append(self.action_source)
-        actions.append(self.eos)
-        return actions
-
-    def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
-        """
-        Defines the structure of the output of the policy model, from which an
-        action is to be determined or sampled, by returning a vector with a fixed
-        random policy.
-
-        For each dimension d of the hyper-cube and component c of the mixture, the
-        output of the policy should return
-          1) the weight of the component in the mixture
-          2) the logit(alpha) parameter of the Beta distribution to sample the increment
-          3) the logit(beta) parameter of the Beta distribution to sample the increment
-
-        Additionally, the policy output contains one logit per dimension plus one logit
-        for the EOS action, for the categorical distribution over dimensions.
-
-        Therefore, the output of the policy model has dimensionality D x C x 3 + D + 1,
-        where D is the number of dimensions (self.n_dim) and C is the number of
-        components (self.n_comp). The first D + 1 entries in the policy output
-        correspond to the categorical logits. Then, the next 3 x C entries in the
-        policy output correspond to the first dimension, and so on.
-        """
-        policy_output = torch.ones(
-            self.n_dim * self.n_comp * 3 + self.n_dim + 1,
-            device=self.device,
-            dtype=self.float,
-        )
-        policy_output[self.n_dim + 2 :: 3] = params["beta_alpha"]
-        policy_output[self.n_dim + 3 :: 3] = params["beta_beta"]
-        return policy_output
-
-    def get_mask_invalid_actions_forward(
-        self,
-        state: Optional[List] = None,
-        done: Optional[bool] = None,
-    ) -> List:
-        """
-        Returns a vector with the length of the discrete part of the action space + 1:
-        True if action is invalid going forward given the current state, False
-        otherwise.
-
-        All discrete actions are valid, including eos, except if the value of any
-        dimension has excedded max_val, in which case the only valid action is eos.
-        """
-        if state is None:
-            state = self.state.copy()
-        if done is None:
-            done = self.done
-        if done:
-            return [True for _ in range(self.action_space_dim)]
-        # If state is source, then next action can only be the action from source.
-        if all([s == ss for s in zip(self.state, self.source)]):
-            mask = [True for _ in range(self.action_space_dim)]
-            mask[-2] = False
-        # If the value of any dimension is greater than max_val, then next action can
-        # only be EOS.
-        elif any([s > self.max_val for s in self.state]):
-            mask = [True for _ in range(self.action_space_dim)]
-            mask[-1] = False
-        else:
-            mask = [False for _ in range(self.action_space_dim)]
-        return mask
-
-    def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
-        """
-        Returns a vector with the length of the discrete part of the action space + 1:
-        True if action is invalid going backward given the current state, False
-        otherwise.
-
-        The backward mask has the following structure:
-
-        - 0:n_dim : whether keeping a dimension as is, that is sampling a decrement of
-          0, can have zero probability. True if the value at the dimension is smaller
-          than or equal to 1 - min_incr.
-        - n_dim : whether going to source is invalid. Always valid, hence always False,
-          except if done.
-        - n_dim + 1 : whether sampling EOS is invalid. Only valid if done.
-        """
-        if state is None:
-            state = self.state.copy()
-        if done is None:
-            done = self.done
-        mask_dim = self.n_dim + 2
-        # If done, only valid action is EOS.
-        if done:
-            mask = [True for _ in range(mask_dim)]
-            mask[-1] = False
-        mask = [True for _ in range(mask_dim)]
-        mask[-2] = False
-        # Dimensions whose value is greater than 1 - min_incr must have non-zero
-        # probability of sampling a decrement of exactly zero.
-        for dim, s in enumerate(state):
-            if s > 1 - self.min_incr:
-                mask[dim] = False
-        return mask
-
-    def get_parents(
-        self, state: List = None, done: bool = None, action: Tuple[int, float] = None
-    ) -> Tuple[List[List], List[Tuple[int, float]]]:
-        """
-        Determines all parents and actions that lead to state.
-
-        Args
-        ----
-        state : list
-            Representation of a state
-
-        done : bool
-            Whether the trajectory is done. If None, done is taken from instance.
-
-        action : int
-            Last action performed
-
-        Returns
-        -------
-        parents : list
-            List of parents in state format
-
-        actions : list
-            List of actions that lead to state for each parent in parents
-        """
-        if state is None:
-            state = self.state.copy()
-        if done is None:
-            done = self.done
-        if done:
-            return [state], [self.eos]
-        # If source state
-        elif state[-1] == 0:
-            return [], []
-        else:
-            dim, incr = action
-            state[dim] -= incr
-            parents = [state]
-            return parents, [action]
-
-    def sample_actions_batch(
-        self,
-        policy_outputs: TensorType["n_states", "policy_output_dim"],
-        sampling_method: str = "policy",
-        mask_invalid_actions: TensorType["n_states", "1"] = None,
-        temperature_logits: float = 1.0,
-        loginf: float = 1000,
-    ) -> Tuple[List[Tuple], TensorType["n_states"]]:
-        """
-        Samples a batch of actions from a batch of policy outputs.
-        """
-        device = policy_outputs.device
-        n_states = policy_outputs.shape[0]
-        ns_range = torch.arange(n_states).to(device)
-        # Sample dimensions
-        if sampling_method == "uniform":
-            logits_dims = torch.ones(n_states, self.policy_output_dim).to(device)
-        elif sampling_method == "policy":
-            logits_dims = policy_outputs[:, 0 : self.n_dim + 1]
-            logits_dims /= temperature_logits
-        if mask_invalid_actions is not None:
-            logits_dims[mask_invalid_actions] = -loginf
-        dimensions = Categorical(logits=logits_dims).sample()
-        logprobs_dim = self.logsoftmax(logits_dims)[ns_range, dimensions]
-        # Sample increments
-        ns_range_noeos = ns_range[dimensions != self.eos[0]]
-        dimensions_noeos = dimensions[dimensions != self.eos[0]]
-        increments = torch.zeros(n_states).to(device)
-        logprobs_increments = torch.zeros(n_states).to(device)
-        if len(dimensions_noeos) > 0:
-            if sampling_method == "uniform":
-                distr_increments = Uniform(
-                    torch.zeros(len(ns_range_noeos)),
-                    self.max_val * torch.ones(len(ns_range_noeos)),
-                )
-            elif sampling_method == "policy":
-                alphas = policy_outputs[:, self.n_dim + 2 :: 3][
-                    ns_range_noeos, dimensions_noeos
-                ]
-                betas = policy_outputs[:, self.n_dim + 3 :: 3][
-                    ns_range_noeos, dimensions_noeos
-                ]
-                distr_increments = Beta(torch.exp(alphas), torch.exp(betas))
-            increments[ns_range_noeos] = distr_increments.sample()
-            logprobs_increments[ns_range_noeos] = distr_increments.log_prob(
-                increments[ns_range_noeos]
-            )
-            # Apply minimum increment
-            increments[ns_range_noeos] = torch.min(
-                increments[ns_range_noeos],
-                self.min_incr * torch.ones(ns_range_noeos.shape[0]),
-            )
-        # Combined probabilities
-        logprobs = logprobs_dim + logprobs_increments
-        # Build actions
-        actions = [
-            (dimension, incr)
-            for dimension, incr in zip(dimensions.tolist(), increments.tolist())
-        ]
-        return actions, logprobs
-
-    def get_logprobs(
-        self,
-        policy_outputs: TensorType["n_states", "policy_output_dim"],
-        is_forward: bool,
-        actions: TensorType["n_states", 2],
-        mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
-        loginf: float = 1000,
-    ) -> TensorType["batch_size"]:
-        """
-        Computes log probabilities of actions given policy outputs and actions.
-        """
-        device = policy_outputs.device
-        dimensions, steps = zip(*actions)
-        dimensions = torch.LongTensor([d.long() for d in dimensions]).to(device)
-        steps = torch.FloatTensor(steps).to(device)
-        n_states = policy_outputs.shape[0]
-        ns_range = torch.arange(n_states).to(device)
-        # Dimensions
-        logits_dims = policy_outputs[:, 0::3]
-        if mask_invalid_actions is not None:
-            logits_dims[mask_invalid_actions] = -loginf
-        logprobs_dim = self.logsoftmax(logits_dims)[ns_range, dimensions]
-        # Steps
-        ns_range_noeos = ns_range[dimensions != self.eos]
-        dimensions_noeos = dimensions[dimensions != self.eos]
-        logprobs_steps = torch.zeros(n_states).to(device)
-        if len(dimensions_noeos) > 0:
-            alphas = policy_outputs[:, 1::3][ns_range_noeos, dimensions_noeos]
-            betas = policy_outputs[:, 2::3][ns_range_noeos, dimensions_noeos]
-            distr_steps = Beta(torch.exp(alphas), torch.exp(betas))
-            logprobs_steps[ns_range_noeos] = distr_steps.log_prob(steps[ns_range_noeos])
-        # Combined probabilities
-        logprobs = logprobs_dim + logprobs_steps
-        return logprobs
-
-    def step(
-        self, action: Tuple[int, float]
-    ) -> Tuple[List[float], Tuple[int, float], bool]:
-        """
-        Executes step given an action.
-
-        Args
-        ----
-        action : tuple
-            Action to be executed. An action is a tuple with two values:
-            (dimension, increment).
-
-        Returns
-        -------
-        self.state : list
-            The sequence after executing the action
-
-        action : int
-            Action executed
-
-        valid : bool
-            False, if the action is not allowed for the current state, e.g. stop at the
-            root state
-        """
-        if self.done:
-            return self.state, action, False
-        # If action is eos or any dimension is beyond max_val or n_actions has reached
-        # max_traj_length, then force eos
-        elif (
-            action[0] == self.eos
-            or any([s > self.max_val for s in self.state])
-            or self.n_actions >= self.max_traj_length
-        ):
-            self.done = True
-            self.n_actions += 1
-            return self.state, (self.eos, 0.0), True
-        # If action is not eos, then perform action
-        elif action[0] != self.eos:
-            self.n_actions += 1
-            self.state[action[0]] += action[1]
-            return self.state, action, True
-        # Otherwise (unreachable?) it is invalid
-        else:
-            return self.state, action, False
-
-    def get_grid_terminating_states(self, n_states: int) -> List[List]:
-        n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
-        linspaces = [np.linspace(0, self.max_val, n_per_dim) for _ in range(self.n_dim)]
-        states = list(itertools.product(*linspaces))
-        # TODO: check if necessary
-        states = [list(el) for el in states]
-        return states
-
-
 class ContinuousCube(Cube):
     """
     Continuous hyper-cube environment (continuous version of a hyper-grid) in which the

From 64e42bf6c590bcfc761f5dfa953dfff229f1ab2e Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 17:57:10 -0400
Subject: [PATCH 124/206] Remove TODO

---
 gflownet/envs/cube.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 806899b4a..8c99a99d8 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -326,7 +326,6 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         action and another logit (pos -2) for the (discrete) backward probability of
         returning to the source node.
 
-        * TODO: review count
         Therefore, the output of the policy model has dimensionality D x C x 3 + 2,
         where D is the number of dimensions (self.n_dim) and C is the number of
         components (self.n_comp).

From 81d3ec0c14d6ae917a59db81483a7ce3befc3e34 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 18:26:15 -0400
Subject: [PATCH 125/206] Remove get_parents code, re-organise code and edit
 docstrings

---
 gflownet/envs/cube.py | 132 ++++++++++++------------------------------
 1 file changed, 38 insertions(+), 94 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 8c99a99d8..4aafb80e5 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -462,7 +462,6 @@ def get_mask_invalid_actions_forward(
             mask[0] = True
         return mask
 
-    # TODO: can we simplify to 2 values?
     def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=None):
         """
         The action space is continuous, thus the mask is not only of invalid actions as
@@ -501,61 +500,14 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         mask[0] = False
         return mask
 
-    # TODO: remove all together?
     def get_parents(
         self, state: List = None, done: bool = None, action: Tuple[int, float] = None
     ) -> Tuple[List[List], List[Tuple[int, float]]]:
         """
-        Determines all parents and actions that lead to state.
-
-        Args
-        ----
-        state : list
-            Representation of a state
-
-        done : bool
-            Whether the trajectory is done. If None, done is taken from instance.
-
-        action : int
-            Last action performed
-
-        Returns
-        -------
-        parents : list
-            List of parents in state format
-
-        actions : list
-            List of actions that lead to state for each parent in parents
+        Defined only because it is required. A ContinuousEnv should be created to avoid
+        this issue.
         """
-        if state is None:
-            state = self.state.copy()
-        if done is None:
-            done = self.done
-        if done:
-            return [state], [self.eos]
-        # If source state
-        if all([s == ss for s, ss in zip(state, self.source)]):
-            return [], []
-        else:
-            min_incr = action[-1]
-            for dim, incr_rel_f in enumerate(action[:-1]):
-                state[dim] = (state[dim] - min_incr - incr_rel_f * (1.0 - min_incr)) / (
-                    1.0 - incr_rel_f
-                )
-            epsilon = 1e-9
-            assert all(
-                [s <= (self.max_val + epsilon) for s in state]
-            ), f"""
-            State is out of cube bounds.
-            \nState:\n{state}\nAction:\n{action}\nIncrement: {incr}
-            """
-            assert all(
-                [s >= (0.0 - epsilon) for s in state]
-            ), f"""
-            State is out of cube bounds.
-            \nState:\n{state}\nAction:\n{action}\nIncrement: {incr}
-            """
-            return [state], [action]
+        pass
 
     @staticmethod
     def relative_to_absolute_increments(
@@ -623,6 +575,26 @@ def absolute_to_relative_increments(
             )
         return increments_rel
 
+    # TODO: consider using relu and clamp instead sigmoid
+    def _make_increments_distribution(
+        self,
+        policy_outputs: TensorType["n_states", "policy_output_dim"],
+    ) -> MixtureSameFamily:
+        mix_logits = self._get_policy_betas_weights(policy_outputs).reshape(
+            -1, self.n_dim, self.n_comp
+        )
+        mix = Categorical(logits=mix_logits)
+        alphas = self._get_policy_betas_alpha(policy_outputs).reshape(
+            -1, self.n_dim, self.n_comp
+        )
+        alphas = self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
+        betas = self._get_policy_betas_beta(policy_outputs).reshape(
+            -1, self.n_dim, self.n_comp
+        )
+        betas = self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
+        beta_distr = Beta(alphas, betas)
+        return MixtureSameFamily(mix, beta_distr)
+
     def sample_actions_batch(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -645,26 +617,6 @@ def sample_actions_batch(
                 policy_outputs, mask, states_from, sampling_method, temperature_logits
             )
 
-    # TODO: consider using relu and clamp instead sigmoid
-    def _make_increments_distribution(
-        self,
-        policy_outputs: TensorType["n_states", "policy_output_dim"],
-    ) -> MixtureSameFamily:
-        mix_logits = self._get_policy_betas_weights(policy_outputs).reshape(
-            -1, self.n_dim, self.n_comp
-        )
-        mix = Categorical(logits=mix_logits)
-        alphas = self._get_policy_betas_alpha(policy_outputs).reshape(
-            -1, self.n_dim, self.n_comp
-        )
-        alphas = self.beta_params_max * torch.sigmoid(alphas) + self.beta_params_min
-        betas = self._get_policy_betas_beta(policy_outputs).reshape(
-            -1, self.n_dim, self.n_comp
-        )
-        betas = self.beta_params_max * torch.sigmoid(betas) + self.beta_params_min
-        beta_distr = Beta(alphas, betas)
-        return MixtureSameFamily(mix, beta_distr)
-
     def _sample_actions_batch_forward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -781,35 +733,27 @@ def _sample_actions_batch_backward(
 
         An action indicates, for each dimension, the absolute increment of the
         dimension value. However, in order to ensure that trajectories have finite
-        length, increments must have a minumum increment (self.min_incr) except if the
-        originating state is the source state (special case, see
-        get_mask_invalid_actions_backward()). Furthermore, absolute increments must also
-        be smaller than the distance from the dimension value to the edge of the cube
-        (self.max_val). In order to accomodate these constraints, first relative
-        increments (in [0, 1]) are sampled from a (mixture of) Beta distribution(s),
-        where 0.0 indicates an absolute increment of min_incr and 1.0 indicates an
-        absolute increment of 1 - x + min_incr (going to the edge).
+        length, increments must have a minumum increment (self.min_incr). Furthermore,
+        absolute increments must also be smaller than the distance from the dimension
+        value to the edge of the cube. In order to accomodate these constraints, first
+        relative increments (in [0, 1]) are sampled from a (mixture of) Beta
+        distribution(s), where 0.0 indicates an absolute increment of min_incr and 1.0
+        indicates an absolute increment of x (going back to the source).
 
         Therefore, given a dimension value x, a relative increment r, a minimum
         increment m and a maximum value 1, the absolute increment a is given by:
 
-        a = m + r * (1 - x - m)
+        a = m + r * (x - m)
 
         The continuous distribution to sample the continuous action described above
-        must be mixed with the discrete distribution to model the sampling of the EOS
-        action. The EOS action can be sampled from any state except from the source
-        state or whether the trajectory is done. That the EOS action is invalid is
-        indicated by mask[-1] being False.
-
-        Finally, regarding the constraints on the increments, the following special
-        cases are taken into account:
-
-        - The originating state is the source state: in this case, the minimum
-          increment is 0.0 instead of self.min_incr. This is to ensure that the entire
-          state space can be reached. This is indicated by mask[-2] being False.
-        - The value at any dimension is at a distance from the cube edge smaller than the
-          minimum increment (x > 1 - m). In this case, only EOS is valid.
-          This is indicated by mask[0] being True (continuous actions are invalid).
+        must be mixed with the discrete distribution to model the sampling of the back
+        to source (BST) action. While the BST action is also a continuous action, it
+        needs to be modelled with a (discrete) Bernoulli distribution in order to
+        ensure that this action has positive likelihood.
+
+        Finally, regarding the constraints on the increments, the special case where
+        the trajectory is done and the only possible action is EOS, is also taken into
+        account.
         """
         # Initialize variables
         n_states = policy_outputs.shape[0]

From c174437a7783298cbdd587b41c4ba834d068d2e2 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 19:06:19 -0400
Subject: [PATCH 126/206] Change default parameters

---
 config/env/ccube.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index 7135b071d..f672e0b42 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -10,23 +10,23 @@ func: corners
 n_dim: 2
 max_val: 1.0
 # Policy
-beta_params_min: 0.1
+beta_params_min: 0.01
 beta_params_max: 1000.0
 min_incr: 0.1
 n_comp: 1
 fixed_distribution:
   beta_weights: 1.0
-  beta_alpha: 2.0
-  beta_beta: 5.0
-  bernoulli_source_logit: 1.0
-  bernoulli_eos_logit: 1.0
+  beta_alpha: 0.01
+  beta_beta: 0.01
+  bernoulli_source_logit: 0.0
+  bernoulli_eos_logit: 0.0
 random_distribution:
   beta_weights: 1.0
   # IMPORTANT: adjust because of sigmoid!
-  beta_alpha: $beta_params_max
+  beta_alpha: 0.01
   beta_beta: $beta_params_max
-  bernoulli_source_logit: 1.0
-  bernoulli_eos_logit: 1.0
+  bernoulli_source_logit: 0.0
+  bernoulli_eos_logit: 0.0
 # Buffer
 buffer:
   data_path: null

From ba2d7b07d89c07a0d616dd56687f754a36e7c6a6 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 19:59:18 -0400
Subject: [PATCH 127/206] Refactor get_logprobs

---
 gflownet/envs/base.py             | 34 ++++++++++++++++----
 gflownet/envs/ctorus.py           | 22 +++++++++++--
 gflownet/envs/cube.py             | 52 ++++++++++++++++++-------------
 gflownet/envs/htorus.py           |  6 ++--
 gflownet/envs/tree.py             | 16 +++++-----
 gflownet/gflownet.py              |  4 +--
 tests/gflownet/envs/common.py     | 16 +++++-----
 tests/gflownet/envs/test_ccube.py | 28 ++++++++++++-----
 8 files changed, 122 insertions(+), 56 deletions(-)

diff --git a/gflownet/envs/base.py b/gflownet/envs/base.py
index ca25b7d82..e958b1b18 100644
--- a/gflownet/envs/base.py
+++ b/gflownet/envs/base.py
@@ -502,25 +502,47 @@ def sample_actions_batch(
         actions = [self.action_space[idx] for idx in action_indices]
         return actions, logprobs
 
-    # TODO: Extend docstring
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
-        is_forward: bool,
         actions: TensorType["n_states", "actions_dim"],
+        mask: TensorType["batch_size", "policy_output_dim"] = None,
         states_from: Optional[List] = None,
-        mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
+        is_backward: bool = False,
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of actions given policy outputs and actions. This
         implementation is generally valid for all discrete environments but continuous
         environments will likely have to implement its own.
+
+        Args
+        ----
+        policy_outputs : tensor
+            The output of the GFlowNet policy model.
+
+        mask : tensor
+            The mask of invalid actions. For continuous or mixed environments, the mask
+            may be tensor with an arbitrary length contaning information about special
+            states, as defined elsewhere in the environment.
+
+        actions : tensor
+            The actions from each state in the batch for which to compute the log
+            probability.
+
+        states_from : tensor
+            The states originating the actions, in GFlowNet format. Ignored in discrete
+            environments and only required in certain continuous environments.
+
+        is_backward : bool
+            True if the actions are backward, False if the actions are forward
+            (default). Ignored in discrete environments and only required in certain
+            continuous environments.
         """
         device = policy_outputs.device
         ns_range = torch.arange(policy_outputs.shape[0]).to(device)
         logits = policy_outputs
-        if mask_invalid_actions is not None:
-            logits[mask_invalid_actions] = -torch.inf
+        if mask is not None:
+            logits[mask] = -torch.inf
         action_indices = (
             torch.tensor(
                 [self.action_space.index(tuple(action.tolist())) for action in actions]
@@ -534,7 +556,7 @@ def get_logprobs(
     def get_jacobian_diag(
         self,
         states: TensorType["batch_size", "state_dim"],
-        is_forward: bool,
+        is_backward: bool = False,
         **kwargs,
     ):
         """
diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index b4383774d..1c83e4ede 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -281,13 +281,31 @@ def sample_actions_batch(
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
-        is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
+        mask_invalid_actions: TensorType["n_states", "1"],
         states_from: Optional[List] = None,
-        mask_invalid_actions: TensorType["n_states", "1"] = None,
+        is_backward: bool = False,
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of actions given policy outputs and actions.
+
+        Args
+        ----
+        policy_outputs : tensor
+            The output of the GFlowNet policy model.
+
+        mask : tensor
+            The mask containing information special cases.
+
+        actions : tensor
+            The actions (angle increments) from each state in the batch for which to
+            compute the log probability.
+
+        states_from : tensor
+            Ignored.
+
+        is_backward : bool
+            Ignored.
         """
         device = policy_outputs.device
         do_sample = torch.all(~mask_invalid_actions, dim=1)
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 4aafb80e5..0c160d221 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -413,7 +413,6 @@ def _get_policy_source_logit(
         """
         return policy_output[:, -2]
 
-    # TODO: EOS must be valid from source too
     def get_mask_invalid_actions_forward(
         self,
         state: Optional[List] = None,
@@ -575,7 +574,6 @@ def absolute_to_relative_increments(
             )
         return increments_rel
 
-    # TODO: consider using relu and clamp instead sigmoid
     def _make_increments_distribution(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -716,9 +714,6 @@ def _sample_actions_batch_forward(
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, None
 
-    # TODO: Rewrite docstring
-    # TODO: Write function common to forward and backward
-    # TODO: Catch source states?
     def _sample_actions_batch_backward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -813,36 +808,53 @@ def _sample_actions_batch_backward(
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, None
 
-    # TODO: reorganise args
-    # TODO: mask_invalid_actions -> mask
-    # TODO: Add docstring
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
-        is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
+        mask: TensorType["n_states", "3"],
         states_from: List,
-        mask_invalid_actions: TensorType["n_states", "3"] = None,
+        is_backward: bool,
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of actions given policy outputs and actions.
+
+        Args
+        ----
+        policy_outputs : tensor
+            The output of the GFlowNet policy model.
+
+        mask : tensor
+            The mask containing information invalid actions and special cases.
+
+        actions : tensor
+            The actions (absolute increments) from each state in the batch for which to
+            compute the log probability.
+
+        states_from : tensor
+            The states originating the actions, in GFlowNet format. They are required
+            so as to compute the relative increments and the Jacobian.
+
+        is_backward : bool
+            True if the actions are backward, False if the actions are forward
+            (default). Required, since the computation for forward and backward actions
+            is different.
         """
-        if is_forward:
-            return self._get_logprobs_forward(
-                policy_outputs, actions, states_from, mask_invalid_actions
+        if is_backward:
+            return self._get_logprobs_backward(
+                policy_outputs, actions, mask, states_from
             )
         else:
-            return self._get_logprobs_backward(
-                policy_outputs, actions, states_from, mask_invalid_actions
+            return self._get_logprobs_forward(
+                policy_outputs, actions, mask, states_from
             )
 
-    # TODO: Unify sample_actions and get_logprobs
     def _get_logprobs_forward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         actions: TensorType["n_states", "n_dim"],
+        mask: TensorType["n_states", "3"],
         states_from: List,
-        mask: TensorType["n_states", "3"] = None,
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of forward actions.
@@ -921,13 +933,12 @@ def _get_logprobs_forward(
         logprobs = logprobs_eos + sumlogprobs_increments + log_det_jacobian
         return logprobs
 
-    # TODO: Unify sample_actions and get_logprobs
     def _get_logprobs_backward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         actions: TensorType["n_states", "n_dim"],
+        mask: TensorType["n_states", "3"],
         states_from: List,
-        mask: TensorType["n_states", "3"] = None,
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of backward actions.
@@ -1000,8 +1011,7 @@ def _get_logprobs_backward(
         # Compute combined probabilities
         sumlogprobs_increments = logprobs_increments_rel.sum(axis=1)
         logprobs = logprobs_bts + sumlogprobs_increments + log_det_jacobian
-        # Logprobs of forced EOS are 0
-        # TODO: is there any avoidable computation of is_eos actions?
+        # Ensure that logprobs of forced EOS are 0
         logprobs[is_eos] = 0.0
         return logprobs
 
diff --git a/gflownet/envs/htorus.py b/gflownet/envs/htorus.py
index ebcbe6091..0e34c667b 100644
--- a/gflownet/envs/htorus.py
+++ b/gflownet/envs/htorus.py
@@ -394,14 +394,14 @@ def sample_actions_batch(
         ]
         return actions, logprobs
 
-    # TODO: requires states_to but it is deprecated anyway
+    # TODO: deprecated
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
-        is_forward: bool,
         actions: TensorType["n_states", 2],
-        states_from: Optional[List] = None,
         mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
+        states_from: Optional[List] = None,
+        is_backward: bool = False,
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of actions given policy outputs and actions.
diff --git a/gflownet/envs/tree.py b/gflownet/envs/tree.py
index 29b3c3edb..7e205346a 100644
--- a/gflownet/envs/tree.py
+++ b/gflownet/envs/tree.py
@@ -752,10 +752,10 @@ def sample_actions_batch(
     def get_logprobs_continuous(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
-        is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_from: Optional[List] = None,
         mask_invalid_actions: TensorType["n_states", "1"] = None,
+        states_from: Optional[List] = None,
+        is_backward: bool = False,
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of actions given policy outputs and actions.
@@ -775,7 +775,7 @@ def get_logprobs_continuous(
             ]
             logprobs_discrete = super().get_logprobs(
                 policy_outputs_discrete,
-                is_forward,
+                is_backward,
                 actions[mask_discrete],
                 states_from[mask_discrete],
                 mask_invalid_actions[
@@ -805,10 +805,10 @@ def get_logprobs_continuous(
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
-        is_forward: bool,
         actions: TensorType["n_states", "n_dim"],
-        states_from: Optional[List] = None,
         mask_invalid_actions: TensorType["n_states", "1"] = None,
+        states_from: Optional[List] = None,
+        is_backward: bool = False,
     ) -> TensorType["batch_size"]:
         """
         Computes log probabilities of actions given policy outputs and actions.
@@ -816,15 +816,15 @@ def get_logprobs(
         if self.continuous:
             return self.get_logprobs_continuous(
                 policy_outputs=policy_outputs,
-                is_forward=is_forward,
                 actions=actions,
-                states_from=states_from,
                 mask_invalid_actions=mask_invalid_actions,
+                states_from=states_from,
+                is_backward=is_backward,
             )
         else:
             return super().get_logprobs(
                 policy_outputs=policy_outputs,
-                is_forward=is_forward,
+                is_backward=is_backward,
                 actions=actions,
                 states_from=states_from,
                 mask_invalid_actions=mask_invalid_actions,
diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 234868718..e58195147 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -552,14 +552,14 @@ def compute_logprobs_trajectories(self, batch: Batch, backward: bool = False):
             masks_b = batch.get_masks_backward()
             policy_output_b = self.backward_policy(states_policy)
             logprobs_states = self.env.get_logprobs(
-                policy_output_b, False, actions, states, masks_b
+                policy_output_b, actions, masks_b, states, backward
             )
         else:
             # Forward trajectories
             masks_f = batch.get_masks_forward(of_parents=True)
             policy_output_f = self.forward_policy(parents_policy)
             logprobs_states = self.env.get_logprobs(
-                policy_output_f, True, actions, parents, masks_f
+                policy_output_f, actions, masks_f, parents, backward
             )
         # Sum log probabilities of all transitions in each trajectory
         logprobs = torch.zeros(
diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 5be3286dd..e162b9cf3 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -187,7 +187,9 @@ def test__get_logprobs__backward__returns_zero_if_done(env, n=5):
     # Add noise to policy outputs
     policy_outputs += torch.randn(policy_outputs.shape)
     masks = tbool(masks, device=env.device)
-    logprobs = env.get_logprobs(policy_outputs, False, actions_eos, states, masks)
+    logprobs = env.get_logprobs(
+        policy_outputs, actions_eos, masks, states, is_backward=True
+    )
     assert torch.all(logprobs == 0.0)
 
 
@@ -311,10 +313,10 @@ def test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env):
         actions_torch = torch.tensor(actions)
         logprobs_glp = env.get_logprobs(
             policy_outputs=policy_outputs,
-            is_forward=True,
             actions=actions_torch,
-            states_from=None,
             mask_invalid_actions=masks_invalid_torch,
+            states_from=None,
+            is_backward=False,
         )
         action = actions[0]
         assert env.action2representative(action) in valid_actions
@@ -344,10 +346,10 @@ def test__forward_actions_have_nonzero_backward_prob(env):
         policy_outputs = policy_random.clone().detach()
         logprobs_bw = env.get_logprobs(
             policy_outputs=policy_outputs,
-            is_forward=False,
             actions=actions_torch,
-            states_from=states_torch,
             mask_invalid_actions=masks,
+            states_from=states_torch,
+            is_backward=True,
         )
         assert torch.isfinite(logprobs_bw)
         assert logprobs_bw > -1e6
@@ -377,10 +379,10 @@ def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
             policy_outputs = policy_random.clone().detach()
             logprobs_fw = env.get_logprobs(
                 policy_outputs=policy_outputs,
-                is_forward=True,
                 actions=actions_torch,
-                states_from=states_torch,
                 mask_invalid_actions=masks,
+                states_from=states_torch,
+                is_backward=False,
             )
             assert torch.isfinite(logprobs_fw)
             assert logprobs_fw > -1e6
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index e8d9127fe..267854c04 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -661,7 +661,9 @@ def test__get_logprobs_forward__2d__nearedge_returns_prob1(cube2d, states, actio
     # Add noise to policy outputs
     policy_outputs += torch.randn(policy_outputs.shape)
     # Get log probs
-    logprobs = env.get_logprobs(policy_outputs, True, actions, states_torch, masks)
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=False
+    )
     assert torch.all(logprobs == 0.0)
 
 
@@ -706,7 +708,9 @@ def test__get_logprobs_forward__2d__eos_actions_return_expected(
     params["bernoulli_eos_logit"] = logit_eos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
-    logprobs = env.get_logprobs(policy_outputs, True, actions, states_torch, masks)
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=False
+    )
     assert torch.all(logprobs[is_eos_forced] == 0.0)
     assert torch.all(torch.isclose(logprobs[~is_eos_forced], logprob_eos, atol=1e-6))
 
@@ -754,7 +758,9 @@ def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1
     params["bernoulli_eos_logit"] = logit_force_noeos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
-    logprobs = env.get_logprobs(policy_outputs, True, actions, states_torch, masks)
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=False
+    )
     assert torch.all(logprobs == 0.0)
 
 
@@ -797,7 +803,9 @@ def test__get_logprobs_forward__2d__notnan(cube2d, states, actions):
     params["bernoulli_eos_logit"] = logit_eos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
-    logprobs = env.get_logprobs(policy_outputs, True, actions, states_torch, masks)
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=False
+    )
     assert torch.all(logprobs[is_eos_forced] == 0.0)
     assert torch.all(torch.isfinite(logprobs))
 
@@ -834,7 +842,9 @@ def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, acti
     # Add noise to policy outputs
     policy_outputs += torch.randn(policy_outputs.shape)
     # Get log probs
-    logprobs = env.get_logprobs(policy_outputs, False, actions, states_torch, masks)
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=True
+    )
     assert torch.all(logprobs == 0.0)
 
 
@@ -883,7 +893,9 @@ def test__get_logprobs_backward__2d__bts_actions_return_expected(
     params["bernoulli_source_logit"] = logit_bts
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
-    logprobs = env.get_logprobs(policy_outputs, False, actions, states_torch, masks)
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=True
+    )
     assert torch.all(logprobs[is_bts_forced] == 0.0)
     assert torch.all(torch.isclose(logprobs[~is_bts_forced], logprob_bts, atol=1e-6))
 
@@ -927,7 +939,9 @@ def test__get_logprobs_backward__2d__notnan(cube2d, states, actions):
     params["bernoulli_source_logit"] = logit_bts
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
-    logprobs = env.get_logprobs(policy_outputs, False, actions, states_torch, masks)
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=True
+    )
     assert torch.all(logprobs[is_bts_forced] == 0.0)
     assert torch.all(torch.isfinite(logprobs))
 

From 1acd903d53c448232eba46de91ac4aef8654832a Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 20:00:44 -0400
Subject: [PATCH 128/206] Fix typo

---
 tests/gflownet/envs/common.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index e162b9cf3..09b55b413 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -314,7 +314,7 @@ def test__sample_actions__get_logprobs__return_valid_actions_and_logprobs(env):
         logprobs_glp = env.get_logprobs(
             policy_outputs=policy_outputs,
             actions=actions_torch,
-            mask_invalid_actions=masks_invalid_torch,
+            mask=masks_invalid_torch,
             states_from=None,
             is_backward=False,
         )
@@ -347,7 +347,7 @@ def test__forward_actions_have_nonzero_backward_prob(env):
         logprobs_bw = env.get_logprobs(
             policy_outputs=policy_outputs,
             actions=actions_torch,
-            mask_invalid_actions=masks,
+            mask=masks,
             states_from=states_torch,
             is_backward=True,
         )
@@ -380,7 +380,7 @@ def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
             logprobs_fw = env.get_logprobs(
                 policy_outputs=policy_outputs,
                 actions=actions_torch,
-                mask_invalid_actions=masks,
+                mask=masks,
                 states_from=states_torch,
                 is_backward=False,
             )

From f5b41cb3071f36a3b551f0e682957cfb14472699 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 20:04:38 -0400
Subject: [PATCH 129/206] Fix typos

---
 gflownet/envs/ctorus.py |  4 ++--
 gflownet/envs/htorus.py |  6 +++---
 gflownet/envs/tree.py   | 30 ++++++++++++++----------------
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/gflownet/envs/ctorus.py b/gflownet/envs/ctorus.py
index 1c83e4ede..3cf8543bd 100644
--- a/gflownet/envs/ctorus.py
+++ b/gflownet/envs/ctorus.py
@@ -282,7 +282,7 @@ def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         actions: TensorType["n_states", "n_dim"],
-        mask_invalid_actions: TensorType["n_states", "1"],
+        mask: TensorType["n_states", "1"],
         states_from: Optional[List] = None,
         is_backward: bool = False,
     ) -> TensorType["batch_size"]:
@@ -308,7 +308,7 @@ def get_logprobs(
             Ignored.
         """
         device = policy_outputs.device
-        do_sample = torch.all(~mask_invalid_actions, dim=1)
+        do_sample = torch.all(~mask, dim=1)
         n_states = policy_outputs.shape[0]
         logprobs = torch.zeros(n_states, self.n_dim).to(device)
         if torch.any(do_sample):
diff --git a/gflownet/envs/htorus.py b/gflownet/envs/htorus.py
index 0e34c667b..2e267bfaa 100644
--- a/gflownet/envs/htorus.py
+++ b/gflownet/envs/htorus.py
@@ -399,7 +399,7 @@ def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         actions: TensorType["n_states", 2],
-        mask_invalid_actions: TensorType["batch_size", "policy_output_dim"] = None,
+        mask: TensorType["batch_size", "policy_output_dim"] = None,
         states_from: Optional[List] = None,
         is_backward: bool = False,
     ) -> TensorType["batch_size"]:
@@ -414,8 +414,8 @@ def get_logprobs(
         ns_range = torch.arange(n_states).to(device)
         # Dimensions
         logits_dims = policy_outputs[:, 0 :: self.n_params_per_dim]
-        if mask_invalid_actions is not None:
-            logits_dims[mask_invalid_actions] = -torch.inf
+        if mask is not None:
+            logits_dims[mask] = -torch.inf
         logprobs_dim = self.logsoftmax(logits_dims)[ns_range, dimensions]
         # Angle increments
         # Cases where p(angle) should be computed (nofix):
diff --git a/gflownet/envs/tree.py b/gflownet/envs/tree.py
index 7e205346a..c98bf4936 100644
--- a/gflownet/envs/tree.py
+++ b/gflownet/envs/tree.py
@@ -753,7 +753,7 @@ def get_logprobs_continuous(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         actions: TensorType["n_states", "n_dim"],
-        mask_invalid_actions: TensorType["n_states", "1"] = None,
+        mask: TensorType["n_states", "1"] = None,
         states_from: Optional[List] = None,
         is_backward: bool = False,
     ) -> TensorType["batch_size"]:
@@ -768,7 +768,7 @@ def get_logprobs_continuous(
             )
         logprobs = torch.zeros(n_states, device=self.device, dtype=self.float)
         # Discrete actions
-        mask_discrete = mask_invalid_actions[:, self._action_index_pick_threshold]
+        mask_discrete = mask[:, self._action_index_pick_threshold]
         if torch.any(mask_discrete):
             policy_outputs_discrete = policy_outputs[
                 mask_discrete, : self._index_continuous_policy_output
@@ -778,9 +778,7 @@ def get_logprobs_continuous(
                 is_backward,
                 actions[mask_discrete],
                 states_from[mask_discrete],
-                mask_invalid_actions[
-                    mask_discrete, : self._index_continuous_policy_output
-                ],
+                mask[mask_discrete, : self._index_continuous_policy_output],
             )
             logprobs[mask_discrete] = logprobs_discrete
         if torch.all(mask_discrete):
@@ -806,7 +804,7 @@ def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
         actions: TensorType["n_states", "n_dim"],
-        mask_invalid_actions: TensorType["n_states", "1"] = None,
+        mask: TensorType["n_states", "1"] = None,
         states_from: Optional[List] = None,
         is_backward: bool = False,
     ) -> TensorType["batch_size"]:
@@ -815,19 +813,19 @@ def get_logprobs(
         """
         if self.continuous:
             return self.get_logprobs_continuous(
-                policy_outputs=policy_outputs,
-                actions=actions,
-                mask_invalid_actions=mask_invalid_actions,
-                states_from=states_from,
-                is_backward=is_backward,
+                policy_outputs,
+                actions,
+                mask,
+                states_from,
+                is_backward,
             )
         else:
             return super().get_logprobs(
-                policy_outputs=policy_outputs,
-                is_backward=is_backward,
-                actions=actions,
-                states_from=states_from,
-                mask_invalid_actions=mask_invalid_actions,
+                policy_outputs,
+                actions,
+                mask,
+                states_from,
+                is_backward,
             )
 
     def state2policy_mlp(

From 6a19fdc50c3a2a9b2a197bb09c707ac1e36cd64f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 12 Sep 2023 10:49:55 -0400
Subject: [PATCH 130/206] Restore previous sample_from_reward of htorus

---
 gflownet/envs/htorus.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/gflownet/envs/htorus.py b/gflownet/envs/htorus.py
index 2e267bfaa..6a82dee1a 100644
--- a/gflownet/envs/htorus.py
+++ b/gflownet/envs/htorus.py
@@ -548,18 +548,29 @@ def sample_from_reward(
         Returns a tensor in GFloNet (state) format.
         """
         samples_final = []
-        max_reward = self.proxy2reward(self.proxy.min)
+        max_reward = self.proxy2reward(torch.tensor([self.proxy.min])).to(self.device)
         while len(samples_final) < n_samples:
-            samples_uniform = self.statebatch2proxy(
-                self.get_uniform_terminating_states(n_samples)
+            angles_uniform = (
+                torch.rand(
+                    (n_samples, self.n_dim), dtype=self.float, device=self.device
+                )
+                * 2
+                * np.pi
+            )
+            samples = torch.cat(
+                (
+                    angles_uniform,
+                    torch.ones((angles_uniform.shape[0], 1)).to(angles_uniform),
+                ),
+                axis=1,
             )
-            rewards = self.proxy2reward(self.proxy(samples_uniform))
+            rewards = self.reward_torchbatch(samples)
             mask = (
                 torch.rand(n_samples, dtype=self.float, device=self.device)
                 * (max_reward + epsilon)
                 < rewards
             )
-            samples_accepted = samples_uniform[mask]
+            samples_accepted = samples[mask, :]
             samples_final.extend(samples_accepted[-(n_samples - len(samples_final)) :])
         return torch.vstack(samples_final)
 

From 124da893ddc87e83ea65f24dbb476a79cf356280 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 13 Sep 2023 17:02:56 -0400
Subject: [PATCH 131/206] Add owl cube config file.

---
 config/experiments/ccube/ccube_owl.yaml | 71 +++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 config/experiments/ccube/ccube_owl.yaml

diff --git a/config/experiments/ccube/ccube_owl.yaml b/config/experiments/ccube/ccube_owl.yaml
new file mode 100644
index 000000000..8266ebab7
--- /dev/null
+++ b/config/experiments/ccube/ccube_owl.yaml
@@ -0,0 +1,71 @@
+# @package _global_
+
+defaults:
+   - override /env: ccube
+   - override /gflownet: trajectorybalance
+   - override /proxy: corners
+   - override /logger: wandb
+   - override /user: alex
+
+# Environment
+env:
+  n_comp: 5
+  n_dim: 2
+  beta_params_min: 0.01
+  beta_params_max: 100.0
+  min_incr: 0.1
+  fixed_distribution:
+    beta_weights: 1.0
+    beta_alpha: 0.01
+    beta_beta: 0.01
+    bernoulli_source_logit: 1.0
+    bernoulli_eos_logit: 1.0
+  random_distribution:
+    beta_weights: 1.0
+    beta_alpha: 0.01
+    beta_beta: 0.01
+    bernoulli_source_logit: 1.0
+    bernoulli_eos_logit: 1.0
+  reward_func: identity
+
+# GFlowNet hyperparameters
+gflownet:
+  random_action_prob: 0.1
+  optimizer:
+    batch_size:
+      forward: 100
+    lr: 0.0001
+    z_dim: 16
+    lr_z_mult: 100
+    n_train_steps: 10000
+  policy:
+    forward:
+      type: mlp
+      n_hid: 512
+      n_layers: 5
+      checkpoint: forward
+    backward:
+      type: mlp
+      n_hid: 512
+      n_layers: 5
+      shared_weights: False
+      checkpoint: backward
+
+# WandB
+logger:
+  lightweight: True
+  project_name: "GFlowNet Cube"
+  tags: 
+    - gflownet
+    - continuous
+    - ccube
+  test:
+    period: 500
+    n: 1000
+  checkpoints:
+    period: 500
+
+# Hydra
+hydra:
+  run:
+    dir: ${user.logdir.root}/debug/ccube/${now:%Y-%m-%d_%H-%M-%S}

From 4cbb974b84293acdca2b9d82fb8938942eeae9e7 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 13 Sep 2023 23:36:26 -0400
Subject: [PATCH 132/206] Add metric: variance of (log(rewards) - logprobs) on
 test.

---
 gflownet/gflownet.py     | 10 ++++++++++
 gflownet/utils/logger.py |  4 +++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index e58195147..5012b87ec 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -175,6 +175,7 @@ def __init__(
         self.kl = -1.0
         self.jsd = -1.0
         self.corr_prob_traj_rewards = 0.0
+        self.var_logrewards_logp = -1.0
         self.nll_tt = 0.0
 
     def parameters(self):
@@ -813,6 +814,7 @@ def train(self):
                     self.kl,
                     self.jsd,
                     self.corr_prob_traj_rewards,
+                    self.var_logrewards_logp,
                     self.nll_tt,
                     figs,
                     env_metrics,
@@ -822,6 +824,7 @@ def train(self):
                     self.kl,
                     self.jsd,
                     self.corr_prob_traj_rewards,
+                    self.var_logrewards_logp,
                     self.nll_tt,
                     it,
                     self.use_context,
@@ -962,6 +965,7 @@ def test(self, **plot_kwargs):
                 self.kl,
                 self.jsd,
                 self.corr_prob_traj_rewards,
+                self.var_logrewards_logp,
                 self.nll_tt,
                 (None,),
                 {},
@@ -982,6 +986,10 @@ def test(self, **plot_kwargs):
         corr_prob_traj_rewards = np.corrcoef(
             np.exp(logprobs_x_tt.cpu().numpy()), rewards_x_tt
         )[0, 1]
+        var_logrewards_logp = torch.var(
+            torch.log(tfloat(rewards_x_tt, float_type=self.float, device=self.device))
+            - logprobs_x_tt
+        ).item()
         nll_tt = -logprobs_x_tt.mean().item()
 
         batch, _ = self.sample_batch(n_forward=self.logger.test.n, train=False)
@@ -1013,6 +1021,7 @@ def test(self, **plot_kwargs):
                 self.kl,
                 self.jsd,
                 self.corr_prob_traj_rewards,
+                self.var_logrewards_logp,
                 self.nll_tt,
                 (None,),
                 env_metrics,
@@ -1084,6 +1093,7 @@ def test(self, **plot_kwargs):
             kl,
             jsd,
             corr_prob_traj_rewards,
+            var_logrewards_logp,
             nll_tt,
             [fig_reward_samples, fig_kde_pred, fig_kde_true],
             {},
diff --git a/gflownet/utils/logger.py b/gflownet/utils/logger.py
index 3e9cf3377..e9556f9c5 100644
--- a/gflownet/utils/logger.py
+++ b/gflownet/utils/logger.py
@@ -329,6 +329,7 @@ def log_test_metrics(
         kl: float,
         jsd: float,
         corr_prob_traj_rewards: float,
+        var_logrewards_logp: float,
         nll_tt: float,
         step: int,
         use_context: bool,
@@ -342,9 +343,10 @@ def log_test_metrics(
                     "KL Div.",
                     "Jensen Shannon Div.",
                     "Corr. (test probs., rewards)",
+                    "Var(logR - logp) test",
                     "NLL of test data",
                 ],
-                [l1, kl, jsd, corr_prob_traj_rewards, nll_tt],
+                [l1, kl, jsd, corr_prob_traj_rewards, var_logrewards_logp, nll_tt],
             )
         )
         self.log_metrics(

From b86d08b6edab9056cd7f7ee3cb43410cb697c7b5 Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Mon, 18 Sep 2023 23:03:42 +0200
Subject: [PATCH 133/206] Fix typo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Michał Koziarski <michal.koziarski@gmail.com>
---
 gflownet/gflownet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 5012b87ec..87947f5cb 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -1021,7 +1021,7 @@ def test(self, **plot_kwargs):
                 self.kl,
                 self.jsd,
                 self.corr_prob_traj_rewards,
-                self.var_logrewards_logp,
+                var_logrewards_logp,
                 self.nll_tt,
                 (None,),
                 env_metrics,

From 4c63e73bb8d1ef7439e9219d0e7cc82733683cc3 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 18 Sep 2023 17:12:14 -0400
Subject: [PATCH 134/206] Update docstring of base Cube

---
 gflownet/envs/cube.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 0c160d221..9dfd7f797 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -20,9 +20,9 @@
 
 class Cube(GFlowNetEnv, ABC):
     """
-    Continuous (hybrid: discrete and continuous) hyper-cube environment (continuous
-    version of a hyper-grid) in which the action space consists of the increment of
-    dimension d, modelled by a beta distribution.
+    Base class for hyper-cube environments, continuous or hybrid versions of the
+    hyper-grid in which the continuous increments are modelled by a (mixture of) Beta
+    distribution(s).
 
     The states space is the value of each dimension. If the value of a dimension gets
     larger than max_val, then the trajectory is ended.

From abb02c1911c22ed306741df5ddc310c4c3638de3 Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Mon, 18 Sep 2023 23:26:00 +0200
Subject: [PATCH 135/206] Fix typo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Michał Koziarski <michal.koziarski@gmail.com>
---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 0c160d221..63be6e6fb 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1212,7 +1212,7 @@ def get_uniform_terminating_states(
         states = rng.uniform(low=0.0, high=self.max_val, size=(n_states, self.n_dim))
         return states.tolist()
 
-    #     # TODO: make generic for all environments
+    # TODO: make generic for all environments
     def sample_from_reward(
         self, n_samples: int, epsilon=1e-4
     ) -> TensorType["n_samples", "state_dim"]:

From a3671ac2be06d1164ae4009dd3b22e7829c77cb2 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 18 Sep 2023 17:47:40 -0400
Subject: [PATCH 136/206] Fix double space.

---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 9dfd7f797..c080b22e3 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1217,7 +1217,7 @@ def sample_from_reward(
         self, n_samples: int, epsilon=1e-4
     ) -> TensorType["n_samples", "state_dim"]:
         """
-        Rejection sampling  with proposal the uniform distribution in
+        Rejection sampling with proposal the uniform distribution in
         [0, max_val]]^n_dim.
 
         Returns a tensor in GFloNet (state) format.

From d9aee66453cc886bce91943f8af48599b81851ff Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 18 Sep 2023 17:54:25 -0400
Subject: [PATCH 137/206] Warning instead of print.

---
 tests/gflownet/envs/common.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 09b55b413..44173bc19 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -1,3 +1,5 @@
+import warnings
+
 import hydra
 import numpy as np
 import pytest
@@ -59,7 +61,7 @@ def _get_terminating_states(env, n):
     elif hasattr(env, "get_random_terminating_states"):
         return env.get_random_terminating_states(n, 0)
     else:
-        print(
+        warnings.warn(
             f"""
         Testing backward sampling or setting terminating states requires that the
         environment implements one of the following:

From dd4b4fea0f92562ec11f6ab3a61799741f8c7151 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 18 Sep 2023 18:58:10 -0400
Subject: [PATCH 138/206] Fix issue in config files: rename random_distribution
 -> random_distr_params

---
 config/env/alaninedipeptide.yaml        | 2 +-
 config/env/ccube.yaml                   | 2 +-
 config/env/ctorus.yaml                  | 2 +-
 config/env/htorus.yaml                  | 2 +-
 config/env/tree.yaml                    | 2 +-
 config/experiments/ccube/ccube_owl.yaml | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/config/env/alaninedipeptide.yaml b/config/env/alaninedipeptide.yaml
index 71be8f7f6..9196a6d12 100644
--- a/config/env/alaninedipeptide.yaml
+++ b/config/env/alaninedipeptide.yaml
@@ -16,7 +16,7 @@ fixed_distribution:
   vonmises_mean: 0.0
   vonmises_concentration: 0.5
 # Parameters of the random policy output distribution
-random_distribution:
+random_distr_params:
   vonmises_mean: 0.0
   vonmises_concentration: 0.001
 # Buffer
diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index f672e0b42..e00a525b7 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -20,7 +20,7 @@ fixed_distribution:
   beta_beta: 0.01
   bernoulli_source_logit: 0.0
   bernoulli_eos_logit: 0.0
-random_distribution:
+random_distr_params:
   beta_weights: 1.0
   # IMPORTANT: adjust because of sigmoid!
   beta_alpha: 0.01
diff --git a/config/env/ctorus.yaml b/config/env/ctorus.yaml
index d7d86384e..933200a08 100644
--- a/config/env/ctorus.yaml
+++ b/config/env/ctorus.yaml
@@ -17,7 +17,7 @@ fixed_distribution:
   vonmises_mean: 0.0
   vonmises_concentration: 1.0
 # Parameters of the random policy output distribution
-random_distribution:
+random_distr_params:
   vonmises_mean: 0.0
   vonmises_concentration: 0.01
 # Buffer
diff --git a/config/env/htorus.yaml b/config/env/htorus.yaml
index 315cd7ab4..0b3f8f09d 100644
--- a/config/env/htorus.yaml
+++ b/config/env/htorus.yaml
@@ -17,7 +17,7 @@ fixed_distribution:
   vonmises_mean: 0.0
   vonmises_concentration: 0.5
 # Parameters of the random policy output distribution
-random_distribution:
+random_distr_params:
   vonmises_mean: 0.0
   vonmises_concentration: 0.001
 # Buffer
diff --git a/config/env/tree.yaml b/config/env/tree.yaml
index c714bb9bf..93296ff31 100644
--- a/config/env/tree.yaml
+++ b/config/env/tree.yaml
@@ -25,7 +25,7 @@ beta_params_max: 100.0
 fixed_distribution:
   beta_alpha: 2.0
   beta_beta: 5.0
-random_distribution:
+random_distr_params:
   beta_alpha: 1.0
   beta_beta: 1.0
 # Buffer
diff --git a/config/experiments/ccube/ccube_owl.yaml b/config/experiments/ccube/ccube_owl.yaml
index 8266ebab7..308e95b2f 100644
--- a/config/experiments/ccube/ccube_owl.yaml
+++ b/config/experiments/ccube/ccube_owl.yaml
@@ -20,7 +20,7 @@ env:
     beta_beta: 0.01
     bernoulli_source_logit: 1.0
     bernoulli_eos_logit: 1.0
-  random_distribution:
+  random_distr_params:
     beta_weights: 1.0
     beta_alpha: 0.01
     beta_beta: 0.01

From 9ee9de16c8fbe034c6f20d748718ed3a6ecefe3b Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 18 Sep 2023 18:57:15 -0400
Subject: [PATCH 139/206] Fix issue in config files: rename fixed_distribution
 -> fixed_distr_params

---
 config/env/alaninedipeptide.yaml        | 2 +-
 config/env/ccube.yaml                   | 2 +-
 config/env/ctorus.yaml                  | 2 +-
 config/env/htorus.yaml                  | 2 +-
 config/env/tree.yaml                    | 2 +-
 config/experiments/ccube/ccube_owl.yaml | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/config/env/alaninedipeptide.yaml b/config/env/alaninedipeptide.yaml
index 9196a6d12..17a3877a2 100644
--- a/config/env/alaninedipeptide.yaml
+++ b/config/env/alaninedipeptide.yaml
@@ -12,7 +12,7 @@ length_traj: 10
 vonmises_min_concentration: 1e-3
 # Parameters of the fixed policy output distribution
 n_comp: 3
-fixed_distribution:
+fixed_distr_params:
   vonmises_mean: 0.0
   vonmises_concentration: 0.5
 # Parameters of the random policy output distribution
diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index e00a525b7..dc98126f0 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -14,7 +14,7 @@ beta_params_min: 0.01
 beta_params_max: 1000.0
 min_incr: 0.1
 n_comp: 1
-fixed_distribution:
+fixed_distr_params:
   beta_weights: 1.0
   beta_alpha: 0.01
   beta_beta: 0.01
diff --git a/config/env/ctorus.yaml b/config/env/ctorus.yaml
index 933200a08..fa194956f 100644
--- a/config/env/ctorus.yaml
+++ b/config/env/ctorus.yaml
@@ -13,7 +13,7 @@ length_traj: 3
 vonmises_min_concentration: 1e-3
 # Parameters of the fixed policy output distribution
 n_comp: 3
-fixed_distribution:
+fixed_distr_params:
   vonmises_mean: 0.0
   vonmises_concentration: 1.0
 # Parameters of the random policy output distribution
diff --git a/config/env/htorus.yaml b/config/env/htorus.yaml
index 0b3f8f09d..c53b95acc 100644
--- a/config/env/htorus.yaml
+++ b/config/env/htorus.yaml
@@ -13,7 +13,7 @@ policy_encoding_dim_per_angle: null
 length_traj: 3
 vonmises_min_concentration: 1e-3
 # Parameters of the fixed policy output distribution
-fixed_distribution:
+fixed_distr_params:
   vonmises_mean: 0.0
   vonmises_concentration: 0.5
 # Parameters of the random policy output distribution
diff --git a/config/env/tree.yaml b/config/env/tree.yaml
index 93296ff31..af6bec2eb 100644
--- a/config/env/tree.yaml
+++ b/config/env/tree.yaml
@@ -22,7 +22,7 @@ test_args:
 threshold_components: 3
 beta_params_min: 0.1
 beta_params_max: 100.0
-fixed_distribution:
+fixed_distr_params:
   beta_alpha: 2.0
   beta_beta: 5.0
 random_distr_params:
diff --git a/config/experiments/ccube/ccube_owl.yaml b/config/experiments/ccube/ccube_owl.yaml
index 308e95b2f..009d826d6 100644
--- a/config/experiments/ccube/ccube_owl.yaml
+++ b/config/experiments/ccube/ccube_owl.yaml
@@ -14,7 +14,7 @@ env:
   beta_params_min: 0.01
   beta_params_max: 100.0
   min_incr: 0.1
-  fixed_distribution:
+  fixed_distr_params:
     beta_weights: 1.0
     beta_alpha: 0.01
     beta_beta: 0.01

From 49b23bafda9d915256c41cc089ef0b8d6628128f Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Tue, 19 Sep 2023 17:39:58 +0200
Subject: [PATCH 140/206] Fix docstring typo.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Michał Koziarski <michal.koziarski@gmail.com>
---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index d9684ac82..5b1f169bd 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -254,7 +254,7 @@ class ContinuousCube(Cube):
     """
     Continuous hyper-cube environment (continuous version of a hyper-grid) in which the
     action space consists of the increment of each dimension d, modelled by a mixture
-    of Beta distributions. The states space is the value of each dimension. In order to
+    of Beta distributions. The state space is the value of each dimension. In order to
     ensure that all trajectories are of finite length, actions have a minimum increment
     for all dimensions determined by min_incr. If the value of any dimension is larger
     than 1 - min_incr, then that dimension can be further incremented. In order to

From 33f1917b67ed8a6962d1c3b6948a67d58429110d Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Tue, 19 Sep 2023 17:45:33 +0200
Subject: [PATCH 141/206] Fix docstring typo.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Michał Koziarski <michal.koziarski@gmail.com>
---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 5b1f169bd..40815a9bd 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -257,7 +257,7 @@ class ContinuousCube(Cube):
     of Beta distributions. The state space is the value of each dimension. In order to
     ensure that all trajectories are of finite length, actions have a minimum increment
     for all dimensions determined by min_incr. If the value of any dimension is larger
-    than 1 - min_incr, then that dimension can be further incremented. In order to
+    than 1 - min_incr, then that dimension can't be further incremented. In order to
     ensure the coverage of the state space, the first action (from the source state) is
     not constrained by the minimum increment.
 

From 625f6616287ebd150e415b605203def4f677d261 Mon Sep 17 00:00:00 2001
From: Alex <alexhg15@gmail.com>
Date: Tue, 19 Sep 2023 17:47:03 +0200
Subject: [PATCH 142/206] Update gflownet/envs/cube.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Michał Koziarski <michal.koziarski@gmail.com>
---
 gflownet/envs/cube.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 40815a9bd..2605ef232 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -311,10 +311,10 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         Continuous actions
 
         For each dimension d of the hyper-cube and component c of the mixture, the
-        output of the policy should return
-          1) the weight of the component in the mixture
-          2) the logit(alpha) parameter of the Beta distribution to sample the increment
-          3) the logit(beta) parameter of the Beta distribution to sample the increment
+        output of the policy should return:
+          1) the weight of the component in the mixture,
+          2) the logit(alpha) parameter of the Beta distribution to sample the increment,
+          3) the logit(beta) parameter of the Beta distribution to sample the increment.
 
         These parameters are the first n_dim * n_comp * 3 of the policy output such
         that the first 3 x C elements correspond to the first dimension, and so on.

From 7c58b412e3f588debf1742f5a4f4aeb177751d1b Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 15:54:23 -0400
Subject: [PATCH 143/206] CubeAbstract <- Cube

---
 gflownet/envs/cube.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index d9684ac82..46ac40a34 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -18,7 +18,7 @@
 from gflownet.utils.common import copy, tbool, tfloat
 
 
-class Cube(GFlowNetEnv, ABC):
+class CubeAbstract(GFlowNetEnv, ABC):
     """
     Base class for hyper-cube environments, continuous or hybrid versions of the
     hyper-grid in which the continuous increments are modelled by a (mixture of) Beta
@@ -250,7 +250,7 @@ def step(
         pass
 
 
-class ContinuousCube(Cube):
+class ContinuousCube(CubeAbstract):
     """
     Continuous hyper-cube environment (continuous version of a hyper-grid) in which the
     action space consists of the increment of each dimension d, modelled by a mixture

From 403c8a2013646e6652e53c5432a99aa6c0d245ab Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 16:03:25 -0400
Subject: [PATCH 144/206] Redefine Cube init: CubeBase <- CubeAbstract;
 eliminate max_val.

---
 gflownet/envs/cube.py | 45 +++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 46ac40a34..1a0c18c90 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -18,32 +18,32 @@
 from gflownet.utils.common import copy, tbool, tfloat
 
 
-class CubeAbstract(GFlowNetEnv, ABC):
+class CubeBase(GFlowNetEnv, ABC):
     """
     Base class for hyper-cube environments, continuous or hybrid versions of the
     hyper-grid in which the continuous increments are modelled by a (mixture of) Beta
     distribution(s).
 
-    The states space is the value of each dimension. If the value of a dimension gets
-    larger than max_val, then the trajectory is ended.
+    The states space is the value of each dimension, defined in the closed set [0, 1].
+    If the value of a dimension gets larger than max_val, then the trajectory is ended
+    (the only possible action is EOS).
 
     Attributes
     ----------
     n_dim : int
         Dimensionality of the hyper-cube.
 
-    max_val : float
-        Max length of the hyper-cube.
-
     min_incr : float
-        Minimum increment in the actions, expressed as the fraction of max_val. This is
-        necessary to ensure coverage of the state space.
+        Minimum increment in the actions, in (0, 1). This is necessary to ensure
+        that all trajectories have finite length.
+
+    n_comp : int
+        Number of components in the mixture of Beta distributions.
     """
 
     def __init__(
         self,
         n_dim: int = 2,
-        max_val: float = 1.0,
         min_incr: float = 0.1,
         n_comp: int = 1,
         beta_params_min: float = 0.1,
@@ -65,23 +65,18 @@ def __init__(
         **kwargs,
     ):
         assert n_dim > 0
-        assert max_val > 0.0
+        assert min_incr > 0.0
+        assert min_incr < 1.0
         assert n_comp > 0
         # Main properties
         self.n_dim = n_dim
-        self.eos = self.n_dim
-        self.max_val = max_val
-        self.min_incr = min_incr * self.max_val
+        self.min_incr = min_incr
         # Parameters of the policy distribution
         self.n_comp = n_comp
         self.beta_params_min = beta_params_min
         self.beta_params_max = beta_params_max
-        # Source state: position 0 at all dimensions
-        self.source = [0.0 for _ in range(self.n_dim)]
-        # Action from source: (n_dim, 0)
-        self.action_source = (self.n_dim, 0)
-        # End-of-sequence action: (n_dim + 1, 0)
-        self.eos = (self.n_dim + 1, 0)
+        # Source state is abstract - not included in the cube: -1 for all dimensions.
+        self.source = [-1 for _ in range(self.n_dim)]
         # Conversions: only conversions to policy are implemented and the rest are the
         # same
         self.state2proxy = self.state2policy
@@ -250,7 +245,7 @@ def step(
         pass
 
 
-class ContinuousCube(CubeAbstract):
+class ContinuousCube(CubeBase):
     """
     Continuous hyper-cube environment (continuous version of a hyper-grid) in which the
     action space consists of the increment of each dimension d, modelled by a mixture
@@ -273,12 +268,12 @@ class ContinuousCube(CubeAbstract):
     n_dim : int
         Dimensionality of the hyper-cube.
 
-    max_val : float
-        Max length of the hyper-cube.
-
     min_incr : float
-        Minimum increment in the actions, expressed as the fraction of max_val. This is
-        necessary to ensure that trajectories have finite length.
+        Minimum increment in the actions, in (0, 1). This is necessary to ensure
+        that all trajectories have finite length.
+
+    n_comp : int
+        Number of components in the mixture of Beta distributions.
     """
 
     def __init__(self, **kwargs):

From 91ab7f9026a12dc10a6b1a067aa2a8d23e6092ec Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 16:39:51 -0400
Subject: [PATCH 145/206] Re-implement sample_actions_* without need to compute
 absolute increments from source.

---
 gflownet/envs/cube.py | 67 ++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 1a0c18c90..2e51ef821 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -444,7 +444,7 @@ def get_mask_invalid_actions_forward(
         if done:
             return [True] * mask_dim
         mask = [False] * mask_dim
-        # If the state is not the source state, EOS is invalid
+        # If the state is the source state, EOS is invalid
         if state == self.source:
             mask[2] = True
         # If the state is not the source, indicate not special case (True)
@@ -503,20 +503,20 @@ def get_parents(
         """
         pass
 
+    # TODO: rethink if not necessary from source
     @staticmethod
     def relative_to_absolute_increments(
         states: TensorType["n_states", "n_dim"],
         increments_rel: TensorType["n_states", "n_dim"],
         min_increments: TensorType["n_states", "n_dim"],
-        max_val: float,
         is_backward: bool,
     ):
         """
         Returns a batch of absolute increments (actions) given a batch of states,
         relative increments and minimum_increments.
 
-        Given a dimension value x, a relative increment r, a minimum increment m and a
-        maximum value 1, the absolute increment a is given by:
+        Given a dimension value x, a relative increment r, and a minimum increment m,
+        then the absolute increment a is given by:
 
         Forward:
 
@@ -531,24 +531,24 @@ def relative_to_absolute_increments(
             increments_abs = min_increments + increments_rel * (states - min_increments)
         else:
             increments_abs = min_increments + increments_rel * (
-                max_val - states - min_increments
+                1.0 - states - min_increments
             )
         return increments_abs
 
+    # TODO: rethink if not necessary from source
     @staticmethod
     def absolute_to_relative_increments(
         states: TensorType["n_states", "n_dim"],
         increments_abs: TensorType["n_states", "n_dim"],
         min_increments: TensorType["n_states", "n_dim"],
-        max_val: float,
         is_backward: bool,
     ):
         """
         Returns a batch of relative increments (as sampled by the Beta distributions)
         given a batch of states, absolute increments (actions) and minimum_increments.
 
-        Given a dimension value x, an absolute increment a, a minimum increment m and a
-        maximum value 1, the relative increment r is given by:
+        Given a dimension value x, an absolute increment a, and a minimum increment m,
+        then the relative increment r is given by:
 
         Forward:
 
@@ -565,7 +565,7 @@ def absolute_to_relative_increments(
             )
         else:
             increments_rel = (increments_abs - min_increments) / (
-                max_val - states - min_increments
+                1.0 - states - min_increments
             )
         return increments_rel
 
@@ -673,7 +673,7 @@ def _sample_actions_batch_forward(
             distr_eos = Bernoulli(logits=logits_eos)
             is_eos_sampled[do_eos] = tbool(distr_eos.sample(), device=self.device)
             is_eos[is_eos_sampled] = True
-        # Sample relative increments if EOS is not the sampled or forced action
+        # Sample (relative) increments if EOS is not the (sampled or forced) action
         do_increments = ~is_eos
         if torch.any(do_increments):
             if sampling_method == "uniform":
@@ -682,22 +682,24 @@ def _sample_actions_batch_forward(
                 distr_increments = self._make_increments_distribution(
                     policy_outputs[do_increments]
                 )
-            # Shape of increments_rel: [n_do_increments, n_dim]
-            increments_rel = distr_increments.sample()
-            # Get minimum increments
+            # Shape of increments: [n_do_increments, n_dim]
+            increments = distr_increments.sample()
+            # Compute absolute increments from sampled relative increments if state is
+            # not source
+            is_relative = ~is_source[do_increments]
             min_increments = torch.full_like(
-                increments_rel, self.min_incr, dtype=self.float, device=self.device
+                increments[is_relative],
+                self.min_incr,
+                dtype=self.float,
+                device=self.device,
             )
-            min_increments[is_source[do_increments]] = 0.0
-            # Compute absolute increments
-            states_from_do_increments = tfloat(
+            states_from_rel = tfloat(
                 states_from, float_type=self.float, device=self.device
-            )[do_increments]
-            increments_abs = self.relative_to_absolute_increments(
-                states_from_do_increments,
-                increments_rel,
+            )[is_relative]
+            increments[is_relative] = self.relative_to_absolute_increments(
+                states_from_rel,
+                increments[is_relative],
                 min_increments,
-                self.max_val,
                 is_backward=False,
             )
         # Build actions
@@ -705,7 +707,7 @@ def _sample_actions_batch_forward(
             (n_states, self.n_dim), torch.inf, dtype=self.float, device=self.device
         )
         if torch.any(do_increments):
-            actions_tensor[do_increments] = increments_abs
+            actions_tensor[do_increments] = increments
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, None
 
@@ -771,20 +773,18 @@ def _sample_actions_batch_backward(
                     policy_outputs[do_increments]
                 )
             # Shape of increments_rel: [n_do_increments, n_dim]
-            increments_rel = distr_increments.sample()
-            # Set minimum increments
+            increments = distr_increments.sample()
+            # Compute absolute increments from all sampled relative increments
             min_increments = torch.full_like(
-                increments_rel, self.min_incr, dtype=self.float, device=self.device
+                increments, self.min_incr, dtype=self.float, device=self.device
             )
-            # Compute absolute increments
-            states_from_do_increments = tfloat(
+            states_from_rel = tfloat(
                 states_from, float_type=self.float, device=self.device
             )[do_increments]
-            increments_abs = self.relative_to_absolute_increments(
-                states_from_do_increments,
-                increments_rel,
+            increments = self.relative_to_absolute_increments(
+                states_from_rel,
+                increments,
                 min_increments,
-                self.max_val,
                 is_backward=True,
             )
         # Build actions
@@ -793,7 +793,8 @@ def _sample_actions_batch_backward(
         )
         actions_tensor[is_eos] = torch.inf
         if torch.any(do_increments):
-            actions_tensor[do_increments] = increments_abs
+            actions_tensor[do_increments] = increments
+        # TODO: Should BTS actions be special?
         if torch.any(is_bts):
             # BTS actions are equal to the originating states
             actions_bts = tfloat(

From 03c05379721b3d5331e4742625d9c1e26ea58821 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 17:14:24 -0400
Subject: [PATCH 146/206] Re-implement _get_logprobs_* with new BTS.

---
 gflownet/envs/cube.py | 145 ++++++++++++++++++------------------------
 1 file changed, 63 insertions(+), 82 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 2e51ef821..d38e1076a 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -288,11 +288,14 @@ def get_action_space(self):
 
         EOS is indicated by np.inf for all dimensions.
 
-        This method defines self.eos and the returned action space is simply a
-        representative (arbitrary) action with an increment of 0.0 in all dimensions,
-        and EOS.
+        BTS (back to source) is indicated by -1 for all dimensions.
+
+        This method defines self.eos, self.bts and the returned action space is simply
+        a representative (arbitrary) action with an increment of 0.0 in all dimensions,
+        EOS and BTS.
         """
         self.eos = tuple([np.inf] * self.n_dim)
+        self.bts = tuple([-1] * self.n_dim)
         self.representative_action = tuple([0.0] * self.n_dim)
         return [self.representative_action, self.eos]
 
@@ -508,7 +511,6 @@ def get_parents(
     def relative_to_absolute_increments(
         states: TensorType["n_states", "n_dim"],
         increments_rel: TensorType["n_states", "n_dim"],
-        min_increments: TensorType["n_states", "n_dim"],
         is_backward: bool,
     ):
         """
@@ -526,21 +528,19 @@ def relative_to_absolute_increments(
 
         a = m + r * (x - m)
         """
-        max_val = torch.full_like(states, max_val)
+        min_increments = torch.full_like(
+            increments_rel, self.min_incr, dtype=self.float, device=self.device
+        )
         if is_backward:
-            increments_abs = min_increments + increments_rel * (states - min_increments)
+            return min_increments + increments_rel * (states - min_increments)
         else:
-            increments_abs = min_increments + increments_rel * (
-                1.0 - states - min_increments
-            )
-        return increments_abs
+            return min_increments + increments_rel * (1.0 - states - min_increments)
 
     # TODO: rethink if not necessary from source
     @staticmethod
     def absolute_to_relative_increments(
         states: TensorType["n_states", "n_dim"],
         increments_abs: TensorType["n_states", "n_dim"],
-        min_increments: TensorType["n_states", "n_dim"],
         is_backward: bool,
     ):
         """
@@ -558,16 +558,13 @@ def absolute_to_relative_increments(
 
         r = (a - m) / (x - m)
         """
-        max_val = torch.full_like(states, max_val)
+        min_increments = torch.full_like(
+            increments_abs, self.min_incr, dtype=self.float, device=self.device
+        )
         if is_backward:
-            increments_rel = (increments_abs - min_increments) / (
-                states - min_increments
-            )
+            return (increments_abs - min_increments) / (states - min_increments)
         else:
-            increments_rel = (increments_abs - min_increments) / (
-                1.0 - states - min_increments
-            )
-        return increments_rel
+            return (increments_abs - min_increments) / (1.0 - states - min_increments)
 
     def _make_increments_distribution(
         self,
@@ -656,6 +653,9 @@ def _sample_actions_batch_forward(
         """
         # Initialize variables
         n_states = policy_outputs.shape[0]
+        states_from_tensor = tfloat(
+            states_from, float_type=self.float, device=self.device
+        )
         is_eos = torch.zeros(n_states, dtype=torch.bool, device=self.device)
         # Determine source states
         is_source = ~mask[:, 1]
@@ -687,19 +687,14 @@ def _sample_actions_batch_forward(
             # Compute absolute increments from sampled relative increments if state is
             # not source
             is_relative = ~is_source[do_increments]
-            min_increments = torch.full_like(
-                increments[is_relative],
-                self.min_incr,
-                dtype=self.float,
-                device=self.device,
-            )
             states_from_rel = tfloat(
-                states_from, float_type=self.float, device=self.device
+                states_from_tensor[do_increments],
+                float_type=self.float,
+                device=self.device,
             )[is_relative]
             increments[is_relative] = self.relative_to_absolute_increments(
                 states_from_rel,
                 increments[is_relative],
-                min_increments,
                 is_backward=False,
             )
         # Build actions
@@ -775,32 +770,27 @@ def _sample_actions_batch_backward(
             # Shape of increments_rel: [n_do_increments, n_dim]
             increments = distr_increments.sample()
             # Compute absolute increments from all sampled relative increments
-            min_increments = torch.full_like(
-                increments, self.min_incr, dtype=self.float, device=self.device
-            )
             states_from_rel = tfloat(
                 states_from, float_type=self.float, device=self.device
             )[do_increments]
             increments = self.relative_to_absolute_increments(
                 states_from_rel,
                 increments,
-                min_increments,
                 is_backward=True,
             )
         # Build actions
         actions_tensor = torch.zeros(
             (n_states, self.n_dim), dtype=self.float, device=self.device
         )
-        actions_tensor[is_eos] = torch.inf
+        actions_tensor[is_eos] = tfloat(
+            self.eos, float_type=self.float, device=self.device
+        )
         if torch.any(do_increments):
             actions_tensor[do_increments] = increments
-        # TODO: Should BTS actions be special?
         if torch.any(is_bts):
-            # BTS actions are equal to the originating states
-            actions_bts = tfloat(
-                states_from, float_type=self.float, device=self.device
-            )[is_bts]
-            actions_tensor[is_bts] = actions_bts
+            actions_tensor[is_bts] = tfloat(
+                self.bts, float_type=self.float, device=self.device
+            )
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, None
 
@@ -893,18 +883,25 @@ def _get_logprobs_forward(
         do_increments = ~is_eos
         if torch.any(do_increments):
             # Get absolute increments
-            increments_abs = actions[do_increments]
-            # Get minimum increments
-            min_increments = torch.full_like(
-                increments_abs, self.min_incr, dtype=self.float, device=self.device
-            )
-            min_increments[is_source[do_increments]] = 0.0
-            # Get relative increments
-            increments_rel = self.absolute_to_relative_increments(
+            increments = actions[do_increments]
+            # Compute relative increments from absolute increments if state is not
+            # source
+            is_relative = ~is_source[do_increments]
+            states_from_rel = tfloat(
                 states_from_tensor[do_increments],
-                increments_abs,
-                min_increments,
-                self.max_val,
+                float_type=self.float,
+                device=self.device,
+            )[is_relative]
+            increments[is_relative] = self.absolute_to_relative_increments(
+                states_from_rel,
+                increments,
+                is_backward=False,
+            )
+            # Compute diagonal of the Jacobian (see _get_jacobian_diag()) if state is
+            # not source
+            is_relative = torch.logical_and(do_increments, ~is_source)
+            jacobian_diag[is_relative] = self._get_jacobian_diag(
+                states_from_rel,
                 is_backward=False,
             )
             # Get logprobs
@@ -913,14 +910,7 @@ def _get_logprobs_forward(
             )
             # Clamp because increments of 0.0 or 1.0 would yield nan
             logprobs_increments_rel[do_increments] = distr_increments.log_prob(
-                torch.clamp(increments_rel, min=1e-6, max=(1 - 1e-6))
-            )
-            # Compute diagonal of the Jacobian (see _get_jacobian_diag())
-            jacobian_diag[do_increments] = self._get_jacobian_diag(
-                states_from_tensor[do_increments],
-                min_increments,
-                self.max_val,
-                is_backward=False,
+                torch.clamp(increments, min=1e-6, max=(1 - 1e-6))
             )
         # Get log determinant of the Jacobian
         log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)
@@ -952,6 +942,7 @@ def _get_logprobs_backward(
         jacobian_diag = torch.ones(
             (n_states, self.n_dim), device=self.device, dtype=self.float
         )
+        bts_tensor = tfloat(self.bts, float_type=self.float, device=self.device)
         # EOS is the only possible action only if done is True (mask[2] is False)
         is_eos = ~mask[:, 2]
         # Back-to-source (BTS) is the only possible action if mask[1] is False
@@ -960,11 +951,8 @@ def _get_logprobs_backward(
         # Get sampled BTS actions and get log probs from Bernoulli distribution
         do_bts = torch.logical_and(~is_bts_forced, ~is_eos)
         if torch.any(do_bts):
-            # BTS actions are equal to the originating states
             is_bts_sampled = torch.zeros_like(do_bts)
-            is_bts_sampled[do_bts] = torch.all(
-                actions[do_bts] == states_from_tensor[do_bts], dim=1
-            )
+            is_bts_sampled[do_bts] = torch.all(actions[do_bts] == bts_tensor)
             is_bts[is_bts_sampled] = True
             logits_bts = self._get_policy_source_logit(policy_outputs)[do_bts]
             distr_bts = Bernoulli(logits=logits_bts)
@@ -975,16 +963,16 @@ def _get_logprobs_backward(
         do_increments = torch.logical_and(~is_bts, ~is_eos)
         if torch.any(do_increments):
             # Get absolute increments
-            increments_abs = actions[do_increments]
-            min_increments = torch.full_like(
-                increments_abs, self.min_incr, dtype=self.float, device=self.device
+            increments = actions[do_increments]
+            # Compute absolute increments from all sampled relative increments
+            increments = self.absolute_to_relative_increments(
+                states_from_tensor[do_increments],
+                increments,
+                is_backward=True,
             )
-            # Get relative increments
-            increments_rel = self.absolute_to_relative_increments(
+            # Compute diagonal of the Jacobian (see _get_jacobian_diag())
+            jacobian_diag[do_increments] = self._get_jacobian_diag(
                 states_from_tensor[do_increments],
-                increments_abs,
-                min_increments,
-                self.max_val,
                 is_backward=True,
             )
             # Get logprobs
@@ -993,14 +981,7 @@ def _get_logprobs_backward(
             )
             # Clamp because increments of 0.0 or 1.0 would yield nan
             logprobs_increments_rel[do_increments] = distr_increments.log_prob(
-                torch.clamp(increments_rel, min=1e-6, max=(1 - 1e-6))
-            )
-            # Compute diagonal of the Jacobian (see _get_jacobian_diag())
-            jacobian_diag[do_increments] = self._get_jacobian_diag(
-                states_from_tensor[do_increments],
-                min_increments,
-                self.max_val,
-                is_backward=True,
+                torch.clamp(increments, min=1e-6, max=(1 - 1e-6))
             )
         # Get log determinant of the Jacobian
         log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)
@@ -1014,7 +995,6 @@ def _get_logprobs_backward(
     @staticmethod
     def _get_jacobian_diag(
         states_from: TensorType["n_states", "n_dim"],
-        min_increments: TensorType["n_states", "n_dim"],
         max_val: float,
         is_backward: bool,
     ):
@@ -1048,12 +1028,13 @@ def _get_jacobian_diag(
         other than itself are zero. Therefore, the Jacobian is diagonal and the
         determinant is the product of the diagonal.
         """
-        epsilon = 1e-9
-        max_val = torch.full_like(states_from, max_val)
+        min_increments = torch.full_like(
+            states_from, self.min_incr, dtype=self.float, device=self.device
+        )
         if is_backward:
-            return 1.0 / ((states_from - min_increments) + epsilon)
+            return 1.0 / ((states_from - min_increments))
         else:
-            return 1.0 / ((max_val - states_from - min_increments) + epsilon)
+            return 1.0 / ((1.0 - states_from - min_increments))
 
     def _step(
         self,

From 84bbc1672b9b5247818c960c7094dcd6d2d4a084 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 17:15:43 -0400
Subject: [PATCH 147/206] Add BTS action to action space list

---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index d38e1076a..d0c8477eb 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -297,7 +297,7 @@ def get_action_space(self):
         self.eos = tuple([np.inf] * self.n_dim)
         self.bts = tuple([-1] * self.n_dim)
         self.representative_action = tuple([0.0] * self.n_dim)
-        return [self.representative_action, self.eos]
+        return [self.representative_action, self.bts, self.eos]
 
     def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         """

From b2a1920ae6c99bd71ccf1aa50ecc91ea4804b745 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 17:20:12 -0400
Subject: [PATCH 148/206] Eliminate references to max_val

---
 gflownet/envs/cube.py             |   32 +-
 tests/gflownet/envs/test_ccube.py | 1999 ++++++++++++++---------------
 2 files changed, 1014 insertions(+), 1017 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index d0c8477eb..183f0cab8 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -25,8 +25,8 @@ class CubeBase(GFlowNetEnv, ABC):
     distribution(s).
 
     The states space is the value of each dimension, defined in the closed set [0, 1].
-    If the value of a dimension gets larger than max_val, then the trajectory is ended
-    (the only possible action is EOS).
+    If the value of a dimension gets larger than 1 - min_incr, then the trajectory is
+    ended (the only possible action is EOS).
 
     Attributes
     ----------
@@ -117,20 +117,20 @@ def statetorch2policy(
         self, states: TensorType["batch", "state_dim"] = None
     ) -> TensorType["batch", "policy_input_dim"]:
         """
-        Clips the states into [0, max_val] and maps them to [-1.0, 1.0]
+        Clips the states into [0, 1] and maps them to [-1.0, 1.0]
 
         Args
         ----
         state : list
             State
         """
-        return 2.0 * torch.clip(states, min=0.0, max=self.max_val) - 1.0
+        return 2.0 * torch.clip(states, min=0.0, max=1.0) - 1.0
 
     def statebatch2policy(
         self, states: List[List]
     ) -> TensorType["batch", "state_proxy_dim"]:
         """
-        Clips the states into [0, max_val] and maps them to [-1.0, 1.0]
+        Clips the states into [0, 1] and maps them to [-1.0, 1.0]
 
         Args
         ----
@@ -143,11 +143,11 @@ def statebatch2policy(
 
     def state2policy(self, state: List = None) -> List:
         """
-        Clips the state into [0, max_val] and maps it to [-1.0, 1.0]
+        Clips the state into [0, 1] and maps it to [-1.0, 1.0]
         """
         if state is None:
             state = self.state.copy()
-        return [2.0 * min(max(0.0, s), self.max_val) - 1.0 for s in state]
+        return [2.0 * min(max(0.0, s), 1.0) - 1.0 for s in state]
 
     def state2readable(self, state: List) -> str:
         """
@@ -259,7 +259,7 @@ class ContinuousCube(CubeBase):
     Actions do not represent absolute increments but rather the relative increment with
     respect to the distance to the edges of the hyper-cube, from the minimum increment.
     That is, if dimension d of a state has value 0.3, the minimum increment (min_incr)
-    is 0.1 and the maximum value (max_val) is 1.0, an action of 0.5 will increment the
+    is 0.1 and the maximum value is 1.0, an action of 0.5 will increment the
     value of the dimension in 0.5 * (1.0 - 0.3 - 0.1) = 0.5 * 0.6 = 0.3. Therefore, the
     value of d in the next state will be 0.3 + 0.3 = 0.6.
 
@@ -625,7 +625,7 @@ def _sample_actions_batch_forward(
         originating state is the source state (special case, see
         get_mask_invalid_actions_forward()). Furthermore, absolute increments must also
         be smaller than the distance from the dimension value to the edge of the cube
-        (self.max_val). In order to accomodate these constraints, first relative
+        (1.0). In order to accomodate these constraints, first relative
         increments (in [0, 1]) are sampled from a (mixture of) Beta distribution(s),
         where 0.0 indicates an absolute increment of min_incr and 1.0 indicates an
         absolute increment of 1 - x + min_incr (going to the edge).
@@ -995,7 +995,6 @@ def _get_logprobs_backward(
     @staticmethod
     def _get_jacobian_diag(
         states_from: TensorType["n_states", "n_dim"],
-        max_val: float,
         is_backward: bool,
     ):
         """
@@ -1003,7 +1002,7 @@ def _get_jacobian_diag(
         the target states.
 
         Forward: the sampled variables are the relative increments r_f and the state
-        updates (s -> s') are (assuming max_val = 1):
+        updates (s -> s') are:
 
         s' = s + m + r_f(1 - s - m)
         r_f = (s' - s - m) / (1 - s - m)
@@ -1076,12 +1075,12 @@ def _step(
         # to source.
         if self.isclose(self.state, self.source, atol=1e-6):
             self.state = copy(self.source)
-        if not all([s <= (self.max_val + epsilon) for s in self.state]):
+        if not all([s <= (1.0 + epsilon) for s in self.state]):
             import ipdb
 
             ipdb.set_trace()
         assert all(
-            [s <= (self.max_val + epsilon) for s in self.state]
+            [s <= (1.0 + epsilon) for s in self.state]
         ), f"""
         State is out of cube bounds.
         \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
@@ -1176,7 +1175,7 @@ def step_backwards(
 
     def get_grid_terminating_states(self, n_states: int) -> List[List]:
         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
-        linspaces = [np.linspace(0, self.max_val, n_per_dim) for _ in range(self.n_dim)]
+        linspaces = [np.linspace(0.0, 1.0, n_per_dim) for _ in range(self.n_dim)]
         states = list(itertools.product(*linspaces))
         # TODO: check if necessary
         states = [list(el) for el in states]
@@ -1186,7 +1185,7 @@ def get_uniform_terminating_states(
         self, n_states: int, seed: int = None
     ) -> List[List]:
         rng = np.random.default_rng(seed)
-        states = rng.uniform(low=0.0, high=self.max_val, size=(n_states, self.n_dim))
+        states = rng.uniform(low=0.0, high=1.0, size=(n_states, self.n_dim))
         return states.tolist()
 
     # TODO: make generic for all environments
@@ -1194,8 +1193,7 @@ def sample_from_reward(
         self, n_samples: int, epsilon=1e-4
     ) -> TensorType["n_samples", "state_dim"]:
         """
-        Rejection sampling with proposal the uniform distribution in
-        [0, max_val]]^n_dim.
+        Rejection sampling with proposal the uniform distribution in [0, 1]^n_dim.
 
         Returns a tensor in GFloNet (state) format.
         """
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 267854c04..58109c537 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -28,1006 +28,1005 @@ def cube2d():
         ],
     ],
 )
-@pytest.mark.skip(reason="skip while developping other tests")
-def test__get_action_space__returns_expected(env, action_space):
+def test__get_action_space__returns_expected(cube2d, action_space):
     assert set(action_space) == set(env.action_space)
 
 
-@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
-def test__get_policy_output__fixed_as_expected(env, request):
-    env = request.getfixturevalue(env)
-    policy_outputs = torch.unsqueeze(env.fixed_policy_output, 0)
-    params = env.fixed_distr_params
-    policy_output__as_expected(env, policy_outputs, params)
-
-
-@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
-def test__get_policy_output__random_as_expected(env, request):
-    env = request.getfixturevalue(env)
-    policy_outputs = torch.unsqueeze(env.random_policy_output, 0)
-    params = env.random_distr_params
-    policy_output__as_expected(env, policy_outputs, params)
-
-
-def policy_output__as_expected(env, policy_outputs, params):
-    assert torch.all(
-        env._get_policy_betas_weights(policy_outputs) == params["beta_weights"]
-    )
-    assert torch.all(
-        env._get_policy_betas_alpha(policy_outputs) == params["beta_alpha"]
-    )
-    assert torch.all(env._get_policy_betas_beta(policy_outputs) == params["beta_beta"])
-    assert torch.all(
-        env._get_policy_eos_logit(policy_outputs) == params["bernoulli_eos_logit"]
-    )
-    assert torch.all(
-        env._get_policy_source_logit(policy_outputs) == params["bernoulli_source_logit"]
-    )
-
-
-@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
-def test__mask_forward__returns_all_true_if_done(env, request):
-    env = request.getfixturevalue(env)
-    # Sample states
-    states = env.get_uniform_terminating_states(100)
-    # Iterate over state and test
-    for state in states:
-        env.set_state(state, done=True)
-        mask = env.get_mask_invalid_actions_forward()
-        assert all(mask)
-
-
-@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
-def test__mask_backward__returns_all_true_except_eos_if_done(env, request):
-    env = request.getfixturevalue(env)
-    # Sample states
-    states = env.get_uniform_terminating_states(100)
-    # Iterate over state and test
-    for state in states:
-        env.set_state(state, done=True)
-        mask = env.get_mask_invalid_actions_backward()
-        assert all(mask[:-1])
-        assert mask[-1] is False
-
-
-@pytest.mark.parametrize(
-    "state, mask_expected",
-    [
-        (
-            [0.0],
-            [False, False, True],
-        ),
-        (
-            [0.5],
-            [False, True, False],
-        ),
-        (
-            [0.90],
-            [False, True, False],
-        ),
-        (
-            [0.95],
-            [True, True, False],
-        ),
-    ],
-)
-def test__mask_forward__1d__returns_expected(cube1d, state, mask_expected):
-    env = cube1d
-    mask = env.get_mask_invalid_actions_forward(state)
-    assert mask == mask_expected
-
-
-@pytest.mark.parametrize(
-    "state, mask_expected",
-    [
-        (
-            [0.0, 0.0],
-            [False, False, True],
-        ),
-        (
-            [0.5, 0.5],
-            [False, True, False],
-        ),
-        (
-            [0.90, 0.5],
-            [False, True, False],
-        ),
-        (
-            [0.95, 0.5],
-            [True, True, False],
-        ),
-        (
-            [0.5, 0.90],
-            [False, True, False],
-        ),
-        (
-            [0.5, 0.95],
-            [True, True, False],
-        ),
-        (
-            [0.95, 0.95],
-            [True, True, False],
-        ),
-    ],
-)
-def test__mask_forward__2d__returns_expected(cube2d, state, mask_expected):
-    env = cube2d
-    mask = env.get_mask_invalid_actions_forward(state)
-    assert mask == mask_expected
-
-
-@pytest.mark.parametrize(
-    "state, mask_expected",
-    [
-        (
-            [0.0],
-            [True, False, True],
-        ),
-        (
-            [0.1],
-            [False, True, True],
-        ),
-        (
-            [0.05],
-            [True, False, True],
-        ),
-        (
-            [0.5],
-            [False, True, True],
-        ),
-        (
-            [0.90],
-            [False, True, True],
-        ),
-        (
-            [0.95],
-            [False, True, True],
-        ),
-    ],
-)
-def test__mask_backward__1d__returns_expected(cube1d, state, mask_expected):
-    env = cube1d
-    mask = env.get_mask_invalid_actions_backward(state)
-    assert mask == mask_expected
-
-
-@pytest.mark.parametrize(
-    "state, mask_expected",
-    [
-        (
-            [0.0, 0.0],
-            [True, False, True],
-        ),
-        (
-            [0.5, 0.5],
-            [False, True, True],
-        ),
-        (
-            [0.05, 0.5],
-            [True, False, True],
-        ),
-        (
-            [0.5, 0.05],
-            [True, False, True],
-        ),
-        (
-            [0.05, 0.05],
-            [True, False, True],
-        ),
-        (
-            [0.90, 0.5],
-            [False, True, True],
-        ),
-        (
-            [0.5, 0.90],
-            [False, True, True],
-        ),
-        (
-            [0.95, 0.5],
-            [False, True, True],
-        ),
-        (
-            [0.5, 0.95],
-            [False, True, True],
-        ),
-        (
-            [0.95, 0.95],
-            [False, True, True],
-        ),
-    ],
-)
-def test__mask_backward__2d__returns_expected(cube2d, state, mask_expected):
-    env = cube2d
-    mask = env.get_mask_invalid_actions_backward(state)
-    assert mask == mask_expected
-
-
-@pytest.mark.parametrize(
-    "state, increments_rel, min_increments, state_expected",
-    [
-        (
-            [0.0, 0.0],
-            [0.5, 0.5],
-            [0.0, 0.0],
-            [0.5, 0.5],
-        ),
-        (
-            [0.0, 0.0],
-            [0.0, 0.0],
-            [0.0, 0.0],
-            [0.0, 0.0],
-        ),
-        (
-            [0.0, 0.0],
-            [0.1794, 0.9589],
-            [0.0, 0.0],
-            [0.1794, 0.9589],
-        ),
-        (
-            [0.3, 0.5],
-            [0.0, 0.0],
-            [0.1, 0.1],
-            [0.4, 0.6],
-        ),
-        (
-            [0.3, 0.5],
-            [1.0, 1.0],
-            [0.1, 0.1],
-            [1.0, 1.0],
-        ),
-        (
-            [0.3, 0.5],
-            [0.5, 0.5],
-            [0.1, 0.1],
-            [0.7, 0.8],
-        ),
-        (
-            [0.27, 0.85],
-            [0.12, 0.76],
-            [0.1, 0.1],
-            [0.4456, 0.988],
-        ),
-        (
-            [0.27, 0.95],
-            [0.12, 0.0],
-            [0.1, 0.0],
-            [0.4456, 0.95],
-        ),
-        (
-            [0.95, 0.27],
-            [0.0, 0.12],
-            [0.0, 0.1],
-            [0.95, 0.4456],
-        ),
-    ],
-)
-def test__relative_to_absolute_increments__2d_forward__returns_expected(
-    cube2d, state, increments_rel, min_increments, state_expected
-):
-    env = cube2d
-    # Convert to tensors
-    states = tfloat([state], float_type=env.float, device=env.device)
-    increments_rel = tfloat([increments_rel], float_type=env.float, device=env.device)
-    min_increments = tfloat([min_increments], float_type=env.float, device=env.device)
-    states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
-    # Get absolute increments
-    increments_abs = env.relative_to_absolute_increments(
-        states, increments_rel, min_increments, env.max_val, is_backward=False
-    )
-    states_next = states + increments_abs
-    assert torch.all(torch.isclose(states_next, states_expected))
-
-
-@pytest.mark.parametrize(
-    "state, increments_rel, min_increments, state_expected",
-    [
-        (
-            [1.0, 1.0],
-            [0.0, 0.0],
-            [0.1, 0.1],
-            [0.9, 0.9],
-        ),
-        (
-            [1.0, 1.0],
-            [1.0, 1.0],
-            [0.1, 0.1],
-            [0.0, 0.0],
-        ),
-        (
-            [1.0, 1.0],
-            [0.1794, 0.9589],
-            [0.1, 0.1],
-            [0.73854, 0.03699],
-        ),
-        (
-            [0.3, 0.5],
-            [0.0, 0.0],
-            [0.1, 0.1],
-            [0.2, 0.4],
-        ),
-        (
-            [0.3, 0.5],
-            [1.0, 1.0],
-            [0.1, 0.1],
-            [0.0, 0.0],
-        ),
-    ],
-)
-def test__relative_to_absolute_increments__2d_backward__returns_expected(
-    cube2d, state, increments_rel, min_increments, state_expected
-):
-    env = cube2d
-    # Convert to tensors
-    states = tfloat([state], float_type=env.float, device=env.device)
-    increments_rel = tfloat([increments_rel], float_type=env.float, device=env.device)
-    min_increments = tfloat([min_increments], float_type=env.float, device=env.device)
-    states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
-    # Get absolute increments
-    increments_abs = env.relative_to_absolute_increments(
-        states, increments_rel, min_increments, env.max_val, is_backward=True
-    )
-    states_next = states - increments_abs
-    assert torch.all(torch.isclose(states_next, states_expected))
-
-
-@pytest.mark.parametrize(
-    "state, action, state_expected",
-    [
-        (
-            [0.0, 0.0],
-            (0.5, 0.5),
-            [0.5, 0.5],
-        ),
-        (
-            [0.0, 0.0],
-            (0.0, 0.0),
-            [0.0, 0.0],
-        ),
-        (
-            [0.0, 0.0],
-            (0.1794, 0.9589),
-            [0.1794, 0.9589],
-        ),
-        (
-            [0.3, 0.5],
-            (0.1, 0.1),
-            [0.4, 0.6],
-        ),
-        (
-            [0.3, 0.5],
-            (0.7, 0.5),
-            [1.0, 1.0],
-        ),
-        (
-            [0.3, 0.5],
-            (0.4, 0.3),
-            [0.7, 0.8],
-        ),
-        (
-            [0.27, 0.85],
-            (0.1756, 0.138),
-            [0.4456, 0.988],
-        ),
-        (
-            [0.27, 0.95],
-            (0.1756, 0.0),
-            [0.4456, 0.95],
-        ),
-        (
-            [0.95, 0.27],
-            (0.0, 0.1756),
-            [0.95, 0.4456],
-        ),
-    ],
-)
-def test__step_forward__2d__returns_expected(cube2d, state, action, state_expected):
-    env = cube2d
-    env.set_state(state)
-    state_new, action, valid = env.step(action)
-    assert env.isclose(state_new, state_expected)
-
-
-@pytest.mark.parametrize(
-    "states, force_eos",
-    [
-        (
-            [[0.0, 0.0], [0.0, 0.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
-            [False, False, False, False, False],
-        ),
-        (
-            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.0], [0.16, 0.93]],
-            [False, False, False, False, False],
-        ),
-        (
-            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.3], [0.2, 0.95], [0.01, 0.01]],
-            [False, False, False, False, False],
-        ),
-        (
-            [[0.0, 0.0], [0.0, 0.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
-            [False, False, False, True, False],
-        ),
-        (
-            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.0], [0.16, 0.93]],
-            [False, True, True, False, False],
-        ),
-        (
-            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.98], [0.92, 0.95], [0.01, 0.01]],
-            [False, False, False, True, True],
-        ),
-    ],
-)
-def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos):
-    env = cube2d
-    n_states = len(states)
-    force_eos = tbool(force_eos, device=env.device)
-    # Get masks
-    masks = tbool(
-        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
-    )
-    # Define Beta distribution with low variance and get confident range
-    n_samples = 10000
-    beta_params_min = 0.0
-    beta_params_max = 10000
-    alpha = 10
-    alphas_presigmoid = alpha * torch.ones(n_samples)
-    alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
-    beta = 1.0
-    betas_presigmoid = beta * torch.ones(n_samples)
-    betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
-    beta_distr = Beta(alphas, betas)
-    samples = beta_distr.sample()
-    mean_incr_rel = 0.9 * samples.mean()
-    min_incr_rel = 0.9 * samples.min()
-    max_incr_rel = 1.1 * samples.max()
-    # Define Bernoulli parameters for EOS with deterministic probability
-    logit_force_eos = torch.inf
-    logit_force_noeos = -torch.inf
-    # Estimate confident intervals of absolute actions
-    states_torch = tfloat(states, float_type=env.float, device=env.device)
-    is_source = torch.all(states_torch == 0.0, dim=1)
-    is_near_edge = states_torch > 1.0 - env.min_incr
-    min_increments = torch.full_like(
-        states_torch, env.min_incr, dtype=env.float, device=env.device
-    )
-    min_increments[is_source, :] = 0.0
-    increments_rel_min = torch.full_like(
-        states_torch, min_incr_rel, dtype=env.float, device=env.device
-    )
-    increments_rel_max = torch.full_like(
-        states_torch, max_incr_rel, dtype=env.float, device=env.device
-    )
-    increments_abs_min = env.relative_to_absolute_increments(
-        states_torch, increments_rel_min, min_increments, env.max_val, is_backward=False
-    )
-    increments_abs_max = env.relative_to_absolute_increments(
-        states_torch, increments_rel_max, min_increments, env.max_val, is_backward=False
-    )
-    # Get EOS actions
-    is_eos_forced = torch.any(is_near_edge, dim=1)
-    is_eos = torch.logical_or(is_eos_forced, force_eos)
-    increments_abs_min[is_eos] = torch.inf
-    increments_abs_max[is_eos] = torch.inf
-    # Reconfigure environment
-    env.n_comp = 1
-    env.beta_params_min = beta_params_min
-    env.beta_params_max = beta_params_max
-    # Build policy outputs
-    params = env.fixed_distr_params
-    params["beta_alpha"] = alpha
-    params["beta_beta"] = beta
-    params["bernoulli_eos_logit"] = logit_force_noeos
-    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    policy_outputs[force_eos, -1] = logit_force_eos
-    # Sample actions
-    actions, _ = env.sample_actions_batch(
-        policy_outputs, masks, states, is_backward=False
-    )
-    actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
-    actions_eos = torch.all(actions_tensor == torch.inf, dim=1)
-    assert torch.all(actions_eos == is_eos)
-    assert torch.all(actions_tensor >= increments_abs_min)
-    assert torch.all(actions_tensor <= increments_abs_max)
-
-
-@pytest.mark.parametrize(
-    "states, force_bst",
-    [
-        (
-            [[1.0, 1.0], [1.0, 1.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
-            [False, False, False, False, False],
-        ),
-        (
-            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.05], [0.16, 0.93]],
-            [False, False, False, False, False],
-        ),
-        (
-            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.3], [0.2, 0.95], [0.01, 0.01]],
-            [False, False, False, False, False],
-        ),
-        (
-            [[0.0001, 0.0], [0.001, 0.01], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
-            [False, False, False, True, False],
-        ),
-        (
-            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [1.0, 1.0], [0.16, 0.93]],
-            [False, True, True, True, False],
-        ),
-        (
-            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.98], [0.92, 0.95], [0.01, 0.01]],
-            [False, False, False, True, True],
-        ),
-    ],
-)
-def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bst):
-    env = cube2d
-    n_states = len(states)
-    force_bst = tbool(force_bst, device=env.device)
-    # Get masks
-    masks = tbool(
-        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
-    )
-    # Define Beta distribution with low variance and get confident range
-    n_samples = 10000
-    beta_params_min = 0.0
-    beta_params_max = 10000
-    alpha = 10
-    alphas_presigmoid = alpha * torch.ones(n_samples)
-    alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
-    beta = 1.0
-    betas_presigmoid = beta * torch.ones(n_samples)
-    betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
-    beta_distr = Beta(alphas, betas)
-    samples = beta_distr.sample()
-    mean_incr_rel = 0.9 * samples.mean()
-    min_incr_rel = 0.9 * samples.min()
-    max_incr_rel = 1.1 * samples.max()
-    # Define Bernoulli parameters for BST with deterministic probability
-    logit_force_bst = torch.inf
-    logit_force_nobst = -torch.inf
-    # Estimate confident intervals of absolute actions
-    states_torch = tfloat(states, float_type=env.float, device=env.device)
-    is_near_edge = states_torch < env.min_incr
-    min_increments = torch.full_like(
-        states_torch, env.min_incr, dtype=env.float, device=env.device
-    )
-    increments_rel_min = torch.full_like(
-        states_torch, min_incr_rel, dtype=env.float, device=env.device
-    )
-    increments_rel_max = torch.full_like(
-        states_torch, max_incr_rel, dtype=env.float, device=env.device
-    )
-    increments_abs_min = env.relative_to_absolute_increments(
-        states_torch, increments_rel_min, min_increments, env.max_val, is_backward=True
-    )
-    increments_abs_max = env.relative_to_absolute_increments(
-        states_torch, increments_rel_max, min_increments, env.max_val, is_backward=True
-    )
-    # Get BST actions
-    is_bst_forced = torch.any(is_near_edge, dim=1)
-    is_bst = torch.logical_or(is_bst_forced, force_bst)
-    increments_abs_min[is_bst] = states_torch[is_bst]
-    increments_abs_max[is_bst] = states_torch[is_bst]
-    # Reconfigure environment
-    env.n_comp = 1
-    env.beta_params_min = beta_params_min
-    env.beta_params_max = beta_params_max
-    # Build policy outputs
-    params = env.fixed_distr_params
-    params["beta_alpha"] = alpha
-    params["beta_beta"] = beta
-    params["bernoulli_source_logit"] = logit_force_nobst
-    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    policy_outputs[force_bst, -2] = logit_force_bst
-    # Sample actions
-    actions, _ = env.sample_actions_batch(
-        policy_outputs, masks, states, is_backward=True
-    )
-    actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
-    actions_bst = torch.all(actions_tensor == states_torch, dim=1)
-    assert torch.all(actions_bst == is_bst)
-    assert torch.all(actions_tensor >= increments_abs_min)
-    assert torch.all(actions_tensor <= increments_abs_max)
-
-
-@pytest.mark.parametrize(
-    "states, actions",
-    [
-        (
-            [[0.95, 0.97], [0.96, 0.5], [0.5, 0.96]],
-            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
-        ),
-        (
-            [[0.95, 0.97], [0.901, 0.5], [1.0, 1.0]],
-            [[np.inf, np.inf], [0.01, 0.2], [0.3, 0.01]],
-        ),
-    ],
-)
-def test__get_logprobs_forward__2d__nearedge_returns_prob1(cube2d, states, actions):
-    """
-    The only valid action from 'near-edge' states is EOS, thus the the log probability
-    should be zero, regardless of the action and the policy outputs
-    """
-    env = cube2d
-    n_states = len(states)
-    states_torch = tfloat(states, float_type=env.float, device=env.device)
-    actions = tfloat(actions, float_type=env.float, device=env.device)
-    # Get masks
-    masks = tbool(
-        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
-    )
-    # Build policy outputs
-    params = env.fixed_distr_params
-    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    # Add noise to policy outputs
-    policy_outputs += torch.randn(policy_outputs.shape)
-    # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, actions, masks, states_torch, is_backward=False
-    )
-    assert torch.all(logprobs == 0.0)
-
-
-@pytest.mark.parametrize(
-    "states, actions",
-    [
-        (
-            [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-            [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
-        ),
-        (
-            [[1.0, 1.0], [0.01, 0.01], [0.001, 0.1]],
-            [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
-        ),
-    ],
-)
-def test__get_logprobs_forward__2d__eos_actions_return_expected(
-    cube2d, states, actions
-):
-    """
-    The only valid action from 'near-edge' states is EOS, thus the the log probability
-    should be zero, regardless of the action and the policy outputs
-    """
-    env = cube2d
-    n_states = len(states)
-    states_torch = tfloat(states, float_type=env.float, device=env.device)
-    actions = tfloat(actions, float_type=env.float, device=env.device)
-    # Get masks
-    masks = tbool(
-        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
-    )
-    # Get EOS forced
-    is_near_edge = states_torch > 1.0 - env.min_incr
-    is_eos_forced = torch.any(is_near_edge, dim=1)
-    # Define Bernoulli parameter for EOS
-    # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_eos = 1
-    distr_eos = Bernoulli(logits=logit_eos)
-    logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
-    # Build policy outputs
-    params = env.fixed_distr_params
-    params["bernoulli_eos_logit"] = logit_eos
-    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, actions, masks, states_torch, is_backward=False
-    )
-    assert torch.all(logprobs[is_eos_forced] == 0.0)
-    assert torch.all(torch.isclose(logprobs[~is_eos_forced], logprob_eos, atol=1e-6))
-
-
-@pytest.mark.parametrize(
-    "actions",
-    [
-        [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-        [[0.999, 0.999], [0.0001, 0.0001], [0.5, 0.5]],
-        [[0.0, 0.0], [1.0, 1.0]],
-    ],
-)
-def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1(
-    cube2d, actions
-):
-    """
-    With Uniform increment policy, all the actions from the source must have the same
-    probability.
-    """
-    env = cube2d
-    n_states = len(actions)
-    states = [env.source for _ in range(n_states)]
-    states_torch = tfloat(states, float_type=env.float, device=env.device)
-    actions = tfloat(actions, float_type=env.float, device=env.device)
-    # Get masks
-    masks = tbool(
-        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
-    )
-    # Define Uniform Beta distribution (large values of alpha and beta and max of 1.0)
-    beta_params_min = 0.0
-    beta_params_max = 1.0
-    alpha_presigmoid = 1000.0
-    betas_presigmoid = 1000.0
-    # Define Bernoulli parameter for impossible EOS
-    # If Bernouilli has logit -torch.inf, the logprobs are nan
-    logit_force_noeos = -1000
-    # Reconfigure environment
-    env.n_comp = 1
-    env.beta_params_min = beta_params_min
-    env.beta_params_max = beta_params_max
-    # Build policy outputs
-    params = env.fixed_distr_params
-    params["beta_alpha"] = alpha_presigmoid
-    params["beta_beta"] = betas_presigmoid
-    params["bernoulli_eos_logit"] = logit_force_noeos
-    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, actions, masks, states_torch, is_backward=False
-    )
-    assert torch.all(logprobs == 0.0)
-
-
-@pytest.mark.parametrize(
-    "states, actions",
-    [
-        (
-            [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]],
-            [[0.1, 0.2], [0.001, 0.001], [0.5, 0.5]],
-        ),
-        (
-            [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7]],
-            [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
-        ),
-        (
-            [[0.6384, 0.4577], [0.5, 0.5], [0.7, 0.7]],
-            [[0.2988, 0.3585], [0.1, 0.1], [0.1, 0.1]],
-        ),
-    ],
-)
-def test__get_logprobs_forward__2d__notnan(cube2d, states, actions):
-    env = cube2d
-    n_states = len(states)
-    states_torch = tfloat(states, float_type=env.float, device=env.device)
-    actions = tfloat(actions, float_type=env.float, device=env.device)
-    # Get masks
-    masks = tbool(
-        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
-    )
-    # Get EOS forced
-    is_near_edge = states_torch > 1.0 - env.min_incr
-    is_eos_forced = torch.any(is_near_edge, dim=1)
-    # Define Bernoulli parameter for EOS
-    # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_eos = 1
-    distr_eos = Bernoulli(logits=logit_eos)
-    logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
-    # Build policy outputs
-    params = env.fixed_distr_params
-    params["bernoulli_eos_logit"] = logit_eos
-    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, actions, masks, states_torch, is_backward=False
-    )
-    assert torch.all(logprobs[is_eos_forced] == 0.0)
-    assert torch.all(torch.isfinite(logprobs))
-
-
-@pytest.mark.parametrize(
-    "states, actions",
-    [
-        (
-            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
-            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
-        ),
-        (
-            [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
-            [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
-        ),
-    ],
-)
-def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, actions):
-    """
-    The only valid backward action from 'near-edge' states is BTS, thus the the log
-    probability should be zero.
-    """
-    env = cube2d
-    n_states = len(states)
-    states_torch = tfloat(states, float_type=env.float, device=env.device)
-    actions = tfloat(actions, float_type=env.float, device=env.device)
-    # Get masks
-    masks = tbool(
-        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
-    )
-    # Build policy outputs
-    params = env.fixed_distr_params
-    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    # Add noise to policy outputs
-    policy_outputs += torch.randn(policy_outputs.shape)
-    # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, actions, masks, states_torch, is_backward=True
-    )
-    assert torch.all(logprobs == 0.0)
-
-
-@pytest.mark.parametrize(
-    "states, actions",
-    [
-        (
-            [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-            [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-        ),
-        (
-            [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
-            [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
-        ),
-        (
-            [[1.0, 1.0], [0.0, 0.0]],
-            [[1.0, 1.0], [0.0, 0.0]],
-        ),
-    ],
-)
-def test__get_logprobs_backward__2d__bts_actions_return_expected(
-    cube2d, states, actions
-):
-    """
-    The only valid action from 'near-edge' states is EOS, thus the the log probability
-    should be zero, regardless of the action and the policy outputs
-    """
-    env = cube2d
-    n_states = len(states)
-    states_torch = tfloat(states, float_type=env.float, device=env.device)
-    actions = tfloat(actions, float_type=env.float, device=env.device)
-    # Get masks
-    masks = tbool(
-        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
-    )
-    # Get BTS forced
-    is_near_edge = states_torch < env.min_incr
-    is_bts_forced = torch.any(is_near_edge, dim=1)
-    # Define Bernoulli parameter for BTS
-    # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_bts = 1
-    distr_bts = Bernoulli(logits=logit_bts)
-    logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
-    # Build policy outputs
-    params = env.fixed_distr_params
-    params["bernoulli_source_logit"] = logit_bts
-    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, actions, masks, states_torch, is_backward=True
-    )
-    assert torch.all(logprobs[is_bts_forced] == 0.0)
-    assert torch.all(torch.isclose(logprobs[~is_bts_forced], logprob_bts, atol=1e-6))
-
-
-@pytest.mark.parametrize(
-    "states, actions",
-    [
-        (
-            [[0.3, 0.3], [0.5, 0.5], [0.8, 0.8]],
-            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
-        ),
-        (
-            [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]],
-            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
-        ),
-        (
-            [[1.0, 1.0], [0.5, 0.5], [0.3, 0.3]],
-            [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
-        ),
-    ],
-)
-def test__get_logprobs_backward__2d__notnan(cube2d, states, actions):
-    env = cube2d
-    n_states = len(states)
-    states_torch = tfloat(states, float_type=env.float, device=env.device)
-    actions = tfloat(actions, float_type=env.float, device=env.device)
-    # Get masks
-    masks = tbool(
-        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
-    )
-    # Get BTS forced
-    is_near_edge = states_torch < env.min_incr
-    is_bts_forced = torch.any(is_near_edge, dim=1)
-    # Define Bernoulli parameter for BTS
-    # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_bts = 1
-    distr_bts = Bernoulli(logits=logit_bts)
-    logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
-    # Build policy outputs
-    params = env.fixed_distr_params
-    params["bernoulli_source_logit"] = logit_bts
-    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    # Get log probs
-    logprobs = env.get_logprobs(
-        policy_outputs, actions, masks, states_torch, is_backward=True
-    )
-    assert torch.all(logprobs[is_bts_forced] == 0.0)
-    assert torch.all(torch.isfinite(logprobs))
-
-
-@pytest.mark.parametrize(
-    "state, expected",
-    [
-        (
-            [0.0, 0.0],
-            [0.0, 0.0],
-        ),
-        (
-            [1.0, 1.0],
-            [1.0, 1.0],
-        ),
-        (
-            [1.1, 1.00001],
-            [1.0, 1.0],
-        ),
-        (
-            [-0.1, 1.00001],
-            [0.0, 1.0],
-        ),
-        (
-            [0.1, 0.21],
-            [0.1, 0.21],
-        ),
-    ],
-)
-@pytest.mark.skip(reason="skip while developping other tests")
-def test__state2policy_returns_expected(env, state, expected):
-    assert env.state2policy(state) == expected
-
-
-@pytest.mark.parametrize(
-    "states, expected",
-    [
-        (
-            [[0.0, 0.0], [1.0, 1.0], [1.1, 1.00001], [-0.1, 1.00001], [0.1, 0.21]],
-            [[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 1.0], [0.1, 0.21]],
-        ),
-    ],
-)
-@pytest.mark.skip(reason="skip while developping other tests")
-def test__statetorch2policy_returns_expected(env, states, expected):
-    assert torch.equal(
-        env.statetorch2policy(torch.tensor(states)), torch.tensor(expected)
-    )
-
-
-@pytest.mark.parametrize(
-    "state, expected",
-    [
-        (
-            [0.0, 0.0],
-            [True, False, False],
-        ),
-        (
-            [0.1, 0.1],
-            [False, True, False],
-        ),
-        (
-            [1.0, 0.0],
-            [False, True, False],
-        ),
-        (
-            [1.1, 0.0],
-            [True, True, False],
-        ),
-        (
-            [0.1, 1.1],
-            [True, True, False],
-        ),
-    ],
-)
-@pytest.mark.skip(reason="skip while developping other tests")
-def test__get_mask_invalid_actions_forward__returns_expected(env, state, expected):
-    assert env.get_mask_invalid_actions_forward(state) == expected, print(
-        state, expected, env.get_mask_invalid_actions_forward(state)
-    )
-
-
-@pytest.mark.skip(reason="skip while developping other tests")
-def test__continuous_env_common__cube1d(cube1d):
-    return common.test__continuous_env_common(cube1d)
-
-
-def test__continuous_env_common__cube2d(cube2d):
-    return common.test__continuous_env_common(cube2d)
+# @pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+# def test__get_policy_output__fixed_as_expected(env, request):
+#     env = request.getfixturevalue(env)
+#     policy_outputs = torch.unsqueeze(env.fixed_policy_output, 0)
+#     params = env.fixed_distr_params
+#     policy_output__as_expected(env, policy_outputs, params)
+# 
+# 
+# @pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+# def test__get_policy_output__random_as_expected(env, request):
+#     env = request.getfixturevalue(env)
+#     policy_outputs = torch.unsqueeze(env.random_policy_output, 0)
+#     params = env.random_distr_params
+#     policy_output__as_expected(env, policy_outputs, params)
+# 
+# 
+# def policy_output__as_expected(env, policy_outputs, params):
+#     assert torch.all(
+#         env._get_policy_betas_weights(policy_outputs) == params["beta_weights"]
+#     )
+#     assert torch.all(
+#         env._get_policy_betas_alpha(policy_outputs) == params["beta_alpha"]
+#     )
+#     assert torch.all(env._get_policy_betas_beta(policy_outputs) == params["beta_beta"])
+#     assert torch.all(
+#         env._get_policy_eos_logit(policy_outputs) == params["bernoulli_eos_logit"]
+#     )
+#     assert torch.all(
+#         env._get_policy_source_logit(policy_outputs) == params["bernoulli_source_logit"]
+#     )
+# 
+# 
+# @pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+# def test__mask_forward__returns_all_true_if_done(env, request):
+#     env = request.getfixturevalue(env)
+#     # Sample states
+#     states = env.get_uniform_terminating_states(100)
+#     # Iterate over state and test
+#     for state in states:
+#         env.set_state(state, done=True)
+#         mask = env.get_mask_invalid_actions_forward()
+#         assert all(mask)
+# 
+# 
+# @pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+# def test__mask_backward__returns_all_true_except_eos_if_done(env, request):
+#     env = request.getfixturevalue(env)
+#     # Sample states
+#     states = env.get_uniform_terminating_states(100)
+#     # Iterate over state and test
+#     for state in states:
+#         env.set_state(state, done=True)
+#         mask = env.get_mask_invalid_actions_backward()
+#         assert all(mask[:-1])
+#         assert mask[-1] is False
+# 
+# 
+# @pytest.mark.parametrize(
+#     "state, mask_expected",
+#     [
+#         (
+#             [0.0],
+#             [False, False, True],
+#         ),
+#         (
+#             [0.5],
+#             [False, True, False],
+#         ),
+#         (
+#             [0.90],
+#             [False, True, False],
+#         ),
+#         (
+#             [0.95],
+#             [True, True, False],
+#         ),
+#     ],
+# )
+# def test__mask_forward__1d__returns_expected(cube1d, state, mask_expected):
+#     env = cube1d
+#     mask = env.get_mask_invalid_actions_forward(state)
+#     assert mask == mask_expected
+# 
+# 
+# @pytest.mark.parametrize(
+#     "state, mask_expected",
+#     [
+#         (
+#             [0.0, 0.0],
+#             [False, False, True],
+#         ),
+#         (
+#             [0.5, 0.5],
+#             [False, True, False],
+#         ),
+#         (
+#             [0.90, 0.5],
+#             [False, True, False],
+#         ),
+#         (
+#             [0.95, 0.5],
+#             [True, True, False],
+#         ),
+#         (
+#             [0.5, 0.90],
+#             [False, True, False],
+#         ),
+#         (
+#             [0.5, 0.95],
+#             [True, True, False],
+#         ),
+#         (
+#             [0.95, 0.95],
+#             [True, True, False],
+#         ),
+#     ],
+# )
+# def test__mask_forward__2d__returns_expected(cube2d, state, mask_expected):
+#     env = cube2d
+#     mask = env.get_mask_invalid_actions_forward(state)
+#     assert mask == mask_expected
+# 
+# 
+# @pytest.mark.parametrize(
+#     "state, mask_expected",
+#     [
+#         (
+#             [0.0],
+#             [True, False, True],
+#         ),
+#         (
+#             [0.1],
+#             [False, True, True],
+#         ),
+#         (
+#             [0.05],
+#             [True, False, True],
+#         ),
+#         (
+#             [0.5],
+#             [False, True, True],
+#         ),
+#         (
+#             [0.90],
+#             [False, True, True],
+#         ),
+#         (
+#             [0.95],
+#             [False, True, True],
+#         ),
+#     ],
+# )
+# def test__mask_backward__1d__returns_expected(cube1d, state, mask_expected):
+#     env = cube1d
+#     mask = env.get_mask_invalid_actions_backward(state)
+#     assert mask == mask_expected
+# 
+# 
+# @pytest.mark.parametrize(
+#     "state, mask_expected",
+#     [
+#         (
+#             [0.0, 0.0],
+#             [True, False, True],
+#         ),
+#         (
+#             [0.5, 0.5],
+#             [False, True, True],
+#         ),
+#         (
+#             [0.05, 0.5],
+#             [True, False, True],
+#         ),
+#         (
+#             [0.5, 0.05],
+#             [True, False, True],
+#         ),
+#         (
+#             [0.05, 0.05],
+#             [True, False, True],
+#         ),
+#         (
+#             [0.90, 0.5],
+#             [False, True, True],
+#         ),
+#         (
+#             [0.5, 0.90],
+#             [False, True, True],
+#         ),
+#         (
+#             [0.95, 0.5],
+#             [False, True, True],
+#         ),
+#         (
+#             [0.5, 0.95],
+#             [False, True, True],
+#         ),
+#         (
+#             [0.95, 0.95],
+#             [False, True, True],
+#         ),
+#     ],
+# )
+# def test__mask_backward__2d__returns_expected(cube2d, state, mask_expected):
+#     env = cube2d
+#     mask = env.get_mask_invalid_actions_backward(state)
+#     assert mask == mask_expected
+# 
+# 
+# @pytest.mark.parametrize(
+#     "state, increments_rel, min_increments, state_expected",
+#     [
+#         (
+#             [0.0, 0.0],
+#             [0.5, 0.5],
+#             [0.0, 0.0],
+#             [0.5, 0.5],
+#         ),
+#         (
+#             [0.0, 0.0],
+#             [0.0, 0.0],
+#             [0.0, 0.0],
+#             [0.0, 0.0],
+#         ),
+#         (
+#             [0.0, 0.0],
+#             [0.1794, 0.9589],
+#             [0.0, 0.0],
+#             [0.1794, 0.9589],
+#         ),
+#         (
+#             [0.3, 0.5],
+#             [0.0, 0.0],
+#             [0.1, 0.1],
+#             [0.4, 0.6],
+#         ),
+#         (
+#             [0.3, 0.5],
+#             [1.0, 1.0],
+#             [0.1, 0.1],
+#             [1.0, 1.0],
+#         ),
+#         (
+#             [0.3, 0.5],
+#             [0.5, 0.5],
+#             [0.1, 0.1],
+#             [0.7, 0.8],
+#         ),
+#         (
+#             [0.27, 0.85],
+#             [0.12, 0.76],
+#             [0.1, 0.1],
+#             [0.4456, 0.988],
+#         ),
+#         (
+#             [0.27, 0.95],
+#             [0.12, 0.0],
+#             [0.1, 0.0],
+#             [0.4456, 0.95],
+#         ),
+#         (
+#             [0.95, 0.27],
+#             [0.0, 0.12],
+#             [0.0, 0.1],
+#             [0.95, 0.4456],
+#         ),
+#     ],
+# )
+# def test__relative_to_absolute_increments__2d_forward__returns_expected(
+#     cube2d, state, increments_rel, min_increments, state_expected
+# ):
+#     env = cube2d
+#     # Convert to tensors
+#     states = tfloat([state], float_type=env.float, device=env.device)
+#     increments_rel = tfloat([increments_rel], float_type=env.float, device=env.device)
+#     min_increments = tfloat([min_increments], float_type=env.float, device=env.device)
+#     states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
+#     # Get absolute increments
+#     increments_abs = env.relative_to_absolute_increments(
+#         states, increments_rel, min_increments, env.max_val, is_backward=False
+#     )
+#     states_next = states + increments_abs
+#     assert torch.all(torch.isclose(states_next, states_expected))
+# 
+# 
+# @pytest.mark.parametrize(
+#     "state, increments_rel, min_increments, state_expected",
+#     [
+#         (
+#             [1.0, 1.0],
+#             [0.0, 0.0],
+#             [0.1, 0.1],
+#             [0.9, 0.9],
+#         ),
+#         (
+#             [1.0, 1.0],
+#             [1.0, 1.0],
+#             [0.1, 0.1],
+#             [0.0, 0.0],
+#         ),
+#         (
+#             [1.0, 1.0],
+#             [0.1794, 0.9589],
+#             [0.1, 0.1],
+#             [0.73854, 0.03699],
+#         ),
+#         (
+#             [0.3, 0.5],
+#             [0.0, 0.0],
+#             [0.1, 0.1],
+#             [0.2, 0.4],
+#         ),
+#         (
+#             [0.3, 0.5],
+#             [1.0, 1.0],
+#             [0.1, 0.1],
+#             [0.0, 0.0],
+#         ),
+#     ],
+# )
+# def test__relative_to_absolute_increments__2d_backward__returns_expected(
+#     cube2d, state, increments_rel, min_increments, state_expected
+# ):
+#     env = cube2d
+#     # Convert to tensors
+#     states = tfloat([state], float_type=env.float, device=env.device)
+#     increments_rel = tfloat([increments_rel], float_type=env.float, device=env.device)
+#     min_increments = tfloat([min_increments], float_type=env.float, device=env.device)
+#     states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
+#     # Get absolute increments
+#     increments_abs = env.relative_to_absolute_increments(
+#         states, increments_rel, min_increments, env.max_val, is_backward=True
+#     )
+#     states_next = states - increments_abs
+#     assert torch.all(torch.isclose(states_next, states_expected))
+# 
+# 
+# @pytest.mark.parametrize(
+#     "state, action, state_expected",
+#     [
+#         (
+#             [0.0, 0.0],
+#             (0.5, 0.5),
+#             [0.5, 0.5],
+#         ),
+#         (
+#             [0.0, 0.0],
+#             (0.0, 0.0),
+#             [0.0, 0.0],
+#         ),
+#         (
+#             [0.0, 0.0],
+#             (0.1794, 0.9589),
+#             [0.1794, 0.9589],
+#         ),
+#         (
+#             [0.3, 0.5],
+#             (0.1, 0.1),
+#             [0.4, 0.6],
+#         ),
+#         (
+#             [0.3, 0.5],
+#             (0.7, 0.5),
+#             [1.0, 1.0],
+#         ),
+#         (
+#             [0.3, 0.5],
+#             (0.4, 0.3),
+#             [0.7, 0.8],
+#         ),
+#         (
+#             [0.27, 0.85],
+#             (0.1756, 0.138),
+#             [0.4456, 0.988],
+#         ),
+#         (
+#             [0.27, 0.95],
+#             (0.1756, 0.0),
+#             [0.4456, 0.95],
+#         ),
+#         (
+#             [0.95, 0.27],
+#             (0.0, 0.1756),
+#             [0.95, 0.4456],
+#         ),
+#     ],
+# )
+# def test__step_forward__2d__returns_expected(cube2d, state, action, state_expected):
+#     env = cube2d
+#     env.set_state(state)
+#     state_new, action, valid = env.step(action)
+#     assert env.isclose(state_new, state_expected)
+# 
+# 
+# @pytest.mark.parametrize(
+#     "states, force_eos",
+#     [
+#         (
+#             [[0.0, 0.0], [0.0, 0.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+#             [False, False, False, False, False],
+#         ),
+#         (
+#             [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.0], [0.16, 0.93]],
+#             [False, False, False, False, False],
+#         ),
+#         (
+#             [[0.05, 0.97], [0.56, 0.23], [0.95, 0.3], [0.2, 0.95], [0.01, 0.01]],
+#             [False, False, False, False, False],
+#         ),
+#         (
+#             [[0.0, 0.0], [0.0, 0.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+#             [False, False, False, True, False],
+#         ),
+#         (
+#             [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.0], [0.16, 0.93]],
+#             [False, True, True, False, False],
+#         ),
+#         (
+#             [[0.05, 0.97], [0.56, 0.23], [0.95, 0.98], [0.92, 0.95], [0.01, 0.01]],
+#             [False, False, False, True, True],
+#         ),
+#     ],
+# )
+# def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos):
+#     env = cube2d
+#     n_states = len(states)
+#     force_eos = tbool(force_eos, device=env.device)
+#     # Get masks
+#     masks = tbool(
+#         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+#     )
+#     # Define Beta distribution with low variance and get confident range
+#     n_samples = 10000
+#     beta_params_min = 0.0
+#     beta_params_max = 10000
+#     alpha = 10
+#     alphas_presigmoid = alpha * torch.ones(n_samples)
+#     alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
+#     beta = 1.0
+#     betas_presigmoid = beta * torch.ones(n_samples)
+#     betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
+#     beta_distr = Beta(alphas, betas)
+#     samples = beta_distr.sample()
+#     mean_incr_rel = 0.9 * samples.mean()
+#     min_incr_rel = 0.9 * samples.min()
+#     max_incr_rel = 1.1 * samples.max()
+#     # Define Bernoulli parameters for EOS with deterministic probability
+#     logit_force_eos = torch.inf
+#     logit_force_noeos = -torch.inf
+#     # Estimate confident intervals of absolute actions
+#     states_torch = tfloat(states, float_type=env.float, device=env.device)
+#     is_source = torch.all(states_torch == 0.0, dim=1)
+#     is_near_edge = states_torch > 1.0 - env.min_incr
+#     min_increments = torch.full_like(
+#         states_torch, env.min_incr, dtype=env.float, device=env.device
+#     )
+#     min_increments[is_source, :] = 0.0
+#     increments_rel_min = torch.full_like(
+#         states_torch, min_incr_rel, dtype=env.float, device=env.device
+#     )
+#     increments_rel_max = torch.full_like(
+#         states_torch, max_incr_rel, dtype=env.float, device=env.device
+#     )
+#     increments_abs_min = env.relative_to_absolute_increments(
+#         states_torch, increments_rel_min, min_increments, env.max_val, is_backward=False
+#     )
+#     increments_abs_max = env.relative_to_absolute_increments(
+#         states_torch, increments_rel_max, min_increments, env.max_val, is_backward=False
+#     )
+#     # Get EOS actions
+#     is_eos_forced = torch.any(is_near_edge, dim=1)
+#     is_eos = torch.logical_or(is_eos_forced, force_eos)
+#     increments_abs_min[is_eos] = torch.inf
+#     increments_abs_max[is_eos] = torch.inf
+#     # Reconfigure environment
+#     env.n_comp = 1
+#     env.beta_params_min = beta_params_min
+#     env.beta_params_max = beta_params_max
+#     # Build policy outputs
+#     params = env.fixed_distr_params
+#     params["beta_alpha"] = alpha
+#     params["beta_beta"] = beta
+#     params["bernoulli_eos_logit"] = logit_force_noeos
+#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+#     policy_outputs[force_eos, -1] = logit_force_eos
+#     # Sample actions
+#     actions, _ = env.sample_actions_batch(
+#         policy_outputs, masks, states, is_backward=False
+#     )
+#     actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
+#     actions_eos = torch.all(actions_tensor == torch.inf, dim=1)
+#     assert torch.all(actions_eos == is_eos)
+#     assert torch.all(actions_tensor >= increments_abs_min)
+#     assert torch.all(actions_tensor <= increments_abs_max)
+# 
+# 
+# @pytest.mark.parametrize(
+#     "states, force_bst",
+#     [
+#         (
+#             [[1.0, 1.0], [1.0, 1.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+#             [False, False, False, False, False],
+#         ),
+#         (
+#             [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.05], [0.16, 0.93]],
+#             [False, False, False, False, False],
+#         ),
+#         (
+#             [[0.05, 0.97], [0.56, 0.23], [0.95, 0.3], [0.2, 0.95], [0.01, 0.01]],
+#             [False, False, False, False, False],
+#         ),
+#         (
+#             [[0.0001, 0.0], [0.001, 0.01], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+#             [False, False, False, True, False],
+#         ),
+#         (
+#             [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [1.0, 1.0], [0.16, 0.93]],
+#             [False, True, True, True, False],
+#         ),
+#         (
+#             [[0.05, 0.97], [0.56, 0.23], [0.95, 0.98], [0.92, 0.95], [0.01, 0.01]],
+#             [False, False, False, True, True],
+#         ),
+#     ],
+# )
+# def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bst):
+#     env = cube2d
+#     n_states = len(states)
+#     force_bst = tbool(force_bst, device=env.device)
+#     # Get masks
+#     masks = tbool(
+#         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+#     )
+#     # Define Beta distribution with low variance and get confident range
+#     n_samples = 10000
+#     beta_params_min = 0.0
+#     beta_params_max = 10000
+#     alpha = 10
+#     alphas_presigmoid = alpha * torch.ones(n_samples)
+#     alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
+#     beta = 1.0
+#     betas_presigmoid = beta * torch.ones(n_samples)
+#     betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
+#     beta_distr = Beta(alphas, betas)
+#     samples = beta_distr.sample()
+#     mean_incr_rel = 0.9 * samples.mean()
+#     min_incr_rel = 0.9 * samples.min()
+#     max_incr_rel = 1.1 * samples.max()
+#     # Define Bernoulli parameters for BST with deterministic probability
+#     logit_force_bst = torch.inf
+#     logit_force_nobst = -torch.inf
+#     # Estimate confident intervals of absolute actions
+#     states_torch = tfloat(states, float_type=env.float, device=env.device)
+#     is_near_edge = states_torch < env.min_incr
+#     min_increments = torch.full_like(
+#         states_torch, env.min_incr, dtype=env.float, device=env.device
+#     )
+#     increments_rel_min = torch.full_like(
+#         states_torch, min_incr_rel, dtype=env.float, device=env.device
+#     )
+#     increments_rel_max = torch.full_like(
+#         states_torch, max_incr_rel, dtype=env.float, device=env.device
+#     )
+#     increments_abs_min = env.relative_to_absolute_increments(
+#         states_torch, increments_rel_min, min_increments, env.max_val, is_backward=True
+#     )
+#     increments_abs_max = env.relative_to_absolute_increments(
+#         states_torch, increments_rel_max, min_increments, env.max_val, is_backward=True
+#     )
+#     # Get BST actions
+#     is_bst_forced = torch.any(is_near_edge, dim=1)
+#     is_bst = torch.logical_or(is_bst_forced, force_bst)
+#     increments_abs_min[is_bst] = states_torch[is_bst]
+#     increments_abs_max[is_bst] = states_torch[is_bst]
+#     # Reconfigure environment
+#     env.n_comp = 1
+#     env.beta_params_min = beta_params_min
+#     env.beta_params_max = beta_params_max
+#     # Build policy outputs
+#     params = env.fixed_distr_params
+#     params["beta_alpha"] = alpha
+#     params["beta_beta"] = beta
+#     params["bernoulli_source_logit"] = logit_force_nobst
+#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+#     policy_outputs[force_bst, -2] = logit_force_bst
+#     # Sample actions
+#     actions, _ = env.sample_actions_batch(
+#         policy_outputs, masks, states, is_backward=True
+#     )
+#     actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
+#     actions_bst = torch.all(actions_tensor == states_torch, dim=1)
+#     assert torch.all(actions_bst == is_bst)
+#     assert torch.all(actions_tensor >= increments_abs_min)
+#     assert torch.all(actions_tensor <= increments_abs_max)
+# 
+# 
+# @pytest.mark.parametrize(
+#     "states, actions",
+#     [
+#         (
+#             [[0.95, 0.97], [0.96, 0.5], [0.5, 0.96]],
+#             [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
+#         ),
+#         (
+#             [[0.95, 0.97], [0.901, 0.5], [1.0, 1.0]],
+#             [[np.inf, np.inf], [0.01, 0.2], [0.3, 0.01]],
+#         ),
+#     ],
+# )
+# def test__get_logprobs_forward__2d__nearedge_returns_prob1(cube2d, states, actions):
+#     """
+#     The only valid action from 'near-edge' states is EOS, thus the the log probability
+#     should be zero, regardless of the action and the policy outputs
+#     """
+#     env = cube2d
+#     n_states = len(states)
+#     states_torch = tfloat(states, float_type=env.float, device=env.device)
+#     actions = tfloat(actions, float_type=env.float, device=env.device)
+#     # Get masks
+#     masks = tbool(
+#         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+#     )
+#     # Build policy outputs
+#     params = env.fixed_distr_params
+#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+#     # Add noise to policy outputs
+#     policy_outputs += torch.randn(policy_outputs.shape)
+#     # Get log probs
+#     logprobs = env.get_logprobs(
+#         policy_outputs, actions, masks, states_torch, is_backward=False
+#     )
+#     assert torch.all(logprobs == 0.0)
+# 
+# 
+# @pytest.mark.parametrize(
+#     "states, actions",
+#     [
+#         (
+#             [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+#             [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
+#         ),
+#         (
+#             [[1.0, 1.0], [0.01, 0.01], [0.001, 0.1]],
+#             [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
+#         ),
+#     ],
+# )
+# def test__get_logprobs_forward__2d__eos_actions_return_expected(
+#     cube2d, states, actions
+# ):
+#     """
+#     The only valid action from 'near-edge' states is EOS, thus the the log probability
+#     should be zero, regardless of the action and the policy outputs
+#     """
+#     env = cube2d
+#     n_states = len(states)
+#     states_torch = tfloat(states, float_type=env.float, device=env.device)
+#     actions = tfloat(actions, float_type=env.float, device=env.device)
+#     # Get masks
+#     masks = tbool(
+#         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+#     )
+#     # Get EOS forced
+#     is_near_edge = states_torch > 1.0 - env.min_incr
+#     is_eos_forced = torch.any(is_near_edge, dim=1)
+#     # Define Bernoulli parameter for EOS
+#     # If Bernouilli has logit torch.inf, the logprobs are nan
+#     logit_eos = 1
+#     distr_eos = Bernoulli(logits=logit_eos)
+#     logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
+#     # Build policy outputs
+#     params = env.fixed_distr_params
+#     params["bernoulli_eos_logit"] = logit_eos
+#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+#     # Get log probs
+#     logprobs = env.get_logprobs(
+#         policy_outputs, actions, masks, states_torch, is_backward=False
+#     )
+#     assert torch.all(logprobs[is_eos_forced] == 0.0)
+#     assert torch.all(torch.isclose(logprobs[~is_eos_forced], logprob_eos, atol=1e-6))
+# 
+# 
+# @pytest.mark.parametrize(
+#     "actions",
+#     [
+#         [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+#         [[0.999, 0.999], [0.0001, 0.0001], [0.5, 0.5]],
+#         [[0.0, 0.0], [1.0, 1.0]],
+#     ],
+# )
+# def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1(
+#     cube2d, actions
+# ):
+#     """
+#     With Uniform increment policy, all the actions from the source must have the same
+#     probability.
+#     """
+#     env = cube2d
+#     n_states = len(actions)
+#     states = [env.source for _ in range(n_states)]
+#     states_torch = tfloat(states, float_type=env.float, device=env.device)
+#     actions = tfloat(actions, float_type=env.float, device=env.device)
+#     # Get masks
+#     masks = tbool(
+#         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+#     )
+#     # Define Uniform Beta distribution (large values of alpha and beta and max of 1.0)
+#     beta_params_min = 0.0
+#     beta_params_max = 1.0
+#     alpha_presigmoid = 1000.0
+#     betas_presigmoid = 1000.0
+#     # Define Bernoulli parameter for impossible EOS
+#     # If Bernouilli has logit -torch.inf, the logprobs are nan
+#     logit_force_noeos = -1000
+#     # Reconfigure environment
+#     env.n_comp = 1
+#     env.beta_params_min = beta_params_min
+#     env.beta_params_max = beta_params_max
+#     # Build policy outputs
+#     params = env.fixed_distr_params
+#     params["beta_alpha"] = alpha_presigmoid
+#     params["beta_beta"] = betas_presigmoid
+#     params["bernoulli_eos_logit"] = logit_force_noeos
+#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+#     # Get log probs
+#     logprobs = env.get_logprobs(
+#         policy_outputs, actions, masks, states_torch, is_backward=False
+#     )
+#     assert torch.all(logprobs == 0.0)
+# 
+# 
+# @pytest.mark.parametrize(
+#     "states, actions",
+#     [
+#         (
+#             [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]],
+#             [[0.1, 0.2], [0.001, 0.001], [0.5, 0.5]],
+#         ),
+#         (
+#             [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7]],
+#             [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
+#         ),
+#         (
+#             [[0.6384, 0.4577], [0.5, 0.5], [0.7, 0.7]],
+#             [[0.2988, 0.3585], [0.1, 0.1], [0.1, 0.1]],
+#         ),
+#     ],
+# )
+# def test__get_logprobs_forward__2d__notnan(cube2d, states, actions):
+#     env = cube2d
+#     n_states = len(states)
+#     states_torch = tfloat(states, float_type=env.float, device=env.device)
+#     actions = tfloat(actions, float_type=env.float, device=env.device)
+#     # Get masks
+#     masks = tbool(
+#         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+#     )
+#     # Get EOS forced
+#     is_near_edge = states_torch > 1.0 - env.min_incr
+#     is_eos_forced = torch.any(is_near_edge, dim=1)
+#     # Define Bernoulli parameter for EOS
+#     # If Bernouilli has logit torch.inf, the logprobs are nan
+#     logit_eos = 1
+#     distr_eos = Bernoulli(logits=logit_eos)
+#     logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
+#     # Build policy outputs
+#     params = env.fixed_distr_params
+#     params["bernoulli_eos_logit"] = logit_eos
+#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+#     # Get log probs
+#     logprobs = env.get_logprobs(
+#         policy_outputs, actions, masks, states_torch, is_backward=False
+#     )
+#     assert torch.all(logprobs[is_eos_forced] == 0.0)
+#     assert torch.all(torch.isfinite(logprobs))
+# 
+# 
+# @pytest.mark.parametrize(
+#     "states, actions",
+#     [
+#         (
+#             [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
+#             [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
+#         ),
+#         (
+#             [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
+#             [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
+#         ),
+#     ],
+# )
+# def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, actions):
+#     """
+#     The only valid backward action from 'near-edge' states is BTS, thus the the log
+#     probability should be zero.
+#     """
+#     env = cube2d
+#     n_states = len(states)
+#     states_torch = tfloat(states, float_type=env.float, device=env.device)
+#     actions = tfloat(actions, float_type=env.float, device=env.device)
+#     # Get masks
+#     masks = tbool(
+#         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+#     )
+#     # Build policy outputs
+#     params = env.fixed_distr_params
+#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+#     # Add noise to policy outputs
+#     policy_outputs += torch.randn(policy_outputs.shape)
+#     # Get log probs
+#     logprobs = env.get_logprobs(
+#         policy_outputs, actions, masks, states_torch, is_backward=True
+#     )
+#     assert torch.all(logprobs == 0.0)
+# 
+# 
+# @pytest.mark.parametrize(
+#     "states, actions",
+#     [
+#         (
+#             [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+#             [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+#         ),
+#         (
+#             [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
+#             [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
+#         ),
+#         (
+#             [[1.0, 1.0], [0.0, 0.0]],
+#             [[1.0, 1.0], [0.0, 0.0]],
+#         ),
+#     ],
+# )
+# def test__get_logprobs_backward__2d__bts_actions_return_expected(
+#     cube2d, states, actions
+# ):
+#     """
+#     The only valid action from 'near-edge' states is EOS, thus the the log probability
+#     should be zero, regardless of the action and the policy outputs
+#     """
+#     env = cube2d
+#     n_states = len(states)
+#     states_torch = tfloat(states, float_type=env.float, device=env.device)
+#     actions = tfloat(actions, float_type=env.float, device=env.device)
+#     # Get masks
+#     masks = tbool(
+#         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+#     )
+#     # Get BTS forced
+#     is_near_edge = states_torch < env.min_incr
+#     is_bts_forced = torch.any(is_near_edge, dim=1)
+#     # Define Bernoulli parameter for BTS
+#     # If Bernouilli has logit torch.inf, the logprobs are nan
+#     logit_bts = 1
+#     distr_bts = Bernoulli(logits=logit_bts)
+#     logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
+#     # Build policy outputs
+#     params = env.fixed_distr_params
+#     params["bernoulli_source_logit"] = logit_bts
+#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+#     # Get log probs
+#     logprobs = env.get_logprobs(
+#         policy_outputs, actions, masks, states_torch, is_backward=True
+#     )
+#     assert torch.all(logprobs[is_bts_forced] == 0.0)
+#     assert torch.all(torch.isclose(logprobs[~is_bts_forced], logprob_bts, atol=1e-6))
+# 
+# 
+# @pytest.mark.parametrize(
+#     "states, actions",
+#     [
+#         (
+#             [[0.3, 0.3], [0.5, 0.5], [0.8, 0.8]],
+#             [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
+#         ),
+#         (
+#             [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]],
+#             [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
+#         ),
+#         (
+#             [[1.0, 1.0], [0.5, 0.5], [0.3, 0.3]],
+#             [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
+#         ),
+#     ],
+# )
+# def test__get_logprobs_backward__2d__notnan(cube2d, states, actions):
+#     env = cube2d
+#     n_states = len(states)
+#     states_torch = tfloat(states, float_type=env.float, device=env.device)
+#     actions = tfloat(actions, float_type=env.float, device=env.device)
+#     # Get masks
+#     masks = tbool(
+#         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+#     )
+#     # Get BTS forced
+#     is_near_edge = states_torch < env.min_incr
+#     is_bts_forced = torch.any(is_near_edge, dim=1)
+#     # Define Bernoulli parameter for BTS
+#     # If Bernouilli has logit torch.inf, the logprobs are nan
+#     logit_bts = 1
+#     distr_bts = Bernoulli(logits=logit_bts)
+#     logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
+#     # Build policy outputs
+#     params = env.fixed_distr_params
+#     params["bernoulli_source_logit"] = logit_bts
+#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+#     # Get log probs
+#     logprobs = env.get_logprobs(
+#         policy_outputs, actions, masks, states_torch, is_backward=True
+#     )
+#     assert torch.all(logprobs[is_bts_forced] == 0.0)
+#     assert torch.all(torch.isfinite(logprobs))
+# 
+# 
+# @pytest.mark.parametrize(
+#     "state, expected",
+#     [
+#         (
+#             [0.0, 0.0],
+#             [0.0, 0.0],
+#         ),
+#         (
+#             [1.0, 1.0],
+#             [1.0, 1.0],
+#         ),
+#         (
+#             [1.1, 1.00001],
+#             [1.0, 1.0],
+#         ),
+#         (
+#             [-0.1, 1.00001],
+#             [0.0, 1.0],
+#         ),
+#         (
+#             [0.1, 0.21],
+#             [0.1, 0.21],
+#         ),
+#     ],
+# )
+# @pytest.mark.skip(reason="skip while developping other tests")
+# def test__state2policy_returns_expected(env, state, expected):
+#     assert env.state2policy(state) == expected
+# 
+# 
+# @pytest.mark.parametrize(
+#     "states, expected",
+#     [
+#         (
+#             [[0.0, 0.0], [1.0, 1.0], [1.1, 1.00001], [-0.1, 1.00001], [0.1, 0.21]],
+#             [[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 1.0], [0.1, 0.21]],
+#         ),
+#     ],
+# )
+# @pytest.mark.skip(reason="skip while developping other tests")
+# def test__statetorch2policy_returns_expected(env, states, expected):
+#     assert torch.equal(
+#         env.statetorch2policy(torch.tensor(states)), torch.tensor(expected)
+#     )
+# 
+# 
+# @pytest.mark.parametrize(
+#     "state, expected",
+#     [
+#         (
+#             [0.0, 0.0],
+#             [True, False, False],
+#         ),
+#         (
+#             [0.1, 0.1],
+#             [False, True, False],
+#         ),
+#         (
+#             [1.0, 0.0],
+#             [False, True, False],
+#         ),
+#         (
+#             [1.1, 0.0],
+#             [True, True, False],
+#         ),
+#         (
+#             [0.1, 1.1],
+#             [True, True, False],
+#         ),
+#     ],
+# )
+# @pytest.mark.skip(reason="skip while developping other tests")
+# def test__get_mask_invalid_actions_forward__returns_expected(env, state, expected):
+#     assert env.get_mask_invalid_actions_forward(state) == expected, print(
+#         state, expected, env.get_mask_invalid_actions_forward(state)
+#     )
+# 
+# 
+# @pytest.mark.skip(reason="skip while developping other tests")
+# def test__continuous_env_common__cube1d(cube1d):
+#     return common.test__continuous_env_common(cube1d)
+# 
+# 
+# def test__continuous_env_common__cube2d(cube2d):
+#     return common.test__continuous_env_common(cube2d)

From d42b37e64ff52d3e5a5a075ce1d7e1470d6213d9 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 17:52:33 -0400
Subject: [PATCH 149/206] staticmethods are not so anymore.

---
 gflownet/envs/cube.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 183f0cab8..7e7ef3df1 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -484,6 +484,9 @@ def get_mask_invalid_actions_backward(self, state=None, done=None, parents_a=Non
         done = self._get_done(done)
         mask_dim = 3
         mask = [True] * mask_dim
+        # If the state is the source state, entire mask is True
+        if state == self.source:
+            return mask
         # If done, only valid action is EOS.
         if done:
             mask[2] = False
@@ -506,9 +509,8 @@ def get_parents(
         """
         pass
 
-    # TODO: rethink if not necessary from source
-    @staticmethod
     def relative_to_absolute_increments(
+        self,
         states: TensorType["n_states", "n_dim"],
         increments_rel: TensorType["n_states", "n_dim"],
         is_backward: bool,
@@ -536,9 +538,8 @@ def relative_to_absolute_increments(
         else:
             return min_increments + increments_rel * (1.0 - states - min_increments)
 
-    # TODO: rethink if not necessary from source
-    @staticmethod
     def absolute_to_relative_increments(
+        self,
         states: TensorType["n_states", "n_dim"],
         increments_abs: TensorType["n_states", "n_dim"],
         is_backward: bool,
@@ -992,8 +993,8 @@ def _get_logprobs_backward(
         logprobs[is_eos] = 0.0
         return logprobs
 
-    @staticmethod
     def _get_jacobian_diag(
+        self,
         states_from: TensorType["n_states", "n_dim"],
         is_backward: bool,
     ):

From 1d73241736d55b2439ba7c7059804c32bfcbe696 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 18:13:51 -0400
Subject: [PATCH 150/206] Adapt step methods

---
 gflownet/envs/cube.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 7e7ef3df1..6a9ea9f11 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1071,11 +1071,9 @@ def _step(
             if backward:
                 self.state[dim] -= incr
             else:
+                if self.state == self.source:
+                    self.state = [0.0 for _ in range(self.n_dim)]
                 self.state[dim] += incr
-        # If state is close enough to source, set source to avoid escaping comparison
-        # to source.
-        if self.isclose(self.state, self.source, atol=1e-6):
-            self.state = copy(self.source)
         if not all([s <= (1.0 + epsilon) for s in self.state]):
             import ipdb
 
@@ -1167,12 +1165,15 @@ def step_backwards(
             self.done = False
             self.n_actions += 1
             return self.state, action, True
-        # Otherwise perform action
-        else:
-            assert action != self.eos
+        if action == self.bts:
+            self.state = self.source
             self.n_actions += 1
-            self._step(action, backward=True)
             return self.state, action, True
+        # Otherwise perform action
+        assert action != self.eos
+        self.n_actions += 1
+        self._step(action, backward=True)
+        return self.state, action, True
 
     def get_grid_terminating_states(self, n_states: int) -> List[List]:
         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))

From dccc14382923a51ab18cd91864ef0e76163333b4 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 20:18:41 -0400
Subject: [PATCH 151/206] Fixes, epsilon as parameter, and implement
 _get_beta_params_from_mean_variance although it is not used.

---
 gflownet/envs/cube.py | 75 +++++++++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 17 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 6a9ea9f11..317b53db4 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -77,6 +77,8 @@ def __init__(
         self.beta_params_max = beta_params_max
         # Source state is abstract - not included in the cube: -1 for all dimensions.
         self.source = [-1 for _ in range(self.n_dim)]
+        # Small constant to clamp the inputs to the beta distribution
+        self.epsilon = 1e-6
         # Conversions: only conversions to policy are implemented and the rest are the
         # same
         self.state2proxy = self.state2policy
@@ -567,6 +569,43 @@ def absolute_to_relative_increments(
         else:
             return (increments_abs - min_increments) / (1.0 - states - min_increments)
 
+    @staticmethod
+    def _get_beta_params_from_mean_variance(
+        mean: TensorType["n_states", "n_dim_x_n_comp"],
+        variance: TensorType["n_states", "n_dim_x_n_comp"],
+    ) -> Tuple[
+        TensorType["n_states", "n_dim_x_n_comp"],
+        TensorType["n_states", "n_dim_x_n_comp"],
+    ]:
+        """
+        Calculates the alpha and beta parameters of a Beta distribution from the mean
+        and variance.
+
+        The method operates on tensors containing a batch of means and variances.
+
+        Args
+        ----
+        mean : tensor
+            A batch of means.
+
+        variance : tensor
+            A batch of variances.
+
+        Returns
+        -------
+        alpha : tensor
+            The alpha parameters for the Beta distributions as a function of the mean
+            and variances.
+
+        beta : tensor
+            The beta parameters for the Beta distributions as a function of the mean
+            and variances.
+        """
+        one_minus_mean = 1.0 - mean
+        beta = one_minus_mean * (mean * one_minus_mean - variance) / variance
+        alpha = (mean * beta) / one_minus_mean
+        return alpha, beta
+
     def _make_increments_distribution(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
@@ -888,30 +927,32 @@ def _get_logprobs_forward(
             # Compute relative increments from absolute increments if state is not
             # source
             is_relative = ~is_source[do_increments]
-            states_from_rel = tfloat(
-                states_from_tensor[do_increments],
-                float_type=self.float,
-                device=self.device,
-            )[is_relative]
-            increments[is_relative] = self.absolute_to_relative_increments(
-                states_from_rel,
-                increments,
-                is_backward=False,
-            )
+            if torch.any(is_relative):
+                states_from_rel = tfloat(
+                    states_from_tensor[do_increments],
+                    float_type=self.float,
+                    device=self.device,
+                )[is_relative]
+                increments[is_relative] = self.absolute_to_relative_increments(
+                    states_from_rel,
+                    increments[is_relative],
+                    is_backward=False,
+                )
             # Compute diagonal of the Jacobian (see _get_jacobian_diag()) if state is
             # not source
             is_relative = torch.logical_and(do_increments, ~is_source)
-            jacobian_diag[is_relative] = self._get_jacobian_diag(
-                states_from_rel,
-                is_backward=False,
-            )
+            if torch.any(is_relative):
+                jacobian_diag[is_relative] = self._get_jacobian_diag(
+                    states_from_rel,
+                    is_backward=False,
+                )
             # Get logprobs
             distr_increments = self._make_increments_distribution(
                 policy_outputs[do_increments]
             )
             # Clamp because increments of 0.0 or 1.0 would yield nan
             logprobs_increments_rel[do_increments] = distr_increments.log_prob(
-                torch.clamp(increments, min=1e-6, max=(1 - 1e-6))
+                torch.clamp(increments, min=self.epsilon, max=(1 - self.epsilon))
             )
         # Get log determinant of the Jacobian
         log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)
@@ -953,7 +994,7 @@ def _get_logprobs_backward(
         do_bts = torch.logical_and(~is_bts_forced, ~is_eos)
         if torch.any(do_bts):
             is_bts_sampled = torch.zeros_like(do_bts)
-            is_bts_sampled[do_bts] = torch.all(actions[do_bts] == bts_tensor)
+            is_bts_sampled[do_bts] = torch.all(actions[do_bts] == bts_tensor, dim=1)
             is_bts[is_bts_sampled] = True
             logits_bts = self._get_policy_source_logit(policy_outputs)[do_bts]
             distr_bts = Bernoulli(logits=logits_bts)
@@ -982,7 +1023,7 @@ def _get_logprobs_backward(
             )
             # Clamp because increments of 0.0 or 1.0 would yield nan
             logprobs_increments_rel[do_increments] = distr_increments.log_prob(
-                torch.clamp(increments, min=1e-6, max=(1 - 1e-6))
+                torch.clamp(increments, min=self.epsilon, max=(1 - self.epsilon))
             )
         # Get log determinant of the Jacobian
         log_det_jacobian = torch.sum(torch.log(jacobian_diag), dim=1)

From 81ffa79caa785f0ab08d6a220f0b4e2716b883f8 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 20:19:26 -0400
Subject: [PATCH 152/206] Adapt tests

---
 tests/gflownet/envs/test_ccube.py | 2032 +++++++++++++++--------------
 1 file changed, 1033 insertions(+), 999 deletions(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 58109c537..45e4551bb 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -29,1004 +29,1038 @@ def cube2d():
     ],
 )
 def test__get_action_space__returns_expected(cube2d, action_space):
-    assert set(action_space) == set(env.action_space)
+    env = cube2d
+    assert action_space == env.action_space
 
 
-# @pytest.mark.parametrize("env", ["cube1d", "cube2d"])
-# def test__get_policy_output__fixed_as_expected(env, request):
-#     env = request.getfixturevalue(env)
-#     policy_outputs = torch.unsqueeze(env.fixed_policy_output, 0)
-#     params = env.fixed_distr_params
-#     policy_output__as_expected(env, policy_outputs, params)
-# 
-# 
-# @pytest.mark.parametrize("env", ["cube1d", "cube2d"])
-# def test__get_policy_output__random_as_expected(env, request):
-#     env = request.getfixturevalue(env)
-#     policy_outputs = torch.unsqueeze(env.random_policy_output, 0)
-#     params = env.random_distr_params
-#     policy_output__as_expected(env, policy_outputs, params)
-# 
-# 
-# def policy_output__as_expected(env, policy_outputs, params):
-#     assert torch.all(
-#         env._get_policy_betas_weights(policy_outputs) == params["beta_weights"]
-#     )
-#     assert torch.all(
-#         env._get_policy_betas_alpha(policy_outputs) == params["beta_alpha"]
-#     )
-#     assert torch.all(env._get_policy_betas_beta(policy_outputs) == params["beta_beta"])
-#     assert torch.all(
-#         env._get_policy_eos_logit(policy_outputs) == params["bernoulli_eos_logit"]
-#     )
-#     assert torch.all(
-#         env._get_policy_source_logit(policy_outputs) == params["bernoulli_source_logit"]
-#     )
-# 
-# 
-# @pytest.mark.parametrize("env", ["cube1d", "cube2d"])
-# def test__mask_forward__returns_all_true_if_done(env, request):
-#     env = request.getfixturevalue(env)
-#     # Sample states
-#     states = env.get_uniform_terminating_states(100)
-#     # Iterate over state and test
-#     for state in states:
-#         env.set_state(state, done=True)
-#         mask = env.get_mask_invalid_actions_forward()
-#         assert all(mask)
-# 
-# 
-# @pytest.mark.parametrize("env", ["cube1d", "cube2d"])
-# def test__mask_backward__returns_all_true_except_eos_if_done(env, request):
-#     env = request.getfixturevalue(env)
-#     # Sample states
-#     states = env.get_uniform_terminating_states(100)
-#     # Iterate over state and test
-#     for state in states:
-#         env.set_state(state, done=True)
-#         mask = env.get_mask_invalid_actions_backward()
-#         assert all(mask[:-1])
-#         assert mask[-1] is False
-# 
-# 
-# @pytest.mark.parametrize(
-#     "state, mask_expected",
-#     [
-#         (
-#             [0.0],
-#             [False, False, True],
-#         ),
-#         (
-#             [0.5],
-#             [False, True, False],
-#         ),
-#         (
-#             [0.90],
-#             [False, True, False],
-#         ),
-#         (
-#             [0.95],
-#             [True, True, False],
-#         ),
-#     ],
-# )
-# def test__mask_forward__1d__returns_expected(cube1d, state, mask_expected):
-#     env = cube1d
-#     mask = env.get_mask_invalid_actions_forward(state)
-#     assert mask == mask_expected
-# 
-# 
-# @pytest.mark.parametrize(
-#     "state, mask_expected",
-#     [
-#         (
-#             [0.0, 0.0],
-#             [False, False, True],
-#         ),
-#         (
-#             [0.5, 0.5],
-#             [False, True, False],
-#         ),
-#         (
-#             [0.90, 0.5],
-#             [False, True, False],
-#         ),
-#         (
-#             [0.95, 0.5],
-#             [True, True, False],
-#         ),
-#         (
-#             [0.5, 0.90],
-#             [False, True, False],
-#         ),
-#         (
-#             [0.5, 0.95],
-#             [True, True, False],
-#         ),
-#         (
-#             [0.95, 0.95],
-#             [True, True, False],
-#         ),
-#     ],
-# )
-# def test__mask_forward__2d__returns_expected(cube2d, state, mask_expected):
-#     env = cube2d
-#     mask = env.get_mask_invalid_actions_forward(state)
-#     assert mask == mask_expected
-# 
-# 
-# @pytest.mark.parametrize(
-#     "state, mask_expected",
-#     [
-#         (
-#             [0.0],
-#             [True, False, True],
-#         ),
-#         (
-#             [0.1],
-#             [False, True, True],
-#         ),
-#         (
-#             [0.05],
-#             [True, False, True],
-#         ),
-#         (
-#             [0.5],
-#             [False, True, True],
-#         ),
-#         (
-#             [0.90],
-#             [False, True, True],
-#         ),
-#         (
-#             [0.95],
-#             [False, True, True],
-#         ),
-#     ],
-# )
-# def test__mask_backward__1d__returns_expected(cube1d, state, mask_expected):
-#     env = cube1d
-#     mask = env.get_mask_invalid_actions_backward(state)
-#     assert mask == mask_expected
-# 
-# 
-# @pytest.mark.parametrize(
-#     "state, mask_expected",
-#     [
-#         (
-#             [0.0, 0.0],
-#             [True, False, True],
-#         ),
-#         (
-#             [0.5, 0.5],
-#             [False, True, True],
-#         ),
-#         (
-#             [0.05, 0.5],
-#             [True, False, True],
-#         ),
-#         (
-#             [0.5, 0.05],
-#             [True, False, True],
-#         ),
-#         (
-#             [0.05, 0.05],
-#             [True, False, True],
-#         ),
-#         (
-#             [0.90, 0.5],
-#             [False, True, True],
-#         ),
-#         (
-#             [0.5, 0.90],
-#             [False, True, True],
-#         ),
-#         (
-#             [0.95, 0.5],
-#             [False, True, True],
-#         ),
-#         (
-#             [0.5, 0.95],
-#             [False, True, True],
-#         ),
-#         (
-#             [0.95, 0.95],
-#             [False, True, True],
-#         ),
-#     ],
-# )
-# def test__mask_backward__2d__returns_expected(cube2d, state, mask_expected):
-#     env = cube2d
-#     mask = env.get_mask_invalid_actions_backward(state)
-#     assert mask == mask_expected
-# 
-# 
-# @pytest.mark.parametrize(
-#     "state, increments_rel, min_increments, state_expected",
-#     [
-#         (
-#             [0.0, 0.0],
-#             [0.5, 0.5],
-#             [0.0, 0.0],
-#             [0.5, 0.5],
-#         ),
-#         (
-#             [0.0, 0.0],
-#             [0.0, 0.0],
-#             [0.0, 0.0],
-#             [0.0, 0.0],
-#         ),
-#         (
-#             [0.0, 0.0],
-#             [0.1794, 0.9589],
-#             [0.0, 0.0],
-#             [0.1794, 0.9589],
-#         ),
-#         (
-#             [0.3, 0.5],
-#             [0.0, 0.0],
-#             [0.1, 0.1],
-#             [0.4, 0.6],
-#         ),
-#         (
-#             [0.3, 0.5],
-#             [1.0, 1.0],
-#             [0.1, 0.1],
-#             [1.0, 1.0],
-#         ),
-#         (
-#             [0.3, 0.5],
-#             [0.5, 0.5],
-#             [0.1, 0.1],
-#             [0.7, 0.8],
-#         ),
-#         (
-#             [0.27, 0.85],
-#             [0.12, 0.76],
-#             [0.1, 0.1],
-#             [0.4456, 0.988],
-#         ),
-#         (
-#             [0.27, 0.95],
-#             [0.12, 0.0],
-#             [0.1, 0.0],
-#             [0.4456, 0.95],
-#         ),
-#         (
-#             [0.95, 0.27],
-#             [0.0, 0.12],
-#             [0.0, 0.1],
-#             [0.95, 0.4456],
-#         ),
-#     ],
-# )
-# def test__relative_to_absolute_increments__2d_forward__returns_expected(
-#     cube2d, state, increments_rel, min_increments, state_expected
-# ):
-#     env = cube2d
-#     # Convert to tensors
-#     states = tfloat([state], float_type=env.float, device=env.device)
-#     increments_rel = tfloat([increments_rel], float_type=env.float, device=env.device)
-#     min_increments = tfloat([min_increments], float_type=env.float, device=env.device)
-#     states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
-#     # Get absolute increments
-#     increments_abs = env.relative_to_absolute_increments(
-#         states, increments_rel, min_increments, env.max_val, is_backward=False
-#     )
-#     states_next = states + increments_abs
-#     assert torch.all(torch.isclose(states_next, states_expected))
-# 
-# 
-# @pytest.mark.parametrize(
-#     "state, increments_rel, min_increments, state_expected",
-#     [
-#         (
-#             [1.0, 1.0],
-#             [0.0, 0.0],
-#             [0.1, 0.1],
-#             [0.9, 0.9],
-#         ),
-#         (
-#             [1.0, 1.0],
-#             [1.0, 1.0],
-#             [0.1, 0.1],
-#             [0.0, 0.0],
-#         ),
-#         (
-#             [1.0, 1.0],
-#             [0.1794, 0.9589],
-#             [0.1, 0.1],
-#             [0.73854, 0.03699],
-#         ),
-#         (
-#             [0.3, 0.5],
-#             [0.0, 0.0],
-#             [0.1, 0.1],
-#             [0.2, 0.4],
-#         ),
-#         (
-#             [0.3, 0.5],
-#             [1.0, 1.0],
-#             [0.1, 0.1],
-#             [0.0, 0.0],
-#         ),
-#     ],
-# )
-# def test__relative_to_absolute_increments__2d_backward__returns_expected(
-#     cube2d, state, increments_rel, min_increments, state_expected
-# ):
-#     env = cube2d
-#     # Convert to tensors
-#     states = tfloat([state], float_type=env.float, device=env.device)
-#     increments_rel = tfloat([increments_rel], float_type=env.float, device=env.device)
-#     min_increments = tfloat([min_increments], float_type=env.float, device=env.device)
-#     states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
-#     # Get absolute increments
-#     increments_abs = env.relative_to_absolute_increments(
-#         states, increments_rel, min_increments, env.max_val, is_backward=True
-#     )
-#     states_next = states - increments_abs
-#     assert torch.all(torch.isclose(states_next, states_expected))
-# 
-# 
-# @pytest.mark.parametrize(
-#     "state, action, state_expected",
-#     [
-#         (
-#             [0.0, 0.0],
-#             (0.5, 0.5),
-#             [0.5, 0.5],
-#         ),
-#         (
-#             [0.0, 0.0],
-#             (0.0, 0.0),
-#             [0.0, 0.0],
-#         ),
-#         (
-#             [0.0, 0.0],
-#             (0.1794, 0.9589),
-#             [0.1794, 0.9589],
-#         ),
-#         (
-#             [0.3, 0.5],
-#             (0.1, 0.1),
-#             [0.4, 0.6],
-#         ),
-#         (
-#             [0.3, 0.5],
-#             (0.7, 0.5),
-#             [1.0, 1.0],
-#         ),
-#         (
-#             [0.3, 0.5],
-#             (0.4, 0.3),
-#             [0.7, 0.8],
-#         ),
-#         (
-#             [0.27, 0.85],
-#             (0.1756, 0.138),
-#             [0.4456, 0.988],
-#         ),
-#         (
-#             [0.27, 0.95],
-#             (0.1756, 0.0),
-#             [0.4456, 0.95],
-#         ),
-#         (
-#             [0.95, 0.27],
-#             (0.0, 0.1756),
-#             [0.95, 0.4456],
-#         ),
-#     ],
-# )
-# def test__step_forward__2d__returns_expected(cube2d, state, action, state_expected):
-#     env = cube2d
-#     env.set_state(state)
-#     state_new, action, valid = env.step(action)
-#     assert env.isclose(state_new, state_expected)
-# 
-# 
-# @pytest.mark.parametrize(
-#     "states, force_eos",
-#     [
-#         (
-#             [[0.0, 0.0], [0.0, 0.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
-#             [False, False, False, False, False],
-#         ),
-#         (
-#             [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.0], [0.16, 0.93]],
-#             [False, False, False, False, False],
-#         ),
-#         (
-#             [[0.05, 0.97], [0.56, 0.23], [0.95, 0.3], [0.2, 0.95], [0.01, 0.01]],
-#             [False, False, False, False, False],
-#         ),
-#         (
-#             [[0.0, 0.0], [0.0, 0.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
-#             [False, False, False, True, False],
-#         ),
-#         (
-#             [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.0], [0.16, 0.93]],
-#             [False, True, True, False, False],
-#         ),
-#         (
-#             [[0.05, 0.97], [0.56, 0.23], [0.95, 0.98], [0.92, 0.95], [0.01, 0.01]],
-#             [False, False, False, True, True],
-#         ),
-#     ],
-# )
-# def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos):
-#     env = cube2d
-#     n_states = len(states)
-#     force_eos = tbool(force_eos, device=env.device)
-#     # Get masks
-#     masks = tbool(
-#         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
-#     )
-#     # Define Beta distribution with low variance and get confident range
-#     n_samples = 10000
-#     beta_params_min = 0.0
-#     beta_params_max = 10000
-#     alpha = 10
-#     alphas_presigmoid = alpha * torch.ones(n_samples)
-#     alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
-#     beta = 1.0
-#     betas_presigmoid = beta * torch.ones(n_samples)
-#     betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
-#     beta_distr = Beta(alphas, betas)
-#     samples = beta_distr.sample()
-#     mean_incr_rel = 0.9 * samples.mean()
-#     min_incr_rel = 0.9 * samples.min()
-#     max_incr_rel = 1.1 * samples.max()
-#     # Define Bernoulli parameters for EOS with deterministic probability
-#     logit_force_eos = torch.inf
-#     logit_force_noeos = -torch.inf
-#     # Estimate confident intervals of absolute actions
-#     states_torch = tfloat(states, float_type=env.float, device=env.device)
-#     is_source = torch.all(states_torch == 0.0, dim=1)
-#     is_near_edge = states_torch > 1.0 - env.min_incr
-#     min_increments = torch.full_like(
-#         states_torch, env.min_incr, dtype=env.float, device=env.device
-#     )
-#     min_increments[is_source, :] = 0.0
-#     increments_rel_min = torch.full_like(
-#         states_torch, min_incr_rel, dtype=env.float, device=env.device
-#     )
-#     increments_rel_max = torch.full_like(
-#         states_torch, max_incr_rel, dtype=env.float, device=env.device
-#     )
-#     increments_abs_min = env.relative_to_absolute_increments(
-#         states_torch, increments_rel_min, min_increments, env.max_val, is_backward=False
-#     )
-#     increments_abs_max = env.relative_to_absolute_increments(
-#         states_torch, increments_rel_max, min_increments, env.max_val, is_backward=False
-#     )
-#     # Get EOS actions
-#     is_eos_forced = torch.any(is_near_edge, dim=1)
-#     is_eos = torch.logical_or(is_eos_forced, force_eos)
-#     increments_abs_min[is_eos] = torch.inf
-#     increments_abs_max[is_eos] = torch.inf
-#     # Reconfigure environment
-#     env.n_comp = 1
-#     env.beta_params_min = beta_params_min
-#     env.beta_params_max = beta_params_max
-#     # Build policy outputs
-#     params = env.fixed_distr_params
-#     params["beta_alpha"] = alpha
-#     params["beta_beta"] = beta
-#     params["bernoulli_eos_logit"] = logit_force_noeos
-#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-#     policy_outputs[force_eos, -1] = logit_force_eos
-#     # Sample actions
-#     actions, _ = env.sample_actions_batch(
-#         policy_outputs, masks, states, is_backward=False
-#     )
-#     actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
-#     actions_eos = torch.all(actions_tensor == torch.inf, dim=1)
-#     assert torch.all(actions_eos == is_eos)
-#     assert torch.all(actions_tensor >= increments_abs_min)
-#     assert torch.all(actions_tensor <= increments_abs_max)
-# 
-# 
-# @pytest.mark.parametrize(
-#     "states, force_bst",
-#     [
-#         (
-#             [[1.0, 1.0], [1.0, 1.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
-#             [False, False, False, False, False],
-#         ),
-#         (
-#             [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.05], [0.16, 0.93]],
-#             [False, False, False, False, False],
-#         ),
-#         (
-#             [[0.05, 0.97], [0.56, 0.23], [0.95, 0.3], [0.2, 0.95], [0.01, 0.01]],
-#             [False, False, False, False, False],
-#         ),
-#         (
-#             [[0.0001, 0.0], [0.001, 0.01], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
-#             [False, False, False, True, False],
-#         ),
-#         (
-#             [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [1.0, 1.0], [0.16, 0.93]],
-#             [False, True, True, True, False],
-#         ),
-#         (
-#             [[0.05, 0.97], [0.56, 0.23], [0.95, 0.98], [0.92, 0.95], [0.01, 0.01]],
-#             [False, False, False, True, True],
-#         ),
-#     ],
-# )
-# def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bst):
-#     env = cube2d
-#     n_states = len(states)
-#     force_bst = tbool(force_bst, device=env.device)
-#     # Get masks
-#     masks = tbool(
-#         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
-#     )
-#     # Define Beta distribution with low variance and get confident range
-#     n_samples = 10000
-#     beta_params_min = 0.0
-#     beta_params_max = 10000
-#     alpha = 10
-#     alphas_presigmoid = alpha * torch.ones(n_samples)
-#     alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
-#     beta = 1.0
-#     betas_presigmoid = beta * torch.ones(n_samples)
-#     betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
-#     beta_distr = Beta(alphas, betas)
-#     samples = beta_distr.sample()
-#     mean_incr_rel = 0.9 * samples.mean()
-#     min_incr_rel = 0.9 * samples.min()
-#     max_incr_rel = 1.1 * samples.max()
-#     # Define Bernoulli parameters for BST with deterministic probability
-#     logit_force_bst = torch.inf
-#     logit_force_nobst = -torch.inf
-#     # Estimate confident intervals of absolute actions
-#     states_torch = tfloat(states, float_type=env.float, device=env.device)
-#     is_near_edge = states_torch < env.min_incr
-#     min_increments = torch.full_like(
-#         states_torch, env.min_incr, dtype=env.float, device=env.device
-#     )
-#     increments_rel_min = torch.full_like(
-#         states_torch, min_incr_rel, dtype=env.float, device=env.device
-#     )
-#     increments_rel_max = torch.full_like(
-#         states_torch, max_incr_rel, dtype=env.float, device=env.device
-#     )
-#     increments_abs_min = env.relative_to_absolute_increments(
-#         states_torch, increments_rel_min, min_increments, env.max_val, is_backward=True
-#     )
-#     increments_abs_max = env.relative_to_absolute_increments(
-#         states_torch, increments_rel_max, min_increments, env.max_val, is_backward=True
-#     )
-#     # Get BST actions
-#     is_bst_forced = torch.any(is_near_edge, dim=1)
-#     is_bst = torch.logical_or(is_bst_forced, force_bst)
-#     increments_abs_min[is_bst] = states_torch[is_bst]
-#     increments_abs_max[is_bst] = states_torch[is_bst]
-#     # Reconfigure environment
-#     env.n_comp = 1
-#     env.beta_params_min = beta_params_min
-#     env.beta_params_max = beta_params_max
-#     # Build policy outputs
-#     params = env.fixed_distr_params
-#     params["beta_alpha"] = alpha
-#     params["beta_beta"] = beta
-#     params["bernoulli_source_logit"] = logit_force_nobst
-#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-#     policy_outputs[force_bst, -2] = logit_force_bst
-#     # Sample actions
-#     actions, _ = env.sample_actions_batch(
-#         policy_outputs, masks, states, is_backward=True
-#     )
-#     actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
-#     actions_bst = torch.all(actions_tensor == states_torch, dim=1)
-#     assert torch.all(actions_bst == is_bst)
-#     assert torch.all(actions_tensor >= increments_abs_min)
-#     assert torch.all(actions_tensor <= increments_abs_max)
-# 
-# 
-# @pytest.mark.parametrize(
-#     "states, actions",
-#     [
-#         (
-#             [[0.95, 0.97], [0.96, 0.5], [0.5, 0.96]],
-#             [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
-#         ),
-#         (
-#             [[0.95, 0.97], [0.901, 0.5], [1.0, 1.0]],
-#             [[np.inf, np.inf], [0.01, 0.2], [0.3, 0.01]],
-#         ),
-#     ],
-# )
-# def test__get_logprobs_forward__2d__nearedge_returns_prob1(cube2d, states, actions):
-#     """
-#     The only valid action from 'near-edge' states is EOS, thus the the log probability
-#     should be zero, regardless of the action and the policy outputs
-#     """
-#     env = cube2d
-#     n_states = len(states)
-#     states_torch = tfloat(states, float_type=env.float, device=env.device)
-#     actions = tfloat(actions, float_type=env.float, device=env.device)
-#     # Get masks
-#     masks = tbool(
-#         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
-#     )
-#     # Build policy outputs
-#     params = env.fixed_distr_params
-#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-#     # Add noise to policy outputs
-#     policy_outputs += torch.randn(policy_outputs.shape)
-#     # Get log probs
-#     logprobs = env.get_logprobs(
-#         policy_outputs, actions, masks, states_torch, is_backward=False
-#     )
-#     assert torch.all(logprobs == 0.0)
-# 
-# 
-# @pytest.mark.parametrize(
-#     "states, actions",
-#     [
-#         (
-#             [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-#             [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
-#         ),
-#         (
-#             [[1.0, 1.0], [0.01, 0.01], [0.001, 0.1]],
-#             [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
-#         ),
-#     ],
-# )
-# def test__get_logprobs_forward__2d__eos_actions_return_expected(
-#     cube2d, states, actions
-# ):
-#     """
-#     The only valid action from 'near-edge' states is EOS, thus the the log probability
-#     should be zero, regardless of the action and the policy outputs
-#     """
-#     env = cube2d
-#     n_states = len(states)
-#     states_torch = tfloat(states, float_type=env.float, device=env.device)
-#     actions = tfloat(actions, float_type=env.float, device=env.device)
-#     # Get masks
-#     masks = tbool(
-#         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
-#     )
-#     # Get EOS forced
-#     is_near_edge = states_torch > 1.0 - env.min_incr
-#     is_eos_forced = torch.any(is_near_edge, dim=1)
-#     # Define Bernoulli parameter for EOS
-#     # If Bernouilli has logit torch.inf, the logprobs are nan
-#     logit_eos = 1
-#     distr_eos = Bernoulli(logits=logit_eos)
-#     logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
-#     # Build policy outputs
-#     params = env.fixed_distr_params
-#     params["bernoulli_eos_logit"] = logit_eos
-#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-#     # Get log probs
-#     logprobs = env.get_logprobs(
-#         policy_outputs, actions, masks, states_torch, is_backward=False
-#     )
-#     assert torch.all(logprobs[is_eos_forced] == 0.0)
-#     assert torch.all(torch.isclose(logprobs[~is_eos_forced], logprob_eos, atol=1e-6))
-# 
-# 
-# @pytest.mark.parametrize(
-#     "actions",
-#     [
-#         [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-#         [[0.999, 0.999], [0.0001, 0.0001], [0.5, 0.5]],
-#         [[0.0, 0.0], [1.0, 1.0]],
-#     ],
-# )
-# def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1(
-#     cube2d, actions
-# ):
-#     """
-#     With Uniform increment policy, all the actions from the source must have the same
-#     probability.
-#     """
-#     env = cube2d
-#     n_states = len(actions)
-#     states = [env.source for _ in range(n_states)]
-#     states_torch = tfloat(states, float_type=env.float, device=env.device)
-#     actions = tfloat(actions, float_type=env.float, device=env.device)
-#     # Get masks
-#     masks = tbool(
-#         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
-#     )
-#     # Define Uniform Beta distribution (large values of alpha and beta and max of 1.0)
-#     beta_params_min = 0.0
-#     beta_params_max = 1.0
-#     alpha_presigmoid = 1000.0
-#     betas_presigmoid = 1000.0
-#     # Define Bernoulli parameter for impossible EOS
-#     # If Bernouilli has logit -torch.inf, the logprobs are nan
-#     logit_force_noeos = -1000
-#     # Reconfigure environment
-#     env.n_comp = 1
-#     env.beta_params_min = beta_params_min
-#     env.beta_params_max = beta_params_max
-#     # Build policy outputs
-#     params = env.fixed_distr_params
-#     params["beta_alpha"] = alpha_presigmoid
-#     params["beta_beta"] = betas_presigmoid
-#     params["bernoulli_eos_logit"] = logit_force_noeos
-#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-#     # Get log probs
-#     logprobs = env.get_logprobs(
-#         policy_outputs, actions, masks, states_torch, is_backward=False
-#     )
-#     assert torch.all(logprobs == 0.0)
-# 
-# 
-# @pytest.mark.parametrize(
-#     "states, actions",
-#     [
-#         (
-#             [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]],
-#             [[0.1, 0.2], [0.001, 0.001], [0.5, 0.5]],
-#         ),
-#         (
-#             [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7]],
-#             [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
-#         ),
-#         (
-#             [[0.6384, 0.4577], [0.5, 0.5], [0.7, 0.7]],
-#             [[0.2988, 0.3585], [0.1, 0.1], [0.1, 0.1]],
-#         ),
-#     ],
-# )
-# def test__get_logprobs_forward__2d__notnan(cube2d, states, actions):
-#     env = cube2d
-#     n_states = len(states)
-#     states_torch = tfloat(states, float_type=env.float, device=env.device)
-#     actions = tfloat(actions, float_type=env.float, device=env.device)
-#     # Get masks
-#     masks = tbool(
-#         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
-#     )
-#     # Get EOS forced
-#     is_near_edge = states_torch > 1.0 - env.min_incr
-#     is_eos_forced = torch.any(is_near_edge, dim=1)
-#     # Define Bernoulli parameter for EOS
-#     # If Bernouilli has logit torch.inf, the logprobs are nan
-#     logit_eos = 1
-#     distr_eos = Bernoulli(logits=logit_eos)
-#     logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
-#     # Build policy outputs
-#     params = env.fixed_distr_params
-#     params["bernoulli_eos_logit"] = logit_eos
-#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-#     # Get log probs
-#     logprobs = env.get_logprobs(
-#         policy_outputs, actions, masks, states_torch, is_backward=False
-#     )
-#     assert torch.all(logprobs[is_eos_forced] == 0.0)
-#     assert torch.all(torch.isfinite(logprobs))
-# 
-# 
-# @pytest.mark.parametrize(
-#     "states, actions",
-#     [
-#         (
-#             [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
-#             [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
-#         ),
-#         (
-#             [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
-#             [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
-#         ),
-#     ],
-# )
-# def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, actions):
-#     """
-#     The only valid backward action from 'near-edge' states is BTS, thus the the log
-#     probability should be zero.
-#     """
-#     env = cube2d
-#     n_states = len(states)
-#     states_torch = tfloat(states, float_type=env.float, device=env.device)
-#     actions = tfloat(actions, float_type=env.float, device=env.device)
-#     # Get masks
-#     masks = tbool(
-#         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
-#     )
-#     # Build policy outputs
-#     params = env.fixed_distr_params
-#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-#     # Add noise to policy outputs
-#     policy_outputs += torch.randn(policy_outputs.shape)
-#     # Get log probs
-#     logprobs = env.get_logprobs(
-#         policy_outputs, actions, masks, states_torch, is_backward=True
-#     )
-#     assert torch.all(logprobs == 0.0)
-# 
-# 
-# @pytest.mark.parametrize(
-#     "states, actions",
-#     [
-#         (
-#             [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-#             [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-#         ),
-#         (
-#             [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
-#             [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
-#         ),
-#         (
-#             [[1.0, 1.0], [0.0, 0.0]],
-#             [[1.0, 1.0], [0.0, 0.0]],
-#         ),
-#     ],
-# )
-# def test__get_logprobs_backward__2d__bts_actions_return_expected(
-#     cube2d, states, actions
-# ):
-#     """
-#     The only valid action from 'near-edge' states is EOS, thus the the log probability
-#     should be zero, regardless of the action and the policy outputs
-#     """
-#     env = cube2d
-#     n_states = len(states)
-#     states_torch = tfloat(states, float_type=env.float, device=env.device)
-#     actions = tfloat(actions, float_type=env.float, device=env.device)
-#     # Get masks
-#     masks = tbool(
-#         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
-#     )
-#     # Get BTS forced
-#     is_near_edge = states_torch < env.min_incr
-#     is_bts_forced = torch.any(is_near_edge, dim=1)
-#     # Define Bernoulli parameter for BTS
-#     # If Bernouilli has logit torch.inf, the logprobs are nan
-#     logit_bts = 1
-#     distr_bts = Bernoulli(logits=logit_bts)
-#     logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
-#     # Build policy outputs
-#     params = env.fixed_distr_params
-#     params["bernoulli_source_logit"] = logit_bts
-#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-#     # Get log probs
-#     logprobs = env.get_logprobs(
-#         policy_outputs, actions, masks, states_torch, is_backward=True
-#     )
-#     assert torch.all(logprobs[is_bts_forced] == 0.0)
-#     assert torch.all(torch.isclose(logprobs[~is_bts_forced], logprob_bts, atol=1e-6))
-# 
-# 
-# @pytest.mark.parametrize(
-#     "states, actions",
-#     [
-#         (
-#             [[0.3, 0.3], [0.5, 0.5], [0.8, 0.8]],
-#             [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
-#         ),
-#         (
-#             [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]],
-#             [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
-#         ),
-#         (
-#             [[1.0, 1.0], [0.5, 0.5], [0.3, 0.3]],
-#             [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
-#         ),
-#     ],
-# )
-# def test__get_logprobs_backward__2d__notnan(cube2d, states, actions):
-#     env = cube2d
-#     n_states = len(states)
-#     states_torch = tfloat(states, float_type=env.float, device=env.device)
-#     actions = tfloat(actions, float_type=env.float, device=env.device)
-#     # Get masks
-#     masks = tbool(
-#         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
-#     )
-#     # Get BTS forced
-#     is_near_edge = states_torch < env.min_incr
-#     is_bts_forced = torch.any(is_near_edge, dim=1)
-#     # Define Bernoulli parameter for BTS
-#     # If Bernouilli has logit torch.inf, the logprobs are nan
-#     logit_bts = 1
-#     distr_bts = Bernoulli(logits=logit_bts)
-#     logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
-#     # Build policy outputs
-#     params = env.fixed_distr_params
-#     params["bernoulli_source_logit"] = logit_bts
-#     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-#     # Get log probs
-#     logprobs = env.get_logprobs(
-#         policy_outputs, actions, masks, states_torch, is_backward=True
-#     )
-#     assert torch.all(logprobs[is_bts_forced] == 0.0)
-#     assert torch.all(torch.isfinite(logprobs))
-# 
-# 
-# @pytest.mark.parametrize(
-#     "state, expected",
-#     [
-#         (
-#             [0.0, 0.0],
-#             [0.0, 0.0],
-#         ),
-#         (
-#             [1.0, 1.0],
-#             [1.0, 1.0],
-#         ),
-#         (
-#             [1.1, 1.00001],
-#             [1.0, 1.0],
-#         ),
-#         (
-#             [-0.1, 1.00001],
-#             [0.0, 1.0],
-#         ),
-#         (
-#             [0.1, 0.21],
-#             [0.1, 0.21],
-#         ),
-#     ],
-# )
-# @pytest.mark.skip(reason="skip while developping other tests")
-# def test__state2policy_returns_expected(env, state, expected):
-#     assert env.state2policy(state) == expected
-# 
-# 
-# @pytest.mark.parametrize(
-#     "states, expected",
-#     [
-#         (
-#             [[0.0, 0.0], [1.0, 1.0], [1.1, 1.00001], [-0.1, 1.00001], [0.1, 0.21]],
-#             [[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 1.0], [0.1, 0.21]],
-#         ),
-#     ],
-# )
-# @pytest.mark.skip(reason="skip while developping other tests")
-# def test__statetorch2policy_returns_expected(env, states, expected):
-#     assert torch.equal(
-#         env.statetorch2policy(torch.tensor(states)), torch.tensor(expected)
-#     )
-# 
-# 
-# @pytest.mark.parametrize(
-#     "state, expected",
-#     [
-#         (
-#             [0.0, 0.0],
-#             [True, False, False],
-#         ),
-#         (
-#             [0.1, 0.1],
-#             [False, True, False],
-#         ),
-#         (
-#             [1.0, 0.0],
-#             [False, True, False],
-#         ),
-#         (
-#             [1.1, 0.0],
-#             [True, True, False],
-#         ),
-#         (
-#             [0.1, 1.1],
-#             [True, True, False],
-#         ),
-#     ],
-# )
-# @pytest.mark.skip(reason="skip while developping other tests")
-# def test__get_mask_invalid_actions_forward__returns_expected(env, state, expected):
-#     assert env.get_mask_invalid_actions_forward(state) == expected, print(
-#         state, expected, env.get_mask_invalid_actions_forward(state)
-#     )
-# 
-# 
-# @pytest.mark.skip(reason="skip while developping other tests")
-# def test__continuous_env_common__cube1d(cube1d):
-#     return common.test__continuous_env_common(cube1d)
-# 
-# 
-# def test__continuous_env_common__cube2d(cube2d):
-#     return common.test__continuous_env_common(cube2d)
+@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+def test__get_policy_output__fixed_as_expected(env, request):
+    env = request.getfixturevalue(env)
+    policy_outputs = torch.unsqueeze(env.fixed_policy_output, 0)
+    params = env.fixed_distr_params
+    policy_output__as_expected(env, policy_outputs, params)
+
+
+@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+def test__get_policy_output__random_as_expected(env, request):
+    env = request.getfixturevalue(env)
+    policy_outputs = torch.unsqueeze(env.random_policy_output, 0)
+    params = env.random_distr_params
+    policy_output__as_expected(env, policy_outputs, params)
+
+
+def policy_output__as_expected(env, policy_outputs, params):
+    assert torch.all(
+        env._get_policy_betas_weights(policy_outputs) == params["beta_weights"]
+    )
+    assert torch.all(
+        env._get_policy_betas_alpha(policy_outputs) == params["beta_alpha"]
+    )
+    assert torch.all(env._get_policy_betas_beta(policy_outputs) == params["beta_beta"])
+    assert torch.all(
+        env._get_policy_eos_logit(policy_outputs) == params["bernoulli_eos_logit"]
+    )
+    assert torch.all(
+        env._get_policy_source_logit(policy_outputs) == params["bernoulli_source_logit"]
+    )
+
+
+@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+def test__mask_forward__returns_all_true_if_done(env, request):
+    env = request.getfixturevalue(env)
+    # Sample states
+    states = env.get_uniform_terminating_states(100)
+    # Iterate over state and test
+    for state in states:
+        env.set_state(state, done=True)
+        mask = env.get_mask_invalid_actions_forward()
+        assert all(mask)
+
+
+@pytest.mark.parametrize("env", ["cube1d", "cube2d"])
+def test__mask_backward__returns_all_true_except_eos_if_done(env, request):
+    env = request.getfixturevalue(env)
+    # Sample states
+    states = env.get_uniform_terminating_states(100)
+    # Iterate over state and test
+    for state in states:
+        env.set_state(state, done=True)
+        mask = env.get_mask_invalid_actions_backward()
+        assert all(mask[:-1])
+        assert mask[-1] is False
+
+
+@pytest.mark.parametrize(
+    "state, mask_expected",
+    [
+        (
+            [-1.0],
+            [False, False, True],
+        ),
+        (
+            [0.0],
+            [False, True, False],
+        ),
+        (
+            [0.5],
+            [False, True, False],
+        ),
+        (
+            [0.90],
+            [False, True, False],
+        ),
+        (
+            [0.95],
+            [True, True, False],
+        ),
+    ],
+)
+def test__mask_forward__1d__returns_expected(cube1d, state, mask_expected):
+    env = cube1d
+    mask = env.get_mask_invalid_actions_forward(state)
+    assert mask == mask_expected
+
+
+@pytest.mark.parametrize(
+    "state, mask_expected",
+    [
+        (
+            [-1.0, -1.0],
+            [False, False, True],
+        ),
+        (
+            [0.0, 0.0],
+            [False, True, False],
+        ),
+        (
+            [0.5, 0.0],
+            [False, True, False],
+        ),
+        (
+            [0.0, 0.01],
+            [False, True, False],
+        ),
+        (
+            [0.5, 0.5],
+            [False, True, False],
+        ),
+        (
+            [0.90, 0.5],
+            [False, True, False],
+        ),
+        (
+            [0.95, 0.5],
+            [True, True, False],
+        ),
+        (
+            [0.5, 0.90],
+            [False, True, False],
+        ),
+        (
+            [0.5, 0.95],
+            [True, True, False],
+        ),
+        (
+            [0.95, 0.95],
+            [True, True, False],
+        ),
+    ],
+)
+def test__mask_forward__2d__returns_expected(cube2d, state, mask_expected):
+    env = cube2d
+    mask = env.get_mask_invalid_actions_forward(state)
+    assert mask == mask_expected
+
+
+@pytest.mark.parametrize(
+    "state, mask_expected",
+    [
+        (
+            [-1.0],
+            [True, True, True],
+        ),
+        (
+            [0.0],
+            [True, False, True],
+        ),
+        (
+            [0.05],
+            [True, False, True],
+        ),
+        (
+            [0.1],
+            [False, True, True],
+        ),
+        (
+            [0.5],
+            [False, True, True],
+        ),
+        (
+            [0.90],
+            [False, True, True],
+        ),
+        (
+            [0.95],
+            [False, True, True],
+        ),
+    ],
+)
+def test__mask_backward__1d__returns_expected(cube1d, state, mask_expected):
+    env = cube1d
+    mask = env.get_mask_invalid_actions_backward(state)
+    assert mask == mask_expected
+
+
+@pytest.mark.parametrize(
+    "state, mask_expected",
+    [
+        (
+            [-1.0, -1.0],
+            [True, True, True],
+        ),
+        (
+            [0.0, 0.0],
+            [True, False, True],
+        ),
+        (
+            [0.5, 0.5],
+            [False, True, True],
+        ),
+        (
+            [0.05, 0.5],
+            [True, False, True],
+        ),
+        (
+            [0.5, 0.05],
+            [True, False, True],
+        ),
+        (
+            [0.05, 0.05],
+            [True, False, True],
+        ),
+        (
+            [0.90, 0.5],
+            [False, True, True],
+        ),
+        (
+            [0.5, 0.90],
+            [False, True, True],
+        ),
+        (
+            [0.95, 0.5],
+            [False, True, True],
+        ),
+        (
+            [0.5, 0.95],
+            [False, True, True],
+        ),
+        (
+            [0.95, 0.95],
+            [False, True, True],
+        ),
+    ],
+)
+def test__mask_backward__2d__returns_expected(cube2d, state, mask_expected):
+    env = cube2d
+    mask = env.get_mask_invalid_actions_backward(state)
+    assert mask == mask_expected
+
+
+@pytest.mark.parametrize(
+    "state, increments_rel, state_expected",
+    [
+        (
+            [0.3, 0.5],
+            [0.0, 0.0],
+            [0.4, 0.6],
+        ),
+        (
+            [0.0, 0.0],
+            [0.1794, 0.9589],
+            [0.26146, 0.96301],
+        ),
+        (
+            [0.3, 0.5],
+            [1.0, 1.0],
+            [1.0, 1.0],
+        ),
+        (
+            [0.3, 0.5],
+            [0.5, 0.5],
+            [0.7, 0.8],
+        ),
+        (
+            [0.27, 0.85],
+            [0.12, 0.76],
+            [0.4456, 0.988],
+        ),
+    ],
+)
+def test__relative_to_absolute_increments__2d_forward__returns_expected(
+    cube2d, state, increments_rel, state_expected
+):
+    env = cube2d
+    # Convert to tensors
+    states = tfloat([state], float_type=env.float, device=env.device)
+    increments_rel = tfloat([increments_rel], float_type=env.float, device=env.device)
+    states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
+    # Get absolute increments
+    increments_abs = env.relative_to_absolute_increments(
+        states, increments_rel, is_backward=False
+    )
+    states_next = states + increments_abs
+    assert torch.all(torch.isclose(states_next, states_expected))
+
+
+@pytest.mark.parametrize(
+    "state, increments_rel, state_expected",
+    [
+        (
+            [1.0, 1.0],
+            [0.0, 0.0],
+            [0.9, 0.9],
+        ),
+        (
+            [1.0, 1.0],
+            [1.0, 1.0],
+            [0.0, 0.0],
+        ),
+        (
+            [1.0, 1.0],
+            [0.1794, 0.9589],
+            [0.73854, 0.03699],
+        ),
+        (
+            [0.3, 0.5],
+            [0.0, 0.0],
+            [0.2, 0.4],
+        ),
+        (
+            [0.3, 0.5],
+            [1.0, 1.0],
+            [0.0, 0.0],
+        ),
+    ],
+)
+def test__relative_to_absolute_increments__2d_backward__returns_expected(
+    cube2d, state, increments_rel, state_expected
+):
+    env = cube2d
+    # Convert to tensors
+    states = tfloat([state], float_type=env.float, device=env.device)
+    increments_rel = tfloat([increments_rel], float_type=env.float, device=env.device)
+    states_expected = tfloat([state_expected], float_type=env.float, device=env.device)
+    # Get absolute increments
+    increments_abs = env.relative_to_absolute_increments(
+        states, increments_rel, is_backward=True
+    )
+    states_next = states - increments_abs
+    assert torch.all(torch.isclose(states_next, states_expected))
+
+
+@pytest.mark.parametrize(
+    "state, action, state_expected",
+    [
+        (
+            [-1.0, -1.0],
+            (0.5, 0.5),
+            [0.5, 0.5],
+        ),
+        (
+            [-1.0, -1.0],
+            (0.0, 0.0),
+            [0.0, 0.0],
+        ),
+        (
+            [-1.0, -1.0],
+            (0.1794, 0.9589),
+            [0.1794, 0.9589],
+        ),
+        (
+            [0.0, 0.0],
+            (0.1, 0.1),
+            [0.1, 0.1],
+        ),
+        (
+            [0.0, 0.0],
+            (0.1794, 0.9589),
+            [0.1794, 0.9589],
+        ),
+        (
+            [0.3, 0.5],
+            (0.1, 0.1),
+            [0.4, 0.6],
+        ),
+        (
+            [0.3, 0.5],
+            (0.7, 0.5),
+            [1.0, 1.0],
+        ),
+        (
+            [0.3, 0.5],
+            (0.4, 0.3),
+            [0.7, 0.8],
+        ),
+        (
+            [0.27, 0.85],
+            (0.1756, 0.138),
+            [0.4456, 0.988],
+        ),
+        (
+            [0.45, 0.27],
+            (np.inf, np.inf),
+            [0.45, 0.27],
+        ),
+        (
+            [0.0, 0.0],
+            (np.inf, np.inf),
+            [0.0, 0.0],
+        ),
+    ],
+)
+def test__step_forward__2d__returns_expected(cube2d, state, action, state_expected):
+    env = cube2d
+    env.set_state(state)
+    state_new, action, valid = env.step(action)
+    assert env.isclose(state_new, state_expected)
+
+
+@pytest.mark.parametrize(
+    "state, action, state_expected",
+    [
+        (
+            [0.5, 0.9],
+            (0.3, 0.2),
+            [0.2, 0.7],
+        ),
+        (
+            [0.95, 0.4456],
+            (0.1, 0.27),
+            [0.85, 0.1756],
+        ),
+        (
+            [0.1, 0.2],
+            (0.1, 0.1),
+            [0.0, 0.1],
+        ),
+        (
+            [0.1, 0.2],
+            (-1.0, -1.0),
+            [-1.0, -1.0],
+        ),
+        (
+            [0.95, 0.0],
+            (-1.0, -1.0),
+            [-1.0, -1.0],
+        ),
+    ],
+)
+def test__step_backward__2d__returns_expected(cube2d, state, action, state_expected):
+    env = cube2d
+    env.set_state(state)
+    state_new, action, valid = env.step_backwards(action)
+    assert env.isclose(state_new, state_expected)
+
+
+@pytest.mark.parametrize(
+    "states, force_eos",
+    [
+        (
+            [[-1.0, -1.0], [0.0, 0.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.0], [0.16, 0.93]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.3], [0.2, 0.95], [0.01, 0.01]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.0, 0.0], [0.0, 0.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+            [False, False, False, True, False],
+        ),
+        (
+            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.0], [0.16, 0.93]],
+            [False, True, True, False, False],
+        ),
+        (
+            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.98], [0.92, 0.95], [0.01, 0.01]],
+            [False, False, False, True, True],
+        ),
+    ],
+)
+def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos):
+    env = cube2d
+    n_states = len(states)
+    force_eos = tbool(force_eos, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Define Beta distribution with low variance and get confident range
+    n_samples = 10000
+    beta_params_min = 0.0
+    beta_params_max = 10000
+    alpha = 10
+    alphas_presigmoid = alpha * torch.ones(n_samples)
+    alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
+    beta = 1.0
+    betas_presigmoid = beta * torch.ones(n_samples)
+    betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
+    beta_distr = Beta(alphas, betas)
+    samples = beta_distr.sample()
+    mean_incr_rel = 0.9 * samples.mean()
+    min_incr_rel = 0.9 * samples.min()
+    max_incr_rel = 1.1 * samples.max()
+    # Define Bernoulli parameters for EOS with deterministic probability
+    logit_force_eos = torch.inf
+    logit_force_noeos = -torch.inf
+    # Estimate confident intervals of absolute actions
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    is_source = torch.all(states_torch == -1.0, dim=1)
+    is_near_edge = states_torch > 1.0 - env.min_incr
+    increments_min = torch.full_like(
+        states_torch, min_incr_rel, dtype=env.float, device=env.device
+    )
+    increments_max = torch.full_like(
+        states_torch, max_incr_rel, dtype=env.float, device=env.device
+    )
+    increments_min[~is_source] = env.relative_to_absolute_increments(
+        states_torch[~is_source], increments_min[~is_source], is_backward=False
+    )
+    increments_max[~is_source] = env.relative_to_absolute_increments(
+        states_torch[~is_source], increments_max[~is_source], is_backward=False
+    )
+    # Get EOS actions
+    is_eos_forced = torch.any(is_near_edge, dim=1)
+    is_eos = torch.logical_or(is_eos_forced, force_eos)
+    increments_min[is_eos] = torch.inf
+    increments_max[is_eos] = torch.inf
+    # Reconfigure environment
+    env.n_comp = 1
+    env.beta_params_min = beta_params_min
+    env.beta_params_max = beta_params_max
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["beta_alpha"] = alpha
+    params["beta_beta"] = beta
+    params["bernoulli_eos_logit"] = logit_force_noeos
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    policy_outputs[force_eos, -1] = logit_force_eos
+    # Sample actions
+    actions, _ = env.sample_actions_batch(
+        policy_outputs, masks, states, is_backward=False
+    )
+    actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
+    actions_eos = torch.all(actions_tensor == torch.inf, dim=1)
+    assert torch.all(actions_eos == is_eos)
+    assert torch.all(actions_tensor >= increments_min)
+    assert torch.all(actions_tensor <= increments_max)
+
+
+@pytest.mark.parametrize(
+    "states, force_bst",
+    [
+        (
+            [[1.0, 1.0], [1.0, 1.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [0.0, 0.05], [0.16, 0.93]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.3], [0.2, 0.95], [0.01, 0.01]],
+            [False, False, False, False, False],
+        ),
+        (
+            [[0.0001, 0.0], [0.001, 0.01], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
+            [False, False, False, True, False],
+        ),
+        (
+            [[0.12, 0.17], [0.56, 0.23], [0.9, 0.9], [1.0, 1.0], [0.16, 0.93]],
+            [False, True, True, True, False],
+        ),
+        (
+            [[0.05, 0.97], [0.56, 0.23], [0.95, 0.98], [0.92, 0.95], [0.01, 0.01]],
+            [False, False, False, True, True],
+        ),
+    ],
+)
+def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bst):
+    env = cube2d
+    n_states = len(states)
+    force_bst = tbool(force_bst, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+    )
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    # Define Beta distribution with low variance and get confident range
+    n_samples = 10000
+    beta_params_min = 0.0
+    beta_params_max = 10000
+    alpha = 10
+    alphas_presigmoid = alpha * torch.ones(n_samples)
+    alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
+    beta = 1.0
+    betas_presigmoid = beta * torch.ones(n_samples)
+    betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
+    beta_distr = Beta(alphas, betas)
+    samples = beta_distr.sample()
+    mean_incr_rel = 0.9 * samples.mean()
+    min_incr_rel = 0.9 * samples.min()
+    max_incr_rel = 1.1 * samples.max()
+    # Define Bernoulli parameters for BST with deterministic probability
+    logit_force_bst = torch.inf
+    logit_force_nobst = -torch.inf
+    # Get BST actions
+    is_near_edge = states_torch < env.min_incr
+    is_bst_forced = torch.any(is_near_edge, dim=1)
+    is_bst = torch.logical_or(is_bst_forced, force_bst)
+    # Estimate confident intervals of absolute actions
+    increments_min = torch.full_like(
+        states_torch[~is_bst], min_incr_rel, dtype=env.float, device=env.device
+    )
+    increments_max = torch.full_like(
+        states_torch[~is_bst], max_incr_rel, dtype=env.float, device=env.device
+    )
+    increments_min = env.relative_to_absolute_increments(
+        states_torch[~is_bst], increments_min, is_backward=True
+    )
+    increments_max = env.relative_to_absolute_increments(
+        states_torch[~is_bst], increments_max, is_backward=True
+    )
+    # Reconfigure environment
+    env.n_comp = 1
+    env.beta_params_min = beta_params_min
+    env.beta_params_max = beta_params_max
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["beta_alpha"] = alpha
+    params["beta_beta"] = beta
+    params["bernoulli_source_logit"] = logit_force_nobst
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    policy_outputs[force_bst, -2] = logit_force_bst
+    # Sample actions
+    actions, _ = env.sample_actions_batch(
+        policy_outputs, masks, states, is_backward=True
+    )
+    actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
+    actions_bst = torch.all(actions_tensor == -1, dim=1)
+    assert torch.all(actions_bst == is_bst)
+    assert torch.all(actions_tensor[~is_bst] >= increments_min)
+    assert torch.all(actions_tensor[~is_bst] <= increments_max)
+
+
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.95, 0.97], [0.96, 0.5], [0.5, 0.96]],
+            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
+        ),
+        (
+            [[0.95, 0.97], [0.901, 0.5], [1.0, 1.0]],
+            [[np.inf, np.inf], [0.01, 0.2], [0.3, 0.01]],
+        ),
+    ],
+)
+def test__get_logprobs_forward__2d__nearedge_returns_prob1(cube2d, states, actions):
+    """
+    The only valid action from 'near-edge' states is EOS, thus the the log probability
+    should be zero, regardless of the action and the policy outputs
+    """
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Build policy outputs
+    params = env.fixed_distr_params
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Add noise to policy outputs
+    policy_outputs += torch.randn(policy_outputs.shape)
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=False
+    )
+    assert torch.all(logprobs == 0.0)
+
+
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+            [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
+        ),
+        (
+            [[1.0, 1.0], [0.01, 0.01], [0.001, 0.1]],
+            [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
+        ),
+    ],
+)
+def test__get_logprobs_forward__2d__eos_actions_return_expected(
+    cube2d, states, actions
+):
+    """
+    The only valid action from 'near-edge' states is EOS, thus the the log probability
+    should be zero, regardless of the action and the policy outputs
+    """
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Get EOS forced
+    is_near_edge = states_torch > 1.0 - env.min_incr
+    is_eos_forced = torch.any(is_near_edge, dim=1)
+    # Define Bernoulli parameter for EOS
+    # If Bernouilli has logit torch.inf, the logprobs are nan
+    logit_eos = 1
+    distr_eos = Bernoulli(logits=logit_eos)
+    logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["bernoulli_eos_logit"] = logit_eos
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=False
+    )
+    assert torch.all(logprobs[is_eos_forced] == 0.0)
+    assert torch.all(torch.isclose(logprobs[~is_eos_forced], logprob_eos, atol=1e-6))
+
+
+@pytest.mark.parametrize(
+    "actions",
+    [
+        [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+        [[0.999, 0.999], [0.0001, 0.0001], [0.5, 0.5]],
+        [[0.0, 0.0], [1.0, 1.0]],
+    ],
+)
+def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1(
+    cube2d, actions
+):
+    """
+    With Uniform increment policy, all the actions from the source must have the same
+    probability.
+    """
+    env = cube2d
+    n_states = len(actions)
+    states = [env.source for _ in range(n_states)]
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Define Uniform Beta distribution (large values of alpha and beta and max of 1.0)
+    beta_params_min = 0.0
+    beta_params_max = 1.0
+    alpha_presigmoid = 1000.0
+    betas_presigmoid = 1000.0
+    # Define Bernoulli parameter for impossible EOS
+    # If Bernouilli has logit -torch.inf, the logprobs are nan
+    logit_force_noeos = -1000
+    # Reconfigure environment
+    env.n_comp = 1
+    env.beta_params_min = beta_params_min
+    env.beta_params_max = beta_params_max
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["beta_alpha"] = alpha_presigmoid
+    params["beta_beta"] = betas_presigmoid
+    params["bernoulli_eos_logit"] = logit_force_noeos
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=False
+    )
+    assert torch.all(logprobs == 0.0)
+
+
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7]],
+            [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
+        ),
+        (
+            [[0.6384, 0.4577], [0.5, 0.5], [0.7, 0.7]],
+            [[0.2988, 0.3585], [0.2, 0.3], [0.11, 0.1001]],
+        ),
+        (
+            [[-1.0, -1.0], [-1.0, -1.0], [-1.0, -1.0]],
+            [[0.2988, 0.3585], [0.2, 0.3], [0.11, 0.1001]],
+        ),
+        (
+            [[0.6384, 0.4577], [0.5, 0.5], [0.7, 0.7]],
+            [[0.2988, 0.3585], [0.1, 0.1], [0.1, 0.1]],
+        ),
+        (
+            [[0.0, 0.0], [-1.0, -1.0], [0.0, 0.0]],
+            [[0.1, 0.2], [0.001, 0.001], [0.5, 0.5]],
+        ),
+    ],
+)
+def test__get_logprobs_forward__2d__finite(cube2d, states, actions):
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Get EOS forced
+    is_near_edge = states_torch > 1.0 - env.min_incr
+    is_eos_forced = torch.any(is_near_edge, dim=1)
+    # Define Bernoulli parameter for EOS
+    # If Bernouilli has logit torch.inf, the logprobs are nan
+    logit_eos = 1
+    distr_eos = Bernoulli(logits=logit_eos)
+    logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["bernoulli_eos_logit"] = logit_eos
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=False
+    )
+    assert torch.all(logprobs[is_eos_forced] == 0.0)
+    assert torch.all(torch.isfinite(logprobs))
+
+
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
+            [[-1, -1], [-1, -1], [-1, -1]],
+        ),
+        (
+            [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
+            [[-1, -1], [-1, -1], [-1, -1]],
+        ),
+    ],
+)
+def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, actions):
+    """
+    The only valid backward action from 'near-edge' states is BTS, thus the the log
+    probability should be zero.
+    """
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+    )
+    # Build policy outputs
+    params = env.fixed_distr_params
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Add noise to policy outputs
+    policy_outputs += torch.randn(policy_outputs.shape)
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=True
+    )
+    assert torch.all(logprobs == 0.0)
+
+
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+            [[-1, -1], [-1, -1], [-1, -1]],
+        ),
+        (
+            [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
+            [[-1, -1], [-1, -1], [-1, -1]],
+        ),
+        (
+            [[1.0, 1.0], [0.0, 0.0]],
+            [[-1, -1], [-1, -1]],
+        ),
+    ],
+)
+def test__get_logprobs_backward__2d__bts_actions_return_expected(
+    cube2d, states, actions
+):
+    """
+    The only valid action from 'near-edge' states is BTS, thus the log probability
+    should be zero, regardless of the action and the policy outputs
+    """
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+    )
+    # Get BTS forced
+    is_near_edge = states_torch < env.min_incr
+    is_bts_forced = torch.any(is_near_edge, dim=1)
+    # Define Bernoulli parameter for BTS
+    # If Bernouilli has logit torch.inf, the logprobs are nan
+    logit_bts = 1
+    distr_bts = Bernoulli(logits=logit_bts)
+    logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["bernoulli_source_logit"] = logit_bts
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=True
+    )
+    assert torch.all(logprobs[is_bts_forced] == 0.0)
+    assert torch.all(torch.isclose(logprobs[~is_bts_forced], logprob_bts, atol=1e-6))
+
+
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.3, 0.3], [0.5, 0.5], [0.8, 0.8]],
+            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
+        ),
+        (
+            [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]],
+            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
+        ),
+        (
+            [[1.0, 1.0], [0.5, 0.5], [0.3, 0.3]],
+            [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
+        ),
+    ],
+)
+def test__get_logprobs_backward__2d__notnan(cube2d, states, actions):
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+    )
+    # Get BTS forced
+    is_near_edge = states_torch < env.min_incr
+    is_bts_forced = torch.any(is_near_edge, dim=1)
+    # Define Bernoulli parameter for BTS
+    # If Bernouilli has logit torch.inf, the logprobs are nan
+    logit_bts = 1
+    distr_bts = Bernoulli(logits=logit_bts)
+    logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["bernoulli_source_logit"] = logit_bts
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=True
+    )
+    assert torch.all(logprobs[is_bts_forced] == 0.0)
+    assert torch.all(torch.isfinite(logprobs))
+
+
+@pytest.mark.parametrize(
+    "state, expected",
+    [
+        (
+            [0.0, 0.0],
+            [0.0, 0.0],
+        ),
+        (
+            [1.0, 1.0],
+            [1.0, 1.0],
+        ),
+        (
+            [1.1, 1.00001],
+            [1.0, 1.0],
+        ),
+        (
+            [-0.1, 1.00001],
+            [0.0, 1.0],
+        ),
+        (
+            [0.1, 0.21],
+            [0.1, 0.21],
+        ),
+    ],
+)
+@pytest.mark.skip(reason="skip while developping other tests")
+def test__state2policy_returns_expected(env, state, expected):
+    assert env.state2policy(state) == expected
+
+
+@pytest.mark.parametrize(
+    "states, expected",
+    [
+        (
+            [[0.0, 0.0], [1.0, 1.0], [1.1, 1.00001], [-0.1, 1.00001], [0.1, 0.21]],
+            [[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 1.0], [0.1, 0.21]],
+        ),
+    ],
+)
+@pytest.mark.skip(reason="skip while developping other tests")
+def test__statetorch2policy_returns_expected(env, states, expected):
+    assert torch.equal(
+        env.statetorch2policy(torch.tensor(states)), torch.tensor(expected)
+    )
+
+
+@pytest.mark.parametrize(
+    "state, expected",
+    [
+        (
+            [0.0, 0.0],
+            [True, False, False],
+        ),
+        (
+            [0.1, 0.1],
+            [False, True, False],
+        ),
+        (
+            [1.0, 0.0],
+            [False, True, False],
+        ),
+        (
+            [1.1, 0.0],
+            [True, True, False],
+        ),
+        (
+            [0.1, 1.1],
+            [True, True, False],
+        ),
+    ],
+)
+@pytest.mark.skip(reason="skip while developping other tests")
+def test__get_mask_invalid_actions_forward__returns_expected(env, state, expected):
+    assert env.get_mask_invalid_actions_forward(state) == expected, print(
+        state, expected, env.get_mask_invalid_actions_forward(state)
+    )
+
+
+def test__continuous_env_common__cube1d(cube1d):
+    return common.test__continuous_env_common(cube1d)
+
+
+def test__continuous_env_common__cube2d(cube2d):
+    return common.test__continuous_env_common(cube2d)

From aba939505271ed0d7dafe8dcb9bc2d47227d44bb Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 21:23:27 -0400
Subject: [PATCH 153/206] Dummy test

---
 tests/gflownet/envs/test_ccube.py | 37 +++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 45e4551bb..603dc4bba 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -844,6 +844,43 @@ def test__get_logprobs_forward__2d__finite(cube2d, states, actions):
     assert torch.all(torch.isfinite(logprobs))
 
 
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7]],
+            [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]],
+        ),
+    ],
+)
+def test__get_logprobs_forward__2d__as_expected(cube2d, states, actions):
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
+    )
+    # Get EOS forced
+    is_near_edge = states_torch > 1.0 - env.min_incr
+    is_eos_forced = torch.any(is_near_edge, dim=1)
+    # Define Bernoulli parameter for EOS
+    # If Bernouilli has logit torch.inf, the logprobs are nan
+    logit_eos = 1
+    distr_eos = Bernoulli(logits=logit_eos)
+    logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["bernoulli_eos_logit"] = logit_eos
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=False
+    )
+    assert True
+
+
 @pytest.mark.parametrize(
     "states, actions",
     [

From 303780a817db528a531e48c1fb190b4ca096b453 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 21:27:22 -0400
Subject: [PATCH 154/206] Dummy test

---
 tests/gflownet/envs/test_ccube.py | 37 +++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 603dc4bba..11ca5139c 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -881,6 +881,43 @@ def test__get_logprobs_forward__2d__as_expected(cube2d, states, actions):
     assert True
 
 
+@pytest.mark.parametrize(
+    "states, actions",
+    [
+        (
+            [[0.3, 0.3], [0.5, 0.5], [0.7, 0.7]],
+            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
+        ),
+    ],
+)
+def test__get_logprobs_backward__2d__as_expected(cube2d, states, actions):
+    env = cube2d
+    n_states = len(states)
+    states_torch = tfloat(states, float_type=env.float, device=env.device)
+    actions = tfloat(actions, float_type=env.float, device=env.device)
+    # Get masks
+    masks = tbool(
+        [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
+    )
+    # Get EOS forced
+    is_near_edge = states_torch > 1.0 - env.min_incr
+    is_eos_forced = torch.any(is_near_edge, dim=1)
+    # Define Bernoulli parameter for EOS
+    # If Bernouilli has logit torch.inf, the logprobs are nan
+    logit_eos = 1
+    distr_eos = Bernoulli(logits=logit_eos)
+    logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
+    # Build policy outputs
+    params = env.fixed_distr_params
+    params["bernoulli_eos_logit"] = logit_eos
+    policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
+    # Get log probs
+    logprobs = env.get_logprobs(
+        policy_outputs, actions, masks, states_torch, is_backward=True
+    )
+    assert True
+
+
 @pytest.mark.parametrize(
     "states, actions",
     [

From 1636f001a0aa07071242de428381b07ddf0dd854 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 21:32:32 -0400
Subject: [PATCH 155/206] Fix dummy test

---
 tests/gflownet/envs/test_ccube.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 11ca5139c..f000ca03f 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -899,17 +899,14 @@ def test__get_logprobs_backward__2d__as_expected(cube2d, states, actions):
     masks = tbool(
         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
     )
-    # Get EOS forced
-    is_near_edge = states_torch > 1.0 - env.min_incr
-    is_eos_forced = torch.any(is_near_edge, dim=1)
-    # Define Bernoulli parameter for EOS
+    # Define Bernoulli parameter for BTS
     # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_eos = 1
-    distr_eos = Bernoulli(logits=logit_eos)
-    logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
+    logit_bts = 1
+    distr_bts = Bernoulli(logits=logit_bts)
+    logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
     # Build policy outputs
     params = env.fixed_distr_params
-    params["bernoulli_eos_logit"] = logit_eos
+    params["bernoulli_source_logit"] = logit_bts
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(

From cfd0ea03547f0c4aa1f1dd3982176202b2790e08 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 21:40:06 -0400
Subject: [PATCH 156/206] Extend dummy tests

---
 tests/gflownet/envs/test_ccube.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index f000ca03f..162b806ab 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -848,8 +848,8 @@ def test__get_logprobs_forward__2d__finite(cube2d, states, actions):
     "states, actions",
     [
         (
-            [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7]],
-            [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]],
+            [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7], [-1.0, -1.0], [-1.0, -1.0]],
+            [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.3, 0.3], [0.5, 0.5]],
         ),
     ],
 )
@@ -878,6 +878,7 @@ def test__get_logprobs_forward__2d__as_expected(cube2d, states, actions):
     logprobs = env.get_logprobs(
         policy_outputs, actions, masks, states_torch, is_backward=False
     )
+    import ipdb; ipdb.set_trace()
     assert True
 
 
@@ -885,8 +886,8 @@ def test__get_logprobs_forward__2d__as_expected(cube2d, states, actions):
     "states, actions",
     [
         (
-            [[0.3, 0.3], [0.5, 0.5], [0.7, 0.7]],
-            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
+            [[0.3, 0.3], [0.5, 0.5], [0.7, 0.7], [0.05, 0.2], [0.05, 0.05]],
+            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2], [-1, -1], [-1, -1]],
         ),
     ],
 )
@@ -912,6 +913,7 @@ def test__get_logprobs_backward__2d__as_expected(cube2d, states, actions):
     logprobs = env.get_logprobs(
         policy_outputs, actions, masks, states_torch, is_backward=True
     )
+    import ipdb; ipdb.set_trace()
     assert True
 
 

From 2420694b71dbc41bbf27714a2d1730b3816b218b Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 22:08:24 -0400
Subject: [PATCH 157/206] Fix dummy test

---
 tests/gflownet/envs/test_ccube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 162b806ab..b53a687ec 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -849,7 +849,7 @@ def test__get_logprobs_forward__2d__finite(cube2d, states, actions):
     [
         (
             [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7], [-1.0, -1.0], [-1.0, -1.0]],
-            [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.3, 0.3], [0.5, 0.5]],
+            [[0.5, 0.5], [0.3, 0.3], [0.2, 0.2], [0.3, 0.3], [0.5, 0.5]],
         ),
     ],
 )

From 3ec6d0b669fa002f2c5304f6ad75619c65267178 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 22:23:44 -0400
Subject: [PATCH 158/206] Edit tests

---
 tests/gflownet/envs/test_ccube.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index b53a687ec..5aefdf469 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -848,8 +848,8 @@ def test__get_logprobs_forward__2d__finite(cube2d, states, actions):
     "states, actions",
     [
         (
-            [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7], [-1.0, -1.0], [-1.0, -1.0]],
-            [[0.5, 0.5], [0.3, 0.3], [0.2, 0.2], [0.3, 0.3], [0.5, 0.5]],
+            [[0.2, 0.2], [0.5, 0.5], [-1.0, -1.0], [-1.0, -1.0], [0.95, 0.95]],
+            [[0.5, 0.5], [0.3, 0.3], [0.3, 0.3], [0.5, 0.5], [np.inf, np.inf]],
         ),
     ],
 )
@@ -886,8 +886,8 @@ def test__get_logprobs_forward__2d__as_expected(cube2d, states, actions):
     "states, actions",
     [
         (
-            [[0.3, 0.3], [0.5, 0.5], [0.7, 0.7], [0.05, 0.2], [0.05, 0.05]],
-            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2], [-1, -1], [-1, -1]],
+            [[0.3, 0.3], [0.5, 0.5], [1.0, 1.0], [0.05, 0.2], [0.05, 0.05]],
+            [[0.2, 0.2], [0.2, 0.2], [0.5, 0.5], [-1, -1], [-1, -1]],
         ),
     ],
 )

From 392e958c264371e1c9a2bededa57168c4c53418e Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Tue, 19 Sep 2023 23:34:55 -0400
Subject: [PATCH 159/206] Make BTS an actual continuous action again.

---
 gflownet/envs/cube.py             | 76 ++++++++++++++++---------------
 tests/gflownet/envs/test_ccube.py | 63 ++++++++++++-------------
 2 files changed, 72 insertions(+), 67 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 317b53db4..e3746ad1d 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -290,16 +290,13 @@ def get_action_space(self):
 
         EOS is indicated by np.inf for all dimensions.
 
-        BTS (back to source) is indicated by -1 for all dimensions.
-
-        This method defines self.eos, self.bts and the returned action space is simply
+        This method defines self.eos and the returned action space is simply
         a representative (arbitrary) action with an increment of 0.0 in all dimensions,
-        EOS and BTS.
+        and EOS.
         """
         self.eos = tuple([np.inf] * self.n_dim)
-        self.bts = tuple([-1] * self.n_dim)
         self.representative_action = tuple([0.0] * self.n_dim)
-        return [self.representative_action, self.bts, self.eos]
+        return [self.representative_action, self.eos]
 
     def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         """
@@ -774,7 +771,7 @@ def _sample_actions_batch_backward(
 
         The continuous distribution to sample the continuous action described above
         must be mixed with the discrete distribution to model the sampling of the back
-        to source (BST) action. While the BST action is also a continuous action, it
+        to source (BTS) action. While the BTS action is also a continuous action, it
         needs to be modelled with a (discrete) Bernoulli distribution in order to
         ensure that this action has positive likelihood.
 
@@ -828,9 +825,11 @@ def _sample_actions_batch_backward(
         if torch.any(do_increments):
             actions_tensor[do_increments] = increments
         if torch.any(is_bts):
-            actions_tensor[is_bts] = tfloat(
-                self.bts, float_type=self.float, device=self.device
-            )
+            # BTS actions are equal to the originating states
+            actions_bts = tfloat(
+                states_from, float_type=self.float, device=self.device
+            )[is_bts]
+            actions_tensor[is_bts] = actions_bts
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, None
 
@@ -984,7 +983,6 @@ def _get_logprobs_backward(
         jacobian_diag = torch.ones(
             (n_states, self.n_dim), device=self.device, dtype=self.float
         )
-        bts_tensor = tfloat(self.bts, float_type=self.float, device=self.device)
         # EOS is the only possible action only if done is True (mask[2] is False)
         is_eos = ~mask[:, 2]
         # Back-to-source (BTS) is the only possible action if mask[1] is False
@@ -993,8 +991,11 @@ def _get_logprobs_backward(
         # Get sampled BTS actions and get log probs from Bernoulli distribution
         do_bts = torch.logical_and(~is_bts_forced, ~is_eos)
         if torch.any(do_bts):
+            # BTS actions are equal to the originating states
             is_bts_sampled = torch.zeros_like(do_bts)
-            is_bts_sampled[do_bts] = torch.all(actions[do_bts] == bts_tensor, dim=1)
+            is_bts_sampled[do_bts] = torch.all(
+                actions[do_bts] == states_from_tensor[do_bts], dim=1
+            )
             is_bts[is_bts_sampled] = True
             logits_bts = self._get_policy_source_logit(policy_outputs)[do_bts]
             distr_bts = Bernoulli(logits=logits_bts)
@@ -1111,30 +1112,37 @@ def _step(
         for dim, incr in enumerate(action):
             if backward:
                 self.state[dim] -= incr
+                # Add extra dimension in action to mark BTS.
+                if self.isclose(
+                    self.state, [0.0 for _ in range(self.n_dim)], atol=self.epsilon
+                ):
+                    self.state = self.source
             else:
                 if self.state == self.source:
                     self.state = [0.0 for _ in range(self.n_dim)]
                 self.state[dim] += incr
-        if not all([s <= (1.0 + epsilon) for s in self.state]):
-            import ipdb
-
-            ipdb.set_trace()
-        assert all(
-            [s <= (1.0 + epsilon) for s in self.state]
-        ), f"""
-        State is out of cube bounds.
-        \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
-        """
-        if not all([s >= (0.0 - epsilon) for s in self.state]):
-            import ipdb
-
-            ipdb.set_trace()
-        assert all(
-            [s >= (0.0 - epsilon) for s in self.state]
-        ), f"""
-        State is out of cube bounds.
-        \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
-        """
+        # TODO: remove when always correct
+        if self.state != self.source:
+            if not all([s <= (1.0 + epsilon) for s in self.state]):
+                import ipdb
+
+                ipdb.set_trace()
+            assert all(
+                [s <= (1.0 + epsilon) for s in self.state]
+            ), f"""
+            State is out of cube bounds.
+            \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
+            """
+            if not all([s >= (0.0 - epsilon) for s in self.state]):
+                import ipdb
+
+                ipdb.set_trace()
+            assert all(
+                [s >= (0.0 - epsilon) for s in self.state]
+            ), f"""
+            State is out of cube bounds.
+            \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
+            """
         return self.state, action, True
 
     # TODO: make generic for continuous environments
@@ -1206,10 +1214,6 @@ def step_backwards(
             self.done = False
             self.n_actions += 1
             return self.state, action, True
-        if action == self.bts:
-            self.state = self.source
-            self.n_actions += 1
-            return self.state, action, True
         # Otherwise perform action
         assert action != self.eos
         self.n_actions += 1
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 5aefdf469..06e724324 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -23,7 +23,6 @@ def cube2d():
     [
         [
             (0.0, 0.0),
-            (-1.0, -1.0),
             (np.inf, np.inf),
         ],
     ],
@@ -445,12 +444,12 @@ def test__step_forward__2d__returns_expected(cube2d, state, action, state_expect
         ),
         (
             [0.1, 0.2],
-            (-1.0, -1.0),
+            (0.1, 0.2),
             [-1.0, -1.0],
         ),
         (
             [0.95, 0.0],
-            (-1.0, -1.0),
+            (0.95, 0.0),
             [-1.0, -1.0],
         ),
     ],
@@ -561,7 +560,7 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
 
 
 @pytest.mark.parametrize(
-    "states, force_bst",
+    "states, force_bts",
     [
         (
             [[1.0, 1.0], [1.0, 1.0], [0.3, 0.5], [0.27, 0.85], [0.56, 0.23]],
@@ -589,10 +588,10 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
         ),
     ],
 )
-def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bst):
+def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bts):
     env = cube2d
     n_states = len(states)
-    force_bst = tbool(force_bst, device=env.device)
+    force_bts = tbool(force_bts, device=env.device)
     # Get masks
     masks = tbool(
         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
@@ -613,26 +612,28 @@ def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bs
     mean_incr_rel = 0.9 * samples.mean()
     min_incr_rel = 0.9 * samples.min()
     max_incr_rel = 1.1 * samples.max()
-    # Define Bernoulli parameters for BST with deterministic probability
-    logit_force_bst = torch.inf
-    logit_force_nobst = -torch.inf
-    # Get BST actions
-    is_near_edge = states_torch < env.min_incr
-    is_bst_forced = torch.any(is_near_edge, dim=1)
-    is_bst = torch.logical_or(is_bst_forced, force_bst)
+    # Define Bernoulli parameters for BTS with deterministic probability
+    logit_force_bts = torch.inf
+    logit_force_nobts = -torch.inf
     # Estimate confident intervals of absolute actions
     increments_min = torch.full_like(
-        states_torch[~is_bst], min_incr_rel, dtype=env.float, device=env.device
+        states_torch, min_incr_rel, dtype=env.float, device=env.device
     )
     increments_max = torch.full_like(
-        states_torch[~is_bst], max_incr_rel, dtype=env.float, device=env.device
+        states_torch, max_incr_rel, dtype=env.float, device=env.device
     )
     increments_min = env.relative_to_absolute_increments(
-        states_torch[~is_bst], increments_min, is_backward=True
+        states_torch, increments_min, is_backward=True
     )
     increments_max = env.relative_to_absolute_increments(
-        states_torch[~is_bst], increments_max, is_backward=True
+        states_torch, increments_max, is_backward=True
     )
+    # Get BTS actions
+    is_near_edge = states_torch < env.min_incr
+    is_bts_forced = torch.any(is_near_edge, dim=1)
+    is_bts = torch.logical_or(is_bts_forced, force_bts)
+    increments_min[is_bts] = states_torch[is_bts]
+    increments_max[is_bts] = states_torch[is_bts]
     # Reconfigure environment
     env.n_comp = 1
     env.beta_params_min = beta_params_min
@@ -641,18 +642,18 @@ def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bs
     params = env.fixed_distr_params
     params["beta_alpha"] = alpha
     params["beta_beta"] = beta
-    params["bernoulli_source_logit"] = logit_force_nobst
+    params["bernoulli_source_logit"] = logit_force_nobts
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    policy_outputs[force_bst, -2] = logit_force_bst
+    policy_outputs[force_bts, -2] = logit_force_bts
     # Sample actions
     actions, _ = env.sample_actions_batch(
         policy_outputs, masks, states, is_backward=True
     )
     actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
-    actions_bst = torch.all(actions_tensor == -1, dim=1)
-    assert torch.all(actions_bst == is_bst)
-    assert torch.all(actions_tensor[~is_bst] >= increments_min)
-    assert torch.all(actions_tensor[~is_bst] <= increments_max)
+    actions_bts = torch.all(actions_tensor == states_torch, dim=1)
+    assert torch.all(actions_bts == is_bts)
+    assert torch.all(actions_tensor >= increments_min)
+    assert torch.all(actions_tensor <= increments_max)
 
 
 @pytest.mark.parametrize(
@@ -844,6 +845,7 @@ def test__get_logprobs_forward__2d__finite(cube2d, states, actions):
     assert torch.all(torch.isfinite(logprobs))
 
 
+# TODO: improve or remove
 @pytest.mark.parametrize(
     "states, actions",
     [
@@ -878,16 +880,16 @@ def test__get_logprobs_forward__2d__as_expected(cube2d, states, actions):
     logprobs = env.get_logprobs(
         policy_outputs, actions, masks, states_torch, is_backward=False
     )
-    import ipdb; ipdb.set_trace()
     assert True
 
 
+# TODO: improve or remove
 @pytest.mark.parametrize(
     "states, actions",
     [
         (
             [[0.3, 0.3], [0.5, 0.5], [1.0, 1.0], [0.05, 0.2], [0.05, 0.05]],
-            [[0.2, 0.2], [0.2, 0.2], [0.5, 0.5], [-1, -1], [-1, -1]],
+            [[0.2, 0.2], [0.2, 0.2], [0.5, 0.5], [0.05, 0.2], [0.05, 0.05]],
         ),
     ],
 )
@@ -913,7 +915,6 @@ def test__get_logprobs_backward__2d__as_expected(cube2d, states, actions):
     logprobs = env.get_logprobs(
         policy_outputs, actions, masks, states_torch, is_backward=True
     )
-    import ipdb; ipdb.set_trace()
     assert True
 
 
@@ -922,11 +923,11 @@ def test__get_logprobs_backward__2d__as_expected(cube2d, states, actions):
     [
         (
             [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
-            [[-1, -1], [-1, -1], [-1, -1]],
+            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
         ),
         (
             [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
-            [[-1, -1], [-1, -1], [-1, -1]],
+            [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
         ),
     ],
 )
@@ -960,15 +961,15 @@ def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, acti
     [
         (
             [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-            [[-1, -1], [-1, -1], [-1, -1]],
+            [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
         ),
         (
             [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
-            [[-1, -1], [-1, -1], [-1, -1]],
+            [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
         ),
         (
             [[1.0, 1.0], [0.0, 0.0]],
-            [[-1, -1], [-1, -1]],
+            [[1.0, 1.0], [0.0, 0.0]],
         ),
     ],
 )

From 1500c349cdbdf2974b26703b6a7f8fefe9ba9996 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 20 Sep 2023 09:33:20 -0400
Subject: [PATCH 160/206] Add dimension to actions to indicate whether action
 is from/to source.

---
 gflownet/envs/cube.py             | 53 +++++++++++++++++++------------
 tests/gflownet/envs/test_ccube.py |  4 +--
 2 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index e3746ad1d..d69643a15 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -285,8 +285,9 @@ def get_action_space(self):
         """
         The action space is continuous, thus not defined as such here.
 
-        The actions are tuples of length n_dim, where the value at position d indicates
-        the increment of dimension d.
+        The actions are tuples of length n_dim + 1, where the value at position d
+        indicates the increment of dimension d, and the value at position -1 indicates
+        whether the action is from or to source (1), or 0 otherwise.
 
         EOS is indicated by np.inf for all dimensions.
 
@@ -294,8 +295,9 @@ def get_action_space(self):
         a representative (arbitrary) action with an increment of 0.0 in all dimensions,
         and EOS.
         """
-        self.eos = tuple([np.inf] * self.n_dim)
-        self.representative_action = tuple([0.0] * self.n_dim)
+        actions_dim = self.n_dim + 1
+        self.eos = tuple([np.inf] * actions_dim)
+        self.representative_action = tuple([0.0] * actions_dim)
         return [self.representative_action, self.eos]
 
     def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
@@ -736,10 +738,14 @@ def _sample_actions_batch_forward(
             )
         # Build actions
         actions_tensor = torch.full(
-            (n_states, self.n_dim), torch.inf, dtype=self.float, device=self.device
+            (n_states, self.n_dim + 1), torch.inf, dtype=self.float, device=self.device
         )
         if torch.any(do_increments):
+            increments = torch.cat(
+                (increments, torch.zeros((increments.shape[0], 1))), dim=1
+            )
             actions_tensor[do_increments] = increments
+        actions_tensor[is_source, -1] = 1
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, None
 
@@ -817,18 +823,24 @@ def _sample_actions_batch_backward(
             )
         # Build actions
         actions_tensor = torch.zeros(
-            (n_states, self.n_dim), dtype=self.float, device=self.device
+            (n_states, self.n_dim + 1), dtype=self.float, device=self.device
         )
         actions_tensor[is_eos] = tfloat(
             self.eos, float_type=self.float, device=self.device
         )
         if torch.any(do_increments):
+            increments = torch.cat(
+                (increments, torch.zeros((increments.shape[0], 1))), dim=1
+            )
             actions_tensor[do_increments] = increments
         if torch.any(is_bts):
             # BTS actions are equal to the originating states
             actions_bts = tfloat(
                 states_from, float_type=self.float, device=self.device
             )[is_bts]
+            actions_bts = torch.cat(
+                (actions_bts, torch.ones((actions_bts.shape[0], 1))), dim=1
+            )
             actions_tensor[is_bts] = actions_bts
         actions = [tuple(a.tolist()) for a in actions_tensor]
         return actions, None
@@ -836,7 +848,7 @@ def _sample_actions_batch_backward(
     def get_logprobs(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
-        actions: TensorType["n_states", "n_dim"],
+        actions: TensorType["n_states", "actions_dim"],
         mask: TensorType["n_states", "3"],
         states_from: List,
         is_backward: bool,
@@ -877,7 +889,7 @@ def get_logprobs(
     def _get_logprobs_forward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
-        actions: TensorType["n_states", "n_dim"],
+        actions: TensorType["n_states", "actions_dim"],
         mask: TensorType["n_states", "3"],
         states_from: List,
     ) -> TensorType["batch_size"]:
@@ -922,7 +934,7 @@ def _get_logprobs_forward(
         do_increments = ~is_eos
         if torch.any(do_increments):
             # Get absolute increments
-            increments = actions[do_increments]
+            increments = actions[do_increments, :-1]
             # Compute relative increments from absolute increments if state is not
             # source
             is_relative = ~is_source[do_increments]
@@ -963,7 +975,7 @@ def _get_logprobs_forward(
     def _get_logprobs_backward(
         self,
         policy_outputs: TensorType["n_states", "policy_output_dim"],
-        actions: TensorType["n_states", "n_dim"],
+        actions: TensorType["n_states", "actions_dim"],
         mask: TensorType["n_states", "3"],
         states_from: List,
     ) -> TensorType["batch_size"]:
@@ -994,7 +1006,7 @@ def _get_logprobs_backward(
             # BTS actions are equal to the originating states
             is_bts_sampled = torch.zeros_like(do_bts)
             is_bts_sampled[do_bts] = torch.all(
-                actions[do_bts] == states_from_tensor[do_bts], dim=1
+                actions[do_bts, :-1] == states_from_tensor[do_bts], dim=1
             )
             is_bts[is_bts_sampled] = True
             logits_bts = self._get_policy_source_logit(policy_outputs)[do_bts]
@@ -1006,7 +1018,7 @@ def _get_logprobs_backward(
         do_increments = torch.logical_and(~is_bts, ~is_eos)
         if torch.any(do_increments):
             # Get absolute increments
-            increments = actions[do_increments]
+            increments = actions[do_increments, :-1]
             # Compute absolute increments from all sampled relative increments
             increments = self.absolute_to_relative_increments(
                 states_from_tensor[do_increments],
@@ -1109,18 +1121,19 @@ def _step(
             root state
         """
         epsilon = 1e-9
-        for dim, incr in enumerate(action):
+        # If forward action is from source, initialize state to all zeros.
+        if not backward and action[-1] == 1:
+            self.state = [0.0 for _ in range(self.n_dim)]
+        # Increment dimensions
+        for dim, incr in enumerate(action[:-1]):
             if backward:
                 self.state[dim] -= incr
-                # Add extra dimension in action to mark BTS.
-                if self.isclose(
-                    self.state, [0.0 for _ in range(self.n_dim)], atol=self.epsilon
-                ):
-                    self.state = self.source
             else:
-                if self.state == self.source:
-                    self.state = [0.0 for _ in range(self.n_dim)]
                 self.state[dim] += incr
+        # If backward action is to source, set state to source
+        if backward and action[-1] == 1:
+            self.state = self.source
+
         # TODO: remove when always correct
         if self.state != self.source:
             if not all([s <= (1.0 + epsilon) for s in self.state]):
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 06e724324..bdaafe1c6 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -22,8 +22,8 @@ def cube2d():
     "action_space",
     [
         [
-            (0.0, 0.0),
-            (np.inf, np.inf),
+            (0.0, 0.0, 0.0),
+            (np.inf, np.inf, np.inf),
         ],
     ],
 )

From 85de762be6c71696407294dd9bc58470df24a643 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 20 Sep 2023 09:51:25 -0400
Subject: [PATCH 161/206] Add epsilon to get_grid_terminating_states() and
 get_uniform_terminating_states()

---
 gflownet/envs/cube.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index d69643a15..e8919e46e 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1233,19 +1233,25 @@ def step_backwards(
         self._step(action, backward=True)
         return self.state, action, True
 
-    def get_grid_terminating_states(self, n_states: int) -> List[List]:
+    def get_grid_terminating_states(
+        self, n_states: int, epsilon: float = 1e-6
+    ) -> List[List]:
         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
-        linspaces = [np.linspace(0.0, 1.0, n_per_dim) for _ in range(self.n_dim)]
+        linspaces = [
+            np.linspace(epsilon, 1.0 - epsilon, n_per_dim) for _ in range(self.n_dim)
+        ]
         states = list(itertools.product(*linspaces))
         # TODO: check if necessary
         states = [list(el) for el in states]
         return states
 
     def get_uniform_terminating_states(
-        self, n_states: int, seed: int = None
+        self, n_states: int, seed: int = None, epsilon: float = 1e-6
     ) -> List[List]:
         rng = np.random.default_rng(seed)
-        states = rng.uniform(low=0.0, high=1.0, size=(n_states, self.n_dim))
+        states = rng.uniform(
+            low=epsilon, high=1.0 - epsilon, size=(n_states, self.n_dim)
+        )
         return states.tolist()
 
     # TODO: make generic for all environments

From 391dee6773a1ba8b8710e70a66802f2db4126e7d Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 20 Sep 2023 10:30:03 -0400
Subject: [PATCH 162/206] Adjust tests

---
 tests/gflownet/envs/test_ccube.py | 106 ++++++++++++++++++------------
 1 file changed, 63 insertions(+), 43 deletions(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index bdaafe1c6..d81e9b805 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -362,57 +362,57 @@ def test__relative_to_absolute_increments__2d_backward__returns_expected(
     [
         (
             [-1.0, -1.0],
-            (0.5, 0.5),
+            (0.5, 0.5, 1.0),
             [0.5, 0.5],
         ),
         (
             [-1.0, -1.0],
-            (0.0, 0.0),
+            (0.0, 0.0, 1.0),
             [0.0, 0.0],
         ),
         (
             [-1.0, -1.0],
-            (0.1794, 0.9589),
+            (0.1794, 0.9589, 1.0),
             [0.1794, 0.9589],
         ),
         (
             [0.0, 0.0],
-            (0.1, 0.1),
+            (0.1, 0.1, 0.0),
             [0.1, 0.1],
         ),
         (
             [0.0, 0.0],
-            (0.1794, 0.9589),
+            (0.1794, 0.9589, 0.0),
             [0.1794, 0.9589],
         ),
         (
             [0.3, 0.5],
-            (0.1, 0.1),
+            (0.1, 0.1, 0.0),
             [0.4, 0.6],
         ),
         (
             [0.3, 0.5],
-            (0.7, 0.5),
+            (0.7, 0.5, 0.0),
             [1.0, 1.0],
         ),
         (
             [0.3, 0.5],
-            (0.4, 0.3),
+            (0.4, 0.3, 0.0),
             [0.7, 0.8],
         ),
         (
             [0.27, 0.85],
-            (0.1756, 0.138),
+            (0.1756, 0.138, 0.0),
             [0.4456, 0.988],
         ),
         (
             [0.45, 0.27],
-            (np.inf, np.inf),
+            (np.inf, np.inf, np.inf),
             [0.45, 0.27],
         ),
         (
             [0.0, 0.0],
-            (np.inf, np.inf),
+            (np.inf, np.inf, np.inf),
             [0.0, 0.0],
         ),
     ],
@@ -429,27 +429,27 @@ def test__step_forward__2d__returns_expected(cube2d, state, action, state_expect
     [
         (
             [0.5, 0.9],
-            (0.3, 0.2),
+            (0.3, 0.2, 0.0),
             [0.2, 0.7],
         ),
         (
             [0.95, 0.4456],
-            (0.1, 0.27),
+            (0.1, 0.27, 0.0),
             [0.85, 0.1756],
         ),
         (
             [0.1, 0.2],
-            (0.1, 0.1),
+            (0.1, 0.1, 0.0),
             [0.0, 0.1],
         ),
         (
             [0.1, 0.2],
-            (0.1, 0.2),
+            (0.1, 0.2, 1.0),
             [-1.0, -1.0],
         ),
         (
             [0.95, 0.0],
-            (0.95, 0.0),
+            (0.95, 0.0, 1.0),
             [-1.0, -1.0],
         ),
     ],
@@ -555,8 +555,8 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
     actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
     actions_eos = torch.all(actions_tensor == torch.inf, dim=1)
     assert torch.all(actions_eos == is_eos)
-    assert torch.all(actions_tensor >= increments_min)
-    assert torch.all(actions_tensor <= increments_max)
+    assert torch.all(actions_tensor[:, :-1] >= increments_min)
+    assert torch.all(actions_tensor[:, :-1] <= increments_max)
 
 
 @pytest.mark.parametrize(
@@ -650,10 +650,10 @@ def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bt
         policy_outputs, masks, states, is_backward=True
     )
     actions_tensor = tfloat(actions, float_type=env.float, device=env.device)
-    actions_bts = torch.all(actions_tensor == states_torch, dim=1)
+    actions_bts = torch.all(actions_tensor[:, :-1] == states_torch, dim=1)
     assert torch.all(actions_bts == is_bts)
-    assert torch.all(actions_tensor >= increments_min)
-    assert torch.all(actions_tensor <= increments_max)
+    assert torch.all(actions_tensor[:, :-1] >= increments_min)
+    assert torch.all(actions_tensor[:, :-1] <= increments_max)
 
 
 @pytest.mark.parametrize(
@@ -661,11 +661,11 @@ def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bt
     [
         (
             [[0.95, 0.97], [0.96, 0.5], [0.5, 0.96]],
-            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
+            [[0.02, 0.01, 0.0], [0.01, 0.2, 0.0], [0.3, 0.01, 0.0]],
         ),
         (
             [[0.95, 0.97], [0.901, 0.5], [1.0, 1.0]],
-            [[np.inf, np.inf], [0.01, 0.2], [0.3, 0.01]],
+            [[np.inf, np.inf, np.inf], [0.01, 0.2, 0.0], [0.3, 0.01, 0.0]],
         ),
     ],
 )
@@ -699,11 +699,19 @@ def test__get_logprobs_forward__2d__nearedge_returns_prob1(cube2d, states, actio
     [
         (
             [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-            [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
+            [
+                [np.inf, np.inf, np.inf],
+                [np.inf, np.inf, np.inf],
+                [np.inf, np.inf, np.inf],
+            ],
         ),
         (
             [[1.0, 1.0], [0.01, 0.01], [0.001, 0.1]],
-            [[np.inf, np.inf], [np.inf, np.inf], [np.inf, np.inf]],
+            [
+                [np.inf, np.inf, np.inf],
+                [np.inf, np.inf, np.inf],
+                [np.inf, np.inf, np.inf],
+            ],
         ),
     ],
 )
@@ -745,9 +753,9 @@ def test__get_logprobs_forward__2d__eos_actions_return_expected(
 @pytest.mark.parametrize(
     "actions",
     [
-        [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-        [[0.999, 0.999], [0.0001, 0.0001], [0.5, 0.5]],
-        [[0.0, 0.0], [1.0, 1.0]],
+        [[0.1, 0.2, 1.0], [0.3, 0.5, 1.0], [0.5, 0.95, 1.0]],
+        [[0.999, 0.999, 1.0], [0.0001, 0.0001, 1.0], [0.5, 0.5, 1.0]],
+        [[0.0, 0.0, 1.0], [1.0, 1.0, 1.0]],
     ],
 )
 def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1(
@@ -796,23 +804,23 @@ def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1
     [
         (
             [[0.2, 0.2], [0.5, 0.5], [0.7, 0.7]],
-            [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
+            [[0.1, 0.1, 0.0], [0.1, 0.1, 0.0], [0.1, 0.1, 0.0]],
         ),
         (
             [[0.6384, 0.4577], [0.5, 0.5], [0.7, 0.7]],
-            [[0.2988, 0.3585], [0.2, 0.3], [0.11, 0.1001]],
+            [[0.2988, 0.3585, 0.0], [0.2, 0.3, 0.0], [0.11, 0.1001, 0.0]],
         ),
         (
             [[-1.0, -1.0], [-1.0, -1.0], [-1.0, -1.0]],
-            [[0.2988, 0.3585], [0.2, 0.3], [0.11, 0.1001]],
+            [[0.2988, 0.3585, 1.0], [0.2, 0.3, 1.0], [0.11, 0.1001, 1.0]],
         ),
         (
             [[0.6384, 0.4577], [0.5, 0.5], [0.7, 0.7]],
-            [[0.2988, 0.3585], [0.1, 0.1], [0.1, 0.1]],
+            [[0.2988, 0.3585, 0.0], [0.1, 0.1, 0.0], [0.1, 0.1, 0.0]],
         ),
         (
             [[0.0, 0.0], [-1.0, -1.0], [0.0, 0.0]],
-            [[0.1, 0.2], [0.001, 0.001], [0.5, 0.5]],
+            [[0.1, 0.2, 0.0], [0.001, 0.001, 1.0], [0.5, 0.5, 0.0]],
         ),
     ],
 )
@@ -851,7 +859,13 @@ def test__get_logprobs_forward__2d__finite(cube2d, states, actions):
     [
         (
             [[0.2, 0.2], [0.5, 0.5], [-1.0, -1.0], [-1.0, -1.0], [0.95, 0.95]],
-            [[0.5, 0.5], [0.3, 0.3], [0.3, 0.3], [0.5, 0.5], [np.inf, np.inf]],
+            [
+                [0.5, 0.5, 0.0],
+                [0.3, 0.3, 0.0],
+                [0.3, 0.3, 1.0],
+                [0.5, 0.5, 1.0],
+                [np.inf, np.inf, np.inf],
+            ],
         ),
     ],
 )
@@ -889,7 +903,13 @@ def test__get_logprobs_forward__2d__as_expected(cube2d, states, actions):
     [
         (
             [[0.3, 0.3], [0.5, 0.5], [1.0, 1.0], [0.05, 0.2], [0.05, 0.05]],
-            [[0.2, 0.2], [0.2, 0.2], [0.5, 0.5], [0.05, 0.2], [0.05, 0.05]],
+            [
+                [0.2, 0.2, 0.0],
+                [0.2, 0.2, 0.0],
+                [0.5, 0.5, 0.0],
+                [0.05, 0.2, 1.0],
+                [0.05, 0.05, 1.0],
+            ],
         ),
     ],
 )
@@ -923,11 +943,11 @@ def test__get_logprobs_backward__2d__as_expected(cube2d, states, actions):
     [
         (
             [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
-            [[0.02, 0.01], [0.01, 0.2], [0.3, 0.01]],
+            [[0.02, 0.01, 1.0], [0.01, 0.2, 1.0], [0.3, 0.01, 1.0]],
         ),
         (
             [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
-            [[0.0, 0.0], [0.0, 0.2], [0.3, 0.0]],
+            [[0.0, 0.0, 1.0], [0.0, 0.2, 1.0], [0.3, 0.0, 1.0]],
         ),
     ],
 )
@@ -961,15 +981,15 @@ def test__get_logprobs_backward__2d__nearedge_returns_prob1(cube2d, states, acti
     [
         (
             [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
-            [[0.1, 0.2], [0.3, 0.5], [0.5, 0.95]],
+            [[0.1, 0.2, 1.0], [0.3, 0.5, 1.0], [0.5, 0.95, 1.0]],
         ),
         (
             [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
-            [[0.99, 0.99], [0.01, 0.01], [0.001, 0.1]],
+            [[0.99, 0.99, 1.0], [0.01, 0.01, 1.0], [0.001, 0.1, 1.0]],
         ),
         (
             [[1.0, 1.0], [0.0, 0.0]],
-            [[1.0, 1.0], [0.0, 0.0]],
+            [[1.0, 1.0, 1.0], [0.0, 0.0, 1.0]],
         ),
     ],
 )
@@ -1013,15 +1033,15 @@ def test__get_logprobs_backward__2d__bts_actions_return_expected(
     [
         (
             [[0.3, 0.3], [0.5, 0.5], [0.8, 0.8]],
-            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
+            [[0.2, 0.2, 0.0], [0.2, 0.2, 0.0], [0.2, 0.2, 0.0]],
         ),
         (
             [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]],
-            [[0.2, 0.2], [0.2, 0.2], [0.2, 0.2]],
+            [[0.2, 0.2, 0.0], [0.2, 0.2, 0.0], [0.2, 0.2, 0.0]],
         ),
         (
             [[1.0, 1.0], [0.5, 0.5], [0.3, 0.3]],
-            [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
+            [[0.1, 0.1, 0.0], [0.1, 0.1, 0.0], [0.1, 0.1, 0.0]],
         ),
     ],
 )

From 663b989c105b99fc9c012fa8c80291688061d495 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 20 Sep 2023 10:30:30 -0400
Subject: [PATCH 163/206] Make default epsilon of data sets 1e-3

---
 gflownet/envs/cube.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index e8919e46e..35f66ff7e 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1234,7 +1234,7 @@ def step_backwards(
         return self.state, action, True
 
     def get_grid_terminating_states(
-        self, n_states: int, epsilon: float = 1e-6
+        self, n_states: int, epsilon: float = 1e-3
     ) -> List[List]:
         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
         linspaces = [
@@ -1246,7 +1246,7 @@ def get_grid_terminating_states(
         return states
 
     def get_uniform_terminating_states(
-        self, n_states: int, seed: int = None, epsilon: float = 1e-6
+        self, n_states: int, seed: int = None, epsilon: float = 1e-3
     ) -> List[List]:
         rng = np.random.default_rng(seed)
         states = rng.uniform(

From 9b63a061357962b0807b9ba71393ffacce08c467 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 20 Sep 2023 10:47:21 -0400
Subject: [PATCH 164/206] Add docstrings

---
 gflownet/envs/cube.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 35f66ff7e..9fb2693d3 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1236,18 +1236,47 @@ def step_backwards(
     def get_grid_terminating_states(
         self, n_states: int, epsilon: float = 1e-3
     ) -> List[List]:
+        """
+        Constructs a grid of terminating states within the range of the hyper-cube.
+
+        Args
+        ----
+        n_states : int
+            Requested number of states. The actual number of states will be rounded up
+            such that all dimensions have the same number of states.
+
+        epsilon : float
+            Small constant indicating the distance to the theoretical limits of the
+            cube [0, 1], in order to avoid innacuracies in the computation of the log
+            probabilities due to clamping. The grid will thus be in [epsilon, 1 -
+            epsilon]
+        """
         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
         linspaces = [
             np.linspace(epsilon, 1.0 - epsilon, n_per_dim) for _ in range(self.n_dim)
         ]
         states = list(itertools.product(*linspaces))
-        # TODO: check if necessary
         states = [list(el) for el in states]
         return states
 
     def get_uniform_terminating_states(
         self, n_states: int, seed: int = None, epsilon: float = 1e-3
     ) -> List[List]:
+        """
+        Constructs a set of terminating states sampled uniformly within the range of
+        the hyper-cube.
+
+        Args
+        ----
+        n_states : int
+            Number of states in the returned list.
+
+        epsilon : float
+            Small constant indicating the distance to the theoretical limits of the
+            cube [0, 1], in order to avoid innacuracies in the computation of the log
+            probabilities due to clamping. The states will thus be uniformly sampled in
+            [epsilon, 1 - epsilon]
+        """
         rng = np.random.default_rng(seed)
         states = rng.uniform(
             low=epsilon, high=1.0 - epsilon, size=(n_states, self.n_dim)

From 015bff877b7753cbf46851fc269514c29b0dd8fe Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 20 Sep 2023 10:54:15 -0400
Subject: [PATCH 165/206] Small adjustments

---
 gflownet/envs/cube.py             | 19 +++++--------------
 tests/gflownet/envs/test_ccube.py | 11 ++++-------
 2 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 9fb2693d3..a663f03db 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -1120,7 +1120,6 @@ def _step(
             False, if the action is not allowed for the current state, e.g. stop at the
             root state
         """
-        epsilon = 1e-9
         # If forward action is from source, initialize state to all zeros.
         if not backward and action[-1] == 1:
             self.state = [0.0 for _ in range(self.n_dim)]
@@ -1134,31 +1133,23 @@ def _step(
         if backward and action[-1] == 1:
             self.state = self.source
 
-        # TODO: remove when always correct
+        # Check that state is within bounds
         if self.state != self.source:
-            if not all([s <= (1.0 + epsilon) for s in self.state]):
-                import ipdb
-
-                ipdb.set_trace()
             assert all(
-                [s <= (1.0 + epsilon) for s in self.state]
+                [s <= 1.0 for s in self.state]
             ), f"""
             State is out of cube bounds.
             \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
             """
-            if not all([s >= (0.0 - epsilon) for s in self.state]):
-                import ipdb
-
-                ipdb.set_trace()
             assert all(
-                [s >= (0.0 - epsilon) for s in self.state]
+                [s >= 0.0 for s in self.state]
             ), f"""
             State is out of cube bounds.
             \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
             """
         return self.state, action, True
 
-    # TODO: make generic for continuous environments
+    # TODO: make generic for continuous environments?
     def step(self, action: Tuple[float]) -> Tuple[List[float], Tuple[int, float], bool]:
         """
         Executes step given an action. An action is the absolute increment of each
@@ -1195,7 +1186,7 @@ def step(self, action: Tuple[float]) -> Tuple[List[float], Tuple[int, float], bo
             self._step(action, backward=False)
             return self.state, action, True
 
-    # TODO: make generic for continuous environments
+    # TODO: make generic for continuous environments?
     def step_backwards(
         self, action: Tuple[int, float]
     ) -> Tuple[List[float], Tuple[int, float], bool]:
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index d81e9b805..ff1fa3a51 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -853,7 +853,6 @@ def test__get_logprobs_forward__2d__finite(cube2d, states, actions):
     assert torch.all(torch.isfinite(logprobs))
 
 
-# TODO: improve or remove
 @pytest.mark.parametrize(
     "states, actions",
     [
@@ -869,7 +868,7 @@ def test__get_logprobs_forward__2d__finite(cube2d, states, actions):
         ),
     ],
 )
-def test__get_logprobs_forward__2d__as_expected(cube2d, states, actions):
+def test__get_logprobs_forward__2d__is_finite(cube2d, states, actions):
     env = cube2d
     n_states = len(states)
     states_torch = tfloat(states, float_type=env.float, device=env.device)
@@ -894,10 +893,8 @@ def test__get_logprobs_forward__2d__as_expected(cube2d, states, actions):
     logprobs = env.get_logprobs(
         policy_outputs, actions, masks, states_torch, is_backward=False
     )
-    assert True
-
+    assert torch.all(torch.isfinite(logprobs))
 
-# TODO: improve or remove
 @pytest.mark.parametrize(
     "states, actions",
     [
@@ -913,7 +910,7 @@ def test__get_logprobs_forward__2d__as_expected(cube2d, states, actions):
         ),
     ],
 )
-def test__get_logprobs_backward__2d__as_expected(cube2d, states, actions):
+def test__get_logprobs_backward__2d__is_finite(cube2d, states, actions):
     env = cube2d
     n_states = len(states)
     states_torch = tfloat(states, float_type=env.float, device=env.device)
@@ -935,7 +932,7 @@ def test__get_logprobs_backward__2d__as_expected(cube2d, states, actions):
     logprobs = env.get_logprobs(
         policy_outputs, actions, masks, states_torch, is_backward=True
     )
-    assert True
+    assert torch.all(torch.isfinite(logprobs))
 
 
 @pytest.mark.parametrize(

From 3f0ddf6526a46aa9dcbe0610e072cee7e222b4c1 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Wed, 20 Sep 2023 14:31:03 -0400
Subject: [PATCH 166/206] Add functionality so that config distribution
 parameters are probs instead of logits and the actual alpha and beta that
 will go in the Beta policy

---
 config/env/ccube.yaml                         | 21 ++---
 .../experiments/ccube/ccube_pigeon_new.yaml   | 76 +++++++++++++++++++
 gflownet/envs/cube.py                         | 67 ++++++++++++----
 3 files changed, 139 insertions(+), 25 deletions(-)
 create mode 100644 config/experiments/ccube/ccube_pigeon_new.yaml

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index dc98126f0..eeb40af67 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -15,18 +15,21 @@ beta_params_max: 1000.0
 min_incr: 0.1
 n_comp: 1
 fixed_distr_params:
+  beta_params_min: ${beta_params_min}
+  beta_params_max: ${beta_params_max}
   beta_weights: 1.0
-  beta_alpha: 0.01
-  beta_beta: 0.01
-  bernoulli_source_logit: 0.0
-  bernoulli_eos_logit: 0.0
+  beta_alpha: 10.0
+  beta_beta: 10.0
+  bernoulli_bts_prob: 0.1
+  bernoulli_eos_prob: 0.1
 random_distr_params:
+  beta_params_min: ${beta_params_min}
+  beta_params_max: ${beta_params_max}
   beta_weights: 1.0
-  # IMPORTANT: adjust because of sigmoid!
-  beta_alpha: 0.01
-  beta_beta: $beta_params_max
-  bernoulli_source_logit: 0.0
-  bernoulli_eos_logit: 0.0
+  beta_alpha: 1.0
+  beta_beta: 1.0
+  bernoulli_bts_prob: 0.1
+  bernoulli_eos_prob: 0.1
 # Buffer
 buffer:
   data_path: null
diff --git a/config/experiments/ccube/ccube_pigeon_new.yaml b/config/experiments/ccube/ccube_pigeon_new.yaml
new file mode 100644
index 000000000..cf31398d4
--- /dev/null
+++ b/config/experiments/ccube/ccube_pigeon_new.yaml
@@ -0,0 +1,76 @@
+# @package _global_
+# Like hawk but with logit for EOS from source
+
+defaults:
+   - override /env: ccube
+   - override /gflownet: trajectorybalance
+   - override /proxy: corners
+   - override /logger: wandb
+   - override /user: alex
+
+# Environment
+env:
+  n_comp: 5
+  n_dim: 2
+  beta_params_min: 0.01
+  beta_params_max: 100.0
+  min_incr: 0.1
+  fixed_distr_params:
+    beta_params_min: ${env.beta_params_min}
+    beta_params_max: ${env.beta_params_max}
+    beta_weights: 1.0
+    beta_alpha: 88.09
+    beta_beta: 99.34
+    bernoulli_eos_prob: 0.7311
+    bernoulli_bts_prob: 0.7311
+  random_distr_params:
+    beta_params_min: ${env.beta_params_min}
+    beta_params_max: ${env.beta_params_max}
+    beta_weights: 1.0
+    beta_alpha: 100.0
+    beta_beta: 100.0
+    bernoulli_eos_prob: 0.7311
+    bernoulli_bts_prob: 0.7311
+  reward_func: identity
+
+# GFlowNet hyperparameters
+gflownet:
+  random_action_prob: 0.1
+  optimizer:
+    batch_size:
+      forward: 100
+    lr: 0.0001
+    z_dim: 16
+    lr_z_mult: 100
+    n_train_steps: 10000
+  policy:
+    forward:
+      type: mlp
+      n_hid: 512
+      n_layers: 5
+      checkpoint: forward
+    backward:
+      type: mlp
+      n_hid: 512
+      n_layers: 5
+      shared_weights: False
+      checkpoint: backward
+
+# WandB
+logger:
+  lightweight: True
+  project_name: "GFlowNet Cube"
+  tags: 
+    - gflownet
+    - continuous
+    - ccube
+  test:
+    period: 500
+    n: 1000
+  checkpoints:
+    period: 500
+
+# Hydra
+hydra:
+  run:
+    dir: ${user.logdir.root}/debug/ccube/${now:%Y-%m-%d_%H-%M-%S}
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index a663f03db..7cf049a4f 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -46,21 +46,23 @@ def __init__(
         n_dim: int = 2,
         min_incr: float = 0.1,
         n_comp: int = 1,
-        beta_params_min: float = 0.1,
-        beta_params_max: float = 1000.0,
         fixed_distr_params: dict = {
+            "beta_params_min": 0.1,
+            "beta_params_max": 1000.0,
             "beta_weights": 1.0,
             "beta_alpha": 2.0,
             "beta_beta": 5.0,
-            "bernoulli_source_logit": 1.0,
-            "bernoulli_eos_logit": 1.0,
+            "bernoulli_bts_prob": 1.0,
+            "bernoulli_eos_prob": 1.0,
         },
         random_distr_params: dict = {
+            "beta_params_min": 0.1,
+            "beta_params_max": 1000.0,
             "beta_weights": 1.0,
             "beta_alpha": 1000.0,
             "beta_beta": 1000.0,
-            "bernoulli_source_logit": 1.0,
-            "bernoulli_eos_logit": 1.0,
+            "bernoulli_bts_prob": 1.0,
+            "bernoulli_eos_prob": 1.0,
         },
         **kwargs,
     ):
@@ -73,8 +75,8 @@ def __init__(
         self.min_incr = min_incr
         # Parameters of the policy distribution
         self.n_comp = n_comp
-        self.beta_params_min = beta_params_min
-        self.beta_params_max = beta_params_max
+        self.beta_params_min = fixed_distr_params["beta_params_min"]
+        self.beta_params_max = fixed_distr_params["beta_params_max"]
         # Source state is abstract - not included in the cube: -1 for all dimensions.
         self.source = [-1 for _ in range(self.n_dim)]
         # Small constant to clamp the inputs to the beta distribution
@@ -246,6 +248,31 @@ def step(
         """
         pass
 
+    def _beta_params_to_policy_outputs(self, param: str, params_dict: dict):
+        """
+        Maps the values of alpha and beta given in the configuration to new values such
+        that when passed to _make_increments_distribution, the actual alpha and beta
+        passed to the Beta distribution(s) are the ones from the configuration.
+
+        Args
+        ----
+        param : str
+            Name of the parameter to transform: alpha or beta
+
+        params_dict : dict
+            Dictionary with the complete set of parameters of the distribution.
+
+        See
+        ---
+        _make_increments_distribution()
+        """
+        param_min = params_dict["beta_params_min"]
+        param_max = params_dict["beta_params_max"]
+        param_value = tfloat(
+            params_dict[f"beta_{param}"], float_type=self.float, device=self.device
+        )
+        return torch.logit((param_value - param_min) / param_max)
+
 
 class ContinuousCube(CubeBase):
     """
@@ -337,22 +364,30 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
             device=self.device,
         )
         policy_output_cont[0::3] = params["beta_weights"]
-        policy_output_cont[1::3] = params["beta_alpha"]
-        policy_output_cont[2::3] = params["beta_beta"]
+        policy_output_cont[1::3] = self._beta_params_to_policy_outputs("alpha", params)
+        policy_output_cont[2::3] = self._beta_params_to_policy_outputs("beta", params)
         # Logit for Bernoulli distribution to model EOS action
-        policy_output_eos = torch.tensor(
-            [params["bernoulli_eos_logit"]], dtype=self.float, device=self.device
+        policy_output_eos_logit = torch.logit(
+            tfloat(
+                [params["bernoulli_eos_prob"]],
+                float_type=self.float,
+                device=self.device,
+            )
         )
         # Logit for Bernoulli distribution to model back-to-source action
-        policy_output_source = torch.tensor(
-            [params["bernoulli_source_logit"]], dtype=self.float, device=self.device
+        policy_output_bts_logit = torch.logit(
+            tfloat(
+                [params["bernoulli_bts_prob"]],
+                float_type=self.float,
+                device=self.device,
+            )
         )
         # Concatenate all outputs
         policy_output = torch.cat(
             (
                 policy_output_cont,
-                policy_output_source,
-                policy_output_eos,
+                policy_output_bts_logit,
+                policy_output_eos_logit,
             )
         )
         return policy_output

From 8a00330f19500c3382a0ad33417154e0f1d7aaca Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 10:25:31 -0400
Subject: [PATCH 167/206] Fix config file.

---
 config/env/ccube.yaml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index eeb40af67..cb8abd15a 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -8,23 +8,22 @@ continuous: True
 func: corners
 # Dimensions of hypercube
 n_dim: 2
-max_val: 1.0
 # Policy
-beta_params_min: 0.01
-beta_params_max: 1000.0
 min_incr: 0.1
 n_comp: 1
+beta_params_min: 0.01
+beta_params_max: 1000.0
 fixed_distr_params:
-  beta_params_min: ${beta_params_min}
-  beta_params_max: ${beta_params_max}
+  beta_params_min: ${env.beta_params_min}
+  beta_params_max: ${env.beta_params_max}
   beta_weights: 1.0
   beta_alpha: 10.0
   beta_beta: 10.0
   bernoulli_bts_prob: 0.1
   bernoulli_eos_prob: 0.1
 random_distr_params:
-  beta_params_min: ${beta_params_min}
-  beta_params_max: ${beta_params_max}
+  beta_params_min: ${env.beta_params_min}
+  beta_params_max: ${env.beta_params_max}
   beta_weights: 1.0
   beta_alpha: 1.0
   beta_beta: 1.0

From 50046323f60a164b290e30a11d90abedea23e8b3 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 10:28:58 -0400
Subject: [PATCH 168/206] Add epsilon in attributes of class and config

---
 config/env/ccube.yaml | 1 +
 gflownet/envs/cube.py | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index cb8abd15a..7b8ae9b9e 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -11,6 +11,7 @@ n_dim: 2
 # Policy
 min_incr: 0.1
 n_comp: 1
+epsilon: 1e-6
 beta_params_min: 0.01
 beta_params_max: 1000.0
 fixed_distr_params:
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 7cf049a4f..2cae84b05 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -39,6 +39,11 @@ class CubeBase(GFlowNetEnv, ABC):
 
     n_comp : int
         Number of components in the mixture of Beta distributions.
+
+    epsilon : float
+        Small constant to control the clamping interval of the inputs to the
+        calculation of log probabilities. Clamping interval will be [epsilon, 1 -
+        epsilon].
     """
 
     def __init__(
@@ -46,6 +51,7 @@ def __init__(
         n_dim: int = 2,
         min_incr: float = 0.1,
         n_comp: int = 1,
+        epsilon: float: 1e-6
         fixed_distr_params: dict = {
             "beta_params_min": 0.1,
             "beta_params_max": 1000.0,
@@ -80,7 +86,7 @@ def __init__(
         # Source state is abstract - not included in the cube: -1 for all dimensions.
         self.source = [-1 for _ in range(self.n_dim)]
         # Small constant to clamp the inputs to the beta distribution
-        self.epsilon = 1e-6
+        self.epsilon = epsilon
         # Conversions: only conversions to policy are implemented and the rest are the
         # same
         self.state2proxy = self.state2policy

From 147a9b57ec12f0c58a7daea022e39a97569154df Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 15:12:23 -0400
Subject: [PATCH 169/206] Change default parameters of cube to a set that is
 both simple (2 components) and effective with the corners proxy.

---
 config/env/ccube.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index 7b8ae9b9e..2e60b0bbe 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -12,8 +12,8 @@ n_dim: 2
 min_incr: 0.1
 n_comp: 1
 epsilon: 1e-6
-beta_params_min: 0.01
-beta_params_max: 1000.0
+beta_params_min: 0.1
+beta_params_max: 100.0
 fixed_distr_params:
   beta_params_min: ${env.beta_params_min}
   beta_params_max: ${env.beta_params_max}
@@ -26,8 +26,8 @@ random_distr_params:
   beta_params_min: ${env.beta_params_min}
   beta_params_max: ${env.beta_params_max}
   beta_weights: 1.0
-  beta_alpha: 1.0
-  beta_beta: 1.0
+  beta_alpha: 10.0
+  beta_beta: 10.0
   bernoulli_bts_prob: 0.1
   bernoulli_eos_prob: 0.1
 # Buffer
@@ -36,6 +36,6 @@ buffer:
   train: null
   test:
     type: grid
-    n: 1000
+    n: 900
     output_csv: ccube_test.csv
     output_pkl: ccube_test.pkl

From 117be0388e0ca1d5638205e64682480c328f228d Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 15:23:34 -0400
Subject: [PATCH 170/206] Add kappa as attribute and config variable for test
 sets.

---
 config/env/ccube.yaml |  2 ++
 gflownet/envs/cube.py | 36 +++++++++++++++++++++++-------------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index 2e60b0bbe..84af0c733 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -8,6 +8,8 @@ continuous: True
 func: corners
 # Dimensions of hypercube
 n_dim: 2
+# Constant to restrict interval of test sets
+kappa: 1e-3
 # Policy
 min_incr: 0.1
 n_comp: 1
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 2cae84b05..45bc44fca 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -43,7 +43,12 @@ class CubeBase(GFlowNetEnv, ABC):
     epsilon : float
         Small constant to control the clamping interval of the inputs to the
         calculation of log probabilities. Clamping interval will be [epsilon, 1 -
-        epsilon].
+        epsilon]. Default: 1e-6.
+
+    kappa : float
+        Small constant to control the intervals of the generated sets of states (in a
+        grid or uniformly). States will be in the interval [kappa, 1 - kappa]. Default:
+        1e-3.
     """
 
     def __init__(
@@ -51,7 +56,8 @@ def __init__(
         n_dim: int = 2,
         min_incr: float = 0.1,
         n_comp: int = 1,
-        epsilon: float: 1e-6
+        epsilon: float = 1e-6,
+        kappa: float = 1e-3,
         fixed_distr_params: dict = {
             "beta_params_min": 0.1,
             "beta_params_max": 1000.0,
@@ -87,6 +93,8 @@ def __init__(
         self.source = [-1 for _ in range(self.n_dim)]
         # Small constant to clamp the inputs to the beta distribution
         self.epsilon = epsilon
+        # Small constant to restrict the interval of (test) sets
+        self.kappa = kappa
         # Conversions: only conversions to policy are implemented and the rest are the
         # same
         self.state2proxy = self.state2policy
@@ -1266,7 +1274,7 @@ def step_backwards(
         return self.state, action, True
 
     def get_grid_terminating_states(
-        self, n_states: int, epsilon: float = 1e-3
+        self, n_states: int, kappa: Optional[float] = None
     ) -> List[List]:
         """
         Constructs a grid of terminating states within the range of the hyper-cube.
@@ -1277,22 +1285,24 @@ def get_grid_terminating_states(
             Requested number of states. The actual number of states will be rounded up
             such that all dimensions have the same number of states.
 
-        epsilon : float
+        kappa : float
             Small constant indicating the distance to the theoretical limits of the
             cube [0, 1], in order to avoid innacuracies in the computation of the log
-            probabilities due to clamping. The grid will thus be in [epsilon, 1 -
-            epsilon]
+            probabilities due to clamping. The grid will thus be in [kappa, 1 -
+            kappa]. If None, self.kappa will be used.
         """
+        if kappa is None:
+            kappa = self.kappa
         n_per_dim = int(np.ceil(n_states ** (1 / self.n_dim)))
         linspaces = [
-            np.linspace(epsilon, 1.0 - epsilon, n_per_dim) for _ in range(self.n_dim)
+            np.linspace(kappa, 1.0 - kappa, n_per_dim) for _ in range(self.n_dim)
         ]
         states = list(itertools.product(*linspaces))
         states = [list(el) for el in states]
         return states
 
     def get_uniform_terminating_states(
-        self, n_states: int, seed: int = None, epsilon: float = 1e-3
+        self, n_states: int, seed: int = None, kappa: Optional[float] = None
     ) -> List[List]:
         """
         Constructs a set of terminating states sampled uniformly within the range of
@@ -1303,16 +1313,16 @@ def get_uniform_terminating_states(
         n_states : int
             Number of states in the returned list.
 
-        epsilon : float
+        kappa : float
             Small constant indicating the distance to the theoretical limits of the
             cube [0, 1], in order to avoid innacuracies in the computation of the log
             probabilities due to clamping. The states will thus be uniformly sampled in
-            [epsilon, 1 - epsilon]
+            [kappa, 1 - kappa]. If None, self.kappa will be used.
         """
+        if kappa is None:
+            kappa = self.kappa
         rng = np.random.default_rng(seed)
-        states = rng.uniform(
-            low=epsilon, high=1.0 - epsilon, size=(n_states, self.n_dim)
-        )
+        states = rng.uniform(low=kappa, high=1.0 - kappa, size=(n_states, self.n_dim))
         return states.tolist()
 
     # TODO: make generic for all environments

From cbbae73c088ebc2bbfc924b8ef12f0543b041e34 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 15:26:58 -0400
Subject: [PATCH 171/206] Replace print by Exception if FM is used for
 continuous environments.

---
 gflownet/gflownet.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gflownet/gflownet.py b/gflownet/gflownet.py
index 87947f5cb..906936684 100644
--- a/gflownet/gflownet.py
+++ b/gflownet/gflownet.py
@@ -66,13 +66,12 @@ def __init__(
         # Continuous environments
         self.continuous = hasattr(self.env, "continuous") and self.env.continuous
         if self.continuous and optimizer.loss in ["flowmatch", "flowmatching"]:
-            print(
+            raise Exception(
                 """
             Flow matching loss is not available for continuous environments.
-            Trajectory balance will be used instead
+            You may use trajectory balance (gflownet=trajectorybalance) instead.
             """
             )
-            optimizer.loss = "tb"
         # Loss
         if optimizer.loss in ["flowmatch", "flowmatching"]:
             self.loss = "flowmatch"

From 10b04d1a5f0553ba0fb67353f4ce87f07f64cf09 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 15:44:29 -0400
Subject: [PATCH 172/206] Delete old config files

---
 config/experiments/ccube/ccube_owl.yaml       | 71 -----------------
 .../experiments/ccube/ccube_pigeon_new.yaml   | 76 -------------------
 2 files changed, 147 deletions(-)
 delete mode 100644 config/experiments/ccube/ccube_owl.yaml
 delete mode 100644 config/experiments/ccube/ccube_pigeon_new.yaml

diff --git a/config/experiments/ccube/ccube_owl.yaml b/config/experiments/ccube/ccube_owl.yaml
deleted file mode 100644
index 009d826d6..000000000
--- a/config/experiments/ccube/ccube_owl.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-# @package _global_
-
-defaults:
-   - override /env: ccube
-   - override /gflownet: trajectorybalance
-   - override /proxy: corners
-   - override /logger: wandb
-   - override /user: alex
-
-# Environment
-env:
-  n_comp: 5
-  n_dim: 2
-  beta_params_min: 0.01
-  beta_params_max: 100.0
-  min_incr: 0.1
-  fixed_distr_params:
-    beta_weights: 1.0
-    beta_alpha: 0.01
-    beta_beta: 0.01
-    bernoulli_source_logit: 1.0
-    bernoulli_eos_logit: 1.0
-  random_distr_params:
-    beta_weights: 1.0
-    beta_alpha: 0.01
-    beta_beta: 0.01
-    bernoulli_source_logit: 1.0
-    bernoulli_eos_logit: 1.0
-  reward_func: identity
-
-# GFlowNet hyperparameters
-gflownet:
-  random_action_prob: 0.1
-  optimizer:
-    batch_size:
-      forward: 100
-    lr: 0.0001
-    z_dim: 16
-    lr_z_mult: 100
-    n_train_steps: 10000
-  policy:
-    forward:
-      type: mlp
-      n_hid: 512
-      n_layers: 5
-      checkpoint: forward
-    backward:
-      type: mlp
-      n_hid: 512
-      n_layers: 5
-      shared_weights: False
-      checkpoint: backward
-
-# WandB
-logger:
-  lightweight: True
-  project_name: "GFlowNet Cube"
-  tags: 
-    - gflownet
-    - continuous
-    - ccube
-  test:
-    period: 500
-    n: 1000
-  checkpoints:
-    period: 500
-
-# Hydra
-hydra:
-  run:
-    dir: ${user.logdir.root}/debug/ccube/${now:%Y-%m-%d_%H-%M-%S}
diff --git a/config/experiments/ccube/ccube_pigeon_new.yaml b/config/experiments/ccube/ccube_pigeon_new.yaml
deleted file mode 100644
index cf31398d4..000000000
--- a/config/experiments/ccube/ccube_pigeon_new.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# @package _global_
-# Like hawk but with logit for EOS from source
-
-defaults:
-   - override /env: ccube
-   - override /gflownet: trajectorybalance
-   - override /proxy: corners
-   - override /logger: wandb
-   - override /user: alex
-
-# Environment
-env:
-  n_comp: 5
-  n_dim: 2
-  beta_params_min: 0.01
-  beta_params_max: 100.0
-  min_incr: 0.1
-  fixed_distr_params:
-    beta_params_min: ${env.beta_params_min}
-    beta_params_max: ${env.beta_params_max}
-    beta_weights: 1.0
-    beta_alpha: 88.09
-    beta_beta: 99.34
-    bernoulli_eos_prob: 0.7311
-    bernoulli_bts_prob: 0.7311
-  random_distr_params:
-    beta_params_min: ${env.beta_params_min}
-    beta_params_max: ${env.beta_params_max}
-    beta_weights: 1.0
-    beta_alpha: 100.0
-    beta_beta: 100.0
-    bernoulli_eos_prob: 0.7311
-    bernoulli_bts_prob: 0.7311
-  reward_func: identity
-
-# GFlowNet hyperparameters
-gflownet:
-  random_action_prob: 0.1
-  optimizer:
-    batch_size:
-      forward: 100
-    lr: 0.0001
-    z_dim: 16
-    lr_z_mult: 100
-    n_train_steps: 10000
-  policy:
-    forward:
-      type: mlp
-      n_hid: 512
-      n_layers: 5
-      checkpoint: forward
-    backward:
-      type: mlp
-      n_hid: 512
-      n_layers: 5
-      shared_weights: False
-      checkpoint: backward
-
-# WandB
-logger:
-  lightweight: True
-  project_name: "GFlowNet Cube"
-  tags: 
-    - gflownet
-    - continuous
-    - ccube
-  test:
-    period: 500
-    n: 1000
-  checkpoints:
-    period: 500
-
-# Hydra
-hydra:
-  run:
-    dir: ${user.logdir.root}/debug/ccube/${now:%Y-%m-%d_%H-%M-%S}

From 6156602bec379badbcd446de9ac08a1ba34dd379 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 15:47:39 -0400
Subject: [PATCH 173/206] Rename variable param_name <- param

---
 gflownet/envs/cube.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 45bc44fca..4a9f92b2b 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -262,7 +262,7 @@ def step(
         """
         pass
 
-    def _beta_params_to_policy_outputs(self, param: str, params_dict: dict):
+    def _beta_params_to_policy_outputs(self, param_name: str, params_dict: dict):
         """
         Maps the values of alpha and beta given in the configuration to new values such
         that when passed to _make_increments_distribution, the actual alpha and beta
@@ -270,7 +270,7 @@ def _beta_params_to_policy_outputs(self, param: str, params_dict: dict):
 
         Args
         ----
-        param : str
+        param_name : str
             Name of the parameter to transform: alpha or beta
 
         params_dict : dict
@@ -283,7 +283,7 @@ def _beta_params_to_policy_outputs(self, param: str, params_dict: dict):
         param_min = params_dict["beta_params_min"]
         param_max = params_dict["beta_params_max"]
         param_value = tfloat(
-            params_dict[f"beta_{param}"], float_type=self.float, device=self.device
+            params_dict[f"beta_{param_name}"], float_type=self.float, device=self.device
         )
         return torch.logit((param_value - param_min) / param_max)
 

From 3f38836052f02266ec6746f850ae857d005b6eaa Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 15:51:05 -0400
Subject: [PATCH 174/206] Assert that increments from actions are finite.

---
 gflownet/envs/cube.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 4a9f92b2b..79351577b 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -984,6 +984,8 @@ def _get_logprobs_forward(
         if torch.any(do_increments):
             # Get absolute increments
             increments = actions[do_increments, :-1]
+            # Make sure increments are finite
+            assert torch.any(torch.isfinite(increments))
             # Compute relative increments from absolute increments if state is not
             # source
             is_relative = ~is_source[do_increments]
@@ -1068,6 +1070,8 @@ def _get_logprobs_backward(
         if torch.any(do_increments):
             # Get absolute increments
             increments = actions[do_increments, :-1]
+            # Make sure increments are finite
+            assert torch.any(torch.isfinite(increments))
             # Compute absolute increments from all sampled relative increments
             increments = self.absolute_to_relative_increments(
                 states_from_tensor[do_increments],

From 7ba15e4d3d3af3f9257effaaa232eb746623c67a Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 15:55:14 -0400
Subject: [PATCH 175/206] Configuration files for the hypercube

---
 config/experiments/ccube/corners.yaml         |  77 ++++++
 .../hyperparams_search_20230920_batch1.yaml   | 151 +++++++++++
 .../hyperparams_search_20230920_batch2.yaml   | 151 +++++++++++
 .../hyperparams_search_20230920_batch3.yaml   | 151 +++++++++++
 .../hyperparams_search_20230920_batch4.yaml   | 241 ++++++++++++++++++
 config/experiments/ccube/uniform.yaml         |  77 ++++++
 6 files changed, 848 insertions(+)
 create mode 100644 config/experiments/ccube/corners.yaml
 create mode 100644 config/experiments/ccube/hyperparams_search_20230920_batch1.yaml
 create mode 100644 config/experiments/ccube/hyperparams_search_20230920_batch2.yaml
 create mode 100644 config/experiments/ccube/hyperparams_search_20230920_batch3.yaml
 create mode 100644 config/experiments/ccube/hyperparams_search_20230920_batch4.yaml
 create mode 100644 config/experiments/ccube/uniform.yaml

diff --git a/config/experiments/ccube/corners.yaml b/config/experiments/ccube/corners.yaml
new file mode 100644
index 000000000..d44564e16
--- /dev/null
+++ b/config/experiments/ccube/corners.yaml
@@ -0,0 +1,77 @@
+# @package _global_
+# A configuration that works well with the corners proxy.
+# wandb: https://wandb.ai/alexhg/cube/runs/9u2d3zzh
+
+defaults:
+   - override /env: ccube
+   - override /gflownet: trajectorybalance
+   - override /proxy: corners
+   - override /logger: wandb
+   - override /user: alex
+
+# Environment
+env:
+  n_comp: 5
+  n_dim: 2
+  beta_params_min: 0.1
+  beta_params_max: 100.0
+  min_incr: 0.1
+  fixed_distr_params:
+    beta_params_min: ${env.beta_params_min}
+    beta_params_max: ${env.beta_params_max}
+    beta_weights: 1.0
+    beta_alpha: 10.0
+    beta_beta: 10.0
+    bernoulli_eos_prob: 0.1
+    bernoulli_bts_prob: 0.1
+  random_distr_params:
+    beta_params_min: ${env.beta_params_min}
+    beta_params_max: ${env.beta_params_max}
+    beta_weights: 1.0
+    beta_alpha: 10.0
+    beta_beta: 10.0
+    bernoulli_eos_prob: 0.1
+    bernoulli_bts_prob: 0.1
+  reward_func: identity
+
+# GFlowNet hyperparameters
+gflownet:
+  random_action_prob: 0.1
+  optimizer:
+    batch_size:
+      forward: 100
+    lr: 0.0001
+    z_dim: 16
+    lr_z_mult: 100
+    n_train_steps: 10000
+  policy:
+    forward:
+      type: mlp
+      n_hid: 512
+      n_layers: 5
+      checkpoint: forward
+    backward:
+      type: mlp
+      n_hid: 512
+      n_layers: 5
+      shared_weights: False
+      checkpoint: backward
+
+# WandB
+logger:
+  lightweight: True
+  project_name: "cube"
+  tags: 
+    - gflownet
+    - continuous
+    - ccube
+  test:
+    period: 500
+    n: 1000
+  checkpoints:
+    period: 500
+
+# Hydra
+hydra:
+  run:
+    dir: ${user.logdir.root}/debug/ccube/${now:%Y-%m-%d_%H-%M-%S}
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch1.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch1.yaml
new file mode 100644
index 000000000..6eb8eb575
--- /dev/null
+++ b/config/experiments/ccube/hyperparams_search_20230920_batch1.yaml
@@ -0,0 +1,151 @@
+# Shared config
+shared:
+  slurm: {}
+  script:
+    user: $USER
+    device: cpu
+    logger:
+      project_name: cube
+      do:
+        online: True
+      test:
+        period: 500
+        n: 900
+      checkpoints:
+        period: 10000
+    # Contiunuous Cube environment
+    env:
+      __value__: ccube
+      n_dim: 2
+      # Buffer
+      buffer:
+        data_path: null
+        train: null
+        test:
+          type: grid
+          n: 1000
+          output_csv: ccube_test.csv
+          output_pkl: ccube_test.pkl
+    # Proxy
+    proxy: corners
+    # GFlowNet config
+    gflownet:
+      __value__: trajectorybalance
+      random_action_prob: 0.1
+      optimizer:
+        batch_size:
+          forward: 100
+        lr: 0.0001
+        z_dim: 16
+        lr_z_mult: 100
+        n_train_steps: 10000
+    # Policy
+    +gflownet:
+      policy:
+        forward:
+          type: mlp
+          n_hid: 512
+          n_layers: 5
+          checkpoint: forward
+    # Use + to add new variables
+    +gflownet:
+      policy:
+        backward:
+          type: mlp
+          n_hid: 512
+          n_layers: 5
+          checkpoint: backward
+          shared_weights: False
+
+# Jobs
+jobs:
+  - slurm:
+      job_name: pigeonish
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.01
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.7311
+          bernoulli_bts_prob: 0.7311
+  - slurm:
+      job_name: finch
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.7311
+          bernoulli_bts_prob: 0.7311
+  - slurm:
+      job_name: dove
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.7311
+          bernoulli_bts_prob: 0.7311
+  - slurm:
+      job_name: pine
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.01
+          beta_params_max: 1000.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.7311
+          bernoulli_bts_prob: 0.7311
+  - slurm:
+      job_name: spruce
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 1000.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.7311
+          bernoulli_bts_prob: 0.7311
+  - slurm:
+      job_name: fir
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 1
+          beta_params_max: 1000.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.7311
+          bernoulli_bts_prob: 0.7311
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch2.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch2.yaml
new file mode 100644
index 000000000..3d041b855
--- /dev/null
+++ b/config/experiments/ccube/hyperparams_search_20230920_batch2.yaml
@@ -0,0 +1,151 @@
+# Shared config
+shared:
+  slurm: {}
+  script:
+    user: $USER
+    device: cpu
+    logger:
+      project_name: cube
+      do:
+        online: True
+      test:
+        period: 500
+        n: 900
+      checkpoints:
+        period: 10000
+    # Contiunuous Cube environment
+    env:
+      __value__: ccube
+      n_dim: 2
+      # Buffer
+      buffer:
+        data_path: null
+        train: null
+        test:
+          type: grid
+          n: 1000
+          output_csv: ccube_test.csv
+          output_pkl: ccube_test.pkl
+    # Proxy
+    proxy: corners
+    # GFlowNet config
+    gflownet:
+      __value__: trajectorybalance
+      random_action_prob: 0.1
+      optimizer:
+        batch_size:
+          forward: 100
+        lr: 0.0001
+        z_dim: 16
+        lr_z_mult: 100
+        n_train_steps: 10000
+    # Policy
+    +gflownet:
+      policy:
+        forward:
+          type: mlp
+          n_hid: 512
+          n_layers: 5
+          checkpoint: forward
+    # Use + to add new variables
+    +gflownet:
+      policy:
+        backward:
+          type: mlp
+          n_hid: 512
+          n_layers: 5
+          checkpoint: backward
+          shared_weights: False
+
+# Jobs
+jobs:
+  - slurm:
+      job_name: large
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.01
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: cedar
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: hemlock
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: yew
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.01
+          beta_params_max: 1000.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: cycad
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 1000.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: palm
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 1
+          beta_params_max: 1000.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch3.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch3.yaml
new file mode 100644
index 000000000..09ff01523
--- /dev/null
+++ b/config/experiments/ccube/hyperparams_search_20230920_batch3.yaml
@@ -0,0 +1,151 @@
+# Shared config
+shared:
+  slurm: {}
+  script:
+    user: $USER
+    device: cpu
+    logger:
+      project_name: cube
+      do:
+        online: True
+      test:
+        period: 500
+        n: 900
+      checkpoints:
+        period: 10000
+    # Contiunuous Cube environment
+    env:
+      __value__: ccube
+      n_dim: 2
+      # Buffer
+      buffer:
+        data_path: null
+        train: null
+        test:
+          type: grid
+          n: 1000
+          output_csv: ccube_test.csv
+          output_pkl: ccube_test.pkl
+    # Proxy
+    proxy: corners
+    # GFlowNet config
+    gflownet:
+      __value__: trajectorybalance
+      random_action_prob: 0.1
+      optimizer:
+        batch_size:
+          forward: 100
+        lr: 0.0001
+        z_dim: 16
+        lr_z_mult: 100
+        n_train_steps: 10000
+    # Policy
+    +gflownet:
+      policy:
+        forward:
+          type: mlp
+          n_hid: 512
+          n_layers: 5
+          checkpoint: forward
+    # Use + to add new variables
+    +gflownet:
+      policy:
+        backward:
+          type: mlp
+          n_hid: 512
+          n_layers: 5
+          checkpoint: backward
+          shared_weights: False
+
+# Jobs
+jobs:
+  - slurm:
+      job_name: papaya
+    script:
+      env:
+        __value__: ccube
+        n_comp: 2
+        fixed_distr_params:
+          beta_params_min: 0.01
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: mango
+    script:
+      env:
+        __value__: ccube
+        n_comp: 2
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: pineapple
+    script:
+      env:
+        __value__: ccube
+        n_comp: 2
+        fixed_distr_params:
+          beta_params_min: 1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: apple
+    script:
+      env:
+        __value__: ccube
+        n_comp: 2
+        fixed_distr_params:
+          beta_params_min: 0.01
+          beta_params_max: 1000.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: pear
+    script:
+      env:
+        __value__: ccube
+        n_comp: 2
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 1000.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: avocado
+    script:
+      env:
+        __value__: ccube
+        n_comp: 2
+        fixed_distr_params:
+          beta_params_min: 1
+          beta_params_max: 1000.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 100.0
+          beta_beta: 100.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch4.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch4.yaml
new file mode 100644
index 000000000..06ea9e949
--- /dev/null
+++ b/config/experiments/ccube/hyperparams_search_20230920_batch4.yaml
@@ -0,0 +1,241 @@
+# Shared config
+shared:
+  slurm: {}
+  script:
+    user: $USER
+    device: cpu
+    logger:
+      project_name: cube
+      do:
+        online: True
+      test:
+        period: 500
+        n: 900
+      checkpoints:
+        period: 10000
+    # Contiunuous Cube environment
+    env:
+      __value__: ccube
+      n_dim: 2
+      # Buffer
+      buffer:
+        data_path: null
+        train: null
+        test:
+          type: grid
+          n: 1000
+          output_csv: ccube_test.csv
+          output_pkl: ccube_test.pkl
+    # Proxy
+    proxy: corners
+    # GFlowNet config
+    gflownet:
+      __value__: trajectorybalance
+      random_action_prob: 0.1
+      optimizer:
+        batch_size:
+          forward: 100
+        lr: 0.0001
+        z_dim: 16
+        lr_z_mult: 100
+        n_train_steps: 10000
+    # Policy
+    +gflownet:
+      policy:
+        forward:
+          type: mlp
+          n_hid: 512
+          n_layers: 5
+          checkpoint: forward
+    # Use + to add new variables
+    +gflownet:
+      policy:
+        backward:
+          type: mlp
+          n_hid: 512
+          n_layers: 5
+          checkpoint: backward
+          shared_weights: False
+
+# Jobs
+jobs:
+  - slurm:
+      job_name: papaya
+    script:
+      env:
+        __value__: ccube
+        n_comp: 2
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: mango
+    script:
+      env:
+        __value__: ccube
+        n_comp: 2
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: pineapple
+    script:
+      env:
+        __value__: ccube
+        n_comp: 2
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.5
+          bernoulli_bts_prob: 0.5
+  - slurm:
+      job_name: apple
+    script:
+      env:
+        __value__: ccube
+        n_comp: 2
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.5
+          bernoulli_bts_prob: 0.5
+  - slurm:
+      job_name: papaya
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: mango
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: pineapple
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.5
+          bernoulli_bts_prob: 0.5
+  - slurm:
+      job_name: apple
+    script:
+      env:
+        __value__: ccube
+        n_comp: 5
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.5
+          bernoulli_bts_prob: 0.5
+  - slurm:
+      job_name: papaya
+    script:
+      env:
+        __value__: ccube
+        n_comp: 1
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: mango
+    script:
+      env:
+        __value__: ccube
+        n_comp: 1
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.1
+          bernoulli_bts_prob: 0.1
+  - slurm:
+      job_name: pineapple
+    script:
+      env:
+        __value__: ccube
+        n_comp: 1
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.5
+          bernoulli_bts_prob: 0.5
+  - slurm:
+      job_name: apple
+    script:
+      env:
+        __value__: ccube
+        n_comp: 1
+        fixed_distr_params:
+          beta_params_min: 0.1
+          beta_params_max: 100.0
+        random_distr_params:
+          beta_weights: 1.0
+          beta_alpha: 10.0
+          beta_beta: 10.0
+          bernoulli_eos_prob: 0.5
+          bernoulli_bts_prob: 0.5
diff --git a/config/experiments/ccube/uniform.yaml b/config/experiments/ccube/uniform.yaml
new file mode 100644
index 000000000..1fcfa4d9a
--- /dev/null
+++ b/config/experiments/ccube/uniform.yaml
@@ -0,0 +1,77 @@
+# @package _global_
+# A configuration that works well with the uniform proxy.
+# wandb: https://wandb.ai/alexhg/cube/runs/1du9iyr5
+
+defaults:
+   - override /env: ccube
+   - override /gflownet: trajectorybalance
+   - override /proxy: uniform
+   - override /logger: wandb
+   - override /user: alex
+
+# Environment
+env:
+  n_comp: 2
+  n_dim: 2
+  beta_params_min: 0.1
+  beta_params_max: 100.0
+  min_incr: 0.1
+  fixed_distr_params:
+    beta_params_min: ${env.beta_params_min}
+    beta_params_max: ${env.beta_params_max}
+    beta_weights: 1.0
+    beta_alpha: 10.0
+    beta_beta: 10.0
+    bernoulli_eos_prob: 0.1
+    bernoulli_bts_prob: 0.1
+  random_distr_params:
+    beta_params_min: ${env.beta_params_min}
+    beta_params_max: ${env.beta_params_max}
+    beta_weights: 1.0
+    beta_alpha: 10.0
+    beta_beta: 10.0
+    bernoulli_eos_prob: 0.1
+    bernoulli_bts_prob: 0.1
+  reward_func: identity
+
+# GFlowNet hyperparameters
+gflownet:
+  random_action_prob: 0.1
+  optimizer:
+    batch_size:
+      forward: 100
+    lr: 0.0001
+    z_dim: 16
+    lr_z_mult: 100
+    n_train_steps: 10000
+  policy:
+    forward:
+      type: mlp
+      n_hid: 256
+      n_layers: 3
+      checkpoint: forward
+    backward:
+      type: mlp
+      n_hid: 256
+      n_layers: 3
+      shared_weights: False
+      checkpoint: backward
+
+# WandB
+logger:
+  lightweight: True
+  project_name: "cube"
+  tags: 
+    - gflownet
+    - continuous
+    - ccube
+  test:
+    period: 500
+    n: 1000
+  checkpoints:
+    period: 500
+
+# Hydra
+hydra:
+  run:
+    dir: ${user.logdir.root}/debug/ccube/${now:%Y-%m-%d_%H-%M-%S}

From eea634e3c747ebe74544bf2581725876a3b0a793 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 20:54:42 -0400
Subject: [PATCH 176/206] Fix test, related to transformation of distr. params

---
 tests/gflownet/envs/test_ccube.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index ff1fa3a51..fcef78b27 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -53,14 +53,20 @@ def policy_output__as_expected(env, policy_outputs, params):
         env._get_policy_betas_weights(policy_outputs) == params["beta_weights"]
     )
     assert torch.all(
-        env._get_policy_betas_alpha(policy_outputs) == params["beta_alpha"]
+        env._get_policy_betas_alpha(policy_outputs)
+        == env._beta_params_to_policy_outputs("alpha", params)
     )
-    assert torch.all(env._get_policy_betas_beta(policy_outputs) == params["beta_beta"])
     assert torch.all(
-        env._get_policy_eos_logit(policy_outputs) == params["bernoulli_eos_logit"]
+        env._get_policy_betas_beta(policy_outputs)
+        == env._beta_params_to_policy_outputs("beta", params)
     )
     assert torch.all(
-        env._get_policy_source_logit(policy_outputs) == params["bernoulli_source_logit"]
+        env._get_policy_eos_logit(policy_outputs)
+        == torch.logit(torch.tensor(params["bernoulli_eos_prob"]))
+    )
+    assert torch.all(
+        env._get_policy_source_logit(policy_outputs)
+        == torch.logit(torch.tensor(params["bernoulli_bts_prob"]))
     )
 
 
@@ -895,6 +901,7 @@ def test__get_logprobs_forward__2d__is_finite(cube2d, states, actions):
     )
     assert torch.all(torch.isfinite(logprobs))
 
+
 @pytest.mark.parametrize(
     "states, actions",
     [

From 2ef82aa5e691d0cd465a36d354149a3457e920b4 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 21:48:27 -0400
Subject: [PATCH 177/206] Fix more tests, related to transformation of distr.
 params

---
 tests/gflownet/envs/test_ccube.py | 97 ++++++++++++++-----------------
 1 file changed, 45 insertions(+), 52 deletions(-)

diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index fcef78b27..6a8e7475e 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -10,12 +10,12 @@
 
 @pytest.fixture
 def cube1d():
-    return ContinuousCube(n_dim=1, n_comp=3, min_incr=0.1, max_val=1.0)
+    return ContinuousCube(n_dim=1, n_comp=3, min_incr=0.1)
 
 
 @pytest.fixture
 def cube2d():
-    return ContinuousCube(n_dim=2, n_comp=3, min_incr=0.1, max_val=1.0)
+    return ContinuousCube(n_dim=2, n_comp=3, min_incr=0.1)
 
 
 @pytest.mark.parametrize(
@@ -508,20 +508,18 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
     n_samples = 10000
     beta_params_min = 0.0
     beta_params_max = 10000
-    alpha = 10
-    alphas_presigmoid = alpha * torch.ones(n_samples)
-    alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
+    alpha = 10.0
+    alphas = alpha * torch.ones(n_samples)
     beta = 1.0
-    betas_presigmoid = beta * torch.ones(n_samples)
-    betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
+    betas = beta * torch.ones(n_samples)
     beta_distr = Beta(alphas, betas)
     samples = beta_distr.sample()
     mean_incr_rel = 0.9 * samples.mean()
     min_incr_rel = 0.9 * samples.min()
     max_incr_rel = 1.1 * samples.max()
     # Define Bernoulli parameters for EOS with deterministic probability
-    logit_force_eos = torch.inf
-    logit_force_noeos = -torch.inf
+    prob_force_eos = 1.0
+    prob_force_noeos = 0.0
     # Estimate confident intervals of absolute actions
     states_torch = tfloat(states, float_type=env.float, device=env.device)
     is_source = torch.all(states_torch == -1.0, dim=1)
@@ -551,9 +549,9 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
     params = env.fixed_distr_params
     params["beta_alpha"] = alpha
     params["beta_beta"] = beta
-    params["bernoulli_eos_logit"] = logit_force_noeos
+    params["bernoulli_eos_prob"] = prob_force_noeos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    policy_outputs[force_eos, -1] = logit_force_eos
+    policy_outputs[force_eos, -1] = torch.logit(torch.tensor(prob_force_eos))
     # Sample actions
     actions, _ = env.sample_actions_batch(
         policy_outputs, masks, states, is_backward=False
@@ -608,19 +606,17 @@ def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bt
     beta_params_min = 0.0
     beta_params_max = 10000
     alpha = 10
-    alphas_presigmoid = alpha * torch.ones(n_samples)
-    alphas = beta_params_max * torch.sigmoid(alphas_presigmoid) + beta_params_min
+    alphas = alpha * torch.ones(n_samples)
     beta = 1.0
-    betas_presigmoid = beta * torch.ones(n_samples)
-    betas = beta_params_max * torch.sigmoid(betas_presigmoid) + beta_params_min
+    betas = beta * torch.ones(n_samples)
     beta_distr = Beta(alphas, betas)
     samples = beta_distr.sample()
     mean_incr_rel = 0.9 * samples.mean()
     min_incr_rel = 0.9 * samples.min()
     max_incr_rel = 1.1 * samples.max()
     # Define Bernoulli parameters for BTS with deterministic probability
-    logit_force_bts = torch.inf
-    logit_force_nobts = -torch.inf
+    prob_force_bts = 1.0
+    prob_force_nobts = 0.0
     # Estimate confident intervals of absolute actions
     increments_min = torch.full_like(
         states_torch, min_incr_rel, dtype=env.float, device=env.device
@@ -648,9 +644,9 @@ def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bt
     params = env.fixed_distr_params
     params["beta_alpha"] = alpha
     params["beta_beta"] = beta
-    params["bernoulli_source_logit"] = logit_force_nobts
+    params["bernoulli_bts_prob"] = prob_force_nobts
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
-    policy_outputs[force_bts, -2] = logit_force_bts
+    policy_outputs[force_bts, -2] = torch.logit(torch.tensor(prob_force_bts))
     # Sample actions
     actions, _ = env.sample_actions_batch(
         policy_outputs, masks, states, is_backward=True
@@ -740,13 +736,12 @@ def test__get_logprobs_forward__2d__eos_actions_return_expected(
     is_near_edge = states_torch > 1.0 - env.min_incr
     is_eos_forced = torch.any(is_near_edge, dim=1)
     # Define Bernoulli parameter for EOS
-    # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_eos = 1
-    distr_eos = Bernoulli(logits=logit_eos)
+    prob_eos = 0.5
+    distr_eos = Bernoulli(probs=prob_eos)
     logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
     # Build policy outputs
     params = env.fixed_distr_params
-    params["bernoulli_eos_logit"] = logit_eos
+    params["bernoulli_eos_prob"] = prob_eos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(
@@ -780,23 +775,25 @@ def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1
     masks = tbool(
         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
     )
-    # Define Uniform Beta distribution (large values of alpha and beta and max of 1.0)
-    beta_params_min = 0.0
-    beta_params_max = 1.0
-    alpha_presigmoid = 1000.0
-    betas_presigmoid = 1000.0
+    # Define Uniform Beta distribution (alpha and beta equal to 1.0)
+    beta_params_min = 0.1
+    beta_params_max = 100.0
+    alpha = 1.0
+    beta = 1.0
     # Define Bernoulli parameter for impossible EOS
-    # If Bernouilli has logit -torch.inf, the logprobs are nan
-    logit_force_noeos = -1000
+    # If Bernouilli has probability exactly 0, the logit is -inf.
+    prob_force_noeos = 0.0
     # Reconfigure environment
     env.n_comp = 1
     env.beta_params_min = beta_params_min
     env.beta_params_max = beta_params_max
     # Build policy outputs
     params = env.fixed_distr_params
-    params["beta_alpha"] = alpha_presigmoid
-    params["beta_beta"] = betas_presigmoid
-    params["bernoulli_eos_logit"] = logit_force_noeos
+    params["beta_params_min"] = beta_params_min
+    params["beta_params_max"] = beta_params_max
+    params["beta_alpha"] = alpha
+    params["beta_beta"] = beta
+    params["bernoulli_eos_prob"] = prob_force_noeos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(
@@ -844,12 +841,12 @@ def test__get_logprobs_forward__2d__finite(cube2d, states, actions):
     is_eos_forced = torch.any(is_near_edge, dim=1)
     # Define Bernoulli parameter for EOS
     # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_eos = 1
-    distr_eos = Bernoulli(logits=logit_eos)
+    prob_eos = 0.5
+    distr_eos = Bernoulli(probs=prob_eos)
     logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
     # Build policy outputs
     params = env.fixed_distr_params
-    params["bernoulli_eos_logit"] = logit_eos
+    params["bernoulli_eos_prob"] = prob_eos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(
@@ -887,13 +884,12 @@ def test__get_logprobs_forward__2d__is_finite(cube2d, states, actions):
     is_near_edge = states_torch > 1.0 - env.min_incr
     is_eos_forced = torch.any(is_near_edge, dim=1)
     # Define Bernoulli parameter for EOS
-    # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_eos = 1
-    distr_eos = Bernoulli(logits=logit_eos)
+    prob_eos = 0.5
+    distr_eos = Bernoulli(probs=prob_eos)
     logprob_eos = distr_eos.log_prob(torch.tensor(1.0))
     # Build policy outputs
     params = env.fixed_distr_params
-    params["bernoulli_eos_logit"] = logit_eos
+    params["bernoulli_eos_prob"] = prob_eos
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(
@@ -927,13 +923,12 @@ def test__get_logprobs_backward__2d__is_finite(cube2d, states, actions):
         [env.get_mask_invalid_actions_backward(s) for s in states], device=env.device
     )
     # Define Bernoulli parameter for BTS
-    # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_bts = 1
-    distr_bts = Bernoulli(logits=logit_bts)
+    prob_bts = 0.5
+    distr_bts = Bernoulli(probs=prob_bts)
     logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
     # Build policy outputs
     params = env.fixed_distr_params
-    params["bernoulli_source_logit"] = logit_bts
+    params["bernoulli_bts_prob"] = prob_bts
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(
@@ -1016,13 +1011,12 @@ def test__get_logprobs_backward__2d__bts_actions_return_expected(
     is_near_edge = states_torch < env.min_incr
     is_bts_forced = torch.any(is_near_edge, dim=1)
     # Define Bernoulli parameter for BTS
-    # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_bts = 1
-    distr_bts = Bernoulli(logits=logit_bts)
+    prob_bts = 0.5
+    distr_bts = Bernoulli(probs=prob_bts)
     logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
     # Build policy outputs
     params = env.fixed_distr_params
-    params["bernoulli_source_logit"] = logit_bts
+    params["bernoulli_bts_prob"] = prob_bts
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(
@@ -1062,13 +1056,12 @@ def test__get_logprobs_backward__2d__notnan(cube2d, states, actions):
     is_near_edge = states_torch < env.min_incr
     is_bts_forced = torch.any(is_near_edge, dim=1)
     # Define Bernoulli parameter for BTS
-    # If Bernouilli has logit torch.inf, the logprobs are nan
-    logit_bts = 1
-    distr_bts = Bernoulli(logits=logit_bts)
+    prob_bts = 0.5
+    distr_bts = Bernoulli(probs=prob_bts)
     logprob_bts = distr_bts.log_prob(torch.tensor(1.0))
     # Build policy outputs
     params = env.fixed_distr_params
-    params["bernoulli_source_logit"] = logit_bts
+    params["bernoulli_bts_prob"] = prob_bts
     policy_outputs = torch.tile(env.get_policy_output(params), dims=(n_states, 1))
     # Get log probs
     logprobs = env.get_logprobs(

From 1448bcc7e6fcc8f4aa860137107356d96b018f86 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 21:51:33 -0400
Subject: [PATCH 178/206] Fix default parameters of cube

---
 gflownet/envs/cube.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 79351577b..0cb147acf 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -60,21 +60,21 @@ def __init__(
         kappa: float = 1e-3,
         fixed_distr_params: dict = {
             "beta_params_min": 0.1,
-            "beta_params_max": 1000.0,
+            "beta_params_max": 100.0,
             "beta_weights": 1.0,
-            "beta_alpha": 2.0,
-            "beta_beta": 5.0,
-            "bernoulli_bts_prob": 1.0,
-            "bernoulli_eos_prob": 1.0,
+            "beta_alpha": 10.0,
+            "beta_beta": 10.0,
+            "bernoulli_bts_prob": 0.1,
+            "bernoulli_eos_prob": 0.1,
         },
         random_distr_params: dict = {
             "beta_params_min": 0.1,
-            "beta_params_max": 1000.0,
+            "beta_params_max": 100.0,
             "beta_weights": 1.0,
-            "beta_alpha": 1000.0,
-            "beta_beta": 1000.0,
-            "bernoulli_bts_prob": 1.0,
-            "bernoulli_eos_prob": 1.0,
+            "beta_alpha": 10.0,
+            "beta_beta": 10.0,
+            "bernoulli_bts_prob": 0.1,
+            "bernoulli_eos_prob": 0.1,
         },
         **kwargs,
     ):

From 5f90dc4ba9ec5f2694b3fef4cefc718a0f97733e Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 21 Sep 2023 22:04:16 -0400
Subject: [PATCH 179/206] Make beta_params{min,max} attributes of the class
 instead of being part of the distr params dictionaries.

---
 config/env/ccube.yaml                         |  4 --
 config/experiments/ccube/corners.yaml         |  4 --
 .../hyperparams_search_20230920_batch1.yaml   | 30 ++++------
 .../hyperparams_search_20230920_batch2.yaml   | 30 ++++------
 .../hyperparams_search_20230920_batch3.yaml   | 30 ++++------
 .../hyperparams_search_20230920_batch4.yaml   | 60 ++++++++-----------
 config/experiments/ccube/uniform.yaml         |  4 --
 gflownet/envs/cube.py                         | 14 ++---
 tests/gflownet/envs/test_ccube.py             | 14 -----
 9 files changed, 65 insertions(+), 125 deletions(-)

diff --git a/config/env/ccube.yaml b/config/env/ccube.yaml
index 84af0c733..57efa44ef 100644
--- a/config/env/ccube.yaml
+++ b/config/env/ccube.yaml
@@ -17,16 +17,12 @@ epsilon: 1e-6
 beta_params_min: 0.1
 beta_params_max: 100.0
 fixed_distr_params:
-  beta_params_min: ${env.beta_params_min}
-  beta_params_max: ${env.beta_params_max}
   beta_weights: 1.0
   beta_alpha: 10.0
   beta_beta: 10.0
   bernoulli_bts_prob: 0.1
   bernoulli_eos_prob: 0.1
 random_distr_params:
-  beta_params_min: ${env.beta_params_min}
-  beta_params_max: ${env.beta_params_max}
   beta_weights: 1.0
   beta_alpha: 10.0
   beta_beta: 10.0
diff --git a/config/experiments/ccube/corners.yaml b/config/experiments/ccube/corners.yaml
index d44564e16..e3594ac76 100644
--- a/config/experiments/ccube/corners.yaml
+++ b/config/experiments/ccube/corners.yaml
@@ -17,16 +17,12 @@ env:
   beta_params_max: 100.0
   min_incr: 0.1
   fixed_distr_params:
-    beta_params_min: ${env.beta_params_min}
-    beta_params_max: ${env.beta_params_max}
     beta_weights: 1.0
     beta_alpha: 10.0
     beta_beta: 10.0
     bernoulli_eos_prob: 0.1
     bernoulli_bts_prob: 0.1
   random_distr_params:
-    beta_params_min: ${env.beta_params_min}
-    beta_params_max: ${env.beta_params_max}
     beta_weights: 1.0
     beta_alpha: 10.0
     beta_beta: 10.0
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch1.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch1.yaml
index 6eb8eb575..87e44bfb5 100644
--- a/config/experiments/ccube/hyperparams_search_20230920_batch1.yaml
+++ b/config/experiments/ccube/hyperparams_search_20230920_batch1.yaml
@@ -65,9 +65,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.01
-          beta_params_max: 100.0
+        beta_params_min: 0.01
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -80,9 +79,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -95,9 +93,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 1
-          beta_params_max: 100.0
+        beta_params_min: 1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -110,9 +107,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.01
-          beta_params_max: 1000.0
+        beta_params_min: 0.01
+        beta_params_max: 1000.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -125,9 +121,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 1000.0
+        beta_params_min: 0.1
+        beta_params_max: 1000.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -140,9 +135,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 1
-          beta_params_max: 1000.0
+        beta_params_min: 1
+        beta_params_max: 1000.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch2.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch2.yaml
index 3d041b855..93491e3e9 100644
--- a/config/experiments/ccube/hyperparams_search_20230920_batch2.yaml
+++ b/config/experiments/ccube/hyperparams_search_20230920_batch2.yaml
@@ -65,9 +65,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.01
-          beta_params_max: 100.0
+        beta_params_min: 0.01
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -80,9 +79,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -95,9 +93,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 1
-          beta_params_max: 100.0
+        beta_params_min: 1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -110,9 +107,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.01
-          beta_params_max: 1000.0
+        beta_params_min: 0.01
+        beta_params_max: 1000.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -125,9 +121,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 1000.0
+        beta_params_min: 0.1
+        beta_params_max: 1000.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -140,9 +135,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 1
-          beta_params_max: 1000.0
+        beta_params_min: 1
+        beta_params_max: 1000.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch3.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch3.yaml
index 09ff01523..7912af9b3 100644
--- a/config/experiments/ccube/hyperparams_search_20230920_batch3.yaml
+++ b/config/experiments/ccube/hyperparams_search_20230920_batch3.yaml
@@ -65,9 +65,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 2
-        fixed_distr_params:
-          beta_params_min: 0.01
-          beta_params_max: 100.0
+        beta_params_min: 0.01
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -80,9 +79,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 2
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -95,9 +93,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 2
-        fixed_distr_params:
-          beta_params_min: 1
-          beta_params_max: 100.0
+        beta_params_min: 1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -110,9 +107,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 2
-        fixed_distr_params:
-          beta_params_min: 0.01
-          beta_params_max: 1000.0
+        beta_params_min: 0.01
+        beta_params_max: 1000.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -125,9 +121,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 2
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 1000.0
+        beta_params_min: 0.1
+        beta_params_max: 1000.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
@@ -140,9 +135,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 2
-        fixed_distr_params:
-          beta_params_min: 1
-          beta_params_max: 1000.0
+        beta_params_min: 1
+        beta_params_max: 1000.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 100.0
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch4.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch4.yaml
index 06ea9e949..cc82e322c 100644
--- a/config/experiments/ccube/hyperparams_search_20230920_batch4.yaml
+++ b/config/experiments/ccube/hyperparams_search_20230920_batch4.yaml
@@ -65,9 +65,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 2
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -80,9 +79,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 2
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -95,9 +93,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 2
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -110,9 +107,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 2
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -125,9 +121,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -140,9 +135,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -155,9 +149,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -170,9 +163,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 5
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -185,9 +177,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 1
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -200,9 +191,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 1
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -215,9 +205,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 1
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
@@ -230,9 +219,8 @@ jobs:
       env:
         __value__: ccube
         n_comp: 1
-        fixed_distr_params:
-          beta_params_min: 0.1
-          beta_params_max: 100.0
+        beta_params_min: 0.1
+        beta_params_max: 100.0
         random_distr_params:
           beta_weights: 1.0
           beta_alpha: 10.0
diff --git a/config/experiments/ccube/uniform.yaml b/config/experiments/ccube/uniform.yaml
index 1fcfa4d9a..6970a3e95 100644
--- a/config/experiments/ccube/uniform.yaml
+++ b/config/experiments/ccube/uniform.yaml
@@ -17,16 +17,12 @@ env:
   beta_params_max: 100.0
   min_incr: 0.1
   fixed_distr_params:
-    beta_params_min: ${env.beta_params_min}
-    beta_params_max: ${env.beta_params_max}
     beta_weights: 1.0
     beta_alpha: 10.0
     beta_beta: 10.0
     bernoulli_eos_prob: 0.1
     bernoulli_bts_prob: 0.1
   random_distr_params:
-    beta_params_min: ${env.beta_params_min}
-    beta_params_max: ${env.beta_params_max}
     beta_weights: 1.0
     beta_alpha: 10.0
     beta_beta: 10.0
diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 0cb147acf..0476fe88f 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -56,11 +56,11 @@ def __init__(
         n_dim: int = 2,
         min_incr: float = 0.1,
         n_comp: int = 1,
+        beta_params_min: float = 0.1,
+        beta_params_max: float = 100.0,
         epsilon: float = 1e-6,
         kappa: float = 1e-3,
         fixed_distr_params: dict = {
-            "beta_params_min": 0.1,
-            "beta_params_max": 100.0,
             "beta_weights": 1.0,
             "beta_alpha": 10.0,
             "beta_beta": 10.0,
@@ -68,8 +68,6 @@ def __init__(
             "bernoulli_eos_prob": 0.1,
         },
         random_distr_params: dict = {
-            "beta_params_min": 0.1,
-            "beta_params_max": 100.0,
             "beta_weights": 1.0,
             "beta_alpha": 10.0,
             "beta_beta": 10.0,
@@ -87,8 +85,8 @@ def __init__(
         self.min_incr = min_incr
         # Parameters of the policy distribution
         self.n_comp = n_comp
-        self.beta_params_min = fixed_distr_params["beta_params_min"]
-        self.beta_params_max = fixed_distr_params["beta_params_max"]
+        self.beta_params_min = beta_params_min
+        self.beta_params_max = beta_params_max
         # Source state is abstract - not included in the cube: -1 for all dimensions.
         self.source = [-1 for _ in range(self.n_dim)]
         # Small constant to clamp the inputs to the beta distribution
@@ -280,12 +278,10 @@ def _beta_params_to_policy_outputs(self, param_name: str, params_dict: dict):
         ---
         _make_increments_distribution()
         """
-        param_min = params_dict["beta_params_min"]
-        param_max = params_dict["beta_params_max"]
         param_value = tfloat(
             params_dict[f"beta_{param_name}"], float_type=self.float, device=self.device
         )
-        return torch.logit((param_value - param_min) / param_max)
+        return torch.logit((param_value - self.beta_params_min) / self.beta_params_max)
 
 
 class ContinuousCube(CubeBase):
diff --git a/tests/gflownet/envs/test_ccube.py b/tests/gflownet/envs/test_ccube.py
index 6a8e7475e..feda16bf3 100644
--- a/tests/gflownet/envs/test_ccube.py
+++ b/tests/gflownet/envs/test_ccube.py
@@ -506,8 +506,6 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
     )
     # Define Beta distribution with low variance and get confident range
     n_samples = 10000
-    beta_params_min = 0.0
-    beta_params_max = 10000
     alpha = 10.0
     alphas = alpha * torch.ones(n_samples)
     beta = 1.0
@@ -543,8 +541,6 @@ def test__sample_actions_forward__2d__returns_expected(cube2d, states, force_eos
     increments_max[is_eos] = torch.inf
     # Reconfigure environment
     env.n_comp = 1
-    env.beta_params_min = beta_params_min
-    env.beta_params_max = beta_params_max
     # Build policy outputs
     params = env.fixed_distr_params
     params["beta_alpha"] = alpha
@@ -603,8 +599,6 @@ def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bt
     states_torch = tfloat(states, float_type=env.float, device=env.device)
     # Define Beta distribution with low variance and get confident range
     n_samples = 10000
-    beta_params_min = 0.0
-    beta_params_max = 10000
     alpha = 10
     alphas = alpha * torch.ones(n_samples)
     beta = 1.0
@@ -638,8 +632,6 @@ def test__sample_actions_backward__2d__returns_expected(cube2d, states, force_bt
     increments_max[is_bts] = states_torch[is_bts]
     # Reconfigure environment
     env.n_comp = 1
-    env.beta_params_min = beta_params_min
-    env.beta_params_max = beta_params_max
     # Build policy outputs
     params = env.fixed_distr_params
     params["beta_alpha"] = alpha
@@ -776,8 +768,6 @@ def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1
         [env.get_mask_invalid_actions_forward(s) for s in states], device=env.device
     )
     # Define Uniform Beta distribution (alpha and beta equal to 1.0)
-    beta_params_min = 0.1
-    beta_params_max = 100.0
     alpha = 1.0
     beta = 1.0
     # Define Bernoulli parameter for impossible EOS
@@ -785,12 +775,8 @@ def test__get_logprobs_forward__2d__all_actions_from_source_uniform_policy_prob1
     prob_force_noeos = 0.0
     # Reconfigure environment
     env.n_comp = 1
-    env.beta_params_min = beta_params_min
-    env.beta_params_max = beta_params_max
     # Build policy outputs
     params = env.fixed_distr_params
-    params["beta_params_min"] = beta_params_min
-    params["beta_params_max"] = beta_params_max
     params["beta_alpha"] = alpha
     params["beta_beta"] = beta
     params["bernoulli_eos_prob"] = prob_force_noeos

From e19e34f656b0dc98a4ec42b4c4099e626fcca596 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Fri, 22 Sep 2023 21:15:10 -0400
Subject: [PATCH 180/206] Changes in step methods: return invalid if out of
 bounds.

---
 gflownet/envs/cube.py | 60 ++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 0476fe88f..1c2846398 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -2,6 +2,7 @@
 Classes to represent hyper-cube environments
 """
 import itertools
+import warnings
 from abc import ABC, abstractmethod
 from typing import List, Optional, Tuple
 
@@ -337,6 +338,9 @@ def get_action_space(self):
         self.representative_action = tuple([0.0] * actions_dim)
         return [self.representative_action, self.eos]
 
+    def get_max_traj_length(self):
+        return np.ceil(1.0 / self.min_incr) + 2
+
     def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         """
         Defines the structure of the output of the policy model, from which an
@@ -1170,32 +1174,30 @@ def _step(
             root state
         """
         # If forward action is from source, initialize state to all zeros.
-        if not backward and action[-1] == 1:
-            self.state = [0.0 for _ in range(self.n_dim)]
+        if not backward and action[-1] == 1 and self.state == self.source:
+            state = [0.0 for _ in range(self.n_dim)]
+        else:
+            state = copy(self.state)
         # Increment dimensions
         for dim, incr in enumerate(action[:-1]):
             if backward:
-                self.state[dim] -= incr
+                state[dim] -= incr
             else:
-                self.state[dim] += incr
-        # If backward action is to source, set state to source
-        if backward and action[-1] == 1:
-            self.state = self.source
+                state[dim] += incr
+
+        # If state is out of bounds, return invalid
+        if any([s > 1.0 for s in state]) or any([s < 0.0 for s in state]):
+            warnings.warn(
+                f"""
+                State is out of cube bounds.
+                \nCurrent state:\n{self.state}\nAction:\n{action}\nNext state: {state}
+                """
+            )
+            return self.state, action, False
 
-        # Check that state is within bounds
-        if self.state != self.source:
-            assert all(
-                [s <= 1.0 for s in self.state]
-            ), f"""
-            State is out of cube bounds.
-            \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
-            """
-            assert all(
-                [s >= 0.0 for s in self.state]
-            ), f"""
-            State is out of cube bounds.
-            \nState:\n{self.state}\nAction:\n{action}\nIncrement: {incr}
-            """
+        # Otherwise, set self.state as the udpated state and return valid.
+        self.n_actions += 1
+        self.state = state
         return self.state, action, True
 
     # TODO: make generic for continuous environments?
@@ -1231,11 +1233,8 @@ def step(self, action: Tuple[float]) -> Tuple[List[float], Tuple[int, float], bo
             return self.state, self.eos, True
         # Otherwise perform action
         else:
-            self.n_actions += 1
-            self._step(action, backward=False)
-            return self.state, action, True
+            return self._step(action, backward=False)
 
-    # TODO: make generic for continuous environments?
     def step_backwards(
         self, action: Tuple[int, float]
     ) -> Tuple[List[float], Tuple[int, float], bool]:
@@ -1267,11 +1266,14 @@ def step_backwards(
             self.done = False
             self.n_actions += 1
             return self.state, action, True
-        # Otherwise perform action
         assert action != self.eos
-        self.n_actions += 1
-        self._step(action, backward=True)
-        return self.state, action, True
+        # If action is BTS, set source state
+        if action[-1] == 1 and self.state != self.source:
+            self.n_actions += 1
+            self.state = self.source
+            return self.state, action, True
+        # Otherwise perform action
+        return self._step(action, backward=True)
 
     def get_grid_terminating_states(
         self, n_states: int, kappa: Optional[float] = None

From 5b7ce4eab44e4a004e441a479b599fdddb42e803 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sat, 23 Sep 2023 13:29:29 -0400
Subject: [PATCH 181/206] Add common test: forward and backward trajectories
 are reversible.

---
 tests/gflownet/envs/common.py | 36 +++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 44173bc19..701594a7e 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -22,6 +22,7 @@ def test__all_env_common(env):
     test__step_random__does_not_sample_invalid_actions(env)
     test__forward_actions_have_nonzero_backward_prob(env)
     test__backward_actions_have_nonzero_forward_prob(env)
+    test__trajectories_are_reversible(env)
     test__get_parents_step_get_mask__are_compatible(env)
     test__sample_backwards_reaches_source(env)
     test__state2readable__is_reversible(env)
@@ -40,6 +41,7 @@ def test__continuous_env_common(env):
     test__backward_actions_have_nonzero_forward_prob(env)
     test__step__returns_same_state_action_and_invalid_if_done(env)
     test__sample_backwards_reaches_source(env)
+    test__trajectories_are_reversible(env)
 
 
 #     test__gflownet_minimal_runs(env)
@@ -357,6 +359,40 @@ def test__forward_actions_have_nonzero_backward_prob(env):
         assert logprobs_bw > -1e6
 
 
+@pytest.mark.repeat(1000)
+def test__trajectories_are_reversible(env):
+    env = env.reset()
+
+    # Sample random forward trajectory
+    states_trajectory_fw = []
+    actions_trajectory_fw = []
+    while not env.done:
+        state, action, valid = env.step_random(backward=False)
+        if valid:
+            states_trajectory_fw.append(state)
+            actions_trajectory_fw.append(action)
+
+    # Sample backward trajectory with actions in forward trajectory
+    states_trajectory_bw = []
+    actions_trajectory_bw = []
+    actions_trajectory_fw_copy = actions_trajectory_fw.copy()
+    while not env.equal(env.state, env.source) or env.done:
+        state, action, valid = env.step_backwards(actions_trajectory_fw_copy.pop())
+        if valid:
+            states_trajectory_bw.append(state)
+            actions_trajectory_bw.append(action)
+
+    assert all(
+        [
+            env.equal(s_fw, s_bw)
+            for s_fw, s_bw in zip(
+                states_trajectory_fw[:-1], states_trajectory_bw[-2::-1]
+            )
+        ]
+    )
+    assert actions_trajectory_fw == actions_trajectory_bw[::-1]
+
+
 def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
     states = _get_terminating_states(env, n)
     policy_random = torch.unsqueeze(

From b07e8e17a4133aebee7bd8ad111bed1886ab0208 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 24 Sep 2023 21:49:56 -0400
Subject: [PATCH 182/206] Remove old test_cube.py

---
 tests/gflownet/envs/test_cube.py | 97 --------------------------------
 1 file changed, 97 deletions(-)
 delete mode 100644 tests/gflownet/envs/test_cube.py

diff --git a/tests/gflownet/envs/test_cube.py b/tests/gflownet/envs/test_cube.py
deleted file mode 100644
index df7812cd8..000000000
--- a/tests/gflownet/envs/test_cube.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import common
-import numpy as np
-import pytest
-import torch
-
-from gflownet.envs.cube import HybridCube
-
-
-@pytest.fixture
-def env():
-    return HybridCube(n_dim=2, n_comp=3)
-
-
-@pytest.mark.parametrize(
-    "action_space",
-    [
-        [
-            (0, 0.0),
-            (1, 0.0),
-            (2, 0.0),
-        ],
-    ],
-)
-def test__get_action_space__returns_expected(env, action_space):
-    assert set(action_space) == set(env.action_space)
-
-
-def test__get_policy_output__returns_expected(env):
-    assert env.policy_output_dim == env.n_dim * env.n_comp * 3 + env.n_dim + 1
-    fixed_policy_output = env.fixed_policy_output
-    random_policy_output = env.random_policy_output
-    assert torch.all(fixed_policy_output[: env.n_dim + 1] == 1)
-    assert torch.all(random_policy_output[: env.n_dim + 1] == 1)
-    assert torch.all(fixed_policy_output[env.n_dim + 1 :: 3] == 1)
-    assert torch.all(
-        fixed_policy_output[env.n_dim + 2 :: 3] == env.fixed_distr_params["beta_alpha"]
-    )
-    assert torch.all(
-        fixed_policy_output[env.n_dim + 3 :: 3] == env.fixed_distr_params["beta_beta"]
-    )
-    assert torch.all(random_policy_output[env.n_dim + 1 :: 3] == 1)
-    assert torch.all(
-        random_policy_output[env.n_dim + 2 :: 3]
-        == env.random_distr_params["beta_alpha"]
-    )
-    assert torch.all(
-        random_policy_output[env.n_dim + 3 :: 3] == env.random_distr_params["beta_beta"]
-    )
-
-
-@pytest.mark.parametrize(
-    "state, expected",
-    [
-        (
-            [0.0, 0.0],
-            [0.0, 0.0],
-        ),
-        (
-            [1.0, 1.0],
-            [1.0, 1.0],
-        ),
-        (
-            [1.1, 1.00001],
-            [1.0, 1.0],
-        ),
-        (
-            [-0.1, 1.00001],
-            [0.0, 1.0],
-        ),
-        (
-            [0.1, 0.21],
-            [0.1, 0.21],
-        ),
-    ],
-)
-def test__state2policy_returns_expected(env, state, expected):
-    assert env.state2policy(state) == expected
-
-
-@pytest.mark.parametrize(
-    "states, expected",
-    [
-        (
-            [[0.0, 0.0], [1.0, 1.0], [1.1, 1.00001], [-0.1, 1.00001], [0.1, 0.21]],
-            [[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 1.0], [0.1, 0.21]],
-        ),
-    ],
-)
-def test__statebatch_torch2policy_returns_expected(env, states, expected):
-    assert np.equal(env.statebatch2policy(states), np.array(expected)).all()
-    assert torch.equal(
-        env.statetorch2policy(torch.tensor(states)), torch.tensor(expected)
-    )
-
-
-# def test__continuous_env_common(env):
-#     return common.test__continuous_env_common(env)

From ca4ad8784970ebfd9c0c366b8c688e025373d6a9 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 24 Sep 2023 21:53:29 -0400
Subject: [PATCH 183/206] In tree: fixed/random_distribution ->
 fixed/random_distr_params

---
 gflownet/envs/tree.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gflownet/envs/tree.py b/gflownet/envs/tree.py
index c98bf4936..cb4619971 100644
--- a/gflownet/envs/tree.py
+++ b/gflownet/envs/tree.py
@@ -158,11 +158,11 @@ def __init__(
         threshold_components: int = 1,
         beta_params_min: float = 0.1,
         beta_params_max: float = 2.0,
-        fixed_distribution: dict = {
+        fixed_distr_params: dict = {
             "beta_alpha": 2.0,
             "beta_beta": 5.0,
         },
-        random_distribution: dict = {
+        random_distr_params: dict = {
             "beta_alpha": 1.0,
             "beta_beta": 1.0,
         },
@@ -294,8 +294,8 @@ def __init__(
         self.statetorch2oracle = self.statetorch2policy
 
         super().__init__(
-            fixed_distribution=fixed_distribution,
-            random_distribution=random_distribution,
+            fixed_distr_params=fixed_distr_params,
+            random_distr_params=random_distr_params,
             continuous=continuous,
             **kwargs,
         )

From d11df932685e8c6032b54e5d1f1709eb4943e289 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 24 Sep 2023 22:00:55 -0400
Subject: [PATCH 184/206] Fix ordering of arguments and add todos to things
 that must be fixed.

---
 gflownet/envs/tree.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gflownet/envs/tree.py b/gflownet/envs/tree.py
index cb4619971..26e6d653c 100644
--- a/gflownet/envs/tree.py
+++ b/gflownet/envs/tree.py
@@ -670,6 +670,7 @@ def sample_actions_batch_continuous(
             policy_outputs_discrete = policy_outputs[
                 is_discrete, : self._index_continuous_policy_output
             ]
+            # TODO: mask must be applied to states_from too!
             actions_discrete, logprobs_discrete = super().sample_actions_batch(
                 policy_outputs_discrete,
                 mask[is_discrete, : self._index_continuous_policy_output],
@@ -773,12 +774,13 @@ def get_logprobs_continuous(
             policy_outputs_discrete = policy_outputs[
                 mask_discrete, : self._index_continuous_policy_output
             ]
+            # TODO: mask must be applied to states_from too!
             logprobs_discrete = super().get_logprobs(
                 policy_outputs_discrete,
-                is_backward,
                 actions[mask_discrete],
-                states_from[mask_discrete],
                 mask[mask_discrete, : self._index_continuous_policy_output],
+                states_from,
+                is_backward,
             )
             logprobs[mask_discrete] = logprobs_discrete
         if torch.all(mask_discrete):

From 955f2ca1a5ab045ab93095193e3e6b8bcc6eba79 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 24 Sep 2023 22:56:13 -0400
Subject: [PATCH 185/206] Fix issues in tree.

---
 gflownet/envs/tree.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gflownet/envs/tree.py b/gflownet/envs/tree.py
index 26e6d653c..bc92cda13 100644
--- a/gflownet/envs/tree.py
+++ b/gflownet/envs/tree.py
@@ -670,11 +670,11 @@ def sample_actions_batch_continuous(
             policy_outputs_discrete = policy_outputs[
                 is_discrete, : self._index_continuous_policy_output
             ]
-            # TODO: mask must be applied to states_from too!
+            # states_from can be None because it will be ignored
             actions_discrete, logprobs_discrete = super().sample_actions_batch(
                 policy_outputs_discrete,
                 mask[is_discrete, : self._index_continuous_policy_output],
-                states_from,
+                None,
                 is_backward,
                 sampling_method,
                 temperature_logits,
@@ -774,12 +774,12 @@ def get_logprobs_continuous(
             policy_outputs_discrete = policy_outputs[
                 mask_discrete, : self._index_continuous_policy_output
             ]
-            # TODO: mask must be applied to states_from too!
+            # states_from can be None because it will be ignored
             logprobs_discrete = super().get_logprobs(
                 policy_outputs_discrete,
                 actions[mask_discrete],
                 mask[mask_discrete, : self._index_continuous_policy_output],
-                states_from,
+                None,
                 is_backward,
             )
             logprobs[mask_discrete] = logprobs_discrete

From 9cac68ea03fcfea1a2f197a9880aeee0a6190ae3 Mon Sep 17 00:00:00 2001
From: Pierre Luc Carrier <pierre.luc.carrier@mila.quebec>
Date: Mon, 25 Sep 2023 12:16:36 -0400
Subject: [PATCH 186/206] Skip test if states are none

---
 tests/gflownet/envs/common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 44173bc19..6a33aa830 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -359,6 +359,8 @@ def test__forward_actions_have_nonzero_backward_prob(env):
 
 def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
     states = _get_terminating_states(env, n)
+    if states is None:
+        return
     policy_random = torch.unsqueeze(
         tfloat(env.random_policy_output, float_type=env.float, device=env.device), 0
     )

From f2dedead5974b12a0772ea65102fc6f64ae674de Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 12:49:03 -0400
Subject: [PATCH 187/206] Remove old test_cube.py

---
 tests/gflownet/envs/test_cube.py | 97 --------------------------------
 1 file changed, 97 deletions(-)
 delete mode 100644 tests/gflownet/envs/test_cube.py

diff --git a/tests/gflownet/envs/test_cube.py b/tests/gflownet/envs/test_cube.py
deleted file mode 100644
index df7812cd8..000000000
--- a/tests/gflownet/envs/test_cube.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import common
-import numpy as np
-import pytest
-import torch
-
-from gflownet.envs.cube import HybridCube
-
-
-@pytest.fixture
-def env():
-    return HybridCube(n_dim=2, n_comp=3)
-
-
-@pytest.mark.parametrize(
-    "action_space",
-    [
-        [
-            (0, 0.0),
-            (1, 0.0),
-            (2, 0.0),
-        ],
-    ],
-)
-def test__get_action_space__returns_expected(env, action_space):
-    assert set(action_space) == set(env.action_space)
-
-
-def test__get_policy_output__returns_expected(env):
-    assert env.policy_output_dim == env.n_dim * env.n_comp * 3 + env.n_dim + 1
-    fixed_policy_output = env.fixed_policy_output
-    random_policy_output = env.random_policy_output
-    assert torch.all(fixed_policy_output[: env.n_dim + 1] == 1)
-    assert torch.all(random_policy_output[: env.n_dim + 1] == 1)
-    assert torch.all(fixed_policy_output[env.n_dim + 1 :: 3] == 1)
-    assert torch.all(
-        fixed_policy_output[env.n_dim + 2 :: 3] == env.fixed_distr_params["beta_alpha"]
-    )
-    assert torch.all(
-        fixed_policy_output[env.n_dim + 3 :: 3] == env.fixed_distr_params["beta_beta"]
-    )
-    assert torch.all(random_policy_output[env.n_dim + 1 :: 3] == 1)
-    assert torch.all(
-        random_policy_output[env.n_dim + 2 :: 3]
-        == env.random_distr_params["beta_alpha"]
-    )
-    assert torch.all(
-        random_policy_output[env.n_dim + 3 :: 3] == env.random_distr_params["beta_beta"]
-    )
-
-
-@pytest.mark.parametrize(
-    "state, expected",
-    [
-        (
-            [0.0, 0.0],
-            [0.0, 0.0],
-        ),
-        (
-            [1.0, 1.0],
-            [1.0, 1.0],
-        ),
-        (
-            [1.1, 1.00001],
-            [1.0, 1.0],
-        ),
-        (
-            [-0.1, 1.00001],
-            [0.0, 1.0],
-        ),
-        (
-            [0.1, 0.21],
-            [0.1, 0.21],
-        ),
-    ],
-)
-def test__state2policy_returns_expected(env, state, expected):
-    assert env.state2policy(state) == expected
-
-
-@pytest.mark.parametrize(
-    "states, expected",
-    [
-        (
-            [[0.0, 0.0], [1.0, 1.0], [1.1, 1.00001], [-0.1, 1.00001], [0.1, 0.21]],
-            [[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 1.0], [0.1, 0.21]],
-        ),
-    ],
-)
-def test__statebatch_torch2policy_returns_expected(env, states, expected):
-    assert np.equal(env.statebatch2policy(states), np.array(expected)).all()
-    assert torch.equal(
-        env.statetorch2policy(torch.tensor(states)), torch.tensor(expected)
-    )
-
-
-# def test__continuous_env_common(env):
-#     return common.test__continuous_env_common(env)

From 2fbd36a37f1f1db53971945d8f3acd8cdff13f58 Mon Sep 17 00:00:00 2001
From: Pierre Luc Carrier <pierre.luc.carrier@mila.quebec>
Date: Mon, 25 Sep 2023 11:56:36 -0400
Subject: [PATCH 188/206] Fix spacegroup _is_compatible()

---
 gflownet/envs/crystals/spacegroup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gflownet/envs/crystals/spacegroup.py b/gflownet/envs/crystals/spacegroup.py
index 14bdbf103..10053380a 100644
--- a/gflownet/envs/crystals/spacegroup.py
+++ b/gflownet/envs/crystals/spacegroup.py
@@ -608,7 +608,9 @@ def _is_compatible(
         False otherwise.
         """
         # Get list of space groups compatible with the composition
-        space_groups = [self.n_atoms_compatibility_dict[sg] for sg in self.space_groups]
+        space_groups = [
+            sg for sg in self.space_groups if self.n_atoms_compatibility_dict[sg]
+        ]
 
         # Prune the list of space groups to those compatible with the provided crystal-
         # lattice system

From 553731cd34c2ac17428c027d44d2513ac59d8e4d Mon Sep 17 00:00:00 2001
From: Pierre Luc Carrier <pierre.luc.carrier@mila.quebec>
Date: Mon, 25 Sep 2023 11:56:36 -0400
Subject: [PATCH 189/206] Fix spacegroup _is_compatible()

---
 gflownet/envs/crystals/spacegroup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gflownet/envs/crystals/spacegroup.py b/gflownet/envs/crystals/spacegroup.py
index 14bdbf103..10053380a 100644
--- a/gflownet/envs/crystals/spacegroup.py
+++ b/gflownet/envs/crystals/spacegroup.py
@@ -608,7 +608,9 @@ def _is_compatible(
         False otherwise.
         """
         # Get list of space groups compatible with the composition
-        space_groups = [self.n_atoms_compatibility_dict[sg] for sg in self.space_groups]
+        space_groups = [
+            sg for sg in self.space_groups if self.n_atoms_compatibility_dict[sg]
+        ]
 
         # Prune the list of space groups to those compatible with the provided crystal-
         # lattice system

From e37cbefc93cba7a0d2f803b113087884875099ce Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 11 Sep 2023 17:25:39 -0400
Subject: [PATCH 190/206] Control number of repetitions and batch size with
 global variables and set lightweight default values.

---
 tests/gflownet/utils/test_batch.py | 54 +++++++++++++++++-------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/tests/gflownet/utils/test_batch.py b/tests/gflownet/utils/test_batch.py
index d7d57ec63..61c77c873 100644
--- a/tests/gflownet/utils/test_batch.py
+++ b/tests/gflownet/utils/test_batch.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pytest
 import torch
-
 from gflownet.envs.ctorus import ContinuousTorus
 from gflownet.envs.grid import Grid
 from gflownet.envs.tetris import Tetris
@@ -19,6 +18,13 @@
     tlong,
 )
 
+# Sets the number of repetitions for the tests. Please increase to ~10 after
+# introducing changes to the Batch class and decrease again to 1 when passed.
+N_REPEATS = 2
+# Sets the batch size for the tests. Please increase to ~10 after introducing changes
+# to the Batch class and decrease again to 5 when passed.
+BATCH_SIZE = 5
+
 
 @pytest.fixture
 def batch():
@@ -64,7 +70,7 @@ def test__len__returnszero_at_init(batch):
     assert len(batch) == 0
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize("env", ["grid2d", "tetris6x4", "ctorus2d5l"])
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__add_to_batch__single_env_adds_expected(env, batch, request):
@@ -89,7 +95,7 @@ def test__add_to_batch__single_env_adds_expected(env, batch, request):
         assert batch.state_indices[-1] == env.n_actions
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize("env", ["grid2d", "tetris6x4", "ctorus2d5l"])
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__get_states__single_env_returns_expected(env, batch, request):
@@ -120,7 +126,7 @@ def test__get_states__single_env_returns_expected(env, batch, request):
     )
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize("env", ["grid2d", "tetris6x4", "ctorus2d5l"])
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__get_parents__single_env_returns_expected(env, batch, request):
@@ -152,7 +158,7 @@ def test__get_parents__single_env_returns_expected(env, batch, request):
     )
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize("env", ["grid2d", "tetris6x4"])
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__get_parents_all__single_env_returns_expected(env, batch, request):
@@ -194,7 +200,7 @@ def test__get_parents_all__single_env_returns_expected(env, batch, request):
     )
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize("env", ["grid2d", "tetris6x4", "ctorus2d5l"])
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__get_masks_forward__single_env_returns_expected(env, batch, request):
@@ -214,7 +220,7 @@ def test__get_masks_forward__single_env_returns_expected(env, batch, request):
     assert torch.equal(masks_forward_batch, tbool(masks_forward, device=batch.device))
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize("env", ["grid2d", "tetris6x4", "ctorus2d5l"])
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__get_masks_backward__single_env_returns_expected(env, batch, request):
@@ -234,7 +240,7 @@ def test__get_masks_backward__single_env_returns_expected(env, batch, request):
     assert torch.equal(masks_backward_batch, tbool(masks_backward, device=batch.device))
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize(
     "env, proxy",
     [("grid2d", "corners"), ("tetris6x4", "tetris_score"), ("ctorus2d5l", "corners")],
@@ -265,14 +271,14 @@ def test__get_rewards__single_env_returns_expected(env, proxy, batch, request):
     ), (rewards, rewards_batch)
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize(
     "env, proxy",
     [("grid2d", "corners"), ("tetris6x4", "tetris_score"), ("ctorus2d5l", "corners")],
 )
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__forward_sampling_multiple_envs_all_as_expected(env, proxy, batch, request):
-    batch_size = 10
+    batch_size = BATCH_SIZE
     env_ref = request.getfixturevalue(env)
     proxy = request.getfixturevalue(proxy)
     env_ref.proxy = proxy
@@ -444,14 +450,14 @@ def test__forward_sampling_multiple_envs_all_as_expected(env, proxy, batch, requ
     )
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize(
     "env, proxy",
     [("grid2d", "corners"), ("tetris6x4", "tetris_score")],
 )
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__backward_sampling_multiple_envs_all_as_expected(env, proxy, batch, request):
-    batch_size = 10
+    batch_size = BATCH_SIZE
     env_ref = request.getfixturevalue(env)
     proxy = request.getfixturevalue(proxy)
     env_ref.proxy = proxy
@@ -630,7 +636,7 @@ def test__backward_sampling_multiple_envs_all_as_expected(env, proxy, batch, req
     )
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize(
     "env, proxy",
     [("grid2d", "corners"), ("tetris6x4", "tetris_score")],
@@ -662,7 +668,7 @@ def test__mixed_sampling_multiple_envs_all_as_expected(env, proxy, batch, reques
     ### FORWARD ###
 
     # Make list of envs
-    batch_size_forward = 10
+    batch_size_forward = BATCH_SIZE
     envs = []
     for idx in range(batch_size_forward):
         env_aux = env_ref.copy().reset(idx)
@@ -712,7 +718,7 @@ def test__mixed_sampling_multiple_envs_all_as_expected(env, proxy, batch, reques
     ### BACKWARD ###
 
     # Sample terminating states and build list of envs
-    batch_size_backward = 10
+    batch_size_backward = BATCH_SIZE
     x_batch = env_ref.get_random_terminating_states(n_states=batch_size_backward)
     envs = []
     for idx, x in enumerate(x_batch):
@@ -873,7 +879,7 @@ def test__mixed_sampling_multiple_envs_all_as_expected(env, proxy, batch, reques
     )
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize(
     "env, proxy",
     [("grid2d", "corners"), ("tetris6x4", "tetris_score")],
@@ -906,7 +912,7 @@ def test__mixed_sampling_merged_all_as_expected(env, proxy, request):
     ### FORWARD ###
 
     # Make list of envs
-    batch_size_forward = 10
+    batch_size_forward = BATCH_SIZE
     envs = []
     for idx in range(batch_size_forward):
         env_aux = env_ref.copy().reset(idx)
@@ -956,7 +962,7 @@ def test__mixed_sampling_merged_all_as_expected(env, proxy, request):
     ### BACKWARD ###
 
     # Sample terminating states and build list of envs
-    batch_size_backward = 10
+    batch_size_backward = BATCH_SIZE
     x_batch = env_ref.get_random_terminating_states(n_states=batch_size_backward)
     envs = []
     for idx, x in enumerate(x_batch):
@@ -1122,13 +1128,13 @@ def test__mixed_sampling_merged_all_as_expected(env, proxy, request):
     )
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize("env", ["grid2d", "tetris6x4", "ctorus2d5l"])
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__make_indices_consecutive__shuffled_indices_become_consecutive(
     env, batch, request
 ):
-    batch_size = 10
+    batch_size = BATCH_SIZE
     env_ref = request.getfixturevalue(env)
     batch.set_env(env_ref)
 
@@ -1185,13 +1191,13 @@ def test__make_indices_consecutive__shuffled_indices_become_consecutive(
     )
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize("env", ["grid2d", "tetris6x4", "ctorus2d5l"])
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__make_indices_consecutive__random_indices_become_consecutive(
     env, batch, request
 ):
-    batch_size = 10
+    batch_size = BATCH_SIZE
     env_ref = request.getfixturevalue(env)
     batch.set_env(env_ref)
 
@@ -1250,13 +1256,13 @@ def test__make_indices_consecutive__random_indices_become_consecutive(
     )
 
 
-@pytest.mark.repeat(10)
+@pytest.mark.repeat(N_REPEATS)
 @pytest.mark.parametrize("env", ["grid2d", "tetris6x4", "ctorus2d5l"])
 # @pytest.mark.skip(reason="skip while developping other tests")
 def test__make_indices_consecutive__multiplied_indices_become_consecutive(
     env, batch, request
 ):
-    batch_size = 10
+    batch_size = BATCH_SIZE
     env_ref = request.getfixturevalue(env)
     batch.set_env(env_ref)
 

From 19c2660a31bd69f07b3a0c9af36425b5dadc0828 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 13:16:37 -0400
Subject: [PATCH 191/206] black

---
 tests/gflownet/utils/test_batch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/gflownet/utils/test_batch.py b/tests/gflownet/utils/test_batch.py
index 61c77c873..b183eadf2 100644
--- a/tests/gflownet/utils/test_batch.py
+++ b/tests/gflownet/utils/test_batch.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 import torch
+
 from gflownet.envs.ctorus import ContinuousTorus
 from gflownet.envs.grid import Grid
 from gflownet.envs.tetris import Tetris

From 993f4568fd8c466244d873de2b7133e9b21a6347 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 13:27:02 -0400
Subject: [PATCH 192/206] Add warning if test is skipped because of None states
 so that it does not go silent.

---
 tests/gflownet/envs/common.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 6a33aa830..2ec82fbef 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -131,6 +131,7 @@ def test__sampling_forwards_reaches_done_in_finite_steps(env):
 def test__set_state__creates_new_copy_of_state(env):
     states = _get_terminating_states(env, 5)
     if states is None:
+        warnings.warn("Skipping test because states are None.")
         return
     envs = []
     for state in states:
@@ -146,6 +147,7 @@ def test__set_state__creates_new_copy_of_state(env):
 def test__sample_actions__backward__returns_eos_if_done(env, n=5):
     states = _get_terminating_states(env, n)
     if states is None:
+        warnings.warn("Skipping test because states are None.")
         return
     # Set states, done and get masks
     masks = []
@@ -170,6 +172,7 @@ def test__sample_actions__backward__returns_eos_if_done(env, n=5):
 def test__get_logprobs__backward__returns_zero_if_done(env, n=5):
     states = _get_terminating_states(env, n)
     if states is None:
+        warnings.warn("Skipping test because states are None.")
         return
     # Set states, done and get masks
     masks = []
@@ -199,6 +202,7 @@ def test__get_logprobs__backward__returns_zero_if_done(env, n=5):
 def test__sample_backwards_reaches_source(env, n=100):
     states = _get_terminating_states(env, n)
     if states is None:
+        warnings.warn("Skipping test because states are None.")
         return
     for state in states:
         env.set_state(state, done=True)
@@ -360,6 +364,7 @@ def test__forward_actions_have_nonzero_backward_prob(env):
 def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
     states = _get_terminating_states(env, n)
     if states is None:
+        warnings.warn("Skipping test because states are None.")
         return
     policy_random = torch.unsqueeze(
         tfloat(env.random_policy_output, float_type=env.float, device=env.device), 0

From 90e792ee3cf176499ff7afc1330fe03426257417 Mon Sep 17 00:00:00 2001
From: Pierre Luc Carrier <pierre.luc.carrier@mila.quebec>
Date: Mon, 25 Sep 2023 12:16:36 -0400
Subject: [PATCH 193/206] Skip test if states are none

---
 tests/gflownet/envs/common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 701594a7e..b09d52dc4 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -395,6 +395,8 @@ def test__trajectories_are_reversible(env):
 
 def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
     states = _get_terminating_states(env, n)
+    if states is None:
+        return
     policy_random = torch.unsqueeze(
         tfloat(env.random_policy_output, float_type=env.float, device=env.device), 0
     )

From 74496766d795a1907463ceee6f4e1de121f5b2f0 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 13:27:02 -0400
Subject: [PATCH 194/206] Add warning if test is skipped because of None states
 so that it does not go silent.

---
 tests/gflownet/envs/common.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index b09d52dc4..ea0e26f92 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -133,6 +133,7 @@ def test__sampling_forwards_reaches_done_in_finite_steps(env):
 def test__set_state__creates_new_copy_of_state(env):
     states = _get_terminating_states(env, 5)
     if states is None:
+        warnings.warn("Skipping test because states are None.")
         return
     envs = []
     for state in states:
@@ -148,6 +149,7 @@ def test__set_state__creates_new_copy_of_state(env):
 def test__sample_actions__backward__returns_eos_if_done(env, n=5):
     states = _get_terminating_states(env, n)
     if states is None:
+        warnings.warn("Skipping test because states are None.")
         return
     # Set states, done and get masks
     masks = []
@@ -172,6 +174,7 @@ def test__sample_actions__backward__returns_eos_if_done(env, n=5):
 def test__get_logprobs__backward__returns_zero_if_done(env, n=5):
     states = _get_terminating_states(env, n)
     if states is None:
+        warnings.warn("Skipping test because states are None.")
         return
     # Set states, done and get masks
     masks = []
@@ -201,6 +204,7 @@ def test__get_logprobs__backward__returns_zero_if_done(env, n=5):
 def test__sample_backwards_reaches_source(env, n=100):
     states = _get_terminating_states(env, n)
     if states is None:
+        warnings.warn("Skipping test because states are None.")
         return
     for state in states:
         env.set_state(state, done=True)
@@ -396,6 +400,7 @@ def test__trajectories_are_reversible(env):
 def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
     states = _get_terminating_states(env, n)
     if states is None:
+        warnings.warn("Skipping test because states are None.")
         return
     policy_random = torch.unsqueeze(
         tfloat(env.random_policy_output, float_type=env.float, device=env.device), 0

From aca3a3c7719947bd76c479134d987d232a2707f9 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 16:19:52 -0400
Subject: [PATCH 195/206] Skip test about reversible trajectories for Crystal
 because it's broken.

---
 tests/gflownet/envs/common.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index ea0e26f92..475745a21 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -365,6 +365,11 @@ def test__forward_actions_have_nonzero_backward_prob(env):
 
 @pytest.mark.repeat(1000)
 def test__trajectories_are_reversible(env):
+    # Skip for ceertain environments until fixed:
+    skip_envs = ["Crystal"]
+    if env.__class__.__name__ in skip_envs:
+        warnings.warn("Skipping test for this specific environment.")
+        return
     env = env.reset()
 
     # Sample random forward trajectory

From 67bdca3400cae18cd85ac4280d3560fcbe1e6910 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 16:28:19 -0400
Subject: [PATCH 196/206] Skip test__backward_actions_have_nonzero_forward_prob
 for LatticeParameters because backward sampling is broken.

---
 tests/gflownet/envs/common.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 475745a21..9fb00c664 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -403,6 +403,11 @@ def test__trajectories_are_reversible(env):
 
 
 def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
+    # Skip for certain environments until fixed:
+    skip_envs = ["LatticeParameters"]
+    if env.__class__.__name__ in skip_envs:
+        warnings.warn("Skipping test for this specific environment.")
+        return
     states = _get_terminating_states(env, n)
     if states is None:
         warnings.warn("Skipping test because states are None.")

From 2f17d048b81054384f62baab804e03af71d4bdc8 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 16:28:19 -0400
Subject: [PATCH 197/206] Skip test__backward_actions_have_nonzero_forward_prob
 for LatticeParameters because backward sampling is broken.

---
 tests/gflownet/envs/common.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 2ec82fbef..9fbb105e1 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -362,6 +362,11 @@ def test__forward_actions_have_nonzero_backward_prob(env):
 
 
 def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
+    # Skip for certain environments until fixed:
+    skip_envs = ["LatticeParameters"]
+    if env.__class__.__name__ in skip_envs:
+        warnings.warn("Skipping test for this specific environment.")
+        return
     states = _get_terminating_states(env, n)
     if states is None:
         warnings.warn("Skipping test because states are None.")

From 6fb731803fb122e06e5a6a98a1f8d01f563523b8 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 24 Sep 2023 21:53:29 -0400
Subject: [PATCH 198/206] In tree: fixed/random_distribution ->
 fixed/random_distr_params

---
 gflownet/envs/tree.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gflownet/envs/tree.py b/gflownet/envs/tree.py
index c98bf4936..cb4619971 100644
--- a/gflownet/envs/tree.py
+++ b/gflownet/envs/tree.py
@@ -158,11 +158,11 @@ def __init__(
         threshold_components: int = 1,
         beta_params_min: float = 0.1,
         beta_params_max: float = 2.0,
-        fixed_distribution: dict = {
+        fixed_distr_params: dict = {
             "beta_alpha": 2.0,
             "beta_beta": 5.0,
         },
-        random_distribution: dict = {
+        random_distr_params: dict = {
             "beta_alpha": 1.0,
             "beta_beta": 1.0,
         },
@@ -294,8 +294,8 @@ def __init__(
         self.statetorch2oracle = self.statetorch2policy
 
         super().__init__(
-            fixed_distribution=fixed_distribution,
-            random_distribution=random_distribution,
+            fixed_distr_params=fixed_distr_params,
+            random_distr_params=random_distr_params,
             continuous=continuous,
             **kwargs,
         )

From b89c7957dce392919762df14764aab23fe44d3e7 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sun, 24 Sep 2023 22:56:13 -0400
Subject: [PATCH 199/206] resolve cherry-pick#

---
 gflownet/envs/tree.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/gflownet/envs/tree.py b/gflownet/envs/tree.py
index cb4619971..bc92cda13 100644
--- a/gflownet/envs/tree.py
+++ b/gflownet/envs/tree.py
@@ -670,10 +670,11 @@ def sample_actions_batch_continuous(
             policy_outputs_discrete = policy_outputs[
                 is_discrete, : self._index_continuous_policy_output
             ]
+            # states_from can be None because it will be ignored
             actions_discrete, logprobs_discrete = super().sample_actions_batch(
                 policy_outputs_discrete,
                 mask[is_discrete, : self._index_continuous_policy_output],
-                states_from,
+                None,
                 is_backward,
                 sampling_method,
                 temperature_logits,
@@ -773,12 +774,13 @@ def get_logprobs_continuous(
             policy_outputs_discrete = policy_outputs[
                 mask_discrete, : self._index_continuous_policy_output
             ]
+            # states_from can be None because it will be ignored
             logprobs_discrete = super().get_logprobs(
                 policy_outputs_discrete,
-                is_backward,
                 actions[mask_discrete],
-                states_from[mask_discrete],
                 mask[mask_discrete, : self._index_continuous_policy_output],
+                None,
+                is_backward,
             )
             logprobs[mask_discrete] = logprobs_discrete
         if torch.all(mask_discrete):

From 3702057d57bcbd9f3ee04058036c1c896df5d21b Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 17:05:46 -0400
Subject: [PATCH 200/206] Add Tree to skipped envs in reversible trajs test.

---
 tests/gflownet/envs/common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 9fb00c664..0d8203f39 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -365,8 +365,8 @@ def test__forward_actions_have_nonzero_backward_prob(env):
 
 @pytest.mark.repeat(1000)
 def test__trajectories_are_reversible(env):
-    # Skip for ceertain environments until fixed:
-    skip_envs = ["Crystal"]
+    # Skip for certain environments until fixed:
+    skip_envs = ["Crystal, Tree"]
     if env.__class__.__name__ in skip_envs:
         warnings.warn("Skipping test for this specific environment.")
         return

From 00c0d9a64d250495ac373111ab6579aa874de63f Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 17:30:58 -0400
Subject: [PATCH 201/206] Add more exceptions so that test do not crash.

---
 tests/gflownet/envs/common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index 0d8203f39..fd338b30b 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -366,7 +366,7 @@ def test__forward_actions_have_nonzero_backward_prob(env):
 @pytest.mark.repeat(1000)
 def test__trajectories_are_reversible(env):
     # Skip for certain environments until fixed:
-    skip_envs = ["Crystal, Tree"]
+    skip_envs = ["Crystal, LatticeParameters, Tree"]
     if env.__class__.__name__ in skip_envs:
         warnings.warn("Skipping test for this specific environment.")
         return
@@ -404,7 +404,7 @@ def test__trajectories_are_reversible(env):
 
 def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
     # Skip for certain environments until fixed:
-    skip_envs = ["LatticeParameters"]
+    skip_envs = ["Crystal, LatticeParameters"]
     if env.__class__.__name__ in skip_envs:
         warnings.warn("Skipping test for this specific environment.")
         return

From 2f762c7646265dc45c33b52c7a915e706044ad35 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 17:34:00 -0400
Subject: [PATCH 202/206] Fix stupid mistake

---
 tests/gflownet/envs/common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/gflownet/envs/common.py b/tests/gflownet/envs/common.py
index fd338b30b..67d4b7b9b 100644
--- a/tests/gflownet/envs/common.py
+++ b/tests/gflownet/envs/common.py
@@ -366,7 +366,7 @@ def test__forward_actions_have_nonzero_backward_prob(env):
 @pytest.mark.repeat(1000)
 def test__trajectories_are_reversible(env):
     # Skip for certain environments until fixed:
-    skip_envs = ["Crystal, LatticeParameters, Tree"]
+    skip_envs = ["Crystal", "LatticeParameters", "Tree"]
     if env.__class__.__name__ in skip_envs:
         warnings.warn("Skipping test for this specific environment.")
         return
@@ -404,7 +404,7 @@ def test__trajectories_are_reversible(env):
 
 def test__backward_actions_have_nonzero_forward_prob(env, n=1000):
     # Skip for certain environments until fixed:
-    skip_envs = ["Crystal, LatticeParameters"]
+    skip_envs = ["Crystal", "LatticeParameters"]
     if env.__class__.__name__ in skip_envs:
         warnings.warn("Skipping test for this specific environment.")
         return

From 13f6f0a08ef666b0d50a21201cebc89c2790ad09 Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 18:38:01 -0400
Subject: [PATCH 203/206] flake8 and docstring

---
 gflownet/envs/cube.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 41f88402e..5fb196fd4 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -8,11 +8,9 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
-import numpy.typing as npt
-import pandas as pd
 import torch
 from sklearn.neighbors import KernelDensity
-from torch.distributions import Bernoulli, Beta, Categorical, MixtureSameFamily, Uniform
+from torch.distributions import Bernoulli, Beta, Categorical, MixtureSameFamily
 from torchtyping import TensorType
 
 from gflownet.envs.base import GFlowNetEnv
@@ -353,8 +351,8 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         For each dimension d of the hyper-cube and component c of the mixture, the
         output of the policy should return:
           1) the weight of the component in the mixture,
-          2) the logit(alpha) parameter of the Beta distribution to sample the increment,
-          3) the logit(beta) parameter of the Beta distribution to sample the increment.
+          2) the pre-alpha parameter of the Beta distribution to sample the increment,
+          3) the pre-beta parameter of the Beta distribution to sample the increment.
 
         These parameters are the first n_dim * n_comp * 3 of the policy output such
         that the first 3 x C elements correspond to the first dimension, and so on.
@@ -369,6 +367,10 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         Therefore, the output of the policy model has dimensionality D x C x 3 + 2,
         where D is the number of dimensions (self.n_dim) and C is the number of
         components (self.n_comp).
+        
+        See
+        ---
+        _beta_params_to_policy_outputs()
         """
         # Parameters for continuous actions
         self._len_policy_output_cont = self.n_dim * self.n_comp * 3

From 0b143e135f7210245f868f74da40fb12ce71581d Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Mon, 25 Sep 2023 18:39:28 -0400
Subject: [PATCH 204/206] black

---
 gflownet/envs/cube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gflownet/envs/cube.py b/gflownet/envs/cube.py
index 5fb196fd4..a66363286 100644
--- a/gflownet/envs/cube.py
+++ b/gflownet/envs/cube.py
@@ -367,7 +367,7 @@ def get_policy_output(self, params: dict) -> TensorType["policy_output_dim"]:
         Therefore, the output of the policy model has dimensionality D x C x 3 + 2,
         where D is the number of dimensions (self.n_dim) and C is the number of
         components (self.n_comp).
-        
+
         See
         ---
         _beta_params_to_policy_outputs()

From 9cd47a6e186f79002d66f00463e231c26b89c65a Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 19 Oct 2023 17:17:07 -0400
Subject: [PATCH 205/206] Remove get_jacobian_diag() from base env.

---
 gflownet/envs/base.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/gflownet/envs/base.py b/gflownet/envs/base.py
index cc6c3c3d0..9b5e81f3a 100644
--- a/gflownet/envs/base.py
+++ b/gflownet/envs/base.py
@@ -552,20 +552,6 @@ def get_logprobs(
         logprobs = self.logsoftmax(logits)[ns_range, action_indices]
         return logprobs
 
-    def get_jacobian_diag(
-        self,
-        states: TensorType["batch_size", "state_dim"],
-        is_backward: bool = False,
-        **kwargs,
-    ):
-        """
-        Computes the logarithm of the determinant of the Jacobian of the sampled
-        actions with respect to the states. In general, the determinant is equal to 1.
-        Environments where this is not the case must implement the computation of the
-        Jacobian for forward and backward transitions.
-        """
-        return torch.ones(states.shape, device=states.device, dtype=self.float)
-
     # TODO: add seed
     def step_random(self, backward: bool = False):
         """

From e0415565543a790b79564a4633c58431f999456b Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Thu, 19 Oct 2023 17:32:06 -0400
Subject: [PATCH 206/206] Remove yaml files for hyper parameter search

---
 .../hyperparams_search_20230920_batch1.yaml   | 145 -----------
 .../hyperparams_search_20230920_batch2.yaml   | 145 -----------
 .../hyperparams_search_20230920_batch3.yaml   | 145 -----------
 .../hyperparams_search_20230920_batch4.yaml   | 229 ------------------
 4 files changed, 664 deletions(-)
 delete mode 100644 config/experiments/ccube/hyperparams_search_20230920_batch1.yaml
 delete mode 100644 config/experiments/ccube/hyperparams_search_20230920_batch2.yaml
 delete mode 100644 config/experiments/ccube/hyperparams_search_20230920_batch3.yaml
 delete mode 100644 config/experiments/ccube/hyperparams_search_20230920_batch4.yaml

diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch1.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch1.yaml
deleted file mode 100644
index 87e44bfb5..000000000
--- a/config/experiments/ccube/hyperparams_search_20230920_batch1.yaml
+++ /dev/null
@@ -1,145 +0,0 @@
-# Shared config
-shared:
-  slurm: {}
-  script:
-    user: $USER
-    device: cpu
-    logger:
-      project_name: cube
-      do:
-        online: True
-      test:
-        period: 500
-        n: 900
-      checkpoints:
-        period: 10000
-    # Contiunuous Cube environment
-    env:
-      __value__: ccube
-      n_dim: 2
-      # Buffer
-      buffer:
-        data_path: null
-        train: null
-        test:
-          type: grid
-          n: 1000
-          output_csv: ccube_test.csv
-          output_pkl: ccube_test.pkl
-    # Proxy
-    proxy: corners
-    # GFlowNet config
-    gflownet:
-      __value__: trajectorybalance
-      random_action_prob: 0.1
-      optimizer:
-        batch_size:
-          forward: 100
-        lr: 0.0001
-        z_dim: 16
-        lr_z_mult: 100
-        n_train_steps: 10000
-    # Policy
-    +gflownet:
-      policy:
-        forward:
-          type: mlp
-          n_hid: 512
-          n_layers: 5
-          checkpoint: forward
-    # Use + to add new variables
-    +gflownet:
-      policy:
-        backward:
-          type: mlp
-          n_hid: 512
-          n_layers: 5
-          checkpoint: backward
-          shared_weights: False
-
-# Jobs
-jobs:
-  - slurm:
-      job_name: pigeonish
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.01
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.7311
-          bernoulli_bts_prob: 0.7311
-  - slurm:
-      job_name: finch
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.7311
-          bernoulli_bts_prob: 0.7311
-  - slurm:
-      job_name: dove
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.7311
-          bernoulli_bts_prob: 0.7311
-  - slurm:
-      job_name: pine
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.01
-        beta_params_max: 1000.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.7311
-          bernoulli_bts_prob: 0.7311
-  - slurm:
-      job_name: spruce
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.1
-        beta_params_max: 1000.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.7311
-          bernoulli_bts_prob: 0.7311
-  - slurm:
-      job_name: fir
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 1
-        beta_params_max: 1000.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.7311
-          bernoulli_bts_prob: 0.7311
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch2.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch2.yaml
deleted file mode 100644
index 93491e3e9..000000000
--- a/config/experiments/ccube/hyperparams_search_20230920_batch2.yaml
+++ /dev/null
@@ -1,145 +0,0 @@
-# Shared config
-shared:
-  slurm: {}
-  script:
-    user: $USER
-    device: cpu
-    logger:
-      project_name: cube
-      do:
-        online: True
-      test:
-        period: 500
-        n: 900
-      checkpoints:
-        period: 10000
-    # Contiunuous Cube environment
-    env:
-      __value__: ccube
-      n_dim: 2
-      # Buffer
-      buffer:
-        data_path: null
-        train: null
-        test:
-          type: grid
-          n: 1000
-          output_csv: ccube_test.csv
-          output_pkl: ccube_test.pkl
-    # Proxy
-    proxy: corners
-    # GFlowNet config
-    gflownet:
-      __value__: trajectorybalance
-      random_action_prob: 0.1
-      optimizer:
-        batch_size:
-          forward: 100
-        lr: 0.0001
-        z_dim: 16
-        lr_z_mult: 100
-        n_train_steps: 10000
-    # Policy
-    +gflownet:
-      policy:
-        forward:
-          type: mlp
-          n_hid: 512
-          n_layers: 5
-          checkpoint: forward
-    # Use + to add new variables
-    +gflownet:
-      policy:
-        backward:
-          type: mlp
-          n_hid: 512
-          n_layers: 5
-          checkpoint: backward
-          shared_weights: False
-
-# Jobs
-jobs:
-  - slurm:
-      job_name: large
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.01
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: cedar
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: hemlock
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: yew
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.01
-        beta_params_max: 1000.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: cycad
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.1
-        beta_params_max: 1000.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: palm
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 1
-        beta_params_max: 1000.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch3.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch3.yaml
deleted file mode 100644
index 7912af9b3..000000000
--- a/config/experiments/ccube/hyperparams_search_20230920_batch3.yaml
+++ /dev/null
@@ -1,145 +0,0 @@
-# Shared config
-shared:
-  slurm: {}
-  script:
-    user: $USER
-    device: cpu
-    logger:
-      project_name: cube
-      do:
-        online: True
-      test:
-        period: 500
-        n: 900
-      checkpoints:
-        period: 10000
-    # Contiunuous Cube environment
-    env:
-      __value__: ccube
-      n_dim: 2
-      # Buffer
-      buffer:
-        data_path: null
-        train: null
-        test:
-          type: grid
-          n: 1000
-          output_csv: ccube_test.csv
-          output_pkl: ccube_test.pkl
-    # Proxy
-    proxy: corners
-    # GFlowNet config
-    gflownet:
-      __value__: trajectorybalance
-      random_action_prob: 0.1
-      optimizer:
-        batch_size:
-          forward: 100
-        lr: 0.0001
-        z_dim: 16
-        lr_z_mult: 100
-        n_train_steps: 10000
-    # Policy
-    +gflownet:
-      policy:
-        forward:
-          type: mlp
-          n_hid: 512
-          n_layers: 5
-          checkpoint: forward
-    # Use + to add new variables
-    +gflownet:
-      policy:
-        backward:
-          type: mlp
-          n_hid: 512
-          n_layers: 5
-          checkpoint: backward
-          shared_weights: False
-
-# Jobs
-jobs:
-  - slurm:
-      job_name: papaya
-    script:
-      env:
-        __value__: ccube
-        n_comp: 2
-        beta_params_min: 0.01
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: mango
-    script:
-      env:
-        __value__: ccube
-        n_comp: 2
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: pineapple
-    script:
-      env:
-        __value__: ccube
-        n_comp: 2
-        beta_params_min: 1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: apple
-    script:
-      env:
-        __value__: ccube
-        n_comp: 2
-        beta_params_min: 0.01
-        beta_params_max: 1000.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: pear
-    script:
-      env:
-        __value__: ccube
-        n_comp: 2
-        beta_params_min: 0.1
-        beta_params_max: 1000.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: avocado
-    script:
-      env:
-        __value__: ccube
-        n_comp: 2
-        beta_params_min: 1
-        beta_params_max: 1000.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 100.0
-          beta_beta: 100.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
diff --git a/config/experiments/ccube/hyperparams_search_20230920_batch4.yaml b/config/experiments/ccube/hyperparams_search_20230920_batch4.yaml
deleted file mode 100644
index cc82e322c..000000000
--- a/config/experiments/ccube/hyperparams_search_20230920_batch4.yaml
+++ /dev/null
@@ -1,229 +0,0 @@
-# Shared config
-shared:
-  slurm: {}
-  script:
-    user: $USER
-    device: cpu
-    logger:
-      project_name: cube
-      do:
-        online: True
-      test:
-        period: 500
-        n: 900
-      checkpoints:
-        period: 10000
-    # Contiunuous Cube environment
-    env:
-      __value__: ccube
-      n_dim: 2
-      # Buffer
-      buffer:
-        data_path: null
-        train: null
-        test:
-          type: grid
-          n: 1000
-          output_csv: ccube_test.csv
-          output_pkl: ccube_test.pkl
-    # Proxy
-    proxy: corners
-    # GFlowNet config
-    gflownet:
-      __value__: trajectorybalance
-      random_action_prob: 0.1
-      optimizer:
-        batch_size:
-          forward: 100
-        lr: 0.0001
-        z_dim: 16
-        lr_z_mult: 100
-        n_train_steps: 10000
-    # Policy
-    +gflownet:
-      policy:
-        forward:
-          type: mlp
-          n_hid: 512
-          n_layers: 5
-          checkpoint: forward
-    # Use + to add new variables
-    +gflownet:
-      policy:
-        backward:
-          type: mlp
-          n_hid: 512
-          n_layers: 5
-          checkpoint: backward
-          shared_weights: False
-
-# Jobs
-jobs:
-  - slurm:
-      job_name: papaya
-    script:
-      env:
-        __value__: ccube
-        n_comp: 2
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: mango
-    script:
-      env:
-        __value__: ccube
-        n_comp: 2
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: pineapple
-    script:
-      env:
-        __value__: ccube
-        n_comp: 2
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.5
-          bernoulli_bts_prob: 0.5
-  - slurm:
-      job_name: apple
-    script:
-      env:
-        __value__: ccube
-        n_comp: 2
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.5
-          bernoulli_bts_prob: 0.5
-  - slurm:
-      job_name: papaya
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: mango
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: pineapple
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.5
-          bernoulli_bts_prob: 0.5
-  - slurm:
-      job_name: apple
-    script:
-      env:
-        __value__: ccube
-        n_comp: 5
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.5
-          bernoulli_bts_prob: 0.5
-  - slurm:
-      job_name: papaya
-    script:
-      env:
-        __value__: ccube
-        n_comp: 1
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: mango
-    script:
-      env:
-        __value__: ccube
-        n_comp: 1
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.1
-          bernoulli_bts_prob: 0.1
-  - slurm:
-      job_name: pineapple
-    script:
-      env:
-        __value__: ccube
-        n_comp: 1
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.5
-          bernoulli_bts_prob: 0.5
-  - slurm:
-      job_name: apple
-    script:
-      env:
-        __value__: ccube
-        n_comp: 1
-        beta_params_min: 0.1
-        beta_params_max: 100.0
-        random_distr_params:
-          beta_weights: 1.0
-          beta_alpha: 10.0
-          beta_beta: 10.0
-          bernoulli_eos_prob: 0.5
-          bernoulli_bts_prob: 0.5