Linesight-RL#83 Make terminal_actions int

Warning was: Expected type 'int', got 'float' instead
Wuodan · Aug 22, 2024 · 602455d · 602455d
1 parent 5ec1ecf
commit 602455d
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 5 deletions.
diff --git a/docs/build/html/main_objects.html b/docs/build/html/main_objects.html
@@ -137,7 +137,7 @@ <h2>Experience<a class="headerlink" href="#experience" title="Link to this headi
 <span class="sd">                                            next_state_float is a np.array of shape (config.float_input_dim, ) and dtype np.float32</span>
 <span class="sd">(state_potential and next_state_potential)  are floats, used for reward shaping as per Andrew Ng&#39;s paper: https://people.eecs.berkeley.edu/~russell/papers/icml99-shaping.pdf</span>
 <span class="sd">action                                      is an integer representing the action taken for this transition, mapped to config_files/inputs_list.py</span>
-<span class="sd">terminal_actions                            is an integer representing the number of steps between &quot;state&quot; and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains math.inf</span>
+<span class="sd">terminal_actions                            is an integer representing the number of steps between &quot;state&quot; and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains 2^31 - 1</span>
 <span class="sd">n_steps                                     How many steps were taken between &quot;state&quot; and &quot;next state&quot;. Not all transitions contain the same value, as this may depend on exploration policy. Note that in buffer_collate_function, a transition may be reinterpreted as terminal with a lower n_steps, depending on the random horizon that was sampled.</span>
 <span class="sd">gammas                                      a numpy array of shape (config.n_steps, ) containing the gamma value if steps = 0, 1, 2, etc...</span>
 <span class="sd">rewards                                     a numpy array of shape (config.n_steps, ) containing the reward value if steps = 0, 1, 2, etc...</span>

diff --git a/docs/source/main_objects.rst b/docs/source/main_objects.rst
@@ -53,7 +53,7 @@ The class ``Experience`` defined in ``trackmania_rl/experience_replay/`` defines
                                                 next_state_float is a np.array of shape (config.float_input_dim, ) and dtype np.float32
     (state_potential and next_state_potential)  are floats, used for reward shaping as per Andrew Ng's paper: https://people.eecs.berkeley.edu/~russell/papers/icml99-shaping.pdf
     action                                      is an integer representing the action taken for this transition, mapped to config_files/inputs_list.py
-    terminal_actions                            is an integer representing the number of steps between "state" and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains math.inf
+    terminal_actions                            is an integer representing the number of steps between "state" and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains 2^31 - 1
     n_steps                                     How many steps were taken between "state" and "next state". Not all transitions contain the same value, as this may depend on exploration policy. Note that in buffer_collate_function, a transition may be reinterpreted as terminal with a lower n_steps, depending on the random horizon that was sampled.
     gammas                                      a numpy array of shape (config.n_steps, ) containing the gamma value if steps = 0, 1, 2, etc...
     rewards                                     a numpy array of shape (config.n_steps, ) containing the reward value if steps = 0, 1, 2, etc...

diff --git a/trackmania_rl/buffer_management.py b/trackmania_rl/buffer_management.py
@@ -3,7 +3,7 @@
 Its main inputs are a rollout_results object (obtained from a GameInstanceManager object), and a buffer to be filled.
 It reassembles the rollout_results object into transitions, as defined in /trackmania_rl/experience_replay/experience_replay_interface.py
 """
-import math
+
 import random
 
 import numpy as np
@@ -108,7 +108,7 @@ def fill_buffer_from_rollout_with_n_steps_rule(
 
         # Get action that was played
         action = rollout_results["actions"][i]
-        terminal_actions = float((n_frames - 1) - i) if "race_time" in rollout_results else math.inf
+        terminal_actions = n_frames - 1 - i if "race_time" in rollout_results else 2**31 - 1
         next_state_has_passed_finish = ((i + n_steps) == (n_frames - 1)) and ("race_time" in rollout_results)
 
         if not next_state_has_passed_finish:

diff --git a/trackmania_rl/experience_replay/experience_replay_interface.py b/trackmania_rl/experience_replay/experience_replay_interface.py
@@ -2,6 +2,7 @@
 In this file, we define the Experience type.
 This is used to represent a transition sampled from a ReplayBuffer.
 """
+
 import numpy.typing as npt
 
 
@@ -15,7 +16,7 @@ class Experience:
                                                 next_state_float is a np.array of shape (config.float_input_dim, ) and dtype np.float32
     (state_potential and next_state_potential)  are floats, used for reward shaping as per Andrew Ng's paper: https://people.eecs.berkeley.edu/~russell/papers/icml99-shaping.pdf
     action                                      is an integer representing the action taken for this transition, mapped to config_files/inputs_list.py
-    terminal_actions                            is an integer representing the number of steps between "state" and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains math.inf
+    terminal_actions                            is an integer representing the number of steps between "state" and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains 2^31 - 1
     n_steps                                     How many steps were taken between "state" and "next state". Not all transitions contain the same value, as this may depend on exploration policy. Note that in buffer_collate_function, a transition may be reinterpreted as terminal with a lower n_steps, depending on the random horizon that was sampled.
     gammas                                      a numpy array of shape (config.n_steps, ) containing the gamma value if steps = 0, 1, 2, etc...
     rewards                                     a numpy array of shape (config.n_steps, ) containing the reward value if steps = 0, 1, 2, etc...