diff --git a/docs/build/html/main_objects.html b/docs/build/html/main_objects.html index be830a2f..193025cc 100644 --- a/docs/build/html/main_objects.html +++ b/docs/build/html/main_objects.html @@ -137,7 +137,7 @@

Experience next_state_float is a np.array of shape (config.float_input_dim, ) and dtype np.float32 (state_potential and next_state_potential) are floats, used for reward shaping as per Andrew Ng's paper: https://people.eecs.berkeley.edu/~russell/papers/icml99-shaping.pdf action is an integer representing the action taken for this transition, mapped to config_files/inputs_list.py -terminal_actions is an integer representing the number of steps between "state" and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains math.inf +terminal_actions is an integer representing the number of steps between "state" and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains 2^31 - 1 n_steps How many steps were taken between "state" and "next state". Not all transitions contain the same value, as this may depend on exploration policy. Note that in buffer_collate_function, a transition may be reinterpreted as terminal with a lower n_steps, depending on the random horizon that was sampled. gammas a numpy array of shape (config.n_steps, ) containing the gamma value if steps = 0, 1, 2, etc... rewards a numpy array of shape (config.n_steps, ) containing the reward value if steps = 0, 1, 2, etc... diff --git a/docs/source/main_objects.rst b/docs/source/main_objects.rst index c9cf3375..7be4f8cd 100644 --- a/docs/source/main_objects.rst +++ b/docs/source/main_objects.rst @@ -53,7 +53,7 @@ The class ``Experience`` defined in ``trackmania_rl/experience_replay/`` defines next_state_float is a np.array of shape (config.float_input_dim, ) and dtype np.float32 (state_potential and next_state_potential) are floats, used for reward shaping as per Andrew Ng's paper: https://people.eecs.berkeley.edu/~russell/papers/icml99-shaping.pdf action is an integer representing the action taken for this transition, mapped to config_files/inputs_list.py - terminal_actions is an integer representing the number of steps between "state" and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains math.inf + terminal_actions is an integer representing the number of steps between "state" and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains 2^31 - 1 n_steps How many steps were taken between "state" and "next state". Not all transitions contain the same value, as this may depend on exploration policy. Note that in buffer_collate_function, a transition may be reinterpreted as terminal with a lower n_steps, depending on the random horizon that was sampled. gammas a numpy array of shape (config.n_steps, ) containing the gamma value if steps = 0, 1, 2, etc... rewards a numpy array of shape (config.n_steps, ) containing the reward value if steps = 0, 1, 2, etc... diff --git a/trackmania_rl/buffer_management.py b/trackmania_rl/buffer_management.py index 82e3b955..f8ab06b7 100644 --- a/trackmania_rl/buffer_management.py +++ b/trackmania_rl/buffer_management.py @@ -3,7 +3,7 @@ Its main inputs are a rollout_results object (obtained from a GameInstanceManager object), and a buffer to be filled. It reassembles the rollout_results object into transitions, as defined in /trackmania_rl/experience_replay/experience_replay_interface.py """ -import math + import random import numpy as np @@ -108,7 +108,7 @@ def fill_buffer_from_rollout_with_n_steps_rule( # Get action that was played action = rollout_results["actions"][i] - terminal_actions = float((n_frames - 1) - i) if "race_time" in rollout_results else math.inf + terminal_actions = n_frames - 1 - i if "race_time" in rollout_results else 2**31 - 1 next_state_has_passed_finish = ((i + n_steps) == (n_frames - 1)) and ("race_time" in rollout_results) if not next_state_has_passed_finish: diff --git a/trackmania_rl/experience_replay/experience_replay_interface.py b/trackmania_rl/experience_replay/experience_replay_interface.py index af6f43ff..80b9ea6f 100644 --- a/trackmania_rl/experience_replay/experience_replay_interface.py +++ b/trackmania_rl/experience_replay/experience_replay_interface.py @@ -2,6 +2,7 @@ In this file, we define the Experience type. This is used to represent a transition sampled from a ReplayBuffer. """ + import numpy.typing as npt @@ -15,7 +16,7 @@ class Experience: next_state_float is a np.array of shape (config.float_input_dim, ) and dtype np.float32 (state_potential and next_state_potential) are floats, used for reward shaping as per Andrew Ng's paper: https://people.eecs.berkeley.edu/~russell/papers/icml99-shaping.pdf action is an integer representing the action taken for this transition, mapped to config_files/inputs_list.py - terminal_actions is an integer representing the number of steps between "state" and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains math.inf + terminal_actions is an integer representing the number of steps between "state" and race finish in the rollout from which this transition was extracted. If the rollout did not finish (ie: early cutoff), then contains 2^31 - 1 n_steps How many steps were taken between "state" and "next state". Not all transitions contain the same value, as this may depend on exploration policy. Note that in buffer_collate_function, a transition may be reinterpreted as terminal with a lower n_steps, depending on the random horizon that was sampled. gammas a numpy array of shape (config.n_steps, ) containing the gamma value if steps = 0, 1, 2, etc... rewards a numpy array of shape (config.n_steps, ) containing the reward value if steps = 0, 1, 2, etc...