Why aren't simulations deterministic when the policy is deterministic? #471

itazap · 2024-11-28T16:12:50Z

itazap
Nov 28, 2024

(Sorry I am not sure how to open a discussion, so I am opening an issue)

Is it possible to make 'ppo_balancer/run.py' more deterministic? I am having trouble comparing policies because there is too much randomness between runs (sagittal push that can be applied without falling varies greatly). I have set deterministic=True and passed the seed to the functions below, but the runs are still not deterministic.

Thank you!

# run.py file changes:

def parse_command_line_arguments() -> argparse.Namespace:
    """!
    Parse command line arguments.

    @returns Command-line arguments.
    """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "policy",
        nargs="?",
        help="path to the policy parameters file",
    )
    parser.add_argument(
        "--training",
        default=False,
        action="store_true",
        help="add noise and actuation lag, as in training",
    )
    return parser.parse_args()


def get_tip_state(
    observation, tip_height: float = 0.58
) -> Tuple[float, float]:
    """!
    Compute the state of the virtual tip used in the agent's reward.

    This extra info is for logging only.

    @param observation Observation vector.
    @param tip_height Height of the virtual tip.
    @returns Pair of tip (position, velocity) in the sagittal plane.
    """
    pitch = observation[0]
    ground_position = observation[1]
    angular_velocity = observation[2]
    ground_velocity = observation[3]
    tip_position = ground_position + tip_height * np.sin(pitch)
    tip_velocity = ground_velocity + tip_height * angular_velocity * np.cos(
        pitch
    )
    return tip_position, tip_velocity


def run_policy(env: gym.Wrapper, policy) -> None:
    """!
    Run the policy on a given environment.

    @param env Upkie environment, wrapped by the agent.
    @param policy MLP policy to follow.
    """
    push_balance(env, policy, 5)
   #  action = np.zeros(env.action_space.shape)
   #  observation, info = env.reset()
   #  reward = 0.0
   #  while True:
   #      action, _ = policy.predict(observation, deterministic=True)
   #      tip_position, tip_velocity = get_tip_state(observation[-1])
   #      env.unwrapped.log("action", action)
   #      env.unwrapped.log("observation", observation[-1])
   #      env.unwrapped.log("reward", reward)
   #      env.unwrapped.log("tip_position", tip_position)
   #      env.unwrapped.log("tip_velocity", tip_velocity)
   #      observation, reward, terminated, truncated, info = env.step(action)
   #      if terminated or truncated:
   #          observation, info = env.reset()


def push_balance(env, policy, force, check_msfos=False):
    # new function for applying a saggital force for 1s.
    torso_force_in_world = np.zeros(3)
    torso_force_in_world[0] = force
    bullet_action = {
        "external_forces": {
            "torso": {
                "force": torso_force_in_world,
                "local": False,
            }
        }
    }
    observation, _ = env.reset(seed=42)
    pushed = False
    for step in range(2000):
        if step % 100 == 0:
            print(step)
        action, _ = policy.predict(observation, deterministic=True)
        if step < 800 and step >= 600:
            if step % 100 == 0:
                print('PUSHED')
                pushed = True
            env.unwrapped.bullet_extra(bullet_action)  # call before env.step

        observation, _, terminated, truncated, _ = env.step(action)

        if terminated or truncated:
            if pushed:
                return
            observation, _ = env.reset(seed=42)
            if check_msfos:
                return False

    return True



def main(policy_path: str, training: bool) -> None:
    """!
    Load environment and policy, and run the latter on the former.

    @param policy_path Path to policy parameters.
    @param training If True, add training noise and domain randomization.
    """
    env_settings = EnvSettings()
    init_state = None
    if training:
        training_settings = TrainingSettings()
        init_state = RobotState(
            randomization=RobotStateRandomization(
                **training_settings.init_rand
            ),
        )
    with gym.make(
        env_settings.env_id,
        frequency=env_settings.agent_frequency,
        init_state=init_state,
        max_ground_velocity=env_settings.max_ground_velocity,
        regulate_frequency=True,
        spine_config=env_settings.spine_config,
    ) as velocity_env:
        env = make_ppo_balancer_env(
            velocity_env,
            env_settings,
            training=False,
        )
        ppo_settings = PPOSettings()
        policy = PPO(
            "MlpPolicy",
            env,
            policy_kwargs={
                "net_arch": {
                    "pi": ppo_settings.net_arch_pi,
                    "vf": ppo_settings.net_arch_vf,
                },
            },
            verbose=0,
        )
        policy.set_parameters(policy_path)
        run_policy(env, policy)



if __name__ == "__main__":
    if on_raspi():
        configure_agent_process()

    agent_dir = os.path.abspath(os.path.dirname(__file__))
    args = parse_command_line_arguments()

    # Policy parameters
    policy_path = args.policy
    if policy_path is None:
        policy_path = f"{agent_dir}/policy/params.zip"
    if policy_path.endswith(".zip"):
        policy_path = policy_path[:-4]

    policy_dir = os.path.dirname(policy_path) # own policy
    policy_path = "/Users/itazaporozhets/Documents/Robotics/ppo_balancer/training/2024-11-24/grillage_1/final.zip"#f"{os.path.dirname(policy_path)}/grillage_1/operative_config.gin"
    config_path = "/Users/itazaporozhets/Documents/Robotics/ppo_balancer/training/2024-11-24/grillage_1/operative_config.gin"  # f"{os.path.dirname(policy_path)}/grillage_1/operative_config.gin"

    logging.info("Loading policy configuration from %s", config_path)
    gin.parse_config_file(config_path)

    try:
        main(policy_path, args.training)
    except KeyboardInterrupt:
        logging.info("Caught a keyboard interrupt")

stephane-caron · 2024-12-03T07:21:46Z

stephane-caron
Dec 3, 2024
Maintainer

When doing reinforcement learning, stochasticity is not a bug, it's a feature 😃

Stochasticity from the physics simulator

Running the same agent again is indeed not deterministic, even when the policy is fully deterministic. This is because there are other sources of stochasticity that we don't control. One of them is the physics simulator, as it performs collision detection and forward dynamics with finite precision. Even when we reset from exactly the same state, as soon as there are some impacts involved, trajectories will have variance:

./start_simulation.sh

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np

import upkie.envs

upkie.envs.register()
NB_RUNS = 10
NB_STEPS = 100


if __name__ == "__main__":
    with gym.make("UpkieGroundVelocity-v3", frequency=200.0) as env:
        trajectories = []
        action = 0.0 * env.action_space.sample()
        for i in range(NB_RUNS):
            observation, _ = env.reset()  # connects to the spine
            trajectories.append([])
            for step in range(NB_STEPS):
                pitch = observation[0]
                ground_pos = observation[1]
                ground_vel = observation[3]
                action[0] = 10.0 * pitch + 1.0 * ground_pos + 0.1 * ground_vel
                observation, _, _, _, _ = env.step(action)
                trajectories[-1].append(observation[0])

        dt = env.unwrapped.dt
        trange = np.arange(0.0, NB_STEPS * dt, dt)
        plt.ion()
        plt.grid(True)
        plt.plot(trange, np.array(trajectories).T)
        plt.ylim(-0.03, 0.03)
        plt.legend(("pitch [rad]",))

Stochasticity from the agent and simulation loops

Another source of stochasticity comes in when the agent and simulation loops are not synchronized. Here is the same example but this time setting --nb-substeps so that the simulator waits for the agent action:

./start_simulation.sh --nb-substeps 5

Still, once the robot makes contact with the environment, there is not one solution but a distribution of them.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Upkie

Why aren't simulations deterministic when the policy is deterministic? #471

{{title}}

Replies: 1 comment

{{title}}

Select a reply

Upkie

Why aren't simulations deterministic when the policy is deterministic? #471

itazap Nov 28, 2024

Replies: 1 comment

stephane-caron Dec 3, 2024 Maintainer

Stochasticity from the physics simulator

Stochasticity from the agent and simulation loops

itazap
Nov 28, 2024

stephane-caron
Dec 3, 2024
Maintainer