Skip to content

Commit

Permalink
SAC/DDPG tweaks (#312)
Browse files Browse the repository at this point in the history
* sample actions for SacTest agent

* update sac default hyperparameters

* update ddpg default hyperparameters

* fix add_summary bug

* use more standard hyperparameters for ddpg and sac

* remove time body and adjust discount factor

* adjust sac default hyperparameters

* run formatter
  • Loading branch information
cpnota authored Mar 2, 2024
1 parent dc295ab commit 9d06482
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 80 deletions.
3 changes: 2 additions & 1 deletion all/agents/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,5 @@ def __init__(self, policy):
self.policy = policy

def act(self, state):
return self.policy.eval(state)
action, log_prob = self.policy.eval(state)
return action
2 changes: 1 addition & 1 deletion all/logging/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def add_summary(self, name, values, step="frame"):
}
for aggregator, value in metrics.items():
super().add_scalar(
f"summary/{name}/{aggregator}", value, self._get_step(value)
f"summary/{name}/{aggregator}", value, self._get_step(step)
)

# log summary statistics to file
Expand Down
6 changes: 2 additions & 4 deletions all/policies/soft_deterministic.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,8 @@ def __init__(self, model, space):
def forward(self, state):
outputs = super().forward(state)
normal = self._normal(outputs)
if self.training:
action, log_prob = self._sample(normal)
return action, log_prob
return self._squash(normal.loc)
action, log_prob = self._sample(normal)
return action, log_prob

def _normal(self, outputs):
means = outputs[..., 0 : self._action_dim]
Expand Down
1 change: 0 additions & 1 deletion all/presets/continuous/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# from .actor_critic import actor_critic
from .ddpg import ddpg
from .ppo import ppo
from .sac import sac
Expand Down
33 changes: 15 additions & 18 deletions all/presets/continuous/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from all.agents import DDPG, DDPGTestAgent
from all.approximation import PolyakTarget, QContinuous
from all.bodies import TimeFeature
from all.logging import DummyLogger
from all.memory import ExperienceReplayBuffer
from all.policies import DeterministicPolicy
Expand All @@ -15,12 +14,12 @@

default_hyperparameters = {
# Common settings
"discount_factor": 0.98,
"discount_factor": 0.99,
# Adam optimizer settings
"lr_q": 1e-3,
"lr_pi": 1e-3,
"lr_q": 3e-4,
"lr_pi": 3e-4,
# Training settings
"minibatch_size": 100,
"minibatch_size": 256,
"update_frequency": 1,
"polyak_rate": 0.005,
# Replay Buffer settings
Expand Down Expand Up @@ -94,18 +93,16 @@ def agent(self, logger=DummyLogger(), train_steps=float("inf")):
self.hyperparameters["replay_buffer_size"], device=self.device
)

return TimeFeature(
DDPG(
q,
policy,
replay_buffer,
self.action_space,
noise=self.hyperparameters["noise"],
replay_start_size=self.hyperparameters["replay_start_size"],
discount_factor=self.hyperparameters["discount_factor"],
update_frequency=self.hyperparameters["update_frequency"],
minibatch_size=self.hyperparameters["minibatch_size"],
)
return DDPG(
q,
policy,
replay_buffer,
self.action_space,
noise=self.hyperparameters["noise"],
replay_start_size=self.hyperparameters["replay_start_size"],
discount_factor=self.hyperparameters["discount_factor"],
update_frequency=self.hyperparameters["update_frequency"],
minibatch_size=self.hyperparameters["minibatch_size"],
)

def test_agent(self):
Expand All @@ -114,7 +111,7 @@ def test_agent(self):
None,
self.action_space,
)
return TimeFeature(DDPGTestAgent(policy))
return DDPGTestAgent(policy)


ddpg = PresetBuilder("ddpg", default_hyperparameters, DDPGContinuousPreset)
10 changes: 5 additions & 5 deletions all/presets/continuous/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
def fc_q(env, hidden1=400, hidden2=300):
return nn.Sequential(
nn.Float(),
nn.Linear(env.state_space.shape[0] + env.action_space.shape[0] + 1, hidden1),
nn.Linear(env.state_space.shape[0] + env.action_space.shape[0], hidden1),
nn.ReLU(),
nn.Linear(hidden1, hidden2),
nn.ReLU(),
Expand All @@ -25,7 +25,7 @@ def fc_q(env, hidden1=400, hidden2=300):
def fc_v(env, hidden1=400, hidden2=300):
return nn.Sequential(
nn.Float(),
nn.Linear(env.state_space.shape[0] + 1, hidden1),
nn.Linear(env.state_space.shape[0], hidden1),
nn.ReLU(),
nn.Linear(hidden1, hidden2),
nn.ReLU(),
Expand All @@ -36,7 +36,7 @@ def fc_v(env, hidden1=400, hidden2=300):
def fc_deterministic_policy(env, hidden1=400, hidden2=300):
return nn.Sequential(
nn.Float(),
nn.Linear(env.state_space.shape[0] + 1, hidden1),
nn.Linear(env.state_space.shape[0], hidden1),
nn.ReLU(),
nn.Linear(hidden1, hidden2),
nn.ReLU(),
Expand All @@ -47,7 +47,7 @@ def fc_deterministic_policy(env, hidden1=400, hidden2=300):
def fc_soft_policy(env, hidden1=400, hidden2=300):
return nn.Sequential(
nn.Float(),
nn.Linear(env.state_space.shape[0] + 1, hidden1),
nn.Linear(env.state_space.shape[0], hidden1),
nn.ReLU(),
nn.Linear(hidden1, hidden2),
nn.ReLU(),
Expand All @@ -60,7 +60,7 @@ def __init__(self, env, hidden1=400, hidden2=300):
super().__init__()
self.model = nn.Sequential(
nn.Float(),
nn.Linear(env.state_space.shape[0] + 1, hidden1),
nn.Linear(env.state_space.shape[0], hidden1),
nn.Tanh(),
nn.Linear(hidden1, hidden2),
nn.Tanh(),
Expand Down
45 changes: 21 additions & 24 deletions all/presets/continuous/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from all.agents import PPO, PPOTestAgent
from all.approximation import Identity, VNetwork
from all.bodies import TimeFeature
from all.logging import DummyLogger
from all.optim import LinearScheduler
from all.policies import GaussianPolicy
Expand All @@ -15,7 +14,7 @@

default_hyperparameters = {
# Common settings
"discount_factor": 0.98,
"discount_factor": 0.99,
# Adam optimizer settings
"lr": 3e-4, # Adam learning rate
"eps": 1e-5, # Adam stability
Expand Down Expand Up @@ -112,35 +111,33 @@ def agent(self, logger=DummyLogger(), train_steps=float("inf")):
scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
)

return TimeFeature(
PPO(
features,
v,
policy,
epsilon=LinearScheduler(
self.hyperparameters["clip_initial"],
self.hyperparameters["clip_final"],
0,
n_updates,
name="clip",
logger=logger,
),
epochs=self.hyperparameters["epochs"],
minibatches=self.hyperparameters["minibatches"],
n_envs=self.hyperparameters["n_envs"],
n_steps=self.hyperparameters["n_steps"],
discount_factor=self.hyperparameters["discount_factor"],
lam=self.hyperparameters["lam"],
entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"],
return PPO(
features,
v,
policy,
epsilon=LinearScheduler(
self.hyperparameters["clip_initial"],
self.hyperparameters["clip_final"],
0,
n_updates,
name="clip",
logger=logger,
)
),
epochs=self.hyperparameters["epochs"],
minibatches=self.hyperparameters["minibatches"],
n_envs=self.hyperparameters["n_envs"],
n_steps=self.hyperparameters["n_steps"],
discount_factor=self.hyperparameters["discount_factor"],
lam=self.hyperparameters["lam"],
entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"],
logger=logger,
)

def test_agent(self):
policy = GaussianPolicy(
copy.deepcopy(self.policy_model), space=self.action_space
)
return TimeFeature(PPOTestAgent(Identity(self.device), policy))
return PPOTestAgent(Identity(self.device), policy)

def parallel_test_agent(self):
return self.test_agent()
Expand Down
50 changes: 24 additions & 26 deletions all/presets/continuous/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from all.agents import SAC, SACTestAgent
from all.approximation import PolyakTarget, QContinuous
from all.bodies import TimeFeature
from all.logging import DummyLogger
from all.memory import ExperienceReplayBuffer
from all.policies.soft_deterministic import SoftDeterministicPolicy
Expand All @@ -15,20 +14,20 @@

default_hyperparameters = {
# Common settings
"discount_factor": 0.98,
"discount_factor": 0.99,
# Adam optimizer settings
"lr_q": 1e-3,
"lr_pi": 1e-4,
"lr_pi": 3e-4,
# Training settings
"minibatch_size": 100,
"update_frequency": 2,
"minibatch_size": 256,
"update_frequency": 1,
"polyak_rate": 0.005,
# Replay Buffer settings
"replay_start_size": 5000,
"replay_buffer_size": 1e6,
# Exploration settings
"temperature_initial": 0.1,
"lr_temperature": 1e-5,
"lr_temperature_scaling": 3e-5,
"entropy_backups": True,
"entropy_target_scaling": 1.0,
# Model construction
Expand Down Expand Up @@ -111,32 +110,31 @@ def agent(self, logger=DummyLogger(), train_steps=float("inf")):
self.hyperparameters["replay_buffer_size"], device=self.device
)

return TimeFeature(
SAC(
policy,
q1,
q2,
replay_buffer,
temperature_initial=self.hyperparameters["temperature_initial"],
entropy_backups=self.hyperparameters["entropy_backups"],
entropy_target=(
-self.action_space.shape[0]
* self.hyperparameters["entropy_target_scaling"]
),
lr_temperature=self.hyperparameters["lr_temperature"],
replay_start_size=self.hyperparameters["replay_start_size"],
discount_factor=self.hyperparameters["discount_factor"],
update_frequency=self.hyperparameters["update_frequency"],
minibatch_size=self.hyperparameters["minibatch_size"],
logger=logger,
)
return SAC(
policy,
q1,
q2,
replay_buffer,
temperature_initial=self.hyperparameters["temperature_initial"],
entropy_backups=self.hyperparameters["entropy_backups"],
entropy_target=(
-self.action_space.shape[0]
* self.hyperparameters["entropy_target_scaling"]
),
lr_temperature=self.hyperparameters["lr_temperature_scaling"]
/ self.action_space.shape[0],
replay_start_size=self.hyperparameters["replay_start_size"],
discount_factor=self.hyperparameters["discount_factor"],
update_frequency=self.hyperparameters["update_frequency"],
minibatch_size=self.hyperparameters["minibatch_size"],
logger=logger,
)

def test_agent(self):
policy = SoftDeterministicPolicy(
copy.deepcopy(self.policy_model), space=self.action_space
)
return TimeFeature(SACTestAgent(policy))
return SACTestAgent(policy)


sac = PresetBuilder("sac", default_hyperparameters, SACContinuousPreset)

0 comments on commit 9d06482

Please sign in to comment.