-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #53 from Toni-SM/develop
Develop
- Loading branch information
Showing
23 changed files
with
1,211 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ sphinx | |
sphinx_rtd_theme | ||
sphinx-autobuild | ||
sphinx-tabs==3.2.0 | ||
sphinx-copybutton | ||
gym | ||
gymnasium | ||
torch | ||
|
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import torch | ||
import torch.nn as nn | ||
|
||
# Import the skrl components to build the RL system | ||
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin | ||
from skrl.memories.torch import RandomMemory | ||
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG | ||
from skrl.resources.schedulers.torch import KLAdaptiveRL | ||
from skrl.resources.preprocessors.torch import RunningStandardScaler | ||
from skrl.trainers.torch import SequentialTrainer | ||
from skrl.envs.torch import wrap_env | ||
from skrl.envs.torch import load_isaac_orbit_env | ||
from skrl.utils import set_seed | ||
|
||
|
||
# set the seed for reproducibility | ||
set_seed(42) | ||
|
||
|
||
# Define the shared model (stochastic and deterministic models) for the agent using mixins. | ||
class Shared(GaussianMixin, DeterministicMixin, Model): | ||
def __init__(self, observation_space, action_space, device, clip_actions=False, | ||
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): | ||
Model.__init__(self, observation_space, action_space, device) | ||
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) | ||
DeterministicMixin.__init__(self, clip_actions=False) | ||
|
||
self.net = nn.Sequential(nn.Linear(self.num_observations, 256), | ||
nn.ELU(), | ||
nn.Linear(256, 128), | ||
nn.ELU(), | ||
nn.Linear(128, 64), | ||
nn.ELU()) | ||
|
||
self.mean_layer = nn.Linear(64, self.num_actions) | ||
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) | ||
|
||
self.value_layer = nn.Linear(64, 1) | ||
|
||
def act(self, inputs, role): | ||
if role == "policy": | ||
return GaussianMixin.act(self, inputs, role) | ||
elif role == "value": | ||
return DeterministicMixin.act(self, inputs, role) | ||
|
||
def compute(self, inputs, role): | ||
if role == "policy": | ||
return torch.tanh(self.mean_layer(self.net(inputs["states"]))), self.log_std_parameter, {} | ||
elif role == "value": | ||
return self.value_layer(self.net(inputs["states"])), {} | ||
|
||
|
||
# Load and wrap the Omniverse Isaac Gym environment | ||
env = load_isaac_orbit_env(task_name="Isaac-Ant-v0") | ||
env = wrap_env(env) | ||
|
||
device = env.device | ||
|
||
|
||
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) | ||
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) | ||
|
||
|
||
# Instantiate the agent's models (function approximators). | ||
# PPO requires 2 models, visit its documentation for more details | ||
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models | ||
models_ppo = {} | ||
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device, clip_actions=True) | ||
models_ppo["value"] = models_ppo["policy"] # same instance: shared model | ||
|
||
|
||
# Configure and instantiate the agent. | ||
# Only modify some of the default configuration, visit its documentation to see all the options | ||
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters | ||
cfg_ppo = PPO_DEFAULT_CONFIG.copy() | ||
cfg_ppo["rollouts"] = 16 # memory_size | ||
cfg_ppo["learning_epochs"] = 8 | ||
cfg_ppo["mini_batches"] = 4 # 16 * 1024 / 4096 | ||
cfg_ppo["discount_factor"] = 0.99 | ||
cfg_ppo["lambda"] = 0.95 | ||
cfg_ppo["learning_rate"] = 3e-4 | ||
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL | ||
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} | ||
cfg_ppo["random_timesteps"] = 0 | ||
cfg_ppo["learning_starts"] = 0 | ||
cfg_ppo["grad_norm_clip"] = 1.0 | ||
cfg_ppo["ratio_clip"] = 0.2 | ||
cfg_ppo["value_clip"] = 0.2 | ||
cfg_ppo["clip_predicted_values"] = True | ||
cfg_ppo["entropy_loss_scale"] = 0.0 | ||
cfg_ppo["value_loss_scale"] = 1.0 | ||
cfg_ppo["kl_threshold"] = 0 | ||
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01 | ||
cfg_ppo["state_preprocessor"] = RunningStandardScaler | ||
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} | ||
cfg_ppo["value_preprocessor"] = RunningStandardScaler | ||
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} | ||
# logging to TensorBoard and write checkpoints each 40 and 400 timesteps respectively | ||
cfg_ppo["experiment"]["write_interval"] = 40 | ||
cfg_ppo["experiment"]["checkpoint_interval"] = 400 | ||
|
||
agent = PPO(models=models_ppo, | ||
memory=memory, | ||
cfg=cfg_ppo, | ||
observation_space=env.observation_space, | ||
action_space=env.action_space, | ||
device=device) | ||
|
||
|
||
# Configure and instantiate the RL trainer | ||
cfg_trainer = {"timesteps": 8000, "headless": True} | ||
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) | ||
|
||
# start training | ||
trainer.train() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
import torch | ||
import torch.nn as nn | ||
|
||
# Import the skrl components to build the RL system | ||
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin | ||
from skrl.memories.torch import RandomMemory | ||
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG | ||
from skrl.resources.schedulers.torch import KLAdaptiveRL | ||
from skrl.resources.preprocessors.torch import RunningStandardScaler | ||
from skrl.trainers.torch import SequentialTrainer | ||
from skrl.envs.torch import wrap_env | ||
from skrl.envs.torch import load_isaac_orbit_env | ||
from skrl.utils import set_seed | ||
|
||
|
||
# set the seed for reproducibility | ||
set_seed(42) | ||
|
||
|
||
# Define the shared model (stochastic and deterministic models) for the agent using mixins. | ||
class Shared(GaussianMixin, DeterministicMixin, Model): | ||
def __init__(self, observation_space, action_space, device, clip_actions=False, | ||
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): | ||
Model.__init__(self, observation_space, action_space, device) | ||
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) | ||
DeterministicMixin.__init__(self, clip_actions=False) | ||
|
||
self.net = nn.Sequential(nn.Linear(self.num_observations, 32), | ||
nn.ELU(), | ||
nn.Linear(32, 32), | ||
nn.ELU()) | ||
|
||
self.mean_layer = nn.Linear(32, self.num_actions) | ||
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) | ||
|
||
self.value_layer = nn.Linear(32, 1) | ||
|
||
def act(self, inputs, role): | ||
if role == "policy": | ||
return GaussianMixin.act(self, inputs, role) | ||
elif role == "value": | ||
return DeterministicMixin.act(self, inputs, role) | ||
|
||
def compute(self, inputs, role): | ||
if role == "policy": | ||
return torch.tanh(self.mean_layer(self.net(inputs["states"]))), self.log_std_parameter, {} | ||
elif role == "value": | ||
return self.value_layer(self.net(inputs["states"])), {} | ||
|
||
|
||
# Load and wrap the Isaac Orbit environment | ||
env = load_isaac_orbit_env(task_name="Isaac-Cartpole-v0") | ||
env = wrap_env(env) | ||
|
||
device = env.device | ||
|
||
|
||
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) | ||
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) | ||
|
||
|
||
# Instantiate the agent's models (function approximators). | ||
# PPO requires 2 models, visit its documentation for more details | ||
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models | ||
models_ppo = {} | ||
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device, clip_actions=True) | ||
models_ppo["value"] = models_ppo["policy"] # same instance: shared model | ||
|
||
|
||
# Configure and instantiate the agent. | ||
# Only modify some of the default configuration, visit its documentation to see all the options | ||
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters | ||
cfg_ppo = PPO_DEFAULT_CONFIG.copy() | ||
cfg_ppo["rollouts"] = 16 # memory_size | ||
cfg_ppo["learning_epochs"] = 8 | ||
cfg_ppo["mini_batches"] = 1 # 16 * 512 / 8192 | ||
cfg_ppo["discount_factor"] = 0.99 | ||
cfg_ppo["lambda"] = 0.95 | ||
cfg_ppo["learning_rate"] = 3e-4 | ||
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL | ||
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} | ||
cfg_ppo["random_timesteps"] = 0 | ||
cfg_ppo["learning_starts"] = 0 | ||
cfg_ppo["grad_norm_clip"] = 1.0 | ||
cfg_ppo["ratio_clip"] = 0.2 | ||
cfg_ppo["value_clip"] = 0.2 | ||
cfg_ppo["clip_predicted_values"] = True | ||
cfg_ppo["entropy_loss_scale"] = 0.0 | ||
cfg_ppo["value_loss_scale"] = 2.0 | ||
cfg_ppo["kl_threshold"] = 0 | ||
cfg_ppo["rewards_shaper"] = None | ||
cfg_ppo["state_preprocessor"] = RunningStandardScaler | ||
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} | ||
cfg_ppo["value_preprocessor"] = RunningStandardScaler | ||
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} | ||
# logging to TensorBoard and write checkpoints each 16 and 80 timesteps respectively | ||
cfg_ppo["experiment"]["write_interval"] = 16 | ||
cfg_ppo["experiment"]["checkpoint_interval"] = 80 | ||
|
||
agent = PPO(models=models_ppo, | ||
memory=memory, | ||
cfg=cfg_ppo, | ||
observation_space=env.observation_space, | ||
action_space=env.action_space, | ||
device=device) | ||
|
||
|
||
# Configure and instantiate the RL trainer | ||
cfg_trainer = {"timesteps": 1600, "headless": True} | ||
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) | ||
|
||
# start training | ||
trainer.train() |
Oops, something went wrong.