Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions mighty/configs/environment/pufferlib_ocean/bandit.yaml

This file was deleted.

7 changes: 0 additions & 7 deletions mighty/configs/environment/pufferlib_ocean/password.yaml

This file was deleted.

2 changes: 1 addition & 1 deletion mighty/configs/environment/pufferlib_ocean/squared.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# @package _global_

num_steps: 50_000
env: pufferlib.ocean.squared
env: pufferlib.ocean.puffer_squared
env_kwargs: {}
env_wrappers: [mighty.mighty_utils.wrappers.FlattenVecObs]
num_envs: 64
7 changes: 0 additions & 7 deletions mighty/configs/environment/pufferlib_ocean/stochastic.yaml

This file was deleted.

35 changes: 20 additions & 15 deletions mighty/mighty_agents/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

import gymnasium as gym
from gymnasium.wrappers import RescaleAction
from gymnasium.wrappers.normalize import NormalizeObservation, NormalizeReward
from gymnasium.wrappers.vector import NormalizeObservation, NormalizeReward

try:
import logging
Expand All @@ -52,10 +52,6 @@ def seed_env_spaces(env: gym.VectorEnv, seed: int) -> None:
env.single_action_space.seed(seed)
env.observation_space.seed(seed)
env.single_observation_space.seed(seed)
for i in range(len(env.envs)):
env.envs[i].action_space.seed(seed)
env.envs[i].observation_space.seed(seed)


def update_buffer(buffer, new_data):
for k in buffer.keys():
Expand Down Expand Up @@ -440,7 +436,7 @@ def make_checkpoint_dir(self, t: int) -> None:
def __del__(self) -> None:
"""Close wandb upon deletion."""
self.env.close() # type: ignore
if self.log_wandb:
if hasattr(self, "log_wandb") and self.log_wandb:
wandb.finish()

def step(self, observation: torch.Tensor, metrics: Dict) -> torch.Tensor:
Expand Down Expand Up @@ -650,24 +646,24 @@ def run( # noqa: PLR0915
)

# Main loop: rollouts, training and evaluation
autoresets = np.zeros(self.env.num_envs)
while self.steps < n_steps:
metrics["episode_reward"] = episode_reward

action, log_prob = self.step(curr_s, metrics)
# step the env as usual
next_s, reward, terminated, truncated, infos = self.env.step(action)

# For envs that have had an autoreset, don't add initial transition to the buffer
reset_envs = []
for j in range(self.env.num_envs):
if autoresets[j]:
reset_envs.append(j)

# decide which samples are true “done”
replay_dones = terminated # physics‐failure only
dones = np.logical_or(terminated, truncated)

# Overwrite next_s on truncation
# Based on https://github.com/DLR-RM/stable-baselines3/issues/284
real_next_s = next_s.copy()
# infos["final_observation"] is a list/array of the last real obs
for i, tr in enumerate(truncated):
if tr:
real_next_s[i] = infos["final_observation"][i]
episode_reward += reward

# Log everything
Expand All @@ -677,7 +673,7 @@ def run( # noqa: PLR0915
"reward": reward,
"action": action,
"state": curr_s,
"next_state": real_next_s,
"next_state": next_s,
"terminated": terminated.astype(int),
"truncated": truncated.astype(int),
"dones": replay_dones.astype(int),
Expand Down Expand Up @@ -715,6 +711,13 @@ def run( # noqa: PLR0915
for k in self.meta_modules:
self.meta_modules[k].post_step(metrics)

# Replace transitions from autoreset envs with nans so they don't get learned from
for k in ["state", "action", "reward", "next_state", "dones", "log_prob"]:
if k in metrics["transition"]:
metrics["transition"][k] = np.array(metrics["transition"][k]).astype(float)
for j in reset_envs:
metrics["transition"][k][j] = np.ones_like(metrics["transition"][k][0]) * np.nan

transition_metrics = self.process_transition(
metrics["transition"]["state"],
metrics["transition"]["action"],
Expand All @@ -726,7 +729,7 @@ def run( # noqa: PLR0915
)
metrics.update(transition_metrics)
self.result_buffer = update_buffer(self.result_buffer, t)

if self.log_wandb:
log_to_wandb(metrics)

Expand All @@ -737,6 +740,7 @@ def run( # noqa: PLR0915
for _ in range(len(action)):
progress.advance(steps_task)

print(len(self.buffer))
# Update agent
if (
len(self.buffer) >= self._batch_size # type: ignore
Expand All @@ -748,6 +752,7 @@ def run( # noqa: PLR0915
# End step
self.last_state = curr_s
curr_s = next_s
autoresets = np.logical_or(terminated, truncated)

# Evaluate
if eval_every_n_steps and steps_since_eval >= eval_every_n_steps:
Expand Down
16 changes: 8 additions & 8 deletions mighty/mighty_agents/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ def process_transition( # type: ignore
) -> Dict:
# convert into a transition object
transition = TransitionBatch(curr_s, action, reward, next_s, dones)
if len(transition.observations) == 0:
return metrics

if "rollout_values" not in metrics:
metrics["rollout_values"] = np.empty((0, self.env.single_action_space.n)) # type: ignore
Expand All @@ -278,14 +280,12 @@ def process_transition( # type: ignore
)

# Compute and add rollout values to metrics
values = (
self.value_function(
torch.as_tensor(transition.observations, dtype=torch.float32)
)
.detach()
.numpy()
.reshape((transition.observations.shape[0], -1))
)
values = self.value_function(
torch.as_tensor(transition.observations, dtype=torch.float32)
).detach().numpy()

if (values.shape[0] != 1 or len(values.shape) != 2):
values = values.reshape((transition.observations.shape[0], -1))

metrics["rollout_values"] = np.append(metrics["rollout_values"], values, axis=0)

Expand Down
9 changes: 9 additions & 0 deletions mighty/mighty_meta/plr.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,14 @@ def score_function(self, reward, values, logits):
:param logits: Rollout logits
:return: score
"""
reward = np.array(reward)
reward[np.isnan(reward)] = 0.0 # treat NaN rewards (from autoreset) as zero
values= np.array(values)
values[np.isnan(values)] = 0.0 # treat NaN values (from autoreset) as zero
logits = np.array(logits) if logits is not None else None
if logits is not None:
logits[np.isnan(logits)] = 0.0 # treat NaN logits (from autoreset) as zero

if self.sample_strategy == "random":
score = 1
elif self.sample_strategy == "policy_entropy":
Expand Down Expand Up @@ -260,6 +268,7 @@ def _average_entropy(self, episode_logits):
* np.log(1.0 / self.num_actions)
* self.num_actions
)

return (
np.mean(np.sum(-np.exp(episode_logits) * episode_logits, axis=-1))
/ max_entropy
Expand Down
14 changes: 12 additions & 2 deletions mighty/mighty_replay/mighty_replay_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,17 @@ def add(self, transition_batch, _):
list(flatten_infos(transition_batch.extra_info))
]

self.index += transition_batch.size
# Remove nan values from transitions (from autoreset envs) so they don't get learned from
transition_batch.observations = transition_batch.observations[~torch.any(transition_batch.observations.isnan(), dim=1)]
transition_batch.next_obs = transition_batch.next_obs[~torch.any(transition_batch.next_obs.isnan(), dim=1)]
if transition_batch.actions.ndim > 1:
transition_batch.actions = transition_batch.actions[~torch.any(transition_batch.actions.isnan(), dim=1)]
else:
transition_batch.actions = transition_batch.actions[~transition_batch.actions.isnan()]
transition_batch.rewards = transition_batch.rewards[~transition_batch.rewards.isnan()]
transition_batch.dones = transition_batch.dones[~transition_batch.dones.isnan()]

self.index += transition_batch.observations.size(0)
if len(self.obs) == 0:
self.obs = transition_batch.observations
self.next_obs = transition_batch.next_obs
Expand Down Expand Up @@ -169,7 +179,7 @@ def reset(self):
self.index = 0

def __len__(self):
return len(self.obs)
return len(self.actions)

def __bool__(self):
return bool(len(self))
Expand Down
2 changes: 2 additions & 0 deletions mighty/mighty_replay/mighty_rollout_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,9 @@ def compute_returns_and_advantage(
next_val = val_slice[step + 1] # [n_envs]

r_t = rew_slice[step] # shape = [n_envs]
r_t[r_t.isnan()] = 0.0 # treat NaN rewards (from autoreset) as zero
v_t = val_slice[step] # shape = [n_envs]
v_t[v_t.isnan()] = 0.0 # treat NaN values (from autoreset) as zero

# standard TD residual
delta = r_t + self.gamma * next_val * next_non_term - v_t # [n_envs]
Expand Down
23 changes: 9 additions & 14 deletions mighty/mighty_utils/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
CARLVectorEnvSimulator,
ContextualVecEnv,
ProcgenVecEnv,
PufferlibToGymAdapter,
PufferWrapperEnv
)

try:
Expand Down Expand Up @@ -89,6 +89,7 @@ def make_carl_env(
"""Make carl environment."""

import carl
import carl.envs # type: ignore
from carl.context.sampler import ContextSampler

env_kwargs = OmegaConf.to_container(cfg.env_kwargs, resolve=True)
Expand Down Expand Up @@ -205,7 +206,7 @@ def make_procgen_env(cfg: DictConfig) -> Tuple[type[ProcgenVecEnv], Callable, in
return env, eval_env, eval_default


def make_pufferlib_env(cfg: DictConfig) -> Tuple[PufferlibToGymAdapter, Callable, int]:
def make_pufferlib_env(cfg: DictConfig) -> Tuple[PufferWrapperEnv, Callable, int]:
"""Make pufferlib environment."""
import pufferlib # type: ignore
import pufferlib.vector # type: ignore
Expand All @@ -218,15 +219,15 @@ def make_pufferlib_env(cfg: DictConfig) -> Tuple[PufferlibToGymAdapter, Callable
backend = getattr(pufferlib.vector, cfg.env_kwargs["backend"])
else:
backend = pufferlib.vector.Serial
env = PufferlibToGymAdapter(
env = PufferWrapperEnv(
pufferlib.vector.make(make_env, num_envs=cfg.num_envs, backend=backend)
)

def get_eval() -> PufferlibToGymAdapter:
def get_eval() -> PufferWrapperEnv:
env = pufferlib.vector.make(
make_env, num_envs=cfg.n_episodes_eval, backend=backend
)
return PufferlibToGymAdapter(env)
return PufferWrapperEnv(env)

eval_default = cfg.n_episodes_eval
return env, get_eval, eval_default
Expand All @@ -236,11 +237,8 @@ def make_gym_env(
cfg: DictConfig,
) -> Tuple[gym.vector.SyncVectorEnv, partial[gym.vector.SyncVectorEnv], int]:
"""Make gymnasium environment."""
make_env = partial(gym.make, cfg.env, **cfg.env_kwargs)
env = gym.vector.SyncVectorEnv([make_env for _ in range(cfg.num_envs)])
eval_env = partial(
gym.vector.SyncVectorEnv, [make_env for _ in range(cfg.n_episodes_eval)]
)
env = gym.make_vec(cfg.env, cfg.num_envs, **cfg.env_kwargs)
eval_env = partial(gym.make_vec, cfg.env, cfg.n_episodes_eval, **cfg.env_kwargs)
eval_default = cfg.n_episodes_eval
return env, eval_env, eval_default

Expand All @@ -257,10 +255,7 @@ def make_mighty_env(cfg: DictConfig) -> Tuple[ContextualVecEnv, Callable, int]:
env, eval_env, eval_default = make_pufferlib_env(cfg) # type: ignore
elif ENVPOOL:
env = envpool.make(cfg.env, env_type="gym", **cfg.env_kwargs)
make_env = partial(gym.make, cfg.env, **cfg.env_kwargs)
eval_env = partial(
gym.vector.SyncVectorEnv, [make_env for _ in range(cfg.n_episodes_eval)]
)
eval_env = partial(gym.make_vec, cfg.env, cfg.n_episodes_eval, **cfg.env_kwargs)
eval_default = cfg.n_episodes_eval
else:
env, eval_env, eval_default = make_gym_env(cfg) # type: ignore
Expand Down
2 changes: 1 addition & 1 deletion mighty/mighty_utils/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def reset(self, options={}, seed=None):
return self.observation_space.sample(), {}

def step(self, action):
tr = self._np_random.choice([0, 1], p=[0.9, 0.1])
tr = self._np_random.choice([0, 1], p=[0.95, 0.05])
return self.observation_space.sample(), self._np_random.random(), False, tr, {}


Expand Down
61 changes: 12 additions & 49 deletions mighty/mighty_utils/wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,64 +3,27 @@
from __future__ import annotations

import itertools
from functools import partial

import gymnasium as gym
import numpy as np


class PufferlibToGymAdapter(gym.Wrapper):
"""Adapter for Pufferlib environments to be used with OpenAI Gym."""

def __init__(self, env):
"""Adapter for Pufferlib environments to be used with OpenAI Gym."""
super().__init__(env)
self.metadata = {
"render.modes": ["human", "rgb_array"],
"video.frames_per_second": 60,
}

def reset(self, **kwargs):
"""Reset the environment and return the initial observation."""
if "options" in kwargs:
del kwargs["options"]
obs, info = self.env.reset(**kwargs)
return obs, info


class FlattenVecObs(gym.Wrapper):
"""Flatten observation space of a vectorized environment."""
class PufferWrapperEnv(gym.Env):
"""A dummy environment for testing purposes."""

def __init__(self, env):
"""Flatten observation space of a vectorized environment."""
super().__init__(env)
self.og_single_observation_space = self.env.single_observation_space
self.single_observation_space = gym.spaces.flatten_space(
self.env.single_observation_space
)
"""Initialize the dummy environment."""
super().__init__()
self.puffer_env = env

def reset(self, seed=None, options=None):
"""Reset the environment and return the initial observation."""
if options is None:
options = {}
obs, info = self.env.reset(seed=seed, options=options)
obs = np.array(
list(
map(partial(gym.spaces.flatten, self.og_single_observation_space), obs)
)
)
return obs, info
def __getattr__(self, name):
"""Delegate attribute access to the underlying Pufferlib environment."""
return getattr(self.puffer_env, name)

def reset(self, *, seed = None, options = None):
return self.puffer_env.reset(seed=seed)

def step(self, action):
"""Take a step in the environment."""
obs, reward, te, tr, info = self.env.step(action)
obs = np.array(
list(
map(partial(gym.spaces.flatten, self.og_single_observation_space), obs)
)
)
return obs, reward, te, tr, info

return self.puffer_env.step(action)

class MinigridImgVecObs(gym.Wrapper):
"""Change observation space of a vectorized environment to be an image."""
Expand Down
Loading
Loading