automl · TheEimer · Mar 19, 2026 · Mar 19, 2026
diff --git a/mighty/configs/environment/pufferlib_ocean/bandit.yaml b/mighty/configs/environment/pufferlib_ocean/bandit.yaml
diff --git a/mighty/configs/environment/pufferlib_ocean/password.yaml b/mighty/configs/environment/pufferlib_ocean/password.yaml
diff --git a/mighty/configs/environment/pufferlib_ocean/squared.yaml b/mighty/configs/environment/pufferlib_ocean/squared.yaml
@@ -1,7 +1,7 @@
 # @package _global_
 
 num_steps: 50_000 
-env: pufferlib.ocean.squared
+env: pufferlib.ocean.puffer_squared
 env_kwargs: {}
 env_wrappers: [mighty.mighty_utils.wrappers.FlattenVecObs]
 num_envs: 64
diff --git a/mighty/configs/environment/pufferlib_ocean/stochastic.yaml b/mighty/configs/environment/pufferlib_ocean/stochastic.yaml
diff --git a/mighty/mighty_agents/base_agent.py b/mighty/mighty_agents/base_agent.py
@@ -30,7 +30,7 @@
 
 import gymnasium as gym
 from gymnasium.wrappers import RescaleAction
-from gymnasium.wrappers.normalize import NormalizeObservation, NormalizeReward
+from gymnasium.wrappers.vector import NormalizeObservation, NormalizeReward
 
 try:
     import logging
@@ -52,10 +52,6 @@ def seed_env_spaces(env: gym.VectorEnv, seed: int) -> None:
     env.single_action_space.seed(seed)
     env.observation_space.seed(seed)
     env.single_observation_space.seed(seed)
-    for i in range(len(env.envs)):
-        env.envs[i].action_space.seed(seed)
-        env.envs[i].observation_space.seed(seed)
-
 
 def update_buffer(buffer, new_data):
     for k in buffer.keys():
@@ -440,7 +436,7 @@ def make_checkpoint_dir(self, t: int) -> None:
     def __del__(self) -> None:
         """Close wandb upon deletion."""
         self.env.close()  # type: ignore
-        if self.log_wandb:
+        if hasattr(self, "log_wandb") and self.log_wandb:
             wandb.finish()
 
     def step(self, observation: torch.Tensor, metrics: Dict) -> torch.Tensor:
@@ -650,24 +646,24 @@ def run(  # noqa: PLR0915
             )
 
             # Main loop: rollouts, training and evaluation
+            autoresets = np.zeros(self.env.num_envs)
             while self.steps < n_steps:
                 metrics["episode_reward"] = episode_reward
 
                 action, log_prob = self.step(curr_s, metrics)
                 # step the env as usual
                 next_s, reward, terminated, truncated, infos = self.env.step(action)
 
+                # For envs that have had an autoreset, don't add initial transition to the buffer
+                reset_envs = []
+                for j in range(self.env.num_envs):
+                    if autoresets[j]:
+                        reset_envs.append(j)
+
                 # decide which samples are true “done”
                 replay_dones = terminated  # physics‐failure only
                 dones = np.logical_or(terminated, truncated)
 
-                # Overwrite next_s on truncation
-                # Based on https://github.com/DLR-RM/stable-baselines3/issues/284
-                real_next_s = next_s.copy()
-                # infos["final_observation"] is a list/array of the last real obs
-                for i, tr in enumerate(truncated):
-                    if tr:
-                        real_next_s[i] = infos["final_observation"][i]
                 episode_reward += reward
 
                 # Log everything
@@ -677,7 +673,7 @@ def run(  # noqa: PLR0915
                     "reward": reward,
                     "action": action,
                     "state": curr_s,
-                    "next_state": real_next_s,
+                    "next_state": next_s,
                     "terminated": terminated.astype(int),
                     "truncated": truncated.astype(int),
                     "dones": replay_dones.astype(int),
@@ -715,6 +711,13 @@ def run(  # noqa: PLR0915
                 for k in self.meta_modules:
                     self.meta_modules[k].post_step(metrics)
 
+                # Replace transitions from autoreset envs with nans so they don't get learned from
+                for k in ["state", "action", "reward", "next_state", "dones", "log_prob"]:
+                    if k in metrics["transition"]:
+                        metrics["transition"][k] = np.array(metrics["transition"][k]).astype(float)
+                        for j in reset_envs:
+                            metrics["transition"][k][j] = np.ones_like(metrics["transition"][k][0]) * np.nan
+
                 transition_metrics = self.process_transition(
                     metrics["transition"]["state"],
                     metrics["transition"]["action"],
@@ -726,7 +729,7 @@ def run(  # noqa: PLR0915
                 )
                 metrics.update(transition_metrics)
                 self.result_buffer = update_buffer(self.result_buffer, t)
-
+                
                 if self.log_wandb:
                     log_to_wandb(metrics)
 
@@ -737,6 +740,7 @@ def run(  # noqa: PLR0915
                 for _ in range(len(action)):
                     progress.advance(steps_task)
 
+                print(len(self.buffer))
                 # Update agent
                 if (
                     len(self.buffer) >= self._batch_size  # type: ignore
@@ -748,6 +752,7 @@ def run(  # noqa: PLR0915
                 # End step
                 self.last_state = curr_s
                 curr_s = next_s
+                autoresets = np.logical_or(terminated, truncated)
 
                 # Evaluate
                 if eval_every_n_steps and steps_since_eval >= eval_every_n_steps:

diff --git a/mighty/mighty_agents/dqn.py b/mighty/mighty_agents/dqn.py
@@ -268,6 +268,8 @@ def process_transition(  # type: ignore
     ) -> Dict:
         # convert into a transition object
         transition = TransitionBatch(curr_s, action, reward, next_s, dones)
+        if len(transition.observations) == 0:
+            return metrics
 
         if "rollout_values" not in metrics:
             metrics["rollout_values"] = np.empty((0, self.env.single_action_space.n))  # type: ignore
@@ -278,14 +280,12 @@ def process_transition(  # type: ignore
         )
 
         # Compute and add rollout values to metrics
-        values = (
-            self.value_function(
-                torch.as_tensor(transition.observations, dtype=torch.float32)
-            )
-            .detach()
-            .numpy()
-            .reshape((transition.observations.shape[0], -1))
-        )
+        values = self.value_function(
+            torch.as_tensor(transition.observations, dtype=torch.float32)
+        ).detach().numpy()
+
+        if (values.shape[0] != 1 or len(values.shape) != 2):
+            values = values.reshape((transition.observations.shape[0], -1))
 
         metrics["rollout_values"] = np.append(metrics["rollout_values"], values, axis=0)
 

diff --git a/mighty/mighty_meta/plr.py b/mighty/mighty_meta/plr.py
@@ -186,6 +186,14 @@ def score_function(self, reward, values, logits):
         :param logits: Rollout logits
         :return: score
         """
+        reward = np.array(reward)
+        reward[np.isnan(reward)] = 0.0  # treat NaN rewards (from autoreset) as zero
+        values= np.array(values)
+        values[np.isnan(values)] = 0.0  # treat NaN values (from autoreset) as zero
+        logits = np.array(logits) if logits is not None else None
+        if logits is not None:
+            logits[np.isnan(logits)] = 0.0  # treat NaN logits (from autoreset) as zero
+
         if self.sample_strategy == "random":
             score = 1
         elif self.sample_strategy == "policy_entropy":
@@ -260,6 +268,7 @@ def _average_entropy(self, episode_logits):
             * np.log(1.0 / self.num_actions)
             * self.num_actions
         )
+
         return (
             np.mean(np.sum(-np.exp(episode_logits) * episode_logits, axis=-1))
             / max_entropy

diff --git a/mighty/mighty_replay/mighty_replay_buffer.py b/mighty/mighty_replay/mighty_replay_buffer.py
@@ -126,7 +126,17 @@ def add(self, transition_batch, _):
                 list(flatten_infos(transition_batch.extra_info))
             ]
 
-        self.index += transition_batch.size
+        # Remove nan values from transitions (from autoreset envs) so they don't get learned from
+        transition_batch.observations = transition_batch.observations[~torch.any(transition_batch.observations.isnan(), dim=1)]
+        transition_batch.next_obs = transition_batch.next_obs[~torch.any(transition_batch.next_obs.isnan(), dim=1)]
+        if transition_batch.actions.ndim > 1:
+            transition_batch.actions = transition_batch.actions[~torch.any(transition_batch.actions.isnan(), dim=1)]
+        else:
+            transition_batch.actions = transition_batch.actions[~transition_batch.actions.isnan()]
+        transition_batch.rewards = transition_batch.rewards[~transition_batch.rewards.isnan()]
+        transition_batch.dones = transition_batch.dones[~transition_batch.dones.isnan()]
+
+        self.index += transition_batch.observations.size(0)
         if len(self.obs) == 0:
             self.obs = transition_batch.observations
             self.next_obs = transition_batch.next_obs
@@ -169,7 +179,7 @@ def reset(self):
         self.index = 0
 
     def __len__(self):
-        return len(self.obs)
+        return len(self.actions)
 
     def __bool__(self):
         return bool(len(self))

diff --git a/mighty/mighty_replay/mighty_rollout_buffer.py b/mighty/mighty_replay/mighty_rollout_buffer.py
@@ -265,7 +265,9 @@ def compute_returns_and_advantage(
                 next_val = val_slice[step + 1]  # [n_envs]
 
             r_t = rew_slice[step]  # shape = [n_envs]
+            r_t[r_t.isnan()] = 0.0  # treat NaN rewards (from autoreset) as zero
             v_t = val_slice[step]  # shape = [n_envs]
+            v_t[v_t.isnan()] = 0.0  # treat NaN values (from autoreset) as zero
 
             # standard TD residual
             delta = r_t + self.gamma * next_val * next_non_term - v_t  # [n_envs]

diff --git a/mighty/mighty_utils/envs.py b/mighty/mighty_utils/envs.py
@@ -14,7 +14,7 @@
     CARLVectorEnvSimulator,
     ContextualVecEnv,
     ProcgenVecEnv,
-    PufferlibToGymAdapter,
+    PufferWrapperEnv
 )
 
 try:
@@ -89,6 +89,7 @@ def make_carl_env(
     """Make carl environment."""
 
     import carl
+    import carl.envs  # type: ignore
     from carl.context.sampler import ContextSampler
 
     env_kwargs = OmegaConf.to_container(cfg.env_kwargs, resolve=True)
@@ -205,7 +206,7 @@ def make_procgen_env(cfg: DictConfig) -> Tuple[type[ProcgenVecEnv], Callable, in
     return env, eval_env, eval_default
 
 
-def make_pufferlib_env(cfg: DictConfig) -> Tuple[PufferlibToGymAdapter, Callable, int]:
+def make_pufferlib_env(cfg: DictConfig) -> Tuple[PufferWrapperEnv, Callable, int]:
     """Make pufferlib environment."""
     import pufferlib  # type: ignore
     import pufferlib.vector  # type: ignore
@@ -218,15 +219,15 @@ def make_pufferlib_env(cfg: DictConfig) -> Tuple[PufferlibToGymAdapter, Callable
         backend = getattr(pufferlib.vector, cfg.env_kwargs["backend"])
     else:
         backend = pufferlib.vector.Serial
-    env = PufferlibToGymAdapter(
+    env = PufferWrapperEnv(
         pufferlib.vector.make(make_env, num_envs=cfg.num_envs, backend=backend)
     )
 
-    def get_eval() -> PufferlibToGymAdapter:
+    def get_eval() -> PufferWrapperEnv:
         env = pufferlib.vector.make(
             make_env, num_envs=cfg.n_episodes_eval, backend=backend
         )
-        return PufferlibToGymAdapter(env)
+        return PufferWrapperEnv(env)
 
     eval_default = cfg.n_episodes_eval
     return env, get_eval, eval_default
@@ -236,11 +237,8 @@ def make_gym_env(
     cfg: DictConfig,
 ) -> Tuple[gym.vector.SyncVectorEnv, partial[gym.vector.SyncVectorEnv], int]:
     """Make gymnasium environment."""
-    make_env = partial(gym.make, cfg.env, **cfg.env_kwargs)
-    env = gym.vector.SyncVectorEnv([make_env for _ in range(cfg.num_envs)])
-    eval_env = partial(
-        gym.vector.SyncVectorEnv, [make_env for _ in range(cfg.n_episodes_eval)]
-    )
+    env = gym.make_vec(cfg.env, cfg.num_envs, **cfg.env_kwargs)
+    eval_env = partial(gym.make_vec, cfg.env, cfg.n_episodes_eval, **cfg.env_kwargs)
     eval_default = cfg.n_episodes_eval
     return env, eval_env, eval_default
 
@@ -257,10 +255,7 @@ def make_mighty_env(cfg: DictConfig) -> Tuple[ContextualVecEnv, Callable, int]:
         env, eval_env, eval_default = make_pufferlib_env(cfg)  # type: ignore
     elif ENVPOOL:
         env = envpool.make(cfg.env, env_type="gym", **cfg.env_kwargs)
-        make_env = partial(gym.make, cfg.env, **cfg.env_kwargs)
-        eval_env = partial(
-            gym.vector.SyncVectorEnv, [make_env for _ in range(cfg.n_episodes_eval)]
-        )
+        eval_env = partial(gym.make_vec, cfg.env, cfg.n_episodes_eval, **cfg.env_kwargs)
         eval_default = cfg.n_episodes_eval
     else:
         env, eval_env, eval_default = make_gym_env(cfg)  # type: ignore

diff --git a/mighty/mighty_utils/test_helpers.py b/mighty/mighty_utils/test_helpers.py
@@ -33,7 +33,7 @@ def reset(self, options={}, seed=None):
         return self.observation_space.sample(), {}
 
     def step(self, action):
-        tr = self._np_random.choice([0, 1], p=[0.9, 0.1])
+        tr = self._np_random.choice([0, 1], p=[0.95, 0.05])
         return self.observation_space.sample(), self._np_random.random(), False, tr, {}
 
 

diff --git a/mighty/mighty_utils/wrappers.py b/mighty/mighty_utils/wrappers.py
@@ -3,64 +3,27 @@
 from __future__ import annotations
 
 import itertools
-from functools import partial
 
 import gymnasium as gym
 import numpy as np
 
-
-class PufferlibToGymAdapter(gym.Wrapper):
-    """Adapter for Pufferlib environments to be used with OpenAI Gym."""
-
-    def __init__(self, env):
-        """Adapter for Pufferlib environments to be used with OpenAI Gym."""
-        super().__init__(env)
-        self.metadata = {
-            "render.modes": ["human", "rgb_array"],
-            "video.frames_per_second": 60,
-        }
-
-    def reset(self, **kwargs):
-        """Reset the environment and return the initial observation."""
-        if "options" in kwargs:
-            del kwargs["options"]
-        obs, info = self.env.reset(**kwargs)
-        return obs, info
-
-
-class FlattenVecObs(gym.Wrapper):
-    """Flatten observation space of a vectorized environment."""
+class PufferWrapperEnv(gym.Env):
+    """A dummy environment for testing purposes."""
 
     def __init__(self, env):
-        """Flatten observation space of a vectorized environment."""
-        super().__init__(env)
-        self.og_single_observation_space = self.env.single_observation_space
-        self.single_observation_space = gym.spaces.flatten_space(
-            self.env.single_observation_space
-        )
+        """Initialize the dummy environment."""
+        super().__init__()
+        self.puffer_env = env
 
-    def reset(self, seed=None, options=None):
-        """Reset the environment and return the initial observation."""
-        if options is None:
-            options = {}
-        obs, info = self.env.reset(seed=seed, options=options)
-        obs = np.array(
-            list(
-                map(partial(gym.spaces.flatten, self.og_single_observation_space), obs)
-            )
-        )
-        return obs, info
+    def __getattr__(self, name):
+        """Delegate attribute access to the underlying Pufferlib environment."""
+        return getattr(self.puffer_env, name)
 
+    def reset(self, *, seed = None, options = None):
+        return self.puffer_env.reset(seed=seed)
+
     def step(self, action):
-        """Take a step in the environment."""
-        obs, reward, te, tr, info = self.env.step(action)
-        obs = np.array(
-            list(
-                map(partial(gym.spaces.flatten, self.og_single_observation_space), obs)
-            )
-        )
-        return obs, reward, te, tr, info
-
+        return self.puffer_env.step(action)
 
 class MinigridImgVecObs(gym.Wrapper):
     """Change observation space of a vectorized environment to be an image."""