diff --git a/assume/__init__.py b/assume/__init__.py
index 359e409ba..01f8839ce 100644
--- a/assume/__init__.py
+++ b/assume/__init__.py
@@ -5,6 +5,21 @@
 from importlib.metadata import version
 
 from assume.common import MarketConfig, MarketProduct
+from assume.reinforcement_learning import (
+    A2CAlgorithm,
+    DDPG,
+    LSTMActor,
+    Learning,
+    MLPActor,
+    PPO,
+    RLAlgorithm,
+    ReplayBuffer,
+    ReplayBufferSamples,
+    RolloutBuffer,
+    RolloutBufferSamples,
+    TD3,
+    actor_architecture_aliases,
+)
 from assume.scenario.loader_csv import (
     load_custom_units,
     load_scenario_folder,
@@ -16,3 +31,34 @@
 
 __author__ = "ASSUME Developers: Nick Harder, Kim Miskiw, Florian Maurer, Manish Khanra"
 __copyright__ = "AGPL-3.0 License"
+
+__all__ = [
+    # Framework version
+    "__version__",
+    # World & scenario
+    "World",
+    "load_scenario_folder",
+    "load_custom_units",
+    "run_learning",
+    # Market primitives
+    "MarketConfig",
+    "MarketProduct",
+    # RL orchestration
+    "Learning",
+    # RL algorithm base classes
+    "RLAlgorithm",
+    "A2CAlgorithm",
+    # RL concrete algorithms
+    "TD3",
+    "DDPG",
+    "PPO",
+    # RL actor architectures
+    "MLPActor",
+    "LSTMActor",
+    "actor_architecture_aliases",
+    # RL buffers
+    "ReplayBuffer",
+    "ReplayBufferSamples",
+    "RolloutBuffer",
+    "RolloutBufferSamples",
+]
diff --git a/assume/common/base.py b/assume/common/base.py
index 44a6ecd14..68ff1c8bd 100644
--- a/assume/common/base.py
+++ b/assume/common/base.py
@@ -4,7 +4,7 @@
 
 import logging
 from collections import defaultdict
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from datetime import datetime, timedelta
 
 import numpy as np
@@ -755,6 +755,132 @@ def update_forecasts_if_needed(unit: BaseUnit, *args, **kwargs):
         unit.forecaster.update(*args, **kwargs)
 
 
+@dataclass
+class AlgorithmConfig:
+    """
+    Base configuration for algorithm-specific parameters.
+
+    Parameters:
+        batch_size (int): The batch size of experiences sampled from the replay buffer for each training update.
+            Larger batches provide more stable gradients but require more memory. Default is 128.
+        gamma (float): The discount factor for future rewards, ranging from 0 to 1. Default is 0.99.
+        train_freq (str): Defines the frequency at which networks are updated. Default is "24h".
+    """
+
+    batch_size: int = 128
+    gamma: float = 0.99
+    train_freq: str = "24h"
+
+
+# Algorithm category mapping
+ALGORITHM_CATEGORIES = {
+    "mappo": "on-policy",
+    "matd3": "off-policy",
+    "maddpg": "off-policy",
+}
+
+
+def is_on_policy(algorithm_name: str) -> bool:
+    """Check if algorithm is on-policy."""
+    return ALGORITHM_CATEGORIES.get(algorithm_name) == "on-policy"
+
+
+def is_off_policy(algorithm_name: str) -> bool:
+    """Check if algorithm is off-policy."""
+    return ALGORITHM_CATEGORIES.get(algorithm_name) == "off-policy"
+
+
+@dataclass
+class OffPolicyConfig(AlgorithmConfig):
+    """
+    Configuration for off-policy algorithms (MATD3/MADDPG) hyperparameters.
+
+    These parameters control the off-policy actor-critic algorithm behavior such as delayed policy updates,
+    target network updates, and exploration noise.
+
+    Parameters:
+        episodes_collecting_initial_experience (int): The number of episodes at the start during which random
+            actions are chosen instead of using the actor network. Default is 5.
+        gradient_steps (int): The number of gradient descent steps performed during each training update. Default is 100.
+        actor_architecture (str): The architecture of the neural networks used for the actors. Options include
+            "mlp" (Multi-Layer Perceptron) and "lstm" (Long Short-Term Memory). Default is "mlp".
+        replay_buffer_size (int): The maximum number of transitions stored in the replay buffer. Default is 50000.
+        policy_delay (int): The frequency (in gradient steps) at which the actor policy is updated.
+            Some algorithms update the critic more frequently than the actor to stabilize training. Default is 2.
+        noise_sigma (float): The standard deviation of the Ornstein-Uhlenbeck or Gaussian noise distribution
+            used to generate exploration noise added to actions. Default is 0.1.
+        noise_scale (int): The scale factor multiplied by the noise drawn from the distribution.
+            Larger values increase exploration. Default is 1.
+        noise_dt (int): The time step parameter for the Ornstein-Uhlenbeck process, which determines how
+            quickly the noise decays over time. Used for noise scheduling. Default is 1.
+        action_noise_schedule (str | None): Which action noise decay schedule to use. Currently only "linear"
+            decay is available, which linearly decreases exploration noise over training. Default is "linear".
+        tau (float): The soft update coefficient for updating target networks. Controls how slowly target
+            networks track the main networks. Smaller values mean slower updates. Default is 0.005.
+        target_policy_noise (float): The standard deviation of noise added to target policy actions during
+            critic updates. This smoothing helps prevent overfitting to narrow policy peaks. Default is 0.2.
+        target_noise_clip (float): The maximum absolute value for clipping the target policy noise.
+            Prevents the noise from being too large. Default is 0.5.
+    """
+
+    episodes_collecting_initial_experience: int = 5
+    gradient_steps: int = 100
+    noise_dt: int = 1
+    noise_scale: int = 1
+    noise_sigma: float = 0.1
+    actor_architecture: str = "mlp"
+    action_noise_schedule: str | None = None
+    policy_delay: int = 2
+    tau: float = 0.005
+    target_policy_noise: float = 0.2
+    target_noise_clip: float = 0.5
+    replay_buffer_size: int = 50000
+
+    def __post_init__(self):
+        # if we do not have initial experience collected we will get an error as no samples are available on the
+        # buffer from which we can draw experience to adapt the strategy, hence we set it to minimum one episode
+        if self.episodes_collecting_initial_experience < 1:
+            logger.warning(
+                f"episodes_collecting_initial_experience need to be at least 1 to sample from buffer, got {self.episodes_collecting_initial_experience}. setting to 1"
+            )
+
+            self.episodes_collecting_initial_experience = 1
+
+        # check that gradient_steps is positive
+        if self.gradient_steps <= 0:
+            raise ValueError(
+                f"gradient_steps need to be positive, got {self.gradient_steps}"
+            )
+
+
+@dataclass
+class OnPolicyConfig(AlgorithmConfig):
+    """
+    Configuration for on-policy algorithms (PPO/MAPPO) hyperparameters.
+
+    These parameters control the PPO algorithm behavior such as clipping ranges,
+    number of optimization epochs, and loss coefficients.
+
+    Parameters:
+        clip_ratio (float): The clipping ratio for the PPO surrogate objective. Default is 0.1.
+        entropy_coef (float): Coefficient for entropy term in loss. Default is 0.01.
+        gae_lambda (float): Lambda parameter for Generalized Advantage Estimation (GAE). Default is 0.95.
+        max_grad_norm (float): Maximum gradient norm for clipping. Default is 0.5.
+        vf_coef (float): Coefficient for value function term in loss. Default is 0.5.
+        n_epochs (int): Number of optimization epochs per rollout. Default is 10.
+        actor_architecture (str): The architecture of the neural networks used for the actors. Options include
+            "mlp" (Multi-Layer Perceptron) and "lstm" (Long Short-Term Memory). Default is "mlp".
+    """
+
+    clip_ratio: float = 0.1
+    entropy_coef: float = 0.01
+    gae_lambda: float = 0.95
+    max_grad_norm: float = 0.5
+    vf_coef: float = 0.5
+    n_epochs: int = 10
+    actor_architecture: str = "mlp"
+
+
 @dataclass
 class LearningConfig:
     """
@@ -779,9 +905,6 @@ class LearningConfig:
 
         device (str): The device to use for PyTorch computations. Options include "cpu", "cuda", or specific
             CUDA devices like "cuda:0". Default is "cpu".
-        episodes_collecting_initial_experience (int): The number of episodes at the start during which random
-            actions are chosen instead of using the actor network. This helps populate the replay buffer with
-            diverse experiences. Default is 5.
         exploration_noise_std (float): The standard deviation of Gaussian noise added to actions during
             exploration in the environment. Higher values encourage more exploration. Default is 0.2.
         training_episodes (int): The number of training episodes, where one episode is the entire simulation
@@ -793,8 +916,6 @@ class LearningConfig:
         batch_size (int): The batch size of experiences sampled from the replay buffer for each training update.
             Larger batches provide more stable gradients but require more memory. In environments with many leanring agents we advise small batch sizes.
             Default is 128.
-        gradient_steps (int): The number of gradient descent steps performed during each training update.
-            More steps can lead to better learning but increase computation time. Default is 100.
         learning_rate (float): The learning rate (step size) for the optimizer, which controls how much the
             policy and value networks are updated during training. Default is 0.001.
         learning_rate_schedule (str | None): Which learning rate decay schedule to use. Currently only "linear"
@@ -806,30 +927,15 @@ class LearningConfig:
             early stopping. If the reward improvement is less than this threshold over early_stopping_steps,
             training is terminated early. Default is 0.05.
 
-        algorithm (str): Specifies which reinforcement learning algorithm to use. Currently, only "matd3"
-            (Multi-Agent Twin Delayed Deep Deterministic Policy Gradient) is implemented. Default is "matd3".
-        replay_buffer_size (int): The maximum number of transitions stored in the replay buffer for experience replay.
-            Larger buffers allow for more diverse training samples. Default is 500000.
+        algorithm (str): Specifies which reinforcement learning algorithm to use. Options include "matd3"
+            (Multi-Agent Twin Delayed Deep Deterministic Policy Gradient), "maddpg" (Multi-Agent Deep Deterministic Policy Gradient), and "mappo" (Multi-Agent Proximal Policy Optimization). Default is "matd3".
         gamma (float): The discount factor for future rewards, ranging from 0 to 1. Higher values give more
             weight to long-term rewards in decision-making. Default is 0.99.
         actor_architecture (str): The architecture of the neural networks used for the actors. Options include
             "mlp" (Multi-Layer Perceptron) and "lstm" (Long Short-Term Memory). Default is "mlp".
-        policy_delay (int): The frequency (in gradient steps) at which the actor policy is updated.
-            TD3 updates the critic more frequently than the actor to stabilize training. Default is 2.
-        noise_sigma (float): The standard deviation of the Ornstein-Uhlenbeck or Gaussian noise distribution
-            used to generate exploration noise added to actions. Default is 0.1.
-        noise_scale (int): The scale factor multiplied by the noise drawn from the distribution.
-            Larger values increase exploration. Default is 1.
-        noise_dt (int): The time step parameter for the Ornstein-Uhlenbeck process, which determines how
-            quickly the noise decays over time. Used for noise scheduling. Default is 1.
-        action_noise_schedule (str | None): Which action noise decay schedule to use. Currently only "linear"
-            decay is available, which linearly decreases exploration noise over training. Default is "linear".
-        tau (float): The soft update coefficient for updating target networks. Controls how slowly target
-            networks track the main networks. Smaller values mean slower updates. Default is 0.005.
-        target_policy_noise (float): The standard deviation of noise added to target policy actions during
-            critic updates. This smoothing helps prevent overfitting to narrow policy peaks. Default is 0.2.
-        target_noise_clip (float): The maximum absolute value for clipping the target policy noise.
-            Prevents the noise from being too large. Default is 0.5.
+
+        off_policy (OffPolicyConfig): Nested configuration for off-policy algorithms (MATD3/MADDPG) hyperparameters.
+        on_policy (OnPolicyConfig): Nested configuration for on-policy algorithms (PPO/MAPPO) hyperparameters.
 
     """
 
@@ -843,52 +949,47 @@ class LearningConfig:
     max_bid_price: float | None = 100.0
 
     device: str = "cpu"
-    episodes_collecting_initial_experience: int = 5
     exploration_noise_std: float = 0.2
     training_episodes: int = 100
     validation_episodes_interval: int = 5
     train_freq: str = "24h"
     batch_size: int = 128
-    gradient_steps: int = 100
     learning_rate: float = 0.001
     learning_rate_schedule: str | None = None
     early_stopping_steps: int | None = None
     early_stopping_threshold: float = 0.05
 
     algorithm: str = "matd3"
-    replay_buffer_size: int = 50000
     gamma: float = 0.99
     actor_architecture: str = "mlp"
-    policy_delay: int = 2
-    noise_sigma: float = 0.1
-    noise_scale: int = 1
-    noise_dt: int = 1
-    action_noise_schedule: str | None = None
-    tau: float = 0.005
-    target_policy_noise: float = 0.2
-    target_noise_clip: float = 0.5
+
+    # Nested algorithm configurations
+    off_policy: OffPolicyConfig = field(default_factory=OffPolicyConfig)
+    on_policy: OnPolicyConfig = field(default_factory=OnPolicyConfig)
 
     def __post_init__(self):
         """Calculate defaults that depend on other fields and validate inputs."""
+        # Convert nested dicts to dataclass instances if necessary
+        if isinstance(self.off_policy, dict):
+            self.off_policy = OffPolicyConfig(**self.off_policy)
+        if isinstance(self.on_policy, dict):
+            self.on_policy = OnPolicyConfig(**self.on_policy)
+
+        for config in [self.off_policy, self.on_policy]:
+            if config:
+                config.batch_size = self.batch_size
+                config.gamma = self.gamma
+                config.train_freq = self.train_freq
+
+        self.off_policy.actor_architecture = self.actor_architecture
+        self.on_policy.actor_architecture = self.actor_architecture
+
         if self.early_stopping_steps is None:
             self.early_stopping_steps = int(
                 self.training_episodes / self.validation_episodes_interval + 1
             )
 
-        # if we do not have initial experience collected we will get an error as no samples are available on the
-        # buffer from which we can draw experience to adapt the strategy, hence we set it to minimum one episode
-        if self.episodes_collecting_initial_experience < 1:
-            logger.warning(
-                f"episodes_collecting_initial_experience need to be at least 1 to sample from buffer, got {self.episodes_collecting_initial_experience}. setting to 1"
-            )
-
-            self.episodes_collecting_initial_experience = 1
-
-        # check that gradient_steps is positive
-        if self.gradient_steps <= 0:
-            raise ValueError(
-                f"gradient_steps need to be positive, got {self.gradient_steps}"
-            )
+        # check that gradient_steps is positive (now checked in off_policy config)
 
 
 class LearningStrategy(BaseStrategy):
@@ -930,6 +1031,7 @@ def __init__(
         # access to the learning_role that orchestrates learning
         self.learning_role = learning_role
         self.learning_config = learning_role.learning_config
+        self.algorithm = self.learning_config.algorithm
 
         self.foresight = foresight
         self.act_dim = act_dim
diff --git a/assume/common/utils.py b/assume/common/utils.py
index 8c6cbe8ea..e4dec8f8d 100644
--- a/assume/common/utils.py
+++ b/assume/common/utils.py
@@ -599,6 +599,34 @@ def rename_study_case(path: str, old_key: str, new_key: str):
         yaml.safe_dump(data, file, sort_keys=False)
 
 
+def convert_to_tensors(array: np.array, copy=True, dtype=None, device=None):
+    """Convert a numpy array to a PyTorch tensor.
+
+    Note:
+        It copies the data by default.
+
+    Args:
+        array (numpy.ndarray): The numpy array to convert.
+        copy (bool, optional): Whether to copy the data or not
+            (may be useful to avoid changing things by reference). Defaults to True.
+
+    Returns:
+        torch.Tensor: The converted PyTorch tensor.
+    """
+
+    try:
+        import torch as th
+
+        if copy:
+            return th.tensor(array, dtype=dtype, device=device)
+
+        return th.as_tensor(array, dtype=dtype, device=device)
+
+    except ImportError:
+        # If torch is not installed, return the array unchanged
+        return array
+
+
 def convert_tensors(data):
     """
     Recursively checks if the data contains PyTorch tensors and converts them to
diff --git a/assume/reinforcement_learning/__init__.py b/assume/reinforcement_learning/__init__.py
index a10131609..099d8b56e 100644
--- a/assume/reinforcement_learning/__init__.py
+++ b/assume/reinforcement_learning/__init__.py
@@ -2,5 +2,41 @@
 #
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
-from assume.reinforcement_learning.buffer import ReplayBuffer
+from assume.reinforcement_learning.algorithms import (
+    A2CAlgorithm,
+    DDPG,
+    LSTMActor,
+    MLPActor,
+    PPO,
+    RLAlgorithm,
+    TD3,
+    actor_architecture_aliases,
+)
+from assume.reinforcement_learning.buffer import (
+    ReplayBuffer,
+    ReplayBufferSamples,
+    RolloutBuffer,
+    RolloutBufferSamples,
+)
 from assume.reinforcement_learning.learning_role import Learning
+
+__all__ = [
+    # Learning orchestration
+    "Learning",
+    # Algorithms base classes
+    "RLAlgorithm",
+    "A2CAlgorithm",
+    # Algorithms concrete implementations
+    "TD3",
+    "DDPG",
+    "PPO",
+    # Actor architectures
+    "MLPActor",
+    "LSTMActor",
+    "actor_architecture_aliases",
+    # Buffers
+    "ReplayBuffer",
+    "ReplayBufferSamples",
+    "RolloutBuffer",
+    "RolloutBufferSamples",
+]
diff --git a/assume/reinforcement_learning/algorithms/__init__.py b/assume/reinforcement_learning/algorithms/__init__.py
index 645e5c991..363a58edd 100644
--- a/assume/reinforcement_learning/algorithms/__init__.py
+++ b/assume/reinforcement_learning/algorithms/__init__.py
@@ -5,11 +5,34 @@
 from torch import nn
 
 from assume.reinforcement_learning.neural_network_architecture import (
-    MLPActor,
     LSTMActor,
+    MLPActor,
 )
 
 actor_architecture_aliases: dict[str, type[nn.Module]] = {
     "mlp": MLPActor,
     "lstm": LSTMActor,
 }
+
+from assume.reinforcement_learning.algorithms.base_algorithm import (
+    A2CAlgorithm,
+    RLAlgorithm,
+)
+from assume.reinforcement_learning.algorithms.maddpg import DDPG
+from assume.reinforcement_learning.algorithms.mappo import PPO
+from assume.reinforcement_learning.algorithms.matd3 import TD3
+
+
+__all__ = [
+    # Base classes
+    "RLAlgorithm",
+    "A2CAlgorithm",
+    # Concrete algorithms
+    "TD3",
+    "DDPG",
+    "PPO",
+    # Actor architectures
+    "actor_architecture_aliases",
+    "MLPActor",
+    "LSTMActor",
+]
diff --git a/assume/reinforcement_learning/algorithms/base_algorithm.py b/assume/reinforcement_learning/algorithms/base_algorithm.py
index 44c0f492f..0fcff0019 100644
--- a/assume/reinforcement_learning/algorithms/base_algorithm.py
+++ b/assume/reinforcement_learning/algorithms/base_algorithm.py
@@ -1,29 +1,57 @@
 # SPDX-FileCopyrightText: ASSUME Developers
 #
 # SPDX-License-Identifier: AGPL-3.0-or-later
-
+import json
 import logging
+import os
 
 import torch as th
+from torch.optim import AdamW
 
+from assume.common.base import LearningStrategy
 from assume.reinforcement_learning.algorithms import actor_architecture_aliases
+from assume.reinforcement_learning.learning_utils import (
+    transfer_weights,
+)
 
 logger = logging.getLogger(__name__)
 
 
 class RLAlgorithm:
-    """
-    The base RL model class. To implement your own RL algorithm, you need to subclass this class and implement the `update_policy` method.
+    """Base reinforcement learning algorithm class.
+
+    This is the foundation class for all Reinforcement Learning algorithms in the framework.
+    To implement a custom RL algorithm, subclass this class and override the `update_policy` and `get_action` methods.
 
-    Args:
-        learning_role (Learning Role object): Learning object
+    The class provides common functionality for:
+    - Learning rate scheduling
+    - Parameter saving/loading
+    - Device management
+
+    Attributes:
+        learning_role: The learning role object containing configuration and strategies.
+        learning_config: Configuration parameters from the learning role.
+        device: The computation device (CPU/GPU) for tensors.
+        float_type: The floating point precision type for computations.
+        actor_architecture_class: The actor network architecture class.
+
+    Example:
+        >>> class CustomAlgorithm(RLAlgorithm):
+        ...     def update_policy(self):
+        ...         # Custom policy update logic
+        ...         pass
+        ...     def get_action(self, strategy, obs):
+        ...         # Custom action selection logic
+        ...         pass
     """
 
-    def __init__(
-        self,
-        # init learning_role as object of Learning class
-        learning_role,
-    ):
+    def __init__(self, learning_role):
+        """Initialize the RL algorithm.
+
+        Args:
+            learning_role: Learning role object containing configuration and strategies.
+                Must be an instance of the Learning class.
+        """
         super().__init__()
 
         self.learning_role = learning_role
@@ -46,17 +74,23 @@ def update_learning_rate(
         optimizers: list[th.optim.Optimizer] | th.optim.Optimizer,
         learning_rate: float,
     ) -> None:
-        """
-        Update the optimizers learning rate using the current learning rate schedule and the current progress remaining (from 1 to 0).
+        """Update optimizer learning rates.
+
+        Sets the learning rate for one or more optimizers. Handles both single
+        optimizers and lists of optimizers uniformly.
 
         Args:
-            optimizers (List[th.optim.Optimizer] | th.optim.Optimizer): An optimizer or a list of optimizers.
+            optimizers: A single optimizer or list of optimizers to update.
+            learning_rate: The new learning rate value to set.
 
         Note:
-            Adapted from SB3:
+            Adapted from Stable Baselines 3:
             - https://github.com/DLR-RM/stable-baselines3/blob/512eea923afad6f6da4bb53d72b6ea4c6d856e59/stable_baselines3/common/base_class.py#L286
             - https://github.com/DLR-RM/stable-baselines3/blob/512eea923afad6f6da4bb53d72b6ea4c6d856e59/stable_baselines3/common/utils.py#L68
 
+        Example:
+            >>> optimizer = AdamW(model.parameters(), lr=0.001)
+            >>> algorithm.update_learning_rate(optimizer, 0.0001)
         """
 
         if not isinstance(optimizers, list):
@@ -65,27 +99,606 @@ def update_learning_rate(
             for param_group in optimizer.param_groups:
                 param_group["lr"] = learning_rate
 
-    def update_policy(self):
+    def get_action(
+        self, strategy: "LearningStrategy", obs: th.Tensor
+    ) -> tuple[th.Tensor, th.Tensor]:
+        """Sample an action for strategy given observation *obs*.
+
+        Each concrete algorithm overrides this method with its own sampling
+        logic.
+
+        Args:
+            strategy: The TorchLearningStrategy instance requesting an action.
+            obs: Flat observation tensor for a single time-step.
+
+        Returns:
+            A (action, noise) tuple, both tensors on the same device as strategy.
+        """
+        raise NotImplementedError(f"{type(self).__name__} must implement get_action()")
+
+    def update_policy(self) -> None:
+        """Update the policy parameters.
+
+        This method must be overridden by subclasses to implement the specific
+        policy update logic for each RL algorithm. The base implementation raises
+        an error to enforce this requirement.
+
+        Raises:
+            NotImplementedError: If called on the base class without override.
+
+        Example:
+            >>> class CustomAlgorithm(RLAlgorithm):
+            ...     def update_policy(self):
+            ...         # Implement algorithm-specific policy update
+            ...         pass
+        """
         logger.error(
-            "No policy update function of the used Rl algorithm was defined. Please define how the policies should be updated in the specific algorithm you use"
+            "No policy update function of the used RL algorithm was defined. "
+            "Please define how the policies should be updated in the specific "
+            "algorithm you use."
         )
 
     def load_obj(self, directory: str):
-        """
-        Load an object from a specified directory.
+        """Load a serialized object from directory.
 
-        This method loads an object, typically saved as a checkpoint file, from the specified
-        directory and returns it. It uses the `torch.load` function and specifies the device for loading.
+        Loads a PyTorch serialized object from the specified directory path.
+        The object is loaded onto the device specified by the algorithm's configuration.
 
         Args:
-            directory (str): The directory from which the object should be loaded.
+            directory: Path to the directory containing the serialized object.
+                Should point to a valid .pt file.
 
         Returns:
-            object: The loaded object.
+            object: The deserialized Python object.
+
+        Example:
+            >>> model_state = algorithm.load_obj('/path/to/checkpoint.pt')
         """
         return th.load(directory, map_location=self.device, weights_only=True)
 
     def load_params(self, directory: str) -> None:
+        """Load learning parameters from disk.
+
+        Abstract method that should be implemented by subclasses to load
+        algorithm-specific parameters from the specified directory.
+
+        Args:
+            directory: Path to the directory containing saved parameters.
+
+        Note:
+            This is an abstract method that must be overridden by subclasses.
+        """
+
+
+class A2CAlgorithm(RLAlgorithm):
+    """Base actor-critic algorithm class.
+
+    Provides shared functionality for actor-critic reinforcement learning algorithms
+    including parameter management, network initialization, and saving/loading utilities.
+    This serves as the foundation for algorithms like MATD3, MADDPG, and MAPPO.
+
+    The class handles:
+    - Actor and critic network creation and management
+    - Target network synchronization (when applicable)
+    - Parameter saving and loading
+    - Weight transfer between different agent configurations
+
+    Attributes:
+        uses_target_networks: Whether this algorithm uses target networks.
+            TD3 and DDPG use target networks (True), PPO does not (False).
+
+    Example:
+        >>> class ActorCriticAlgorithm(A2CAlgorithm):
+        ...     def update_policy(self):
+        ...         # Custom actor-critic update logic
+        ...         pass
+    """
+
+    #: Whether this algorithm uses target networks for stability.
+    #: TD3 and DDPG use target networks (True), PPO does not (False).
+    uses_target_networks: bool = True
+
+    def __init__(self, learning_role):
+        """Initialize the actor-critic algorithm.
+
+        Args:
+            learning_role: Learning role object containing configuration and strategies.
+        """
+        super().__init__(learning_role)
+
+    def save_params(self, directory: str) -> None:
+        """Save actor and critic network parameters.
+
+        Saves both actor and critic network parameters to separate subdirectories.
+        Creates the directory structure if it doesn't exist.
+
+        Args:
+            directory: Base directory path where parameters will be saved.
+                Will create 'actors/' and 'critics/' subdirectories.
+
+        Example:
+            >>> algorithm.save_params('/path/to/save/directory')
+            # Creates:
+            # /path/to/save/directory/actors/
+            # /path/to/save/directory/critics/
+        """
+        self.save_critic_params(directory=f"{directory}/critics")
+        self.save_actor_params(directory=f"{directory}/actors")
+
+    def save_critic_params(self, directory: str) -> None:
+        """Save critic network parameters.
+
+        Saves critic networks, their optimizers, and target critics (if applicable)
+        for all registered learning strategies. Also saves agent ID ordering information
+        to ensure proper loading.
+
+        Args:
+            directory: Directory path where critic parameters will be saved.
+                Will be created if it doesn't exist.
+
+        Example:
+            >>> algorithm.save_critic_params('/path/to/critics/')
+        """
+        os.makedirs(directory, exist_ok=True)
+        for u_id, strategy in self.learning_role.rl_strats.items():
+            obj = {
+                "critic": strategy.critics.state_dict(),
+                "critic_optimizer": strategy.critics.optimizer.state_dict(),
+            }
+            # Only save target critic if this algorithm uses target networks
+            if self.uses_target_networks:
+                obj["critic_target"] = strategy.target_critics.state_dict()
+
+            path = f"{directory}/critic_{u_id}.pt"
+            th.save(obj, path)
+
+        # record the exact order of u_ids and save it with critics to ensure that the same order is used when loading the parameters
+        u_id_list = [str(u) for u in self.learning_role.rl_strats.keys()]
+        mapping = {"u_id_order": u_id_list}
+        map_path = os.path.join(directory, "u_id_order.json")
+        with open(map_path, "w") as f:
+            json.dump(mapping, f, indent=2)
+
+    def save_actor_params(self, directory: str) -> None:
+        """Save actor network parameters.
+
+        Saves actor networks, their optimizers, and target actors (if applicable)
+        for all registered learning strategies.
+
+        Args:
+            directory: Directory path where actor parameters will be saved.
+                Will be created if it doesn't exist.
+
+        Example:
+            >>> algorithm.save_actor_params('/path/to/actors/')
+        """
+        os.makedirs(directory, exist_ok=True)
+        for u_id, strategy in self.learning_role.rl_strats.items():
+            obj = {
+                "actor": strategy.actor.state_dict(),
+                "actor_optimizer": strategy.actor.optimizer.state_dict(),
+            }
+            # Only save target actor if this algorithm uses target networks
+            if self.uses_target_networks:
+                obj["actor_target"] = strategy.actor_target.state_dict()
+
+            path = f"{directory}/actor_{u_id}.pt"
+            th.save(obj, path)
+
+    def load_params(self, directory: str) -> None:
+        """
+        Load the parameters of both actor and critic networks.
+
+        This method loads the parameters of both the actor and critic networks associated with the learning role from the specified
+        directory. It uses the `load_critic_params` and `load_actor_params` methods to load the respective parameters.
+
+        Args:
+            directory: Base directory containing 'actors/' and 'critics/' subdirectories.
+
+        Example:
+            >>> algorithm.load_params('/path/to/saved/parameters/')
+        """
+        self.load_critic_params(directory)
+        self.load_actor_params(directory)
+
+    def load_critic_params(self, directory: str) -> None:
+        """Load critic network parameters.
+
+        Loads critic networks, target critics (if applicable), and optimizer states
+        for each registered agent strategy. Handles cases where the number of agents
+        differs between saved and current models by performing intelligent weight transfer.
+
+        Args:
+            directory: Base directory containing the 'critics/' subdirectory.
+
+        Note:
+            Automatically handles agent count mismatches through weight transfer.
+            Preserves the order of agents using saved mapping information.
+
+        Example:
+            >>> algorithm.load_critic_params('/path/to/saved/parameters/')
+        """
+        logger.info("Loading critic parameters...")
+
+        if not os.path.exists(directory):
+            logger.warning(
+                "Specified directory does not exist. Using randomly initialized critics."
+            )
+            return
+
+        map_path = os.path.join(directory, "critics", "u_id_order.json")
+        if os.path.exists(map_path):
+            # read the saved order of u_ids from critics save directory
+            with open(map_path) as f:
+                loaded_id_order = json.load(f).get("u_id_order", [])
+        else:
+            logger.warning("No u_id_order.json: assuming same order as current.")
+            loaded_id_order = [str(u) for u in self.learning_role.rl_strats.keys()]
+
+        new_id_order = [str(u) for u in self.learning_role.rl_strats.keys()]
+        direct_load = loaded_id_order == new_id_order
+
+        if direct_load:
+            logger.info("Agents order unchanged. Loading critic weights directly.")
+        else:
+            logger.info(
+                f"Agents length and/or order mismatch: n_old={len(loaded_id_order)}, n_new={len(new_id_order)}. Transferring weights for critics and target critics."
+            )
+
+        for u_id, strategy in self.learning_role.rl_strats.items():
+            critic_path = os.path.join(directory, "critics", f"critic_{u_id}.pt")
+            if not os.path.exists(critic_path):
+                logger.warning(f"No saved critic for {u_id}; skipping.")
+                continue
+
+            try:
+                critic_params = th.load(critic_path, weights_only=True)
+
+                # Required keys depend on whether algorithm uses target networks
+                required_keys = ["critic", "critic_optimizer"]
+                if self.uses_target_networks:
+                    required_keys.append("critic_target")
+
+                for key in required_keys:
+                    if key not in critic_params:
+                        logger.warning(
+                            f"Missing {key} in critic params for {u_id}; skipping."
+                        )
+                        continue
+
+                if direct_load:
+                    strategy.critics.load_state_dict(critic_params["critic"])
+                    strategy.critics.optimizer.load_state_dict(
+                        critic_params["critic_optimizer"]
+                    )
+                    # Only load target critic if this algorithm uses target networks
+                    if self.uses_target_networks and "critic_target" in critic_params:
+                        strategy.target_critics.load_state_dict(
+                            critic_params["critic_target"]
+                        )
+                    logger.debug(f"Loaded critic for {u_id} directly.")
+                else:
+                    critic_weights = transfer_weights(
+                        model=strategy.critics,
+                        loaded_state=critic_params["critic"],
+                        loaded_id_order=loaded_id_order,
+                        new_id_order=new_id_order,
+                        obs_base=strategy.obs_dim,
+                        act_dim=strategy.act_dim,
+                        unique_obs=strategy.unique_obs_dim,
+                    )
+
+                    if critic_weights is None:
+                        logger.warning(
+                            f"Critic weights transfer failed for {u_id}; skipping."
+                        )
+                        continue
+
+                    strategy.critics.load_state_dict(critic_weights)
+
+                    # Only transfer target critic weights if this algorithm uses target networks
+                    if self.uses_target_networks and "critic_target" in critic_params:
+                        target_critic_weights = transfer_weights(
+                            model=strategy.target_critics,
+                            loaded_state=critic_params["critic_target"],
+                            loaded_id_order=loaded_id_order,
+                            new_id_order=new_id_order,
+                            obs_base=strategy.obs_dim,
+                            act_dim=strategy.act_dim,
+                            unique_obs=strategy.unique_obs_dim,
+                        )
+
+                        if target_critic_weights is None:
+                            logger.warning(
+                                f"Target critic weights transfer failed for {u_id}; skipping."
+                            )
+                            continue
+
+                        strategy.target_critics.load_state_dict(target_critic_weights)
+
+                    logger.debug(f"Critic weights transferred for {u_id}.")
+
+            except Exception as e:
+                logger.warning(f"Failed to load critic for {u_id}: {e}")
+
+    def load_actor_params(self, directory: str) -> None:
+        """Load actor network parameters.
+
+        Loads actor networks, target actors (if applicable), and optimizer states
+        for each registered agent strategy from the specified directory.
+
+        Args:
+            directory: The directory containing the 'actors/' subdirectory where the parameters should be loaded.
+
+        Example:
+            >>> algorithm.load_actor_params('/path/to/saved/parameters/')
         """
-        Load learning params - abstract method to be implemented by the Learning Algorithm
+        logger.info("Loading actor parameters...")
+        if not os.path.exists(directory):
+            logger.warning(
+                "Specified directory for loading the actors does not exist! Starting with randomly initialized values!"
+            )
+            return
+
+        for u_id, strategy in self.learning_role.rl_strats.items():
+            try:
+                actor_params = self.load_obj(
+                    directory=f"{directory}/actors/actor_{str(u_id)}.pt"
+                )
+                strategy.actor.load_state_dict(actor_params["actor"])
+                strategy.actor.optimizer.load_state_dict(
+                    actor_params["actor_optimizer"]
+                )
+
+                # Only load target actor if this algorithm uses target networks
+                if self.uses_target_networks and "actor_target" in actor_params:
+                    strategy.actor_target.load_state_dict(actor_params["actor_target"])
+
+                # add a tag to the strategy to indicate that the actor was loaded
+                strategy.actor.loaded = True
+            except Exception:
+                logger.warning(f"No actor values loaded for agent {u_id}")
+
+    def initialize_policy(self, actors_and_critics: dict = None) -> None:
         """
+        Create actor and critic networks for reinforcement learning.
+
+        If `actors_and_critics` is None, this method creates new actor and critic networks.
+        If `actors_and_critics` is provided, it assigns existing networks to the respective attributes.
+
+        Args:
+            actors_and_critics: Optional dictionary containing pre-trained networks.
+                If None, creates new networks. If provided, assigns existing networks.
+                Expected format includes 'actors', 'critics', and optionally
+                'actor_targets' and 'target_critics' keys.
+
+        Example:
+            >>> # Create new networks
+            >>> algorithm.initialize_policy()
+            >>>
+            >>> # Assign existing networks
+            >>> algorithm.initialize_policy(existing_networks_dict)
+        """
+        if actors_and_critics is None:
+            self.check_strategy_dimensions()
+            self.create_actors()
+            self.create_critics()
+
+        else:
+            for u_id, strategy in self.learning_role.rl_strats.items():
+                strategy.actor = actors_and_critics["actors"][u_id]
+                strategy.critics = actors_and_critics["critics"][u_id]
+
+                if self.uses_target_networks:
+                    strategy.actor_target = actors_and_critics["actor_targets"][u_id]
+                    strategy.target_critics = actors_and_critics["target_critics"][u_id]
+
+            self.obs_dim = actors_and_critics["obs_dim"]
+            self.act_dim = actors_and_critics["act_dim"]
+            self.unique_obs_dim = actors_and_critics["unique_obs_dim"]
+
+    def check_strategy_dimensions(self) -> None:
+        """Validate learning strategy dimensions.
+
+        Ensures all registered learning strategies have consistent dimensional
+        properties required for centralized critic algorithms. Checks:
+        - Observation dimensions
+        - Action dimensions
+        - Unique observation dimensions
+        - Timeseries observation dimensions
+        - Foresight parameters
+        If not consistent, raises a ValueError. This is important for centralized
+        critic algorithms, as it uses a centralized critic that requires consistent
+        dimensions across all agents.
+
+        Raises:
+            ValueError: If any dimension mismatch is detected across strategies.
+
+        Note:
+            This validation is crucial for centralized critic algorithms where
+            all agents must have compatible observation and action spaces.
+        """
+        foresight_list = []
+        obs_dim_list = []
+        act_dim_list = []
+        unique_obs_dim_list = []
+        num_timeseries_obs_dim_list = []
+
+        for strategy in self.learning_role.rl_strats.values():
+            foresight_list.append(strategy.foresight)
+            obs_dim_list.append(strategy.obs_dim)
+            act_dim_list.append(strategy.act_dim)
+            unique_obs_dim_list.append(strategy.unique_obs_dim)
+            num_timeseries_obs_dim_list.append(strategy.num_timeseries_obs_dim)
+
+        if len(set(foresight_list)) > 1:
+            raise ValueError(
+                f"All foresight values must be the same for all RL agents. The defined learning strategies have the following foresight values: {foresight_list}"
+            )
+        else:
+            self.foresight = foresight_list[0]
+
+        if len(set(act_dim_list)) > 1:
+            raise ValueError(
+                f"All action dimensions must be the same for all RL agents. The defined learning strategies have the following action dimensions: {act_dim_list}"
+            )
+        else:
+            self.act_dim = act_dim_list[0]
+
+        if len(set(unique_obs_dim_list)) > 1:
+            raise ValueError(
+                f"All unique_obs_dim values must be the same for all RL agents. The defined learning strategies have the following unique_obs_dim values: {unique_obs_dim_list}"
+            )
+        else:
+            self.unique_obs_dim = unique_obs_dim_list[0]
+
+        if len(set(num_timeseries_obs_dim_list)) > 1:
+            raise ValueError(
+                f"All num_timeseries_obs_dim values must be the same for all RL agents. The defined learning strategies have the following num_timeseries_obs_dim values: {num_timeseries_obs_dim_list}"
+            )
+        else:
+            self.num_timeseries_obs_dim = num_timeseries_obs_dim_list[0]
+
+        # Check last, as other cases should fail before!
+        if len(set(obs_dim_list)) > 1:
+            raise ValueError(
+                f"All observation dimensions must be the same for all RL agents. The defined learning strategies have the following observation dimensions: {obs_dim_list}"
+            )
+        else:
+            self.obs_dim = obs_dim_list[0]
+
+    def create_actors(self) -> None:
+        """Create actor networks for all learning strategies.
+
+        This method initializes actor networks and their corresponding target networks for
+        each registered unit strategy. Actors map observations to actions.
+
+        Note:
+            All strategies must have the same observation dimension due to the
+            centralized critic architecture. Units with different observation
+            dimensions require separate learning roles with different critics.
+
+        Example:
+            >>> algorithm.create_actors()
+            >>> # Creates actor and actor_target for each strategy
+        """
+
+        for strategy in self.learning_role.rl_strats.values():
+            strategy.actor = self.actor_architecture_class(
+                obs_dim=self.obs_dim,
+                act_dim=self.act_dim,
+                float_type=self.float_type,
+                unique_obs_dim=self.unique_obs_dim,
+                num_timeseries_obs_dim=self.num_timeseries_obs_dim,
+            ).to(self.device)
+
+            if self.uses_target_networks:
+                strategy.actor_target = self.actor_architecture_class(
+                    obs_dim=self.obs_dim,
+                    act_dim=self.act_dim,
+                    float_type=self.float_type,
+                    unique_obs_dim=self.unique_obs_dim,
+                    num_timeseries_obs_dim=self.num_timeseries_obs_dim,
+                ).to(self.device)
+
+                strategy.actor_target.load_state_dict(strategy.actor.state_dict())
+                strategy.actor_target.train(mode=False)
+
+            strategy.actor.optimizer = AdamW(
+                strategy.actor.parameters(),
+                lr=self.learning_role.calc_lr_from_progress(
+                    1
+                ),  # 1=100% of simulation remaining, uses learning_rate from config as starting point
+            )
+
+            strategy.actor.loaded = False
+
+    def create_critics(self) -> None:
+        """Create critic networks for all learning strategies.
+
+        Initializes critic networks and their corresponding target networks for
+        each registered agent strategy. Critics evaluate state-action pairs.
+
+        Note:
+            All strategies must have the same observation dimension due to the
+            centralized critic architecture. Units with different observation
+            dimensions require separate learning roles with different critics.
+
+        Example:
+            >>> algorithm.create_critics()
+            >>> # Creates critics and target_critics for each strategy
+        """
+        n_agents = len(self.learning_role.rl_strats)
+
+        for strategy in self.learning_role.rl_strats.values():
+            strategy.critics = self.critic_architecture_class(
+                n_agents=n_agents,
+                obs_dim=self.obs_dim,
+                act_dim=self.act_dim,
+                unique_obs_dim=self.unique_obs_dim,
+                float_type=self.float_type,
+            ).to(self.device)
+
+            if self.uses_target_networks:
+                strategy.target_critics = self.critic_architecture_class(
+                    n_agents=n_agents,
+                    obs_dim=self.obs_dim,
+                    act_dim=self.act_dim,
+                    unique_obs_dim=self.unique_obs_dim,
+                    float_type=self.float_type,
+                ).to(self.device)
+
+                strategy.target_critics.load_state_dict(strategy.critics.state_dict())
+                strategy.target_critics.train(mode=False)
+
+            strategy.critics.optimizer = AdamW(
+                strategy.critics.parameters(),
+                lr=self.learning_role.calc_lr_from_progress(
+                    1
+                ),  # 1 = 100% of simulation remaining, uses learning_rate from config as starting point
+            )
+
+    def extract_policy(self) -> dict:
+        """Extract all policy networks.
+
+        Collects actor and critic networks from all learning strategies into
+        a structured dictionary. Includes both primary and target networks.
+
+        Returns:
+            Dictionary containing all network components organized by type:
+                - 'actors': Primary actor networks
+                - 'actor_targets': Target actor networks
+                - 'critics': Primary critic networks
+                - 'target_critics': Target critic networks
+                - Dimension information for reconstruction
+
+        Example:
+            >>> policy_dict = algorithm.extract_policy()
+            >>> # Contains all networks ready for saving or transfer
+        """
+        actors = {}
+        critics = {}
+        if self.uses_target_networks:
+            actor_targets = {}
+            target_critics = {}
+
+        for u_id, strategy in self.learning_role.rl_strats.items():
+            actors[u_id] = strategy.actor
+            critics[u_id] = strategy.critics
+            if self.uses_target_networks:
+                actor_targets[u_id] = strategy.actor_target
+                target_critics[u_id] = strategy.target_critics
+
+        actors_and_critics = {
+            "actors": actors,
+            "critics": critics,
+            "obs_dim": self.obs_dim,
+            "act_dim": self.act_dim,
+            "unique_obs_dim": self.unique_obs_dim,
+        }
+
+        if self.uses_target_networks:
+            actors_and_critics["actor_targets"] = actor_targets
+            actors_and_critics["target_critics"] = target_critics
+
+        return actors_and_critics
diff --git a/assume/reinforcement_learning/algorithms/maddpg.py b/assume/reinforcement_learning/algorithms/maddpg.py
new file mode 100644
index 000000000..7bd1fdad3
--- /dev/null
+++ b/assume/reinforcement_learning/algorithms/maddpg.py
@@ -0,0 +1,352 @@
+# SPDX-FileCopyrightText: ASSUME Developers
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import logging
+
+import torch as th
+from torch.nn import functional as F
+
+from assume.common.base import LearningStrategy
+from assume.reinforcement_learning.algorithms.base_algorithm import A2CAlgorithm
+from assume.reinforcement_learning.learning_utils import (
+    polyak_update,
+)
+from assume.reinforcement_learning.neural_network_architecture import CriticDDPG
+
+logger = logging.getLogger(__name__)
+
+
+class DDPG(A2CAlgorithm):
+    """Deep Deterministic Policy Gradient (DDPG) Algorithm.
+
+    An off-policy actor-critic algorithm that uses deterministic policy gradients
+    for continuous action spaces. DDPG combines Q-learning with policy gradients,
+    using:
+
+    - A single critic network to estimate Q-values
+    - Deterministic actor networks that map states to actions
+    - Target networks updated via Polyak averaging for stability
+    - Replay buffer for sample efficiency and decorrelation
+
+    Attributes:
+        n_updates: Counter for gradient updates performed.
+        grad_clip_norm: Maximum gradient norm for clipping.
+        critic_architecture_class: Critic network architecture (CriticDDPG).
+
+    Example:
+        >>> ddpg = DDPG(learning_role)
+        >>> ddpg.update_policy()  # Performs one training iteration
+    """
+
+    def __init__(self, learning_role) -> None:
+        """Initialize the DDPG algorithm.
+
+        Sets up the algorithm with gradient counters, clipping parameters,
+        and critic architecture.
+
+        Args:
+            learning_role: Learning role object managing agents and replay buffer.
+                Must have off-policy configuration.
+        """
+        super().__init__(learning_role)
+
+        # Gradient step counter
+        self.n_updates = 0
+
+        # Gradient clipping threshold
+        self.grad_clip_norm = 1.0
+
+        # Define the critic architecture class for DDPG (single critic)
+        self.critic_architecture_class = CriticDDPG
+
+    def get_action(
+        self, strategy: "LearningStrategy", obs: th.Tensor
+    ) -> tuple[th.Tensor, th.Tensor]:
+        """Sample an action using the off-policy strategy.
+
+        During learning mode the agent either performs pure-noise initial
+        exploration (first N episodes) or uses its deterministic actor plus
+        Gaussian action noise.  During evaluation mode the actor is used
+        without any noise.
+
+        This default implementation is shared by TD3 and DDPG.  PPO overrides
+        it with its own stochastic Gaussian sampling.
+        """
+        if strategy.learning_mode and not strategy.evaluation_mode:
+            if strategy.collect_initial_experience_mode:
+                # Pure Gaussian noise for initial random exploration
+                noise = th.normal(
+                    mean=0.0,
+                    std=strategy.exploration_noise_std,
+                    size=(strategy.act_dim,),
+                    dtype=strategy.float_type,
+                    device=strategy.device,
+                )
+                return noise, noise
+
+            action = strategy.actor(obs).detach()
+            noise = strategy.action_noise.noise(
+                device=strategy.device, dtype=strategy.float_type
+            )
+            action = th.clamp(
+                action + noise,
+                strategy.actor.min_output,
+                strategy.actor.max_output,
+            )
+            return action, noise
+
+        # Evaluation
+        action = strategy.actor(obs).detach()
+        noise = th.zeros(
+            strategy.act_dim, dtype=strategy.float_type, device=strategy.device
+        )
+        return action, noise
+
+    def update_policy(self) -> None:
+        """Update actor and critic networks using DDPG algorithm.
+
+        Performs one complete training iteration consisting of:
+        1. Sampling batches from replay buffer
+        2. Updating critic networks using MSE loss
+        3. Updating actor networks using policy gradient
+        4. Updating target networks via Polyak averaging
+
+        """
+        logger.debug("Updating Policy (MADDPG/DDPG)")
+
+        strategies = list(self.learning_role.rl_strats.values())
+        n_rl_agents = len(strategies)
+
+        # Initialize metrics storage for gradient logging
+        unit_params = [
+            {
+                u_id: {
+                    "actor_loss": None,
+                    "actor_total_grad_norm": None,
+                    "actor_max_grad_norm": None,
+                    "critic_loss": None,
+                    "critic_total_grad_norm": None,
+                    "critic_max_grad_norm": None,
+                }
+                for u_id in self.learning_role.rl_strats.keys()
+            }
+            for _ in range(self.learning_config.off_policy.gradient_steps)
+        ]
+
+        # Update noise decay and learning rate based on training progress
+        progress_remaining = self.learning_role.get_progress_remaining()
+        updated_noise_decay = self.learning_role.calc_noise_from_progress(
+            progress_remaining
+        )
+        learning_rate = self.learning_role.calc_lr_from_progress(progress_remaining)
+
+        # Update learning rates and noise schedules for all strategies
+        for strategy in strategies:
+            self.update_learning_rate(
+                [strategy.critics.optimizer, strategy.actor.optimizer],
+                learning_rate=learning_rate,
+            )
+            strategy.action_noise.update_noise_decay(updated_noise_decay)
+
+        # Perform gradient updates for specified number of steps
+        for step in range(self.learning_config.off_policy.gradient_steps):
+            self.n_updates += 1
+
+            # Sample transition batch from replay buffer
+            transitions = self.learning_role.buffer.sample(
+                self.learning_config.batch_size
+            )
+
+            states, actions, next_states, rewards = (
+                transitions.observations,
+                transitions.actions,
+                transitions.next_observations,
+                transitions.rewards,
+            )
+
+            # Compute target actions using target actors
+            with th.no_grad():
+                next_actions = th.stack(
+                    [
+                        strategy.actor_target(next_states[:, i, :]).clamp(-1, 1)
+                        for i, strategy in enumerate(strategies)
+                    ]
+                )
+                next_actions = next_actions.transpose(0, 1).contiguous()
+                next_actions = next_actions.view(-1, n_rl_agents * self.act_dim)
+
+            all_actions = actions.view(self.learning_config.batch_size, -1)
+
+            # Extract unique observations for centralized critic construction
+            unique_obs_from_others = states[
+                :, :, self.obs_dim - self.unique_obs_dim :
+            ].reshape(self.learning_config.batch_size, n_rl_agents, -1)
+
+            next_unique_obs_from_others = next_states[
+                :, :, self.obs_dim - self.unique_obs_dim :
+            ].reshape(self.learning_config.batch_size, n_rl_agents, -1)
+
+            # ------------------------------------------------------------
+            # CRITIC UPDATE PHASE
+            # ------------------------------------------------------------
+            for strategy in strategies:
+                strategy.critics.optimizer.zero_grad(set_to_none=True)
+
+            total_critic_loss = 0.0
+
+            for i, strategy in enumerate(strategies):
+                critic = strategy.critics
+                critic_target = strategy.target_critics
+
+                # Build centralized observation
+                other_unique_obs = th.cat(
+                    (unique_obs_from_others[:, :i], unique_obs_from_others[:, i + 1 :]),
+                    dim=1,
+                )
+                other_next_unique_obs = th.cat(
+                    (
+                        next_unique_obs_from_others[:, :i],
+                        next_unique_obs_from_others[:, i + 1 :],
+                    ),
+                    dim=1,
+                )
+
+                all_states = th.cat(
+                    (
+                        states[:, i, :].reshape(self.learning_config.batch_size, -1),
+                        other_unique_obs.reshape(self.learning_config.batch_size, -1),
+                    ),
+                    dim=1,
+                )
+                all_next_states = th.cat(
+                    (
+                        next_states[:, i, :].reshape(
+                            self.learning_config.batch_size, -1
+                        ),
+                        other_next_unique_obs.reshape(
+                            self.learning_config.batch_size, -1
+                        ),
+                    ),
+                    dim=1,
+                )
+
+                # Compute target Q-value (single critic, no min)
+                with th.no_grad():
+                    next_q_value = critic_target(all_next_states, next_actions)
+                    target_Q_value = (
+                        rewards[:, i].unsqueeze(1)
+                        + self.learning_config.gamma * next_q_value
+                    )
+
+                # Compute current Q-value
+                current_Q_value = critic(all_states, all_actions)
+
+                # MSE loss (single critic)
+                critic_loss = F.mse_loss(current_Q_value, target_Q_value)
+
+                unit_params[step][strategy.unit_id]["critic_loss"] = critic_loss.item()
+                total_critic_loss += critic_loss
+
+            # Backward pass for critics
+            total_critic_loss.backward()
+
+            for strategy in strategies:
+                parameters = list(strategy.critics.parameters())
+                max_grad_norm = max(p.grad.norm() for p in parameters)
+                total_norm = th.nn.utils.clip_grad_norm_(
+                    parameters, max_norm=self.grad_clip_norm
+                )
+                strategy.critics.optimizer.step()
+
+                unit_params[step][strategy.unit_id]["critic_total_grad_norm"] = (
+                    total_norm
+                )
+                unit_params[step][strategy.unit_id]["critic_max_grad_norm"] = (
+                    max_grad_norm
+                )
+
+            # ------------------------------------------------------------
+            # ACTOR UPDATE PHASE (updated every step)
+            # ------------------------------------------------------------
+            for strategy in strategies:
+                strategy.actor.optimizer.zero_grad(set_to_none=True)
+
+            total_actor_loss = 0.0
+
+            for i, strategy in enumerate(strategies):
+                actor = strategy.actor
+                critic = strategy.critics
+
+                state_i = states[:, i, :]
+                action_i = actor(state_i)
+
+                other_unique_obs = th.cat(
+                    (unique_obs_from_others[:, :i], unique_obs_from_others[:, i + 1 :]),
+                    dim=1,
+                )
+                all_states_i = th.cat(
+                    (
+                        state_i.reshape(self.learning_config.batch_size, -1),
+                        other_unique_obs.reshape(self.learning_config.batch_size, -1),
+                    ),
+                    dim=1,
+                )
+
+                all_actions_clone = actions.clone().detach()
+                all_actions_clone[:, i, :] = action_i
+                all_actions_clone = all_actions_clone.view(
+                    self.learning_config.batch_size, -1
+                )
+
+                # Actor loss: maximize Q-value
+                actor_loss = -critic(all_states_i, all_actions_clone).mean()
+
+                unit_params[step][strategy.unit_id]["actor_loss"] = actor_loss.item()
+                total_actor_loss += actor_loss
+
+            # Backward pass for actors
+            total_actor_loss.backward()
+
+            for strategy in strategies:
+                parameters = list(strategy.actor.parameters())
+                max_grad_norm = max(p.grad.norm() for p in parameters)
+                total_norm = th.nn.utils.clip_grad_norm_(
+                    parameters, max_norm=self.grad_clip_norm
+                )
+                strategy.actor.optimizer.step()
+
+                unit_params[step][strategy.unit_id]["actor_total_grad_norm"] = (
+                    total_norm
+                )
+                unit_params[step][strategy.unit_id]["actor_max_grad_norm"] = (
+                    max_grad_norm
+                )
+
+            # ------------------------------------------------------------
+            # TARGET NETWORK UPDATE PHASE (Polyak averaging)
+            # ------------------------------------------------------------
+            all_critic_params = []
+            all_target_critic_params = []
+            all_actor_params = []
+            all_target_actor_params = []
+
+            for strategy in strategies:
+                all_critic_params.extend(strategy.critics.parameters())
+                all_target_critic_params.extend(strategy.target_critics.parameters())
+                all_actor_params.extend(strategy.actor.parameters())
+                all_target_actor_params.extend(strategy.actor_target.parameters())
+
+            polyak_update(
+                all_critic_params,
+                all_target_critic_params,
+                self.learning_config.off_policy.tau,
+            )
+            polyak_update(
+                all_actor_params,
+                all_target_actor_params,
+                self.learning_config.off_policy.tau,
+            )
+
+        # Log gradient parameters and metrics to output
+        self.learning_role.write_rl_grad_params_to_output(learning_rate, unit_params)
diff --git a/assume/reinforcement_learning/algorithms/mappo.py b/assume/reinforcement_learning/algorithms/mappo.py
new file mode 100644
index 000000000..b8b51e5f0
--- /dev/null
+++ b/assume/reinforcement_learning/algorithms/mappo.py
@@ -0,0 +1,521 @@
+# SPDX-FileCopyrightText: ASSUME Developers
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import logging
+
+import numpy as np
+import torch as th
+from torch.nn import functional as F
+from torch.optim import AdamW
+
+from assume.reinforcement_learning.algorithms.base_algorithm import A2CAlgorithm
+from assume.reinforcement_learning.neural_network_architecture import (
+    ActorPPO,
+    CriticPPO,
+    LSTMActorPPO,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class PPO(A2CAlgorithm):
+    """
+    Proximal Policy Optimization (PPO) Algorithm.
+
+    A policy gradient method that alternates between sampling data through
+    interaction with the environment, and optimizing a surrogate objective
+    function using stochastic gradient ascent. It is an on-policy algorithm.
+
+    Attributes:
+        clip_range: The epsilon parameter for PPO clipping.
+        clip_range_vf: The epsilon parameter for value function clipping.
+        n_epochs: Number of optimization epochs per rollout.
+        entropy_coef: Coefficient for entropy term in loss calculation.
+        vf_coef: Coefficient for value function term in loss calculation.
+        max_grad_norm: Maximum gradient norm for clipping.
+        n_updates: Counter for gradient updates performed.
+        actor_architecture_class: Actor network architecture class.
+        critic_architecture_class: Critic network architecture class.
+
+    Example:
+        >>> ppo = PPO(learning_role)
+        >>> ppo.update_policy()
+    """
+
+    def __init__(
+        self,
+        learning_role,
+        clip_range=None,
+        clip_range_vf=None,
+        n_epochs=None,
+        entropy_coef=None,
+        vf_coef=None,
+        max_grad_norm=None,
+    ):
+        """Initialize PPO algorithm with specific hyperparameters.
+
+        Args:
+            learning_role: The primary learning role object.
+            clip_range: The epsilon parameter for PPO policy clipping.
+            clip_range_vf: The epsilon parameter for value function clipping.
+            n_epochs: Number of optimization epochs per rollout.
+            entropy_coef: Coefficient for entropy term in loss.
+            vf_coef: Coefficient for value function term in loss.
+            max_grad_norm: Maximum gradient norm for clipping.
+        """
+        super().__init__(learning_role)
+
+        # Set PPO-specific architecture classes
+        self.actor_architecture_class = ActorPPO
+        self.critic_architecture_class = CriticPPO
+
+        config = self.learning_config
+        on_policy_config = config.on_policy
+
+        # Using on-policy config unless explicitly overridden via constructor args.
+        self.clip_range = (
+            clip_range if clip_range is not None else on_policy_config.clip_ratio
+        )
+        self.clip_range_vf = clip_range_vf
+        self.n_epochs = n_epochs if n_epochs is not None else on_policy_config.n_epochs
+        self.entropy_coef = (
+            entropy_coef if entropy_coef is not None else on_policy_config.entropy_coef
+        )
+        self.vf_coef = vf_coef if vf_coef is not None else on_policy_config.vf_coef
+        self.max_grad_norm = (
+            max_grad_norm
+            if max_grad_norm is not None
+            else on_policy_config.max_grad_norm
+        )
+
+        # Update counter
+        self.n_updates = 0
+
+    # =========================================================================
+    # CHECKPOINT SAVING METHODS
+    # =========================================================================
+
+    uses_target_networks: bool = False
+
+    # Note: save_params, save_critic_params, save_actor_params, load_params,
+    # load_critic_params, load_actor_params, initialize_policy are inherited from A2CAlgorithm
+
+    def get_action(self, strategy, obs: th.Tensor) -> tuple[th.Tensor, th.Tensor]:
+        """Sample a stochastic action.
+
+        In learning mode the actor's Gaussian policy is sampled and the
+        log-probability is cached on the strategy for later use in
+        _store_to_buffer_and_update_sync.  In evaluation mode the
+        deterministic mean action is returned instead.
+
+        PPO does *not* have an initial-exploration phase — the stochastic
+        policy provides sufficient exploration from the very first episode.
+        """
+        if strategy.learning_mode and not strategy.evaluation_mode:
+            action, log_prob = strategy.actor.get_action_and_log_prob(obs.unsqueeze(0))
+            action = action.squeeze(0).detach()
+            # Cache log-prob for rollout buffer; value is recomputed centrally
+            strategy._last_log_prob = log_prob.squeeze(0).detach()
+            noise = th.zeros_like(action, dtype=strategy.float_type)
+            return action, noise
+
+        # Evaluation
+        action = strategy.actor(obs, deterministic=True).detach()
+        noise = th.zeros_like(action, dtype=strategy.float_type)
+        return action, noise
+
+    def create_actors(self) -> None:
+        """Create stochastic actor networks for all agents.
+
+        Initializes the ActorPPO or LSTMActorPPO network based on the configuration,
+        as well as its optimizer for each agent strategy.
+
+        Example:
+            >>> ppo.create_actors()
+            >>> # Creates actor network and optimizer for each strategy
+        """
+        actor_architecture = self.learning_config.on_policy.actor_architecture
+
+        for strategy in self.learning_role.rl_strats.values():
+            # Create PPO Actor
+            if actor_architecture == "lstm":
+                strategy.actor = LSTMActorPPO(
+                    obs_dim=self.obs_dim,
+                    act_dim=self.act_dim,
+                    float_type=self.float_type,
+                    unique_obs_dim=self.unique_obs_dim,
+                    num_timeseries_obs_dim=strategy.num_timeseries_obs_dim,
+                ).to(self.device)
+            else:
+                strategy.actor = ActorPPO(
+                    obs_dim=self.obs_dim,
+                    act_dim=self.act_dim,
+                    float_type=self.float_type,
+                ).to(self.device)
+
+            # Create Optimizer
+            strategy.actor.optimizer = AdamW(
+                strategy.actor.parameters(),
+                lr=self.learning_role.calc_lr_from_progress(1),
+            )
+
+            strategy.actor.loaded = False
+
+    def create_critics(self) -> None:
+        """Create value networks for all agents.
+
+        Initializes the CriticPPO network (Centralized Critic) and its optimizer
+        for each registered agent strategy.
+
+        Example:
+            >>> ppo.create_critics()
+            >>> # Creates critic networks and optimizers for each strategy
+        """
+        n_agents = len(self.learning_role.rl_strats)
+
+        for strategy in self.learning_role.rl_strats.values():
+            # Create value network
+            strategy.critics = CriticPPO(
+                n_agents=n_agents,
+                obs_dim=self.obs_dim,
+                unique_obs_dim=self.unique_obs_dim,
+                float_type=self.float_type,
+            ).to(self.device)
+
+            # Create optimizer
+            strategy.critics.optimizer = AdamW(
+                strategy.critics.parameters(),
+                lr=self.learning_role.calc_lr_from_progress(1),
+            )
+
+    def extract_policy(self) -> dict:
+        """Extract all actor and critic networks into a dictionary.
+
+        Collects actor and critic networks from all learning strategies into
+        a structured dictionary.
+
+        Returns:
+            Dictionary containing all network components organized by type:
+                - 'actors': Primary actor networks
+                - 'critics': Primary critic networks
+                - Dimension information for reconstruction
+
+        Example:
+            >>> policy_dict = ppo.extract_policy()
+            >>> # Contains all networks ready for saving or transfer
+        """
+        actors = {}
+        critics = {}
+
+        for u_id, strategy in self.learning_role.rl_strats.items():
+            actors[u_id] = strategy.actor
+            critics[u_id] = strategy.critics
+
+        return {
+            "actors": actors,
+            "critics": critics,
+            "obs_dim": self.obs_dim,
+            "act_dim": self.act_dim,
+            "unique_obs_dim": self.unique_obs_dim,
+        }
+
+    # =========================================================================
+    # CORE TRAINING: POLICY UPDATE
+    # =========================================================================
+
+    def update_policy(self) -> None:
+        """Update actor and critic networks using Proximal Policy Optimization (PPO).
+
+        Performs one complete training iteration consisting of:
+        1. Checking if enough data is collected in the rollout buffer.
+        2. Computing Generalized Advantage Estimation (GAE) and Returns using the last value estimate.
+        3. Updating the Actor and Critic networks over multiple epochs using mini-batches.
+        4. Calculating the surrogate objective with clipping.
+        5. Calculating value function loss (MSE) and entropy bonus.
+        6. Logging metrics and gradients.
+        7. Clearing the on-policy buffer after the update.
+        """
+        logger.debug("Updating Policy (PPO)")
+
+        # Keeping strategy order aligned with rollout-buffer column order.
+        strategies = [strategy for strategy in self.learning_role.rl_strats.values()]
+        n_rl_agents = len(strategies)
+
+        # Getting the buffer, this will be a RolloutBuffer for on-policy algorithms.
+        rollout_buffer = self.learning_role.buffer
+
+        # Check if rollout buffer has data
+        if rollout_buffer is None or rollout_buffer.pos == 0:
+            logger.debug("Rollout buffer is empty, skipping policy update")
+            return
+
+        # Require at least two transitions because we reserve the final one
+        # for bootstrapping V(s_{t+1}) and train on the remaining rollout.
+        if rollout_buffer.pos < 2:
+            logger.debug(
+                "Rollout buffer has fewer than 2 samples, skipping policy update."
+            )
+            return
+
+        # Update learning rate
+        progress_remaining = self.learning_role.get_progress_remaining()
+        learning_rate = self.learning_role.calc_lr_from_progress(progress_remaining)
+
+        for strategy in strategies:
+            for param_group in strategy.critics.optimizer.param_groups:
+                param_group["lr"] = learning_rate
+            for param_group in strategy.actor.optimizer.param_groups:
+                param_group["lr"] = learning_rate
+
+        # Get last values for advantage computation
+        last_values = np.zeros(n_rl_agents)
+        dones = np.zeros(n_rl_agents)
+
+        # Get the buffer size to index into the last stored state
+        buffer_size = (
+            rollout_buffer.pos
+            if not rollout_buffer.full
+            else rollout_buffer.buffer_size
+        )
+
+        if buffer_size > 0:
+            # Use the LAST observation as the bootstrap for the REST of the buffer.
+            # We sacrifice the last step (pos-1) to serve as s_{t+1} for the step before it.
+            # This ensures V(s_{t+1}) is calculated using the REAL next state, not a self-
+            # referential V(s_{t}).
+            last_idx = buffer_size - 1
+            last_obs = rollout_buffer.observations[last_idx]
+
+            if last_idx > 0:
+                last_dones = rollout_buffer.dones[last_idx - 1]
+            else:
+                last_dones = rollout_buffer.dones[last_idx]
+
+            # Reduce buffer size by 1 so as to not train on the bootstrap step
+            rollout_buffer.pos -= 1
+            if rollout_buffer.full:
+                rollout_buffer.full = False  # If it was full, it's not anymore
+
+            # Prepare unique observations for centralized critic
+            last_unique_obs = last_obs[:, self.obs_dim - self.unique_obs_dim :]
+
+            with th.no_grad():
+                for i, strategy in enumerate(strategies):
+                    # Construct centralized observation
+                    obs_i = last_obs[i : i + 1]
+                    other_unique = np.concatenate(
+                        (last_unique_obs[:i], last_unique_obs[i + 1 :]), axis=0
+                    )
+                    centralized_obs = np.concatenate(
+                        (obs_i, other_unique.reshape(1, -1)), axis=1
+                    )
+
+                    obs_tensor = th.as_tensor(
+                        centralized_obs,
+                        device=self.device,
+                        dtype=self.float_type,
+                    )
+                    # Get value estimate from critic
+                    last_values[i] = (
+                        strategy.critics(obs_tensor).cpu().numpy().flatten()[0]
+                    )
+                    dones[i] = last_dones[i]
+
+        # Compute advantages and returns
+        rollout_buffer.compute_returns_and_advantages(last_values, dones)
+
+        # Initialize metrics storage
+        all_actor_losses = []
+        all_critic_losses = []
+        all_entropy_losses = []
+
+        # Initialize unit_params for gradient logging
+        # Use an empty list that will be dynamically extended
+        unit_params = []
+        step_count = 0
+
+        # Helper to create a new step entry
+        def create_step_entry():
+            return {
+                u_id: {
+                    "actor_loss": None,
+                    "actor_total_grad_norm": None,
+                    "actor_max_grad_norm": None,
+                    "critic_loss": None,
+                    "critic_total_grad_norm": None,
+                    "critic_max_grad_norm": None,
+                }
+                for u_id in self.learning_role.rl_strats.keys()
+            }
+
+        effective_batch_size = min(
+            self.learning_config.batch_size,
+            rollout_buffer.pos
+            if not rollout_buffer.full
+            else rollout_buffer.buffer_size,
+        )
+
+        for epoch in range(self.n_epochs):
+            for batch in rollout_buffer.get(effective_batch_size):
+                current_batch_size = batch.observations.shape[0]
+
+                # Precompute unique observation parts for centralized critic
+                unique_obs_from_others = batch.observations[
+                    :, :, self.obs_dim - self.unique_obs_dim :
+                ].reshape(current_batch_size, n_rl_agents, -1)
+
+                for i, strategy in enumerate(strategies):
+                    actor = strategy.actor
+                    critic = strategy.critics
+
+                    obs_i = batch.observations[:, i, :]
+
+                    # Construct centralized state
+                    other_unique_obs = th.cat(
+                        (
+                            unique_obs_from_others[:, :i],
+                            unique_obs_from_others[:, i + 1 :],
+                        ),
+                        dim=1,
+                    )
+                    all_states = th.cat(
+                        (
+                            obs_i.reshape(current_batch_size, -1),
+                            other_unique_obs.reshape(current_batch_size, -1),
+                        ),
+                        dim=1,
+                    )
+
+                    actions_i = batch.actions[:, i, :]
+                    old_log_probs_i = batch.old_log_probs[:, i]
+                    advantages_i = batch.advantages[:, i]
+                    returns_i = batch.returns[:, i]
+                    old_values_i = batch.old_values[:, i]
+
+                    # Normalize advantages across the entire batch, not per-mini-batch
+                    # This provides more stable training
+                    advantages_flat = advantages_i.flatten()
+                    advantages_i = (advantages_i - advantages_flat.mean()) / (
+                        advantages_flat.std() + 1e-8
+                    )
+
+                    log_probs, entropy = actor.evaluate_actions(obs_i, actions_i)
+                    values = critic(all_states).flatten()
+
+                    # Importance sampling ratio
+                    ratio = th.exp(log_probs - old_log_probs_i)
+
+                    # Clipped surrogate objective
+                    policy_loss_1 = advantages_i * ratio
+                    policy_loss_2 = advantages_i * th.clamp(
+                        ratio, 1 - self.clip_range, 1 + self.clip_range
+                    )
+                    policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()
+
+                    # Entropy loss
+                    entropy_loss = -self.entropy_coef * entropy.mean()
+
+                    if self.clip_range_vf is not None:
+                        # Clipped value function loss
+                        values_clipped = old_values_i + th.clamp(
+                            values - old_values_i,
+                            -self.clip_range_vf,
+                            self.clip_range_vf,
+                        )
+                        value_loss_1 = F.mse_loss(values, returns_i)
+                        value_loss_2 = F.mse_loss(values_clipped, returns_i)
+                        value_loss = th.max(value_loss_1, value_loss_2)
+                    else:
+                        value_loss = F.mse_loss(values, returns_i)
+
+                    loss = policy_loss + entropy_loss + self.vf_coef * value_loss
+
+                    # Actor update
+                    actor.optimizer.zero_grad()
+                    critic.optimizer.zero_grad()
+                    loss.backward()
+
+                    # Calculate gradient norms BEFORE clipping
+                    actor_params = list(actor.parameters())
+                    critic_params = list(critic.parameters())
+
+                    actor_max_grad_norm = max(
+                        (
+                            p.grad.norm().item()
+                            for p in actor_params
+                            if p.grad is not None
+                        ),
+                        default=0.0,
+                    )
+                    critic_max_grad_norm = max(
+                        (
+                            p.grad.norm().item()
+                            for p in critic_params
+                            if p.grad is not None
+                        ),
+                        default=0.0,
+                    )
+
+                    # Gradient clipping
+                    actor_total_grad_norm = th.nn.utils.clip_grad_norm_(
+                        actor.parameters(), self.max_grad_norm
+                    )
+                    critic_total_grad_norm = th.nn.utils.clip_grad_norm_(
+                        critic.parameters(), self.max_grad_norm
+                    )
+
+                    actor.optimizer.step()
+                    critic.optimizer.step()
+
+                    # Store metrics
+                    all_actor_losses.append(policy_loss.item())
+                    all_critic_losses.append(value_loss.item())
+                    all_entropy_losses.append(entropy_loss.item())
+
+                    # Ensure we have an entry for this step
+                    if step_count >= len(unit_params):
+                        unit_params.append(create_step_entry())
+
+                    # Store per-unit gradient params for this step
+                    unit_params[step_count][strategy.unit_id]["actor_loss"] = (
+                        policy_loss.item()
+                    )
+                    unit_params[step_count][strategy.unit_id]["critic_loss"] = (
+                        value_loss.item()
+                    )
+                    unit_params[step_count][strategy.unit_id][
+                        "actor_total_grad_norm"
+                    ] = (
+                        actor_total_grad_norm.item()
+                        if isinstance(actor_total_grad_norm, th.Tensor)
+                        else actor_total_grad_norm
+                    )
+                    unit_params[step_count][strategy.unit_id]["actor_max_grad_norm"] = (
+                        actor_max_grad_norm
+                    )
+                    unit_params[step_count][strategy.unit_id][
+                        "critic_total_grad_norm"
+                    ] = (
+                        critic_total_grad_norm.item()
+                        if isinstance(critic_total_grad_norm, th.Tensor)
+                        else critic_total_grad_norm
+                    )
+                    unit_params[step_count][strategy.unit_id][
+                        "critic_max_grad_norm"
+                    ] = critic_max_grad_norm
+
+                step_count += 1
+
+        self.n_updates += 1
+
+        # Write gradient params to output
+        self.learning_role.write_rl_grad_params_to_output(learning_rate, unit_params)
+
+        # Clear rollout buffer
+        rollout_buffer.reset()
+
+        logger.debug(
+            f"PPO update complete. Actor loss: {np.mean(all_actor_losses):.4f}, "
+            f"Value loss: {np.mean(all_critic_losses):.4f}"
+        )
diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py
index 12d2a9a38..89e7597c3 100644
--- a/assume/reinforcement_learning/algorithms/matd3.py
+++ b/assume/reinforcement_learning/algorithms/matd3.py
@@ -2,25 +2,22 @@
 #
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
-import json
 import logging
-import os
 
 import torch as th
 from torch.nn import functional as F
-from torch.optim import AdamW
 
-from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
+from assume.common.base import LearningStrategy
+from assume.reinforcement_learning.algorithms.base_algorithm import A2CAlgorithm
 from assume.reinforcement_learning.learning_utils import (
     polyak_update,
-    transfer_weights,
 )
 from assume.reinforcement_learning.neural_network_architecture import CriticTD3
 
 logger = logging.getLogger(__name__)
 
 
-class TD3(RLAlgorithm):
+class TD3(A2CAlgorithm):
     """
     Twin Delayed Deep Deterministic Policy Gradients (TD3).
     Addressing Function Approximation Error in Actor-Critic Methods.
@@ -28,436 +25,96 @@ class TD3(RLAlgorithm):
     clipped double Q-Learning, delayed policy update and target policy smoothing.
 
     Open AI Spinning guide: https://spinningup.openai.com/en/latest/algorithms/td3.html
-
     Original paper: https://arxiv.org/pdf/1802.09477.pdf
-    """
-
-    def __init__(self, learning_role):
-        super().__init__(learning_role)
-
-        self.n_updates = 0
-        self.grad_clip_norm = 1.0
-
-    def save_params(self, directory):
-        """
-        This method saves the parameters of both the actor and critic networks associated with the learning role. It organizes the
-        saved parameters into separate directories for critics and actors within the specified base directory.
-
-        Args:
-            directory (str): The base directory for saving the parameters.
-        """
-        self.save_critic_params(directory=f"{directory}/critics")
-        self.save_actor_params(directory=f"{directory}/actors")
 
-    def save_critic_params(self, directory):
-        """
-        Save the parameters of critic networks.
-
-        This method saves the parameters of the critic networks, including the critic's state_dict, critic_target's state_dict,
-        and the critic's optimizer state_dict. It organizes the saved parameters into a directory structure specific to the critic
-        associated with each learning strategy.
-
-        Args:
-            directory (str): The base directory for saving the parameters.
-        """
-        os.makedirs(directory, exist_ok=True)
-        for u_id, strategy in self.learning_role.rl_strats.items():
-            obj = {
-                "critic": strategy.critics.state_dict(),
-                "critic_target": strategy.target_critics.state_dict(),
-                "critic_optimizer": strategy.critics.optimizer.state_dict(),
-            }
-            path = f"{directory}/critic_{u_id}.pt"
-            th.save(obj, path)
-
-        # record the exact order of u_ids and save it with critics to ensure that the same order is used when loading the parameters
-        u_id_list = [str(u) for u in self.learning_role.rl_strats.keys()]
-        mapping = {"u_id_order": u_id_list}
-        map_path = os.path.join(directory, "u_id_order.json")
-        with open(map_path, "w") as f:
-            json.dump(mapping, f, indent=2)
+    Attributes:
+        n_updates: Counter for gradient updates performed.
+        grad_clip_norm: Maximum gradient norm for clipping.
+        critic_architecture_class: Critic network architecture class (CriticTD3).
 
-    def save_actor_params(self, directory):
-        """
-        Save the parameters of actor networks.
-
-        This method saves the parameters of the actor networks, including the actor's state_dict, actor_target's state_dict, and
-        the actor's optimizer state_dict. It organizes the saved parameters into a directory structure specific to the actor
-        associated with each learning strategy.
-
-        Args:
-            directory (str): The base directory for saving the parameters.
-        """
-        os.makedirs(directory, exist_ok=True)
-        for u_id, strategy in self.learning_role.rl_strats.items():
-            obj = {
-                "actor": strategy.actor.state_dict(),
-                "actor_target": strategy.actor_target.state_dict(),
-                "actor_optimizer": strategy.actor.optimizer.state_dict(),
-            }
-            path = f"{directory}/actor_{u_id}.pt"
-            th.save(obj, path)
+    Example:
+        >>> td3 = TD3(learning_role)
+        >>> td3.update_policy()
+    """
 
-    def load_params(self, directory: str) -> None:
-        """
-        Load the parameters of both actor and critic networks.
+    def __init__(self, learning_role):
+        """Initialize the TD3 algorithm.
 
-        This method loads the parameters of both the actor and critic networks associated with the learning role from the specified
-        directory. It uses the `load_critic_params` and `load_actor_params` methods to load the respective parameters.
+        Sets up the algorithm with gradient counters, clipping parameters,
+        and critic architecture.
 
         Args:
-            directory (str): The directory from which the parameters should be loaded.
-        """
-        self.load_critic_params(directory)
-        self.load_actor_params(directory)
-
-    def load_critic_params(self, directory: str) -> None:
-        """
-        Load critic, target_critic, and optimizer states for each agent strategy.
-        If agent count differs between saved and current model, performs weight transfer for both networks.
-        Args:
-            directory (str): The directory from which the parameters should be loaded.
+            learning_role: Learning role object managing agents and replay buffer.
+                Must have off-policy configuration.
         """
-        logger.info("Loading critic parameters...")
-
-        if not os.path.exists(directory):
-            logger.warning(
-                "Specified directory does not exist. Using randomly initialized critics."
-            )
-            return
-
-        map_path = os.path.join(directory, "critics", "u_id_order.json")
-        if os.path.exists(map_path):
-            # read the saved order of u_ids from critics save directory
-            with open(map_path) as f:
-                loaded_id_order = json.load(f).get("u_id_order", [])
-        else:
-            logger.warning("No u_id_order.json: assuming same order as current.")
-            loaded_id_order = [str(u) for u in self.learning_role.rl_strats.keys()]
-
-        new_id_order = [str(u) for u in self.learning_role.rl_strats.keys()]
-        direct_load = loaded_id_order == new_id_order
-
-        if direct_load:
-            logger.info("Agents order unchanged. Loading critic weights directly.")
-        else:
-            logger.info(
-                f"Agents length and/or order mismatch: n_old={len(loaded_id_order)}, n_new={len(new_id_order)}. Transferring weights for critics and target critics."
-            )
-
-        for u_id, strategy in self.learning_role.rl_strats.items():
-            critic_path = os.path.join(directory, "critics", f"critic_{u_id}.pt")
-            if not os.path.exists(critic_path):
-                logger.warning(f"No saved critic for {u_id}; skipping.")
-                continue
-
-            try:
-                critic_params = th.load(critic_path, weights_only=True)
-                for key in ("critic", "critic_target", "critic_optimizer"):
-                    if key not in critic_params:
-                        logger.warning(
-                            f"Missing {key} in critic params for {u_id}; skipping."
-                        )
-                        continue
-
-                if direct_load:
-                    strategy.critics.load_state_dict(critic_params["critic"])
-                    strategy.target_critics.load_state_dict(
-                        critic_params["critic_target"]
-                    )
-                    strategy.critics.optimizer.load_state_dict(
-                        critic_params["critic_optimizer"]
-                    )
-                    logger.debug(f"Loaded critic for {u_id} directly.")
-                else:
-                    critic_weights = transfer_weights(
-                        model=strategy.critics,
-                        loaded_state=critic_params["critic"],
-                        loaded_id_order=loaded_id_order,
-                        new_id_order=new_id_order,
-                        obs_base=strategy.obs_dim,
-                        act_dim=strategy.act_dim,
-                        unique_obs=strategy.unique_obs_dim,
-                    )
-                    target_critic_weights = transfer_weights(
-                        model=strategy.target_critics,
-                        loaded_state=critic_params["critic_target"],
-                        loaded_id_order=loaded_id_order,
-                        new_id_order=new_id_order,
-                        obs_base=strategy.obs_dim,
-                        act_dim=strategy.act_dim,
-                        unique_obs=strategy.unique_obs_dim,
-                    )
-
-                    if critic_weights is None or target_critic_weights is None:
-                        logger.warning(
-                            f"Critic weights transfer failed for {u_id}; skipping."
-                        )
-                        continue
+        super().__init__(learning_role)
 
-                    strategy.critics.load_state_dict(critic_weights)
-                    strategy.target_critics.load_state_dict(target_critic_weights)
-                    logger.debug(f"Critic weights transferred for {u_id}.")
+        self.n_updates = 0
+        self.grad_clip_norm = 1.0
 
-            except Exception as e:
-                logger.warning(f"Failed to load critic for {u_id}: {e}")
+        # Define the critic architecture class for TD3
+        self.critic_architecture_class = CriticTD3
 
-    def load_actor_params(self, directory: str) -> None:
-        """
-        Load the parameters of actor networks from a specified directory.
+    def get_action(
+        self, strategy: "LearningStrategy", obs: th.Tensor
+    ) -> tuple[th.Tensor, th.Tensor]:
+        """Sample an action using the off-policy strategy.
 
-        This method loads the parameters of actor networks, including the actor's state_dict, actor_target's state_dict, and
-        the actor's optimizer state_dict, from the specified directory. It iterates through the learning strategies associated
-        with the learning role, loads the respective parameters, and updates the actor and target actor networks accordingly.
+        During learning mode the agent either performs pure-noise initial
+        exploration (first N episodes) or uses its deterministic actor plus
+        Gaussian action noise.  During evaluation mode the actor is used
+        without any noise.
 
-        Args:
-            directory (str): The directory from which the parameters should be loaded.
+        This default implementation is shared by TD3 and DDPG.  PPO overrides
+        it with its own stochastic Gaussian sampling.
         """
-        logger.info("Loading actor parameters...")
-        if not os.path.exists(directory):
-            logger.warning(
-                "Specified directory for loading the actors does not exist! Starting with randomly initialized values!"
-            )
-            return
-
-        for u_id, strategy in self.learning_role.rl_strats.items():
-            try:
-                actor_params = self.load_obj(
-                    directory=f"{directory}/actors/actor_{str(u_id)}.pt"
-                )
-                strategy.actor.load_state_dict(actor_params["actor"])
-                strategy.actor_target.load_state_dict(actor_params["actor_target"])
-                strategy.actor.optimizer.load_state_dict(
-                    actor_params["actor_optimizer"]
+        if strategy.learning_mode and not strategy.evaluation_mode:
+            if strategy.collect_initial_experience_mode:
+                # Pure Gaussian noise for initial random exploration
+                noise = th.normal(
+                    mean=0.0,
+                    std=strategy.exploration_noise_std,
+                    size=(strategy.act_dim,),
+                    dtype=strategy.float_type,
+                    device=strategy.device,
                 )
+                return noise, noise
 
-                # add a tag to the strategy to indicate that the actor was loaded
-                strategy.actor.loaded = True
-            except Exception:
-                logger.warning(f"No actor values loaded for agent {u_id}")
-
-    def initialize_policy(self, actors_and_critics: dict = None) -> None:
-        """
-        Create actor and critic networks for reinforcement learning.
-
-        If `actors_and_critics` is None, this method creates new actor and critic networks.
-        If `actors_and_critics` is provided, it assigns existing networks to the respective attributes.
-
-        Args:
-            actors_and_critics (dict): The actor and critic networks to be assigned.
-
-        """
-        if actors_and_critics is None:
-            self.check_strategy_dimensions()
-            self.create_actors()
-            self.create_critics()
-
-        else:
-            for u_id, strategy in self.learning_role.rl_strats.items():
-                strategy.actor = actors_and_critics["actors"][u_id]
-                strategy.actor_target = actors_and_critics["actor_targets"][u_id]
-
-                strategy.critics = actors_and_critics["critics"][u_id]
-                strategy.target_critics = actors_and_critics["target_critics"][u_id]
-
-            self.obs_dim = actors_and_critics["obs_dim"]
-            self.act_dim = actors_and_critics["act_dim"]
-            self.unique_obs_dim = actors_and_critics["unique_obs_dim"]
-
-    def check_strategy_dimensions(self) -> None:
-        """
-        Iterate over all learning strategies and check if the dimensions of observations and actions are the same.
-        Also check if the unique observation dimensions are the same. If not, raise a ValueError.
-        This is important for the TD3 algorithm, as it uses a centralized critic that requires consistent dimensions across all agents.
-        """
-        foresight_list = []
-        obs_dim_list = []
-        act_dim_list = []
-        unique_obs_dim_list = []
-        num_timeseries_obs_dim_list = []
-
-        for strategy in self.learning_role.rl_strats.values():
-            foresight_list.append(strategy.foresight)
-            obs_dim_list.append(strategy.obs_dim)
-            act_dim_list.append(strategy.act_dim)
-            unique_obs_dim_list.append(strategy.unique_obs_dim)
-            num_timeseries_obs_dim_list.append(strategy.num_timeseries_obs_dim)
-
-        if len(set(foresight_list)) > 1:
-            raise ValueError(
-                f"All foresight values must be the same for all RL agents. The defined learning strategies have the following foresight values: {foresight_list}"
-            )
-        else:
-            self.foresight = foresight_list[0]
-
-        if len(set(act_dim_list)) > 1:
-            raise ValueError(
-                f"All action dimensions must be the same for all RL agents. The defined learning strategies have the following action dimensions: {act_dim_list}"
-            )
-        else:
-            self.act_dim = act_dim_list[0]
-
-        if len(set(unique_obs_dim_list)) > 1:
-            raise ValueError(
-                f"All unique_obs_dim values must be the same for all RL agents. The defined learning strategies have the following unique_obs_dim values: {unique_obs_dim_list}"
-            )
-        else:
-            self.unique_obs_dim = unique_obs_dim_list[0]
-
-        if len(set(num_timeseries_obs_dim_list)) > 1:
-            raise ValueError(
-                f"All num_timeseries_obs_dim values must be the same for all RL agents. The defined learning strategies have the following num_timeseries_obs_dim values: {num_timeseries_obs_dim_list}"
-            )
-        else:
-            self.num_timeseries_obs_dim = num_timeseries_obs_dim_list[0]
-
-        # Check last, as other cases should fail before!
-        if len(set(obs_dim_list)) > 1:
-            raise ValueError(
-                f"All observation dimensions must be the same for all RL agents. The defined learning strategies have the following observation dimensions: {obs_dim_list}"
-            )
-        else:
-            self.obs_dim = obs_dim_list[0]
-
-    def create_actors(self) -> None:
-        """
-        Create actor networks for reinforcement learning for each unit strategy.
-
-        This method initializes actor networks and their corresponding target networks for each unit strategy.
-        The actors are designed to map observations to action probabilities in a reinforcement learning setting.
-
-        The created actor networks are associated with each unit strategy and stored as attributes.
-
-        Note:
-            The observation dimension need to be the same, due to the centralized criic that all actors share.
-            If you have units with different observation dimensions. They need to have different critics and hence learning roles.
-
-        """
-
-        for strategy in self.learning_role.rl_strats.values():
-            strategy.actor = self.actor_architecture_class(
-                obs_dim=self.obs_dim,
-                act_dim=self.act_dim,
-                float_type=self.float_type,
-                unique_obs_dim=self.unique_obs_dim,
-                num_timeseries_obs_dim=self.num_timeseries_obs_dim,
-            ).to(self.device)
-
-            strategy.actor_target = self.actor_architecture_class(
-                obs_dim=self.obs_dim,
-                act_dim=self.act_dim,
-                float_type=self.float_type,
-                unique_obs_dim=self.unique_obs_dim,
-                num_timeseries_obs_dim=self.num_timeseries_obs_dim,
-            ).to(self.device)
-
-            strategy.actor_target.load_state_dict(strategy.actor.state_dict())
-            strategy.actor_target.train(mode=False)
-
-            strategy.actor.optimizer = AdamW(
-                strategy.actor.parameters(),
-                lr=self.learning_role.calc_lr_from_progress(
-                    1
-                ),  # 1=100% of simulation remaining, uses learning_rate from config as starting point
+            action = strategy.actor(obs).detach()
+            noise = strategy.action_noise.noise(
+                device=strategy.device, dtype=strategy.float_type
             )
-
-            strategy.actor.loaded = False
-
-    def create_critics(self) -> None:
-        """
-        Create critic networks for reinforcement learning.
-
-        This method initializes critic networks for each agent in the reinforcement learning setup.
-
-        Note:
-            The observation dimension need to be the same, due to the centralized criic that all actors share.
-            If you have units with different observation dimensions. They need to have different critics and hence learning roles.
-        """
-        n_agents = len(self.learning_role.rl_strats)
-
-        for strategy in self.learning_role.rl_strats.values():
-            strategy.critics = CriticTD3(
-                n_agents=n_agents,
-                obs_dim=self.obs_dim,
-                act_dim=self.act_dim,
-                unique_obs_dim=self.unique_obs_dim,
-                float_type=self.float_type,
-            ).to(self.device)
-
-            strategy.target_critics = CriticTD3(
-                n_agents=n_agents,
-                obs_dim=self.obs_dim,
-                act_dim=self.act_dim,
-                unique_obs_dim=self.unique_obs_dim,
-                float_type=self.float_type,
-            ).to(self.device)
-
-            strategy.target_critics.load_state_dict(strategy.critics.state_dict())
-            strategy.target_critics.train(mode=False)
-
-            strategy.critics.optimizer = AdamW(
-                strategy.critics.parameters(),
-                lr=self.learning_role.calc_lr_from_progress(
-                    1
-                ),  # 1 = 100% of simulation remaining, uses learning_rate from config as starting point
+            action = th.clamp(
+                action + noise,
+                strategy.actor.min_output,
+                strategy.actor.max_output,
             )
+            return action, noise
 
-    def extract_policy(self) -> dict:
-        """
-        Extract actor and critic networks.
-
-        This method extracts the actor and critic networks associated with each learning strategy and organizes them into a
-        dictionary structure. The extracted networks include actors, actor_targets, critics, and target_critics. The resulting
-        dictionary is typically used for saving and sharing these networks.
-
-        Returns:
-            dict: The extracted actor and critic networks.
-        """
-        actors = {}
-        actor_targets = {}
-
-        critics = {}
-        target_critics = {}
-
-        for u_id, strategy in self.learning_role.rl_strats.items():
-            actors[u_id] = strategy.actor
-            actor_targets[u_id] = strategy.actor_target
-
-            critics[u_id] = strategy.critics
-            target_critics[u_id] = strategy.target_critics
-
-        actors_and_critics = {
-            "actors": actors,
-            "actor_targets": actor_targets,
-            "critics": critics,
-            "target_critics": target_critics,
-            "obs_dim": self.obs_dim,
-            "act_dim": self.act_dim,
-            "unique_obs_dim": self.unique_obs_dim,
-        }
-
-        return actors_and_critics
+        # Evaluation
+        action = strategy.actor(obs).detach()
+        noise = th.zeros(
+            strategy.act_dim, dtype=strategy.float_type, device=strategy.device
+        )
+        return action, noise
 
     def update_policy(self):
+        """Update the policy using the Twin Delayed Deep Deterministic Policy Gradients (TD3).
+
+        This method performs the policy update step, which involves updating the actor
+        (policy) and critic (Q-function) networks using the TD3 algorithm. It iterates
+        over the specified number of gradient steps and performs the following for each
+        learning strategy:
+
+        1. Sample a batch of transitions from the replay buffer.
+        2. Calculate the next actions with added noise using the actor target network.
+        3. Compute the target Q-values based on the next states, rewards, and the target critic network.
+        4. Compute the critic loss as the mean squared error between current Q-values and target Q-values.
+        5. Optimize the critic network by performing a gradient descent step.
+        6. Update the actor network if the specified policy delay is reached.
+        7. Apply Polyak averaging to update target networks.
         """
-        Update the policy of the reinforcement learning agent using the Twin Delayed Deep Deterministic Policy Gradients (TD3) algorithm.
-
-        Note:
-            This function performs the policy update step, which involves updating the actor (policy) and critic (Q-function) networks
-            using TD3 algorithm. It iterates over the specified number of gradient steps and performs the following steps for each
-            learning strategy:
-
-            1. Sample a batch of transitions from the replay buffer.
-            2. Calculate the next actions with added noise using the actor target network.
-            3. Compute the target Q-values based on the next states, rewards, and the target critic network.
-            4. Compute the critic loss as the mean squared error between current Q-values and target Q-values.
-            5. Optimize the critic network by performing a gradient descent step.
-            6. Update the actor network if the specified policy delay is reached.
-            7. Apply Polyak averaging to update target networks.
-
-        """
-
-        logger.debug("Updating Policy")
+        logger.debug("Updating Policy (TD3)")
 
         # Stack strategies for easier access
         strategies = list(self.learning_role.rl_strats.values())
@@ -475,7 +132,7 @@ def update_policy(self):
                 }
                 for u_id in self.learning_role.rl_strats.keys()
             }
-            for _ in range(self.learning_config.gradient_steps)
+            for _ in range(self.learning_config.off_policy.gradient_steps)
         ]
 
         # update noise decay and learning rate
@@ -498,7 +155,7 @@ def update_policy(self):
             )
             strategy.action_noise.update_noise_decay(updated_noise_decay)
 
-        for step in range(self.learning_config.gradient_steps):
+        for step in range(self.learning_config.off_policy.gradient_steps):
             self.n_updates += 1
 
             transitions = self.learning_role.buffer.sample(
@@ -514,11 +171,12 @@ def update_policy(self):
             with th.no_grad():
                 # Select action according to policy and add clipped noise
                 noise = (
-                    th.randn_like(actions) * self.learning_config.target_policy_noise
+                    th.randn_like(actions)
+                    * self.learning_config.off_policy.target_policy_noise
                 )
                 noise = noise.clamp(
-                    -self.learning_config.target_noise_clip,
-                    self.learning_config.target_noise_clip,
+                    -self.learning_config.off_policy.target_noise_clip,
+                    self.learning_config.off_policy.target_noise_clip,
                 )
 
                 # Select next actions for all agents
@@ -643,7 +301,7 @@ def update_policy(self):
             ######################################################################
             # ACTOR UPDATE (DELAYED): Accumulate losses for all agents in one pass
             ######################################################################
-            if self.n_updates % self.learning_config.policy_delay == 0:
+            if self.n_updates % self.learning_config.off_policy.policy_delay == 0:
                 # Zero-grad for all actors first
                 for strategy in strategies:
                     strategy.actor.optimizer.zero_grad(set_to_none=True)
@@ -743,10 +401,12 @@ def update_policy(self):
                 polyak_update(
                     all_critic_params,
                     all_target_critic_params,
-                    self.learning_config.tau,
+                    self.learning_config.off_policy.tau,
                 )
                 polyak_update(
-                    all_actor_params, all_target_actor_params, self.learning_config.tau
+                    all_actor_params,
+                    all_target_actor_params,
+                    self.learning_config.off_policy.tau,
                 )
 
         self.learning_role.write_rl_grad_params_to_output(learning_rate, unit_params)
diff --git a/assume/reinforcement_learning/buffer.py b/assume/reinforcement_learning/buffer.py
index aba424066..3bbcca66d 100644
--- a/assume/reinforcement_learning/buffer.py
+++ b/assume/reinforcement_learning/buffer.py
@@ -2,12 +2,16 @@
 #
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
+import logging
 import warnings
+from collections.abc import Generator
 from typing import NamedTuple
 
 import numpy as np
 import torch as th
 
+from assume.common.utils import convert_to_tensors
+
 try:
     # Check memory used by replay buffer when possible
     import psutil
@@ -15,7 +19,20 @@
     psutil = None
 
 
+logger = logging.getLogger(__name__)
+
+
 class ReplayBufferSamples(NamedTuple):
+    """Container for replay buffer samples.
+
+
+    Attributes:
+        observations: States/observations the agent saw.
+        actions: Actions the agent took.
+        next_observations: States/observations the agent saw after taking the action.
+        rewards: Rewards the agent received for taking the action.
+    """
+
     observations: th.Tensor
     actions: th.Tensor
     next_observations: th.Tensor
@@ -32,7 +49,8 @@ def __init__(
         device: str,
         float_type,
     ):
-        """
+        """Initialize the replay buffer.
+
         A class that represents a replay buffer for storing observations, actions, and rewards.
         The replay buffer is implemented as a circular buffer, where the oldest experiences are discarded when the buffer is full.
 
@@ -91,10 +109,7 @@ def __init__(
                 )
 
     def size(self):
-        # write docstring for this function
-        """
-        Return the current size of the buffer (i.e. number of transitions
-        stored in the buffer).
+        """Return the current size of the buffer.
 
         Returns:
             buffer_size(int): The current size of the buffer
@@ -102,32 +117,13 @@ def size(self):
         """
         return self.buffer_size if self.full else self.pos
 
-    def to_torch(self, array: np.array, copy=True):
-        """
-        Converts a numpy array to a PyTorch tensor. Note: It copies the data by default.
-
-        Args:
-            array (numpy.ndarray): The numpy array to convert.
-            copy (bool, optional): Whether to copy the data or not
-                (may be useful to avoid changing things by reference). Defaults to True.
-
-        Returns:
-            torch.Tensor: The converted PyTorch tensor.
-        """
-
-        if copy:
-            return th.tensor(array, dtype=self.th_float_type, device=self.device)
-
-        return th.as_tensor(array, dtype=self.th_float_type, device=self.device)
-
     def add(
         self,
         obs: np.ndarray,
         actions: np.ndarray,
         reward: np.ndarray,
     ):
-        """
-        Adds an observation, action, and reward of all agents to the replay buffer.
+        """Add an observation, action, and reward of all agents to the replay buffer.
 
         Args:
             obs (numpy.ndarray): The observation to add.
@@ -148,8 +144,7 @@ def add(
             self.pos = 0
 
     def sample(self, batch_size: int) -> ReplayBufferSamples:
-        """
-        Samples a random batch of experiences from the replay buffer.
+        """Sample a random batch of experiences from the replay buffer.
 
         Args:
             batch_size (int): The number of experiences to sample.
@@ -173,4 +168,261 @@ def sample(self, batch_size: int) -> ReplayBufferSamples:
             self.rewards[batch_inds],
         )
 
-        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
+        return ReplayBufferSamples(
+            *tuple(convert_to_tensors(array=x, dtype=self.th_float_type, device=self.device) for x in data)
+        )
+
+
+class RolloutBufferSamples(NamedTuple):
+    """Container for rollout buffer samples.
+
+    It holds one batch of training samples from PPO's rollout buffer.
+
+    Attributes:
+        observations: States/observations the agent saw.
+        actions: Actions the agent took.
+        old_values: Critic's value estimates.
+        old_log_probs: Log_probability of taking each action.
+        advantages: Generalized advantage estimates.
+        returns: Expected returns.
+    """
+
+    observations: th.Tensor  # states/observations the agent saw
+    actions: th.Tensor  # actions the agent took
+    old_values: th.Tensor  # critic's value estimates
+    old_log_probs: th.Tensor  # log_probability of taking each action
+    advantages: th.Tensor  # generalized advantage estimates
+    returns: th.Tensor  # expected returns
+
+
+class RolloutBuffer:
+    """Rollout buffer used in on-policy algorithms like PPO.
+
+    It corresponds to the transitions collected using the current policy.
+    This experience is discarded after the policy is updated.
+    In order to use PPO, the current observations are needed to be stored.
+    The observations include actions, rewards, values, log probabilities and done for each action.
+    """
+
+    def __init__(
+        self,
+        buffer_size: int,
+        obs_dim: int,
+        act_dim: int,
+        n_rl_units: int,
+        device: str | th.device,
+        float_type: th.dtype,
+        gamma: float = 0.99,
+        gae_lambda: float = 0.98,
+    ):
+        """Initialize the rollout buffer.
+
+        Args:
+            buffer_size: Max number of elements allowed in the buffer.
+            obs_dim: Dimension of the observation space.
+            act_dim: Dimension of the action space.
+            n_rl_units: Number of RL agents.
+            device: PyTorch device config.
+            float_type: Data type for floating point numbers.
+            gamma: Discount factor.
+            gae_lambda: bias-variance trade-off factor for Generalized Advantage Estimator.
+        """
+        self.buffer_size = buffer_size
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        self.n_rl_units = n_rl_units
+        self.device = device
+        self.float_type = float_type
+        self.gamma = gamma
+        self.gae_lambda = gae_lambda
+
+        # Current position and full flag
+        self.pos = 0
+        self.full = False
+        self.generator_ready = False
+
+        # Allocate buffers
+        self.reset()
+
+    def reset(self) -> None:
+        """Reset the rollout buffer.
+
+        Clearing the buffer and allocating new storage.
+        """
+        self.observations = np.zeros(
+            (self.buffer_size, self.n_rl_units, self.obs_dim), dtype=np.float32
+        )
+        self.actions = np.zeros(
+            (self.buffer_size, self.n_rl_units, self.act_dim), dtype=np.float32
+        )
+        self.rewards = np.zeros((self.buffer_size, self.n_rl_units), dtype=np.float32)
+        self.values = np.zeros((self.buffer_size, self.n_rl_units), dtype=np.float32)
+        self.log_probs = np.zeros((self.buffer_size, self.n_rl_units), dtype=np.float32)
+        self.dones = np.zeros((self.buffer_size, self.n_rl_units), dtype=np.float32)
+
+        # Computed after rollout
+        self.advantages = np.zeros(
+            (self.buffer_size, self.n_rl_units), dtype=np.float32
+        )
+        self.returns = np.zeros((self.buffer_size, self.n_rl_units), dtype=np.float32)
+
+        self.pos = 0
+        self.full = False
+        self.generator_ready = False
+
+    def add(
+        self,
+        obs: np.ndarray,
+        action: np.ndarray,
+        reward: np.ndarray,
+        done: np.ndarray,
+        value: np.ndarray,
+        log_prob: np.ndarray,
+    ) -> None:
+        """Add a transition to the buffer.
+
+        Args:
+            obs: Observation of the agents.
+            action: Action taken by the agents.
+            reward: Reward obtained.
+            done: Whether the episode ended.
+            value: Value estimate from the critic.
+            log_prob: Log probability of the action.
+
+        Raises:
+            OverflowError: If the buffer is already full.  The buffer must be either
+                resized or cleared before adding another transition.
+        """
+        if self.pos >= self.buffer_size:
+            self.full = True
+            logger.error(
+                "RolloutBuffer is full (size=%d). Refusing to silently drop a "
+                "transition. Increase buffer_size or call reset() before adding "
+                "more data.",
+                self.buffer_size,
+            )
+            raise OverflowError(
+                f"RolloutBuffer of size {self.buffer_size} is full; cannot add "
+                "another transition without losing data."
+            )
+
+        self.observations[self.pos] = np.array(obs).copy()
+        self.actions[self.pos] = np.array(action).copy()
+        self.rewards[self.pos] = np.array(reward).flatten().copy()
+        self.dones[self.pos] = np.array(done).flatten().copy()
+        self.values[self.pos] = np.array(value).flatten().copy()
+        self.log_probs[self.pos] = np.array(log_prob).flatten().copy()
+        # flattening the rewards, dones, values, log_probs array to (n_units,) size
+
+        self.pos += 1
+        if self.pos >= self.buffer_size:
+            self.full = True
+
+    def compute_returns_and_advantages(
+        self, last_values: np.ndarray, dones: np.ndarray
+    ) -> None:
+        """Use Generalized Advantage Estimation to compute the advantage.
+
+        To obtain the lambda-return, the advantage is added to the value estimate.
+
+        Args:
+            last_values: Value estimation for the last step.
+            dones: Whether the last step was terminal.
+        """
+        # taking the final value estimates and episode-end flags,
+        # and making them flat arrays providing one number per agent.
+        last_values = np.array(last_values).flatten()
+        dones = np.array(dones).flatten()
+
+        # GAE computation
+        # starting with running total of zero for each agent.
+        last_gae_lam = np.zeros(self.n_rl_units, dtype=np.float32)
+        buffer_size = self.pos if not self.full else self.buffer_size
+
+        # backward loop
+        for step in reversed(range(buffer_size)):
+            if step == buffer_size - 1:
+                # if at the last step, use the last_vlaues given as input
+                next_non_terminal = 1.0 - dones
+                next_values = last_values
+            else:
+                # for all the other steps, get the next value and next episode flag.
+                next_non_terminal = 1.0 - self.dones[step + 1]
+                next_values = self.values[step + 1]
+
+            # TD error
+            delta = (
+                self.rewards[step]
+                + self.gamma * next_values * next_non_terminal
+                - self.values[step]
+            )
+
+            # GAE advantage
+            last_gae_lam = (
+                delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam
+            )
+            self.advantages[step] = last_gae_lam
+
+        # Returns = advantages + values
+        self.returns = self.advantages + self.values
+        self.generator_ready = True
+
+    def get(
+        self, batch_size: int | None = None
+    ) -> Generator[RolloutBufferSamples, None, None]:
+        """Generate batches of transition samples for training.
+
+        Args:
+            batch_size: Number of samples to be accessed per batch.
+
+        Yields:
+            A generator yielding RolloutBufferSamples.
+        """
+        if not self.generator_ready:
+            raise ValueError(
+                "Must call compute_returns_and_advantages before sampling."
+            )
+
+        buffer_size = self.pos if not self.full else self.buffer_size
+        indices = np.random.permutation(buffer_size)
+
+        if batch_size is None:
+            batch_size = buffer_size
+
+        start_idx = 0
+        while start_idx < buffer_size:
+            batch_indices = indices[start_idx : start_idx + batch_size]
+            yield self.sample(batch_indices)
+            start_idx += batch_size
+
+    def sample(self, indices: np.ndarray) -> RolloutBufferSamples:
+        """Sample data from the buffer for given indices.
+
+        Converts numpy arrays to torch tensors for given indices.
+
+        Args:
+            indices: Indices of the samples to retrieve.
+
+        Returns:
+            The batch of samples converted to PyTorch tensors.
+        """
+        data = (
+            self.observations[indices],
+            self.actions[indices],
+            self.values[indices],
+            self.log_probs[indices],
+            self.advantages[indices],
+            self.returns[indices],
+        )
+
+        return RolloutBufferSamples(
+            *(convert_to_tensors(array=x, dtype=self.float_type, device=self.device) for x in data)
+        )
+
+    def size(self) -> int:
+        """Return the current number of stored transitions.
+
+        Returns:
+            The size of the buffer.
+        """
+        return self.buffer_size if self.full else self.pos
diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py
index 9c14db92b..f3e50550f 100644
--- a/assume/reinforcement_learning/learning_role.py
+++ b/assume/reinforcement_learning/learning_role.py
@@ -7,19 +7,27 @@
 from datetime import datetime
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
 import torch as th
 from mango import Role
 
-from assume.common.base import LearningConfig, LearningStrategy
+from assume.common.base import (
+    LearningConfig,
+    LearningStrategy,
+    is_off_policy,
+    is_on_policy,
+)
 from assume.common.utils import (
     create_rrule,
     datetime2timestamp,
     timestamp2datetime,
 )
 from assume.reinforcement_learning.algorithms.base_algorithm import RLAlgorithm
+from assume.reinforcement_learning.algorithms.maddpg import DDPG
+from assume.reinforcement_learning.algorithms.mappo import PPO
 from assume.reinforcement_learning.algorithms.matd3 import TD3
-from assume.reinforcement_learning.buffer import ReplayBuffer
+from assume.reinforcement_learning.buffer import ReplayBuffer, RolloutBuffer
 from assume.reinforcement_learning.learning_utils import (
     linear_schedule_func,
     transform_buffer_data,
@@ -30,10 +38,11 @@
 
 
 class Learning(Role):
-    """
-    This class manages the learning process of reinforcement learning agents, including initializing key components such as
-    neural networks, replay buffer, and learning hyperparameters. It handles both training and evaluation modes based on
-    the provided learning configuration.
+    """Manages the learning process of reinforcement learning agents.
+
+    This class handles the initialization of key components such as neural networks,
+    replay buffer, and learning hyperparameters. It handles both training and evaluation
+    modes based on the provided learning configuration.
 
     Args:
         learning_config (LearningConfig): The configuration for the learning process.
@@ -50,8 +59,8 @@ def __init__(
     ):
         super().__init__()
 
-        # how many learning roles do exist and how are they named
-        self.buffer: ReplayBuffer = None
+        # Single buffer that can be either ReplayBuffer (off-policy) or RolloutBuffer (on-policy)
+        self.buffer = None
         self.episodes_done = 0
         self.rl_strats: dict[int, LearningStrategy] = {}
         self.learning_config = learning_config
@@ -91,13 +100,17 @@ def __init__(
                 self.calc_lr_from_progress = (
                     lambda x: self.learning_config.learning_rate
                 )
-
-            if self.learning_config.action_noise_schedule == "linear":
-                self.calc_noise_from_progress = linear_schedule_func(
-                    self.learning_config.noise_dt
-                )
-            else:
-                self.calc_noise_from_progress = lambda x: self.learning_config.noise_dt
+            # Only set up noise schedule for off-policy algorithms
+            if is_off_policy(self.learning_config.algorithm):
+                if self.learning_config.off_policy.action_noise_schedule == "linear":
+                    self.calc_noise_from_progress = linear_schedule_func(
+                        self.learning_config.off_policy.noise_dt
+                    )
+                else:
+                    self.calc_noise_from_progress = (
+                        lambda x: self.learning_config.off_policy.noise_dt
+                    )
+            # For on-policy algorithms, no noise schedule needed
 
             self.eval_episodes_done = 0
 
@@ -122,16 +135,24 @@ def __init__(
             self.all_rewards = defaultdict(lambda: defaultdict(list))
             self.all_regrets = defaultdict(lambda: defaultdict(list))
             self.all_profits = defaultdict(lambda: defaultdict(list))
+            # On-policy (PPO/MAPPO) only: value estimates, log-probs, and done
+            # flags collected per time-step for GAE computation.
+            if is_on_policy(self.learning_config.algorithm):
+                self.all_values = defaultdict(lambda: defaultdict(list))
+                self.all_log_probs = defaultdict(lambda: defaultdict(list))
+                self.all_dones = defaultdict(lambda: defaultdict(list))
 
     def on_ready(self):
-        """
-        Set up the learning role for reinforcement learning training.
+        """Set up the learning role for reinforcement learning training.
 
-        Notes:
-            This method prepares the learning role for the reinforcement learning training process. It subscribes to relevant messages
-            for handling the training process and schedules recurrent tasks for policy updates based on the specified training frequency.
-            This cannot happen in the init since the context (compare mango agents) is not yet available there.To avoid inconsistent replay buffer states (e.g. observation and action has been stored but not the reward), this
-            slightly shifts the timing of the buffer updates.
+        Note:
+            This method prepares the learning role for the reinforcement learning training process.
+            It subscribes to relevant messages for handling the training process and schedules
+            recurrent tasks for policy updates based on the specified training frequency.
+            This cannot happen in the init since the context (compare mango agents) is not
+            yet available there. To avoid inconsistent replay buffer states (e.g. observation
+            and action has been stored but not the reward), this slightly shifts the timing
+            of the buffer updates.
         """
         super().on_ready()
 
@@ -151,9 +172,46 @@ def on_ready(self):
             src="no_wait",
         )
 
-    def sync_train_freq_with_simulation_horizon(self) -> str | None:
+    def intialize_buffer(self, time_step, validation_interval):
+        """Initialize the replay buffer for reinforcement learning training.
+
+        Args:
+            buffer: The replay buffer to be initialized.
         """
-        Ensure self.train_freq evenly divides the simulation length.
+        if is_off_policy(self.learning_config.algorithm):
+            buffer = ReplayBuffer(
+                buffer_size=self.learning_config.off_policy.replay_buffer_size,
+                obs_dim=self.rl_algorithm.obs_dim,
+                act_dim=self.rl_algorithm.act_dim,
+                n_rl_units=len(self.rl_strats),
+                device=self.device,
+                float_type=self.float_type,
+            )
+            min_episode_for_eval = (
+                self.learning_config.off_policy.episodes_collecting_initial_experience
+                + validation_interval
+            )
+        else:
+            train_freq = pd.Timedelta(str(self.learning_config.train_freq))
+            time_step = pd.Timedelta(time_step)
+            rollout_buffer_size = max(2, int(train_freq / time_step))
+            buffer = RolloutBuffer(
+                buffer_size=rollout_buffer_size,
+                obs_dim=self.rl_algorithm.obs_dim,
+                act_dim=self.rl_algorithm.act_dim,
+                n_rl_units=len(self.rl_strats),
+                device=self.device,
+                float_type=self.float_type,
+                gamma=self.learning_config.gamma,
+                gae_lambda=self.learning_config.on_policy.gae_lambda,
+            )
+            min_episode_for_eval = validation_interval
+
+        return buffer, min_episode_for_eval
+
+    def sync_train_freq_with_simulation_horizon(self) -> str | None:
+        """Ensure self.train_freq evenly divides the simulation length.
+
         If not, adjust self.train_freq (in-place) and return the new string, otherwise return None.
         Uses self.start_datetime/self.end_datetime when available, otherwise falls back to timestamp fields.
         """
@@ -193,8 +251,7 @@ def sync_train_freq_with_simulation_horizon(self) -> str | None:
         return self.learning_config.train_freq
 
     def determine_validation_interval(self) -> int:
-        """
-        Compute and validate validation_interval.
+        """Compute and validate validation_interval.
 
         Returns:
             validation_interval (int)
@@ -205,21 +262,30 @@ def determine_validation_interval(self) -> int:
         training_episodes = self.learning_config.training_episodes
         validation_interval = min(training_episodes, default_interval)
 
-        min_required_episodes = (
-            self.learning_config.episodes_collecting_initial_experience
-            + validation_interval
-        )
-
-        if self.learning_config.training_episodes < min_required_episodes:
-            raise ValueError(
-                f"Training episodes ({training_episodes}) must be greater than the sum of initial experience episodes ({self.learning_config.episodes_collecting_initial_experience}) and evaluation interval ({validation_interval})."
+        # Only check initial experience episodes for off-policy algorithms
+        if is_off_policy(self.learning_config.algorithm):
+            min_required_episodes = (
+                self.learning_config.off_policy.episodes_collecting_initial_experience
+                + validation_interval
             )
 
+            if self.learning_config.training_episodes < min_required_episodes:
+                raise ValueError(
+                    f"Training episodes ({training_episodes}) must be greater than the sum of initial experience episodes ({self.learning_config.off_policy.episodes_collecting_initial_experience}) and evaluation interval ({validation_interval})."
+                )
+        else:
+            # For on-policy algorithms, no initial experience collection needed
+            min_required_episodes = validation_interval
+
+            if self.learning_config.training_episodes < min_required_episodes:
+                raise ValueError(
+                    f"Training episodes ({training_episodes}) must be greater than evaluation interval ({validation_interval})."
+                )
+
         return validation_interval
 
     def register_strategy(self, strategy: LearningStrategy) -> None:
-        """
-        Register a learning strategy with this learning role.
+        """Register a learning strategy with this learning role.
 
         Args:
             strategy (LearningStrategy): The learning strategy to register.
@@ -236,6 +302,15 @@ async def store_to_buffer_and_update(self) -> None:
         current_noises = self.all_noises
         current_regrets = self.all_regrets
         current_profits = self.all_profits
+        # On-policy (PPO/MAPPO) only caches
+        if is_on_policy(self.learning_config.algorithm):
+            current_values = self.all_values
+            current_log_probs = self.all_log_probs
+            current_dones = self.all_dones
+        else:
+            current_values = defaultdict(lambda: defaultdict(list))
+            current_log_probs = defaultdict(lambda: defaultdict(list))
+            current_dones = defaultdict(lambda: defaultdict(list))
 
         # Reset cache dicts immediately with new defaultdicts
         self.all_obs = defaultdict(lambda: defaultdict(list))
@@ -244,6 +319,10 @@ async def store_to_buffer_and_update(self) -> None:
         self.all_noises = defaultdict(lambda: defaultdict(list))
         self.all_regrets = defaultdict(lambda: defaultdict(list))
         self.all_profits = defaultdict(lambda: defaultdict(list))
+        if is_on_policy(self.learning_config.algorithm):
+            self.all_values = defaultdict(lambda: defaultdict(list))
+            self.all_log_probs = defaultdict(lambda: defaultdict(list))
+            self.all_dones = defaultdict(lambda: defaultdict(list))
 
         # Get timestamps from cache we took
         all_timestamps = sorted(current_obs.keys())
@@ -257,11 +336,20 @@ async def store_to_buffer_and_update(self) -> None:
             timestamps_to_process = [
                 ts for ts in all_timestamps if ts not in incomplete_timestamps
             ]
-            # Carry over incomplete timesteps to new cache dicts
+            # Carry over incomplete timesteps to new cache dicts so they are
+            # not lost when the cache is reset below.
+            on_policy_active = is_on_policy(self.learning_config.algorithm)
             for ts in incomplete_timestamps:
                 self.all_obs[ts] = current_obs[ts]
                 self.all_actions[ts] = current_actions[ts]
                 self.all_noises[ts] = current_noises[ts]
+                if on_policy_active:
+                    if ts in current_values:
+                        self.all_values[ts] = current_values[ts]
+                    if ts in current_log_probs:
+                        self.all_log_probs[ts] = current_log_probs[ts]
+                    if ts in current_dones:
+                        self.all_dones[ts] = current_dones[ts]
 
             # Create filtered cache (only complete timesteps)
             cache = {
@@ -271,6 +359,9 @@ async def store_to_buffer_and_update(self) -> None:
                 "noises": {t: current_noises[t] for t in timestamps_to_process},
                 "regret": {t: current_regrets[t] for t in timestamps_to_process},
                 "profit": {t: current_profits[t] for t in timestamps_to_process},
+                "values": {t: current_values[t] for t in timestamps_to_process},
+                "log_probs": {t: current_log_probs[t] for t in timestamps_to_process},
+                "dones": {t: current_dones[t] for t in timestamps_to_process},
             }
 
             # write data to output agent
@@ -284,10 +375,10 @@ async def store_to_buffer_and_update(self) -> None:
             logger.warning("No experience retrieved to store in buffer at update step!")
 
     async def _store_to_buffer_and_update_sync(self, cache, device) -> None:
-        """
-        This function takes all the information that the strategies wrote into the learning_role cache dicts and post_processes them to fit into the buffer.
-        Further triggers the next policy update
+        """Process strategy data into the buffer and trigger policy update.
 
+        This function takes all the information that the strategies wrote into the
+        learning_role cache dicts and post-processes them to fit into the buffer.
         """
         first_start = next(iter(cache["obs"]))
         for name, buffer in [
@@ -303,40 +394,155 @@ async def _store_to_buffer_and_update_sync(self, cache, device) -> None:
                 )
                 return
 
-        # rewrite dict so that obs.shape == (n_rl_units, obs_dim) and sorted by keys and store in buffer
-        self.buffer.add(
-            obs=transform_buffer_data(cache["obs"], device, self.rl_strats.keys()),
-            actions=transform_buffer_data(
-                cache["actions"], device, self.rl_strats.keys()
-            ),
-            reward=transform_buffer_data(
-                cache["rewards"], device, self.rl_strats.keys()
-            ),
-        )
+        # Add data to buffer - type depends on algorithm category
+        if is_on_policy(self.learning_config.algorithm):
+            # Using RolloutBuffer for on-policy algorithms (PPO/MAPPO).
+            unit_id_order = list(self.rl_strats.keys())
+            n_rl_agents = len(unit_id_order)
+            added_timestamps = 0
+
+            for timestamp in sorted(cache["obs"].keys()):
+                missing_units = [
+                    u
+                    for u in unit_id_order
+                    if u not in cache["obs"][timestamp]
+                    or u not in cache["actions"][timestamp]
+                    or u not in cache["rewards"][timestamp]
+                    or u not in cache["log_probs"][timestamp]
+                    or u not in cache["dones"][timestamp]
+                ]
+                if missing_units:
+                    logger.warning(
+                        "Skipping on-policy rollout step at %s: missing data for units %s. "
+                        "This usually means a learning unit failed to report an "
+                        "observation/action/reward/log_prob/done for this timestep, "
+                        "and we refuse to fill the buffer with zeros.",
+                        timestamp,
+                        missing_units,
+                    )
+                    continue
 
-        if (
-            self.episodes_done
-            >= self.learning_config.episodes_collecting_initial_experience
-        ):
+                obs_data = transform_buffer_data(
+                    {timestamp: cache["obs"][timestamp]},
+                    device,
+                    unit_id_order,
+                )
+                actions_data = transform_buffer_data(
+                    {timestamp: cache["actions"][timestamp]},
+                    device,
+                    unit_id_order,
+                )
+                rewards_data = transform_buffer_data(
+                    {timestamp: cache["rewards"][timestamp]},
+                    device,
+                    unit_id_order,
+                )
+
+                # Computing MAPPO value targets with the centralized critic
+                # using the joint observation available at this timestamp.
+                if self.learning_config.algorithm == "mappo":
+                    values_data = np.zeros((1, n_rl_agents, 1), dtype=np.float32)
+                    obs_step = obs_data[0]
+                    unique_obs_all = obs_step[
+                        :,
+                        self.rl_algorithm.obs_dim - self.rl_algorithm.unique_obs_dim :,
+                    ]
+
+                    with th.no_grad():
+                        for i, unit_id in enumerate(unit_id_order):
+                            strategy = self.rl_strats[unit_id]
+                            obs_i = obs_step[i : i + 1]
+                            other_unique = np.concatenate(
+                                (unique_obs_all[:i], unique_obs_all[i + 1 :]),
+                                axis=0,
+                            )
+                            centralized_obs = np.concatenate(
+                                (obs_i, other_unique.reshape(1, -1)),
+                                axis=1,
+                            )
+                            obs_tensor = th.as_tensor(
+                                centralized_obs,
+                                device=self.device,
+                                dtype=self.float_type,
+                            )
+                            values_data[0, i, 0] = (
+                                strategy.critics(obs_tensor)
+                                .cpu()
+                                .numpy()
+                                .reshape(-1)[0]
+                            )
+                else:
+                    values_data = transform_buffer_data(
+                        {timestamp: cache["values"][timestamp]},
+                        device,
+                        unit_id_order,
+                    )
+
+                log_probs_data = transform_buffer_data(
+                    {timestamp: cache["log_probs"][timestamp]},
+                    device,
+                    unit_id_order,
+                )
+
+                dones_data = transform_buffer_data(
+                    {timestamp: cache["dones"][timestamp]},
+                    device,
+                    unit_id_order,
+                )
+
+                # Adding data to the rollout buffer.
+                self.buffer.add(
+                    obs=obs_data,
+                    action=actions_data,
+                    reward=rewards_data,
+                    done=dones_data,
+                    value=values_data,
+                    log_prob=log_probs_data,
+                )
+                added_timestamps += 1
+
+        else:
+            # Using ReplayBuffer for off-policy algorithms (TD3/DDPG).
+            # Rewriting the dict so obs.shape == (n_rl_units, obs_dim), used by keys in learning role.
+            unit_id_order = list(self.rl_strats.keys())
+            self.buffer.add(
+                obs=transform_buffer_data(cache["obs"], device, unit_id_order),
+                actions=transform_buffer_data(
+                    cache["actions"], device, unit_id_order
+                ),
+                reward=transform_buffer_data(
+                    cache["rewards"], device, unit_id_order
+                ),
+            )
+
+        # Only update policy after initial experience for off-policy algorithms
+        if is_off_policy(self.learning_config.algorithm):
+            if (
+                self.episodes_done
+                >= self.learning_config.off_policy.episodes_collecting_initial_experience
+            ):
+                self.rl_algorithm.update_policy()
+        else:
+            # For on-policy algorithms, update policy immediately
             self.rl_algorithm.update_policy()
 
     def add_observation_to_cache(self, unit_id, start, observation) -> None:
-        """
-        Add the observation to the cache dict, per unit_id.
+        """Add the observation to the cache dict, per unit_id.
 
         Args:
             unit_id (str): The id of the unit.
+            start: The start time.
             observation (torch.Tensor): The observation to be added.
 
         """
         self.all_obs[start][unit_id].append(observation)
 
     def add_actions_to_cache(self, unit_id, start, action, noise) -> None:
-        """
-        Add the action and noise to the cache dict, per unit_id.
+        """Add the action and noise to the cache dict, per unit_id.
 
         Args:
             unit_id (str): The id of the unit.
+            start: The start time.
             action (torch.Tensor): The action to be added.
             noise (torch.Tensor): The noise to be added.
 
@@ -352,26 +558,69 @@ def add_actions_to_cache(self, unit_id, start, action, noise) -> None:
         self.all_actions[start][unit_id].append(action)
         self.all_noises[start][unit_id].append(noise)
 
+        # For on-policy algorithms (MAPPO), cache PPO metadata at action time so
+        # rollout entries stay aligned across strategies and timesteps.
+        if is_on_policy(self.learning_config.algorithm):
+            strategy = self.rl_strats.get(unit_id)
+            if strategy is None or not hasattr(strategy, "_last_log_prob"):
+                return
+
+            # Avoid duplicate appends if add_actions_to_cache is called multiple
+            # times for the same unit/timestamp during one market step.
+            if self.all_log_probs[start][unit_id]:
+                return
+
+            value = getattr(strategy, "_last_value", 0.0)
+            log_prob = strategy._last_log_prob
+
+            if hasattr(value, "item"):
+                value = value.item()
+            if hasattr(log_prob, "item"):
+                log_prob = log_prob.item()
+
+            self.add_ppo_data_to_cache(
+                unit_id=unit_id,
+                start=start,
+                value=value,
+                log_prob=log_prob,
+                done=False,
+            )
+
     def add_reward_to_cache(self, unit_id, start, reward, regret, profit) -> None:
-        """
-        Add the reward to the cache dict, per unit_id.
+        """Add the reward to the cache dict, per unit_id.
 
         Args:
-            unit_id (str): The id of the unit.
-            reward (float): The reward to be added.
-
+            unit_id: The id of the unit.
+            start: The start time.
+            reward: The reward to be added.
+            regret: The regret to be added.
+            profit: The profit to be added.
         """
         self.all_rewards[start][unit_id].append(reward)
         self.all_regrets[start][unit_id].append(regret)
         self.all_profits[start][unit_id].append(profit)
 
-    def load_inter_episodic_data(self, inter_episodic_data):
-        """
-        Load the inter-episodic data from the dict stored across simulation runs.
+    def add_ppo_data_to_cache(
+        self, unit_id, start, value, log_prob, done=False
+    ) -> None:
+        """Add PPO specific data to the cache dict, per unit_id.
 
         Args:
-            inter_episodic_data (dict): The inter-episodic data to be loaded.
+            unit_id: The id of the unit.
+            start: The start time.
+            value: The value estimate V(s) from the critic.
+            log_prob: The log probability of the action.
+            done: Whether a terminal state or not.
+        """
+        self.all_values[start][unit_id].append(value)
+        self.all_log_probs[start][unit_id].append(log_prob)
+        self.all_dones[start][unit_id].append(float(done))
 
+    def load_inter_episodic_data(self, inter_episodic_data):
+        """Load the inter-episodic data from the dict stored across simulation runs.
+
+        Args:
+            inter_episodic_data: The inter-episodic data to be loaded.
         """
         self.episodes_done = inter_episodic_data["episodes_done"]
         self.eval_episodes_done = inter_episodic_data["eval_episodes_done"]
@@ -383,22 +632,24 @@ def load_inter_episodic_data(self, inter_episodic_data):
         self.initialize_policy(inter_episodic_data["actors_and_critics"])
 
         # Disable initial exploration if initial experience collection is complete
-        if (
-            self.episodes_done
-            >= self.learning_config.episodes_collecting_initial_experience
-        ):
-            self.turn_off_initial_exploration()
+        # Only for off-policy algorithms
+        if is_off_policy(self.learning_config.algorithm):
+            if (
+                self.episodes_done
+                >= self.learning_config.off_policy.episodes_collecting_initial_experience
+            ):
+                self.turn_off_initial_exploration()
+        # For on-policy algorithms, no initial exploration to disable
 
         # In continue_learning mode, disable it only for loaded strategies
         elif self.learning_config.continue_learning:
             self.turn_off_initial_exploration(loaded_only=True)
 
     def get_inter_episodic_data(self):
-        """
-        Dump the inter-episodic data to a dict for storing across simulation runs.
+        """Dump the inter-episodic data to a dict for storing across simulation runs.
 
         Returns:
-            dict: The inter-episodic data to be stored.
+            The inter-episodic data to be stored.
         """
 
         return {
@@ -412,14 +663,13 @@ def get_inter_episodic_data(self):
         }
 
     def turn_off_initial_exploration(self, loaded_only=False) -> None:
-        """
-        Disable initial exploration mode.
+        """Disable initial exploration mode.
 
         If `loaded_only=True`, only turn off exploration for strategies that were loaded (used in continue_learning mode).
         If `loaded_only=False`, turn it off for all strategies.
 
         Args:
-            loaded_only (bool): Whether to disable exploration only for loaded strategies.
+            loaded_only: Whether to disable exploration only for loaded strategies.
         """
         for strategy in self.rl_strats.values():
             if loaded_only:
@@ -429,50 +679,61 @@ def turn_off_initial_exploration(self, loaded_only=False) -> None:
                 strategy.collect_initial_experience_mode = False
 
     def get_progress_remaining(self) -> float:
-        """
-        Get the remaining learning progress from the simulation run.
+        """Get the remaining learning progress from the simulation run.
 
+        Returns:
+            The remaining progress as a float between 0 and 1.
         """
         total_duration = self.end - self.start
         elapsed_duration = self.context.current_timestamp - self.start
 
-        learning_episodes = (
-            self.learning_config.training_episodes
-            - self.learning_config.episodes_collecting_initial_experience
-        )
+        # Only calculate progress for off-policy algorithms
+        if is_off_policy(self.learning_config.algorithm):
+            initial_experience_episodes = (
+                self.learning_config.off_policy.episodes_collecting_initial_experience
+            )
+            learning_episodes = (
+                self.learning_config.training_episodes - initial_experience_episodes
+            )
 
-        if (
-            self.episodes_done
-            < self.learning_config.episodes_collecting_initial_experience
-        ):
-            progress_remaining = 1
+            if self.episodes_done < initial_experience_episodes:
+                progress_remaining = 1
+            else:
+                progress_remaining = (
+                    1
+                    - (
+                        (self.episodes_done - initial_experience_episodes)
+                        / learning_episodes
+                    )
+                    - ((1 / learning_episodes) * (elapsed_duration / total_duration))
+                )
         else:
+            # For on-policy algorithms, simpler progress calculation
+            total_episodes = self.learning_config.training_episodes
             progress_remaining = (
                 1
-                - (
-                    (
-                        self.episodes_done
-                        - self.learning_config.episodes_collecting_initial_experience
-                    )
-                    / learning_episodes
-                )
-                - ((1 / learning_episodes) * (elapsed_duration / total_duration))
+                - (self.episodes_done / total_episodes)
+                - (elapsed_duration / total_duration)
             )
 
         return progress_remaining
 
     def create_learning_algorithm(self, algorithm: RLAlgorithm):
-        """
-        Create and initialize the reinforcement learning algorithm.
+        """Create and initialize the reinforcement learning algorithm.
 
-        This method creates and initializes the reinforcement learning algorithm based on the specified algorithm name. The algorithm
-        is associated with the learning role and configured with relevant hyperparameters.
+        This method creates and initializes the reinforcement learning algorithm based on
+        the specified algorithm name. The algorithm is associated with the learning role
+        and configured with relevant hyperparameters.
 
         Args:
-            algorithm (RLAlgorithm): The name of the reinforcement learning algorithm.
+            algorithm: The name of the reinforcement learning algorithm.
         """
         if algorithm == "matd3":
             self.rl_algorithm = TD3(learning_role=self)
+        elif algorithm == "maddpg":
+            self.rl_algorithm = DDPG(learning_role=self)
+        elif algorithm == "mappo":
+            self.rl_algorithm = PPO(learning_role=self)
         else:
             logger.error(f"Learning algorithm {algorithm} not implemented!")
 
@@ -480,9 +741,13 @@ def initialize_policy(self, actors_and_critics: dict = None) -> None:
         """
         Initialize the policy of the reinforcement learning agent considering the respective algorithm.
 
-        This method initializes the policy (actor) of the reinforcement learning agent. It tests if we want to continue the learning process with
-        stored policies from a former training process. If so, it loads the policies from the specified directory. Otherwise, it initializes the
-        respective new policies.
+        This method initializes the policy (actor) of the reinforcement learning agent. It
+        tests if we want to continue the learning process with stored policies from a former
+        training process. If so, it loads the policies from the specified directory.
+        Otherwise, it initializes the respective new policies.
+
+        Args:
+            actors_and_critics: The pre-initialized actor and critic policies.
         """
 
         self.rl_algorithm.initialize_policy(actors_and_critics)
@@ -501,20 +766,23 @@ def initialize_policy(self, actors_and_critics: dict = None) -> None:
                 )
 
     def compare_and_save_policies(self, metrics: dict) -> bool:
-        """
-        Compare evaluation metrics and save policies based on the best achieved performance according to the metrics calculated.
+        """Compare evaluation metrics and save best performing policies.
 
-        This method compares the evaluation metrics, such as reward, profit, and regret, and saves the policies if they achieve the
-        best performance in their respective categories. It iterates through the specified modes, compares the current evaluation
-        value with the previous best, and updates the best value if necessary. If an improvement is detected, it saves the policy
-        and associated parameters.
+        This method compares the evaluation metrics, such as reward, profit, and regret,
+        and saves the policies if they achieve the best performance in their respective
+        categories. It iterates through the specified modes, compares the current evaluation
+        value with the previous best, and updates the best value if necessary. If an improvement
+        is detected, it saves the policy and associated parameters.
 
-        metrics contain a metric key like "reward" and the current value.
-        This function stores the policies with the highest metric.
-        So if minimize is required one should add for example "minus_regret" which is then maximized.
+        Metrics contain a metric key like "reward" and the current value. This function
+        stores the policies with the highest metric. If minimize is required, one should
+        add for example "minus_regret" which is then maximized.
+
+        Args:
+            metrics: Dictionary of metrics evaluated.
 
         Returns:
-            bool: True if early stopping criteria is triggered.
+            True if early stopping criteria is triggered, False otherwise.
 
         Note:
             This method is typically used during the evaluation phase to save policies that achieve superior performance.
@@ -579,10 +847,10 @@ def compare_and_save_policies(self, metrics: dict) -> bool:
                         )
                         if (
                             self.learning_config.learning_rate_schedule
-                            or self.learning_config.action_noise_schedule
+                            or self.learning_config.off_policy.action_noise_schedule
                         ) is not None:
                             logger.info(
-                                f"Learning rate schedule ({self.learning_config.learning_rate_schedule}) or action noise schedule ({self.learning_config.action_noise_schedule}) were scheduled to decay, further learning improvement can be possible. End value of schedule may not have been reached."
+                                f"Learning rate schedule ({self.learning_config.learning_rate_schedule}) or action noise schedule ({self.learning_config.off_policy.action_noise_schedule}) were scheduled to decay, further learning improvement can be possible. End value of schedule may not have been reached."
                             )
 
                         self.rl_algorithm.save_params(
@@ -601,8 +869,7 @@ def init_logging(
         output_agent_addr: str,
         train_start: str,
     ):
-        """
-        Initialize the logging for the reinforcement learning agent.
+        """Initialize the logging for the reinforcement learning agent.
 
         This method initializes the tensor board logger for the reinforcement learning agent.
         It also initializes the parameters required for sending data to the output role.
@@ -623,7 +890,11 @@ def init_logging(
             evaluation_mode=self.learning_config.evaluation_mode,
             episode=episode,
             eval_episode=eval_episode,
-            episodes_collecting_initial_experience=self.learning_config.episodes_collecting_initial_experience,
+            episodes_collecting_initial_experience=(
+                self.learning_config.off_policy.episodes_collecting_initial_experience
+                if is_off_policy(self.learning_config.algorithm)
+                else 0
+            ),
         )
 
         # Parameters required for sending data to the output role
@@ -634,12 +905,10 @@ def init_logging(
         self.update_steps = 0
 
     def write_rl_params_to_output(self, cache):
-        """
-        Sends the current rl_strategy update to the output agent.
+        """Sends the current rl_strategy update to the output agent.
 
         Args:
-            products_index (pandas.DatetimeIndex): The index of all products.
-            marketconfig (MarketConfig): The market configuration.
+            cache: The data cache from the strategies.
         """
         output_agent_list = []
 
@@ -681,41 +950,65 @@ def write_rl_params_to_output(self, cache):
     def write_rl_grad_params_to_output(
         self, learning_rate: float, unit_params_list: list[dict]
     ) -> None:
-        """
-        Writes learning parameters and critic losses to output at specified time intervals.
+        """Writes learning parameters and critic losses to output at specified intervals.
 
         This function processes training metrics for each critic over multiple time steps and
         sends them to a database for storage. It tracks the learning rate and critic losses
         across training iterations, associating each record with a timestamp.
 
-        Parameters
-        ----------
-        learning_rate : float
-            The current learning rate used in training.
-        unit_params_list : list[dict]
-            A list of dictionaries containing critic losses for each time step.
-            Each dictionary maps critic names to their corresponding loss values.
+        Args:
+            learning_rate: The current learning rate used in training.
+            unit_params_list: A list of dictionaries containing critic losses for each
+                time step (mapping critic names to their losses in dict).
         """
         # gradient steps performed in previous training episodes
-        gradient_steps_done = (
-            max(
-                self.episodes_done
-                - self.learning_config.episodes_collecting_initial_experience,
-                0,
+        if is_off_policy(self.learning_config.algorithm):
+            gradient_steps_done = (
+                max(
+                    self.episodes_done
+                    - self.learning_config.off_policy.episodes_collecting_initial_experience,
+                    0,
+                )
+                * int(
+                    (timestamp2datetime(self.end) - timestamp2datetime(self.start))
+                    / pd.Timedelta(self.learning_config.train_freq)
+                )
+                * self.learning_config.off_policy.gradient_steps
             )
-            * int(
-                (timestamp2datetime(self.end) - timestamp2datetime(self.start))
-                / pd.Timedelta(self.learning_config.train_freq)
+        else:
+            # For on-policy, no gradient steps concept - use 0 for calculation purposes
+            gradient_steps_done = 0
+
+        # Handle different parameter structures for on-policy vs off-policy
+        if self.learning_config.algorithm == "mappo":
+            # For PPO/MAPPO: unit_params_list length equals actual update steps
+            actual_gradient_steps = len(unit_params_list)
+            gradient_step_range = range(actual_gradient_steps)
+            # For on-policy, use simple step counting
+            base_step = self.update_steps * actual_gradient_steps
+        else:
+            # For off-policy: use configured gradient_steps
+            actual_gradient_steps = self.learning_config.off_policy.gradient_steps
+            gradient_step_range = range(actual_gradient_steps)
+
+            # gradient steps performed in previous training episodes
+            gradient_steps_done = (
+                max(
+                    self.episodes_done
+                    - self.learning_config.off_policy.episodes_collecting_initial_experience,
+                    0,
+                )
+                * int(
+                    (timestamp2datetime(self.end) - timestamp2datetime(self.start))
+                    / pd.Timedelta(self.learning_config.train_freq)
+                )
+                * self.learning_config.off_policy.gradient_steps
             )
-            * self.learning_config.gradient_steps
-        )
+            base_step = gradient_steps_done + self.update_steps * actual_gradient_steps
 
         output_list = [
             {
-                "step": gradient_steps_done
-                + self.update_steps
-                * self.learning_config.gradient_steps  # gradient steps performed in current training episode
-                + gradient_step,
+                "step": base_step + gradient_step,
                 "unit": u_id,
                 "actor_loss": params["actor_loss"],
                 "actor_total_grad_norm": params["actor_total_grad_norm"],
@@ -725,7 +1018,7 @@ def write_rl_grad_params_to_output(
                 "critic_max_grad_norm": params["critic_max_grad_norm"],
                 "learning_rate": learning_rate,
             }
-            for gradient_step in range(self.learning_config.gradient_steps)
+            for gradient_step in gradient_step_range
             for u_id, params in unit_params_list[gradient_step].items()
         ]
 
diff --git a/assume/reinforcement_learning/learning_utils.py b/assume/reinforcement_learning/learning_utils.py
index e0a9d7899..5afb8aed3 100644
--- a/assume/reinforcement_learning/learning_utils.py
+++ b/assume/reinforcement_learning/learning_utils.py
@@ -26,12 +26,26 @@ class ObsActRew(TypedDict):
 Schedule = Callable[[float], float]
 
 
+class ActivationLimits(TypedDict):
+    """Output limits for activation functions."""
+
+    min: float
+    max: float
+    func: Callable[[th.Tensor], th.Tensor]
+
+
+activation_function_limit: dict[str, ActivationLimits] = {
+    "tanh": {"min": -1, "max": 1, "func": th.tanh},
+    "sigmoid": {"min": 0, "max": 1, "func": th.sigmoid},
+    "relu": {"min": 0, "max": float("inf"), "func": th.nn.functional.relu},
+    "softsign": {"min": -1, "max": 1, "func": th.nn.functional.softsign},
+}
+
+
 # Ornstein-Uhlenbeck Noise
 # from https://github.com/songrotek/DDPG/blob/master/ou_noise.py
 class OUNoise:
-    """
-    A class that implements Ornstein-Uhlenbeck noise.
-    """
+    """A class that implements Ornstein-Uhlenbeck noise."""
 
     def __init__(self, action_dimension, mu=0, sigma=0.5, theta=0.15, dt=1e-2):
         self.action_dimension = action_dimension
@@ -60,9 +74,7 @@ def noise(self):
 
 
 class NormalActionNoise:
-    """
-    A Gaussian action noise that supports direct tensor creation on a given device.
-    """
+    """A Gaussian action noise that supports direct tensor creation on a given device."""
 
     def __init__(self, action_dimension, mu=0.0, sigma=0.1, scale=1.0, dt=0.9998):
         self.act_dimension = action_dimension
@@ -72,8 +84,7 @@ def __init__(self, action_dimension, mu=0.0, sigma=0.1, scale=1.0, dt=0.9998):
         self.dt = dt
 
     def noise(self, device=None, dtype=th.float):
-        """
-        Generates noise using torch.normal(), ensuring efficient execution on GPU if needed.
+        """Generate noise using torch.normal() ensuring efficient execution on GPU if needed.
 
         Args:
         - device (torch.device, optional): Target device (e.g., 'cuda' or 'cpu').
@@ -99,9 +110,9 @@ def update_noise_decay(self, updated_decay: float):
 
 
 def polyak_update(params, target_params, tau: float):
-    """
-    Perform a Polyak average update on ``target_params`` using ``params``:
-    target parameters are slowly updated towards the main parameters.
+    """Perform a Polyak average update on ``target_params`` using ``params``.
+
+    Target parameters are slowly updated towards the main parameters.
     ``tau``, the soft update coefficient controls the interpolation:
     ``tau=1`` corresponds to copying the parameters to the target ones whereas nothing happens when ``tau=0``.
     The Polyak update is done in place, with ``no_grad``, and therefore does not create intermediate tensors,
@@ -111,9 +122,9 @@ def polyak_update(params, target_params, tau: float):
     See https://github.com/DLR-RM/stable-baselines3/issues/93
 
     Args:
-        params: parameters to use to update the target params
-        target_params: parameters to update
-        tau: the soft update coefficient ("Polyak update", between 0 and 1)
+        params: Parameters to use to update the target params.
+        target_params: Parameters to update.
+        tau: The soft update coefficient ("Polyak update", between 0 and 1).
     """
     with th.no_grad():
         for param, target_param in zip(params, target_params):
@@ -123,9 +134,10 @@ def polyak_update(params, target_params, tau: float):
 def linear_schedule_func(
     start: float, end: float = 0, end_fraction: float = 1
 ) -> Schedule:
-    """
-    Create a function that interpolates linearly between start and end
-    between ``progress_remaining`` = 1 and ``progress_remaining`` = 1 - ``end_fraction``.
+    """Create a function that interpolates linearly between start and end.
+
+    Interpolates linearly between start and end between ``progress_remaining`` = 1
+    and ``progress_remaining`` = 1 - ``end_fraction``.
 
     Args:
         start: value to start with if ``progress_remaining`` = 1
@@ -135,11 +147,10 @@ def linear_schedule_func(
             of the complete training process.
 
     Returns:
-        Linear schedule function.
+        The linear schedule function.
 
     Note:
         Adapted from SB3: https://github.com/DLR-RM/stable-baselines3/blob/512eea923afad6f6da4bb53d72b6ea4c6d856e59/stable_baselines3/common/utils.py#L100
-
     """
 
     def func(progress_remaining: float) -> float:
@@ -152,17 +163,18 @@ def func(progress_remaining: float) -> float:
 
 
 def constant_schedule(val: float) -> Schedule:
-    """
-    Create a function that returns a constant. It is useful for learning rate schedule (to avoid code duplication)
+    """Create a function that returns a constant.
+
+    It is useful for learning rate schedule (to avoid code duplication).
 
     Args:
-        val: constant value
+        val: Constant value.
+
     Returns:
         Constant schedule function.
 
     Note:
         From SB3: https://github.com/DLR-RM/stable-baselines3/blob/512eea923afad6f6da4bb53d72b6ea4c6d856e59/stable_baselines3/common/utils.py#L124
-
     """
 
     def func(_):
@@ -182,6 +194,11 @@ def get_hidden_sizes(state_dict: dict, prefix: str) -> list[int]:
     return sizes[:-1]  # exclude the final output layer if needed
 
 
+def _get_q_prefixes(state_dict: dict) -> list[str]:
+    known = ("q_layers", "q1_layers", "q2_layers")
+    return [p for p in known if f"{p}.0.weight" in state_dict]
+
+
 def copy_layer_data(dst, src):
     for k in dst:
         if k in src and dst[k].shape == src[k].shape:
@@ -197,15 +214,16 @@ def transform_buffer_data(
     Get tensors from GPU to CPU.
 
     Args:
-        nested_dict: Dict with structure {datetime -> {unit_id -> list[tensor]}}
+        nested_dict: Dict with structure {datetime -> {unit_id -> list[tensor]}}.
+        device: PyTorch device config.
+        keys_unit_order: Ordered iterable of unit ids defining the agent
+            axis of the returned tensor.
 
     Returns:
-        th.Tensor: Shape (n_timesteps, n_powerplants, feature_dim)
+        np.ndarray: Shape (n_timesteps, n_powerplants, feature_dim).
     """
-    # Get sorted lists of units and timestamps (for consistent ordering)
     all_times = sorted(nested_dict.keys())
 
-    # Get feature dimension from first non-empty value
     feature_dim = None
     for unit_data in nested_dict.values():
         for values in unit_data.values():
@@ -220,19 +238,20 @@ def transform_buffer_data(
 
     if feature_dim is None:
         raise ValueError(
-            "Error, while transforming RL data for buffer: No data found to determine feature dimension"
+            "Error, while transforming RL data for buffer: No data found "
+            "to determine feature dimension. Callers must filter out empty "
+            "timesteps before calling transform_buffer_data (see "
+            "learning_role._store_to_buffer_and_update_sync)."
         )
 
-    # Pre-allocate tensor (keep on same device as input data)
     result = th.zeros(
         (len(all_times), len(keys_unit_order), feature_dim), device=device
     )
 
-    # Fill tensor with values (stays on same device as input so if on GPU it stays there during filling)
     for t, timestamp in enumerate(all_times):
         for u, unit_id in enumerate(keys_unit_order):
             values = nested_dict[timestamp].get(unit_id, [])
-            if values:  # if we have values for this timestamp
+            if values:
                 result[t, u] = values[0]
 
     return result.cpu().numpy()
@@ -260,14 +279,24 @@ def transfer_weights(
         act_dim (int): The action dimension size.
         unique_obs (int): The unique observation size per agent, smaller than obs_base as these include also shared observation values.
 
-    returns:
+    Returns:
         dict | None: The updated state dictionary with transferred weights, or None if architecture mismatch.
     """
 
     # 1) Architecture check
     new_state = model.state_dict()
-    loaded_hidden = get_hidden_sizes(loaded_state, prefix="q1_layers")
-    new_hidden = get_hidden_sizes(new_state, prefix="q1_layers")
+    prefixes = _get_q_prefixes(loaded_state)
+    if not prefixes:
+        logger.warning(
+            "Cannot transfer weights: no recognised Q-network prefix "
+            "(q_layers / q1_layers / q2_layers) found in loaded state dict."
+        )
+        return None
+
+    # Using the first detected prefix for architecture check.
+    check_prefix = prefixes[0]
+    loaded_hidden = get_hidden_sizes(loaded_state, prefix=check_prefix)
+    new_hidden = get_hidden_sizes(new_state, prefix=check_prefix)
     if loaded_hidden != new_hidden:
         logger.warning(
             f"Cannot transfer weights: neural network architecture mismatch.\n"
@@ -284,8 +313,7 @@ def transfer_weights(
     # 3) Clone new state
     new_state_copy = {k: v.clone() for k, v in new_state.items()}
 
-    # 4) Transfer per-prefix
-    for prefix in ("q1_layers", "q2_layers"):
+    for prefix in prefixes:
         w_loaded = loaded_state[f"{prefix}.0.weight"]
         b_loaded = loaded_state[f"{prefix}.0.bias"]
         w_new = new_state_copy[f"{prefix}.0.weight"]
@@ -325,7 +353,7 @@ def transfer_weights(
             # actions untouched
 
         # d) bias and deeper layers
-        # copy all other wigths and biases (besides input layer) from loaded to new model
+        # copy all other weights and biases (besides input layer) from loaded to new model
         b_new.copy_(b_loaded)
         for i in range(1, len(new_hidden) + 1):
             new_state_copy[f"{prefix}.{i}.weight"].copy_(
@@ -338,6 +366,38 @@ def transfer_weights(
     return new_state_copy
 
 
+def xavier_init_weights(module: th.nn.Module) -> None:
+    """Apply Xavier uniform initialisation to all Linear layers in *module*.
+
+    Xavier initialisation keeps activation variance roughly constant across
+    layers, which works well for tanh / softsign activations (TD3/DDPG actors
+    and all Q-network critics).
+
+    Args:
+        module: Any ``nn.Module`` whose ``Linear`` sub-layers should be initialised.
+    """
+    if isinstance(module, th.nn.Linear):
+        th.nn.init.xavier_uniform_(module.weight)
+        th.nn.init.zeros_(module.bias)
+
+
+def orthogonal_init_weights(module: th.nn.Module, gain: float = 1.0) -> None:
+    """Apply orthogonal initialisation to a single Linear layer.
+
+    Orthogonal initialisation is the standard choice for PPO because it
+    preserves gradient norms better than Xavier when combined with ReLU
+    activations and a Gaussian policy head.
+
+    Args:
+        module: An ``nn.Linear`` layer to initialise.
+        gain: Scaling factor for the weight matrix.  Common choices:
+            ``sqrt(2)`` for hidden layers, ``0.01`` for the output / policy head.
+    """
+    if isinstance(module, th.nn.Linear):
+        th.nn.init.orthogonal_(module.weight, gain=gain)
+        th.nn.init.zeros_(module.bias)
+
+
 def encode_time_features(start: datetime) -> list:
     """
     Encode time features for a given datetime object.
diff --git a/assume/reinforcement_learning/neural_network_architecture.py b/assume/reinforcement_learning/neural_network_architecture.py
index a173b4b5c..5b9e94422 100644
--- a/assume/reinforcement_learning/neural_network_architecture.py
+++ b/assume/reinforcement_learning/neural_network_architecture.py
@@ -2,18 +2,28 @@
 #
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
+
+import numpy as np
 import torch as th
 from torch import nn
 from torch.nn import functional as F
 
+from assume.reinforcement_learning.learning_utils import (
+    activation_function_limit,
+    orthogonal_init_weights,
+    xavier_init_weights,
+)
 
-class CriticTD3(nn.Module):
-    """Initialize parameters and build model.
+
+class Critic(nn.Module):
+    """Base Critic class handling architecture generation and initialization.
 
     Args:
         n_agents (int): Number of agents
-        obs_dim (int): Dimension of each state
-        act_dim (int): Dimension of each action
+        obs_dim (int): Dimension of observation per agent
+        act_dim: Dimension of action per agent
+        float_type: Data type for parameters
+        unique_obs_dim: Dimension of agent-specific observations
     """
 
     def __init__(
@@ -26,56 +36,71 @@ def __init__(
     ):
         super().__init__()
 
+        # Calculate total (centralized) dimensions
         self.obs_dim = obs_dim + unique_obs_dim * (n_agents - 1)
         self.act_dim = act_dim * n_agents
 
-        # Select proper architecture based on `n_agents`
+        self.float_type = float_type
+
+        # Dynamic Architecture Definition
+        self.hidden_sizes = self._get_architecture(n_agents)
+
+    def _get_architecture(self, n_agents: int) -> list[int]:
+        """Returns hidden layer sizes based on the number of agents."""
         if n_agents <= 20:
             hidden_sizes = [256, 128]  # Shallow network for small `n_agents`
         elif n_agents <= 50:
             hidden_sizes = [512, 256, 128]  # Medium network
         else:
             hidden_sizes = [1024, 512, 256, 128]  # Deeper network for large `n_agents`
+        return hidden_sizes
 
-        # First Q-network (Q1)
-        self.q1_layers = self._build_q_network(hidden_sizes, float_type)
-
-        # Second Q-network (Q2) for double Q-learning
-        self.q2_layers = self._build_q_network(hidden_sizes, float_type)
-
-        # Initialize weights properly
-        self._init_weights()
-
-    def _build_q_network(self, hidden_sizes, float_type):
-        """
-        Dynamically creates a Q-network given the chosen hidden layer sizes.
-        """
+    def _build_q_network(self) -> nn.ModuleList:
+        """Dynamically create a Q-network given the chosen hidden layer sizes."""
         layers = nn.ModuleList()
         input_dim = (
             self.obs_dim + self.act_dim
         )  # Input includes all observations and actions
 
-        for h in hidden_sizes:
-            layers.append(nn.Linear(input_dim, h, dtype=float_type))
+        for h in self.hidden_sizes:
+            layers.append(nn.Linear(input_dim, h, dtype=self.float_type))
+            layers.append(nn.ReLU())
             input_dim = h
-        layers.append(nn.Linear(input_dim, 1, dtype=float_type))  # Output Q-value
+        layers.append(nn.Linear(input_dim, 1, dtype=self.float_type))  # Output Q-value
 
         return layers
 
     def _init_weights(self):
-        """Apply Xavier initialization to all layers."""
+        """Apply Xavier uniform initialisation to all Linear layers."""
+        self.apply(xavier_init_weights)
 
-        def init_layer(m):
-            if isinstance(m, nn.Linear):
-                nn.init.xavier_uniform_(m.weight)
-                nn.init.zeros_(m.bias)
 
-        self.apply(init_layer)
+class CriticTD3(Critic):
+    """Initialize parameters and build model.
 
-    def forward(self, obs, actions):
-        """
-        Forward pass through both Q-networks.
-        """
+    Args:
+        n_agents: Number of agents.
+        obs_dim: Dimension of each state.
+        act_dim: Dimension of each action.
+        float_type: Data type for parameters.
+        unique_obs_dim: Dimension of agent-specific observations.
+    """
+
+    def __init__(
+        self, n_agents: int, obs_dim: int, act_dim: int, float_type, unique_obs_dim: int
+    ):
+        super().__init__(n_agents, obs_dim, act_dim, float_type, unique_obs_dim)
+
+        # First Q-network (Q1)
+        self.q1_layers = self._build_q_network()
+
+        # Second Q-network (Q2) for double Q-learning
+        self.q2_layers = self._build_q_network()
+
+    def forward(
+        self, obs: th.Tensor, actions: th.Tensor
+    ) -> tuple[th.Tensor, th.Tensor]:
+        """Forward pass through both Q-networks."""
         xu = th.cat([obs, actions], dim=1)  # Concatenate obs & actions
 
         # Compute Q1
@@ -92,10 +117,8 @@ def forward(self, obs, actions):
 
         return x1, x2
 
-    def q1_forward(self, obs, actions):
-        """
-        Compute only Q1 (used during actor updates).
-        """
+    def q1_forward(self, obs: th.Tensor, actions: th.Tensor) -> th.Tensor:
+        """Compute only Q1 (used during actor updates)."""
         x = th.cat([obs, actions], dim=1)
 
         for layer in self.q1_layers[:-1]:  # All hidden layers
@@ -106,49 +129,106 @@ def q1_forward(self, obs, actions):
         return x
 
 
-class Actor(nn.Module):
+class CriticDDPG(Critic):
+    """Initialize parameters and build model.
+
+    Args:
+        n_agents (int): Number of agents
+        obs_dim (int): Dimension of observation per agent
+        act_dim: Dimension of action per agent
+        float_type: Data type for parameters
+        unique_obs_dim: Dimension of agent-specific observations
     """
-    Parent class for actor networks.
+
+    def __init__(
+        self,
+        n_agents: int,
+        obs_dim: int,
+        act_dim: int,
+        float_type: th.dtype,
+        unique_obs_dim: int,
+    ):
+        super().__init__(n_agents, obs_dim, act_dim, float_type, unique_obs_dim)
+
+        # Q-network
+        self.q_layers = self._build_q_network()
+
+        # Initialize weights properly
+        self._init_weights()
+
+    def forward(self, obs: th.Tensor, actions: th.Tensor) -> th.Tensor:
+        """Returns Q value."""
+        xu = th.cat([obs, actions], dim=1)  # Concatenate obs & actions
+
+        # Compute Q
+        for layer in self.q_layers[:-1]:  # All hidden layers
+            xu = F.relu(layer(xu))
+
+        x = self.q_layers[-1](xu)
+
+        return x
+
+
+class CriticPPO(Critic):
+    """Initialize parameters and build PPO value network.
+
+    Args:
+        n_agents: Number of agents.
+        obs_dim: Dimension of observation per agent.
+        float_type: Data type for parameters.
+        unique_obs_dim: Dimension of agent-specific observations.
     """
 
-    activation_function_limit = {
-        "softsign": (-1, 1),
-        "tanh": (-1, 1),
-        "sigmoid": (0, 1),
-        "relu": (0, float("inf")),
-    }
+    def __init__(self, n_agents: int, obs_dim: int, float_type, unique_obs_dim: int):
+        super().__init__(
+            n_agents=n_agents,
+            obs_dim=obs_dim,
+            act_dim=0,
+            float_type=float_type,
+            unique_obs_dim=unique_obs_dim,
+        )
+
+        # V-network
+        self.v_layers = self._build_q_network()
+
+        # Initialize weights properly
+        self._init_weights()
+
+    def _init_weights(self) -> None:
+        """Apply orthogonal initialisation: sqrt(2) gain for hidden layers, 1.0 for the value head."""
+        for layer in self.v_layers:
+            if isinstance(layer, nn.Linear):
+                gain = 0.01 if layer.out_features == 1 else np.sqrt(2)
+                orthogonal_init_weights(layer, gain=gain)
+
+    def forward(self, obs: th.Tensor) -> th.Tensor:
+        """Returns V value."""
+        x = obs
+        for layer in self.v_layers:
+            x = layer(x)
+        return x
+
 
-    activation_function_map = {
-        "softsign": F.softsign,
-        "tanh": th.tanh,
-        "sigmoid": th.sigmoid,
-        "relu": F.relu,
-    }
+class Actor(nn.Module):
+    """Parent class for actor networks."""
 
     def __init__(self):
         super().__init__()
 
         self.activation = "softsign"  # or "tanh", "sigmoid", "relu"
 
-        if self.activation not in self.activation_function_limit:
+        if self.activation not in activation_function_limit:
             raise ValueError(
-                f"Activation '{self.activation}' not supported! Supported: {list(self.activation_function_limit.keys())}"
+                f"Activation '{self.activation}' not supported! Supported: {list(activation_function_limit.keys())}"
             )
-        self.min_output, self.max_output = self.activation_function_limit[
-            self.activation
-        ]
 
-        self.activation_function = self.activation_function_map.get(self.activation)
-        if self.activation_function is None:
-            raise ValueError(
-                f"Activation '{self.activation}' not implemented in forward pass!"
-            )
+        self.min_output = activation_function_limit[self.activation]["min"]
+        self.max_output = activation_function_limit[self.activation]["max"]
+        self.activation_function = activation_function_limit[self.activation]["func"]
 
 
 class MLPActor(Actor):
-    """
-    The neurnal network for the MLP actor.
-    """
+    """The neural network for the MLP actor."""
 
     def __init__(self, obs_dim: int, act_dim: int, float_type, *args, **kwargs):
         super().__init__()
@@ -161,14 +241,8 @@ def __init__(self, obs_dim: int, act_dim: int, float_type, *args, **kwargs):
         self._init_weights()
 
     def _init_weights(self):
-        """Apply Xavier initialization to all layers."""
-
-        def init_layer(m):
-            if isinstance(m, nn.Linear):
-                nn.init.xavier_uniform_(m.weight)
-                nn.init.zeros_(m.bias)
-
-        self.apply(init_layer)
+        """Apply Xavier uniform initialisation to all Linear layers."""
+        self.apply(xavier_init_weights)
 
     def forward(self, obs):
         """Forward pass for action prediction."""
@@ -180,16 +254,16 @@ def forward(self, obs):
 
 
 class LSTMActor(Actor):
-    """
-    The LSTM recurrent neurnal network for the actor.
+    """The LSTM recurrent neural network for the actor.
 
     Based on "Multi-Period and Multi-Spatial Equilibrium Analysis in Imperfect Electricity Markets"
     by Ye at al. (2019)
 
-    Note: the original source code was not available, therefore this implementation was derived from the published paper.
-    Adjustments to resemble final layers from MLPActor:
-    - dense layer 2 was omitted
-    - single output layer with softsign activation function to output actions directly instead of two output layers for mean and stddev
+    Note:
+        The original source code was not available, therefore this implementation was derived from the published paper.
+        Adjustments to resemble final layers from MLPActor:
+        - dense layer 2 was omitted
+        - single output layer with softsign activation function to output actions directly instead of two output layers for mean and stddev
     """
 
     def __init__(
@@ -262,3 +336,314 @@ def forward(self, obs):
             x = x.squeeze(0)
 
         return x
+
+
+class ActorPPO(nn.Module):
+    """PPO Actor network with stochastic policy (Gaussian)."""
+
+    def __init__(
+        self,
+        obs_dim: int,
+        act_dim: int,
+        float_type,
+        log_std_init: float = 0.0,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.act_dim = act_dim
+        self.float_type = float_type
+
+        self.activation = "tanh"  # or "softsign", "sigmoid", "relu"
+
+        if self.activation not in activation_function_limit:
+            raise ValueError(
+                f"Activation '{self.activation}' not supported! Supported: {list(activation_function_limit.keys())}"
+            )
+
+        self.min_output = activation_function_limit[self.activation]["min"]
+        self.max_output = activation_function_limit[self.activation]["max"]
+
+        # Policy network (outputs mean)
+        self.FC1 = nn.Linear(obs_dim, 256, dtype=float_type)
+        self.FC2 = nn.Linear(256, 128, dtype=float_type)
+        self.mean_layer = nn.Linear(128, act_dim, dtype=float_type)
+
+        # Learnable log standard deviation
+        self.log_std = nn.Parameter(th.ones(act_dim, dtype=float_type) * log_std_init)
+
+        self._init_weights()
+
+    def _init_weights(self) -> None:
+        """Apply orthogonal initialisation with appropriate gains."""
+        orthogonal_init_weights(self.FC1, gain=np.sqrt(2))
+        orthogonal_init_weights(self.FC2, gain=np.sqrt(2))
+        orthogonal_init_weights(self.mean_layer, gain=0.01)
+
+    def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor:
+        """Forward pass"""
+        x = F.relu(self.FC1(obs))
+        x = F.relu(self.FC2(x))
+        mean = th.tanh(self.mean_layer(x))  # Bounded to [-1, 1]
+
+        if deterministic:
+            return mean
+
+        # Sample from Gaussian during training
+        log_std = self.log_std.expand_as(mean)
+        std = log_std.exp()
+        noise = th.randn_like(mean)
+        action = mean + std * noise
+
+        # Clamp to valid range
+        return th.clamp(action, -1.0, 1.0)
+
+    def get_distribution(self, obs: th.Tensor) -> tuple[th.Tensor, th.Tensor]:
+        """Get the policy distribution parameters."""
+        x = F.relu(self.FC1(obs))
+        x = F.relu(self.FC2(x))
+        mean = th.tanh(self.mean_layer(x))  # Bounded to [-1, 1]
+        log_std = self.log_std.expand_as(mean)
+
+        return mean, log_std
+
+    def get_action_and_log_prob(
+        self,
+        obs: th.Tensor,
+        deterministic: bool = False,
+    ) -> tuple[th.Tensor, th.Tensor]:
+        """Sample action and compute log probability.
+
+        Args:
+            obs: Observations.
+            deterministic: If True, return mean action.
+
+        Returns:
+            Tuple of (action, log_prob).
+        """
+        mean, log_std = self.get_distribution(obs)
+        std = log_std.exp()
+
+        if deterministic:
+            action = mean
+        else:
+            # Sample from Gaussian
+            noise = th.randn_like(mean)
+            action = mean + std * noise
+
+        # Clamp action to valid range
+        action = th.clamp(action, -1.0, 1.0)
+
+        # Compute log probability
+        log_prob = self._compute_log_prob(action, mean, std)
+
+        return action, log_prob
+
+    def evaluate_actions(
+        self,
+        obs: th.Tensor,
+        actions: th.Tensor,
+    ) -> tuple[th.Tensor, th.Tensor, th.Tensor]:
+        """Evaluate log probability and entropy for given actions.
+
+        Used during PPO update to compute importance ratio.
+
+        Args:
+            obs: Observations.
+            actions: Actions to evaluate.
+
+        Returns:
+            Tuple of (log_prob, entropy, values).
+        """
+        mean, log_std = self.get_distribution(obs)
+        std = log_std.exp()
+
+        # Log probability
+        log_prob = self._compute_log_prob(actions, mean, std)
+
+        # Entropy for exploration bonus
+        entropy = 0.5 * (1.0 + th.log(2 * th.pi * std.pow(2))).sum(dim=-1)
+
+        return log_prob, entropy
+
+    def _compute_log_prob(
+        self,
+        actions: th.Tensor,
+        mean: th.Tensor,
+        std: th.Tensor,
+    ) -> th.Tensor:
+        """Compute log probability of actions under Gaussian distribution."""
+        distribution = th.distributions.Normal(mean, std)
+        return distribution.log_prob(actions).sum(dim=-1)
+
+
+class LSTMActorPPO(ActorPPO):
+    """PPO Actor network with LSTM architecture and stochastic policy (Gaussian)."""
+
+    def __init__(
+        self,
+        obs_dim: int,
+        act_dim: int,
+        float_type,
+        unique_obs_dim: int,
+        num_timeseries_obs_dim: int,
+        log_std_init: float = 0.0,
+        *args,
+        **kwargs,
+    ):
+        # Initialize ActorPPO params
+        nn.Module.__init__(self)  # Don't call ActorPPO.__init__ to avoid FC creation
+
+        self.act_dim = act_dim
+        self.float_type = float_type
+        self.unique_obs_dim = unique_obs_dim
+        self.num_timeseries_obs_dim = num_timeseries_obs_dim
+
+        self.activation = "tanh"
+        self.min_output = activation_function_limit[self.activation]["min"]
+        self.max_output = activation_function_limit[self.activation]["max"]
+        self.activation_function = activation_function_limit[self.activation]["func"]
+
+        # Compute timeseries length for LSTM
+        try:
+            self.timeseries_len = int(
+                (obs_dim - unique_obs_dim) / num_timeseries_obs_dim
+            )
+        except Exception as e:
+            raise ValueError(
+                f"Using LSTM but not providing correctly shaped timeseries: Expected integer as unique timeseries length, got {(obs_dim - unique_obs_dim) / num_timeseries_obs_dim} instead."
+            ) from e
+
+        # LSTM Layers
+        self.LSTM1 = nn.LSTMCell(num_timeseries_obs_dim, 8, dtype=float_type)
+        self.LSTM2 = nn.LSTMCell(8, 16, dtype=float_type)
+
+        # Fully Connected Layers
+        self.FC1 = nn.Linear(
+            self.timeseries_len * 16 + unique_obs_dim, 128, dtype=float_type
+        )
+        self.mean_layer = nn.Linear(128, act_dim, dtype=float_type)
+
+        # Learnable log standard deviation
+        self.log_std = nn.Parameter(th.ones(act_dim, dtype=float_type) * log_std_init)
+
+        self._init_weights()
+
+    def _init_weights(self) -> None:
+        """Apply orthogonal initialisation."""
+        for m in self.modules():
+            if isinstance(m, nn.LSTMCell):
+                nn.init.orthogonal_(m.weight_ih, gain=1.0)
+                nn.init.orthogonal_(m.weight_hh, gain=1.0)
+                nn.init.zeros_(m.bias_ih)
+                nn.init.zeros_(m.bias_hh)
+        orthogonal_init_weights(self.FC1, gain=np.sqrt(2))
+        orthogonal_init_weights(self.mean_layer, gain=0.01)
+
+    def _compute_mean(self, obs: th.Tensor) -> th.Tensor:
+        """Compute policy mean action from LSTM features."""
+        if obs.dim() not in (1, 2):
+            raise ValueError(
+                f"LSTMCell: Expected input to be 1D or 2D, got {obs.dim()}D instead"
+            )
+
+        is_batched = obs.dim() == 2
+        if not is_batched:
+            obs = obs.unsqueeze(0)
+
+        # Split observation into time series and stationary parts
+        x1, x2 = obs.split(
+            [obs.shape[1] - self.unique_obs_dim, self.unique_obs_dim], dim=1
+        )
+        x1 = x1.reshape(-1, self.num_timeseries_obs_dim, self.timeseries_len)
+
+        # Initial hidden states
+        batch_size = x1.size(0)
+        h_t = th.zeros(batch_size, 8, dtype=self.float_type, device=obs.device)
+        c_t = th.zeros(batch_size, 8, dtype=self.float_type, device=obs.device)
+
+        h_t2 = th.zeros(batch_size, 16, dtype=self.float_type, device=obs.device)
+        c_t2 = th.zeros(batch_size, 16, dtype=self.float_type, device=obs.device)
+
+        outputs = []
+
+        # LSTM Loop
+        for time_step in x1.split(1, dim=2):
+            # x1 is (Batch, Features, Time) -> split on Time dim=2
+            # time_step is (Batch, Features, 1) -> reshape to (Batch, Features)
+            time_step = time_step.reshape(batch_size, self.num_timeseries_obs_dim)
+            h_t, c_t = self.LSTM1(time_step, (h_t, c_t))
+            h_t2, c_t2 = self.LSTM2(h_t, (h_t2, c_t2))
+            outputs.append(h_t2)
+
+        # Concatenate LSTM outputs
+        outputs = th.cat(outputs, dim=1)
+
+        # Concatenate with stationary observations
+        x = th.cat((outputs, x2), dim=1)
+
+        # FC Layers
+        x = F.relu(self.FC1(x))
+        mean = th.tanh(self.mean_layer(x))  # Bounded to [-1, 1]
+
+        if not is_batched:
+            mean = mean.squeeze(0)
+
+        return mean
+
+    def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor:
+        """Forward pass"""
+        mean = self._compute_mean(obs)
+
+        if deterministic:
+            return mean
+
+        # Sample from Gaussian during training
+        log_std = self.log_std.expand_as(mean)
+        std = log_std.exp()  # Ensure positive
+        # Add small epsilon for numerical stability
+        std = std + 1e-6
+        noise = th.randn_like(mean)
+        action = mean + std * noise
+
+        # Clamp to valid range
+        return th.clamp(action, -1.0, 1.0)
+
+    def get_distribution(self, obs: th.Tensor) -> tuple[th.Tensor, th.Tensor]:
+        """Get policy distribution parameters from LSTM path."""
+        mean = self._compute_mean(obs)
+        log_std = self.log_std.expand_as(mean)
+        return mean, log_std
+
+    def get_action_and_log_prob(
+        self,
+        obs: th.Tensor,
+        deterministic: bool = False,
+    ) -> tuple[th.Tensor, th.Tensor]:
+        """Sample action and compute log probability for LSTM PPO."""
+        mean, log_std = self.get_distribution(obs)
+        std = log_std.exp() + 1e-6
+
+        if deterministic:
+            action = mean
+        else:
+            noise = th.randn_like(mean)
+            action = mean + std * noise
+
+        action = th.clamp(action, -1.0, 1.0)
+        log_prob = self._compute_log_prob(action, mean, std)
+
+        return action, log_prob
+
+    def evaluate_actions(
+        self,
+        obs: th.Tensor,
+        actions: th.Tensor,
+    ) -> tuple[th.Tensor, th.Tensor]:
+        """Evaluate log prob and entropy for provided actions."""
+        mean, log_std = self.get_distribution(obs)
+        std = log_std.exp() + 1e-6
+        log_prob = self._compute_log_prob(actions, mean, std)
+        entropy = 0.5 * (1.0 + th.log(2 * th.pi * std.pow(2))).sum(dim=-1)
+        return log_prob, entropy
diff --git a/assume/scenario/loader_csv.py b/assume/scenario/loader_csv.py
index 2972b132b..16249aeec 100644
--- a/assume/scenario/loader_csv.py
+++ b/assume/scenario/loader_csv.py
@@ -1056,13 +1056,12 @@ def run_learning(
         verbose (bool, optional): A flag indicating whether to enable verbose logging. Defaults to False.
 
     Note:
-        - The function uses a ReplayBuffer to store experiences for training the DRL agents.
+        - The function uses a ReplayBuffer for off-policy algorithms and a RolloutBuffer for on-policy algorithms.
         - It iterates through training episodes, updating the agents and evaluating their performance at regular intervals.
         - Initial exploration is active at the beginning and is disabled after a certain number of episodes to improve the performance of DRL algorithms.
         - Upon completion of training, the function performs an evaluation run using the last policy learned during training.
         - The best policies are chosen based on the average reward obtained during the evaluation runs, and they are saved for future use.
     """
-    from assume.reinforcement_learning.buffer import ReplayBuffer
 
     if not verbose:
         logger.setLevel(logging.WARNING)
@@ -1084,17 +1083,23 @@ def run_learning(
     if os.path.exists(tensorboard_path):
         shutil.rmtree(tensorboard_path, ignore_errors=True)
 
+    validation_interval = world.learning_role.determine_validation_interval()
+
+    # sync train frequency with simulation horizon once at the beginning of training and overwrite scenario data
+    world.scenario_data["config"]["learning_config"]["train_freq"] = (
+        world.learning_role.sync_train_freq_with_simulation_horizon()
+    )
+
+    # Build the appropriate buffer for the selected algorithm category.
+    buffer, min_episode_for_eval = world.learning_role.initialize_buffer(
+        time_step=world.scenario_data["config"]["time_step"],
+        validation_interval=validation_interval,
+    )
+
     # -----------------------------------------
     # Information that needs to be stored across episodes, aka one simulation run
     inter_episodic_data = {
-        "buffer": ReplayBuffer(
-            buffer_size=world.learning_role.learning_config.replay_buffer_size,
-            obs_dim=world.learning_role.rl_algorithm.obs_dim,
-            act_dim=world.learning_role.rl_algorithm.act_dim,
-            n_rl_units=len(world.learning_role.rl_strats),
-            device=world.learning_role.device,
-            float_type=world.learning_role.float_type,
-        ),
+        "buffer": buffer,
         "actors_and_critics": None,
         "max_eval": defaultdict(lambda: -1e9),
         "all_eval": defaultdict(list),
@@ -1105,13 +1110,6 @@ def run_learning(
 
     world.learning_role.load_inter_episodic_data(inter_episodic_data)
 
-    validation_interval = world.learning_role.determine_validation_interval()
-
-    # sync train frequency with simulation horizon once at the beginning of training and overwrite scenario data
-    world.scenario_data["config"]["learning_config"]["train_freq"] = (
-        world.learning_role.sync_train_freq_with_simulation_horizon()
-    )
-
     eval_episode = 1
 
     for episode in tqdm(
@@ -1137,12 +1135,7 @@ def run_learning(
         inter_episodic_data["episodes_done"] = episode
 
         # evaluation run:
-        if (
-            episode % validation_interval == 0
-            and episode
-            >= world.learning_role.learning_config.episodes_collecting_initial_experience
-            + validation_interval
-        ):
+        if episode % validation_interval == 0 and episode >= min_episode_for_eval:
             world.reset()
 
             # load evaluation run
@@ -1186,11 +1179,7 @@ def run_learning(
         world.reset()
 
         # save the policies after each episode in case the simulation is stopped or crashes
-        if (
-            episode
-            >= world.learning_role.learning_config.episodes_collecting_initial_experience
-            + validation_interval
-        ):
+        if episode >= min_episode_for_eval:
             world.learning_role.rl_algorithm.save_params(
                 directory=f"{world.learning_role.learning_config.trained_policies_save_path}/last_policies"
             )
diff --git a/assume/strategies/learning_strategies.py b/assume/strategies/learning_strategies.py
index 15b6285eb..c6396b2ec 100644
--- a/assume/strategies/learning_strategies.py
+++ b/assume/strategies/learning_strategies.py
@@ -16,6 +16,7 @@
     MinMaxStrategy,
     SupportsMinMax,
     SupportsMinMaxCharge,
+    is_off_policy,
 )
 from assume.common.fast_pandas import FastSeries
 from assume.common.market_objects import MarketConfig, Orderbook, Product
@@ -43,6 +44,7 @@ def __init__(self, *args, **kwargs):
         # tells us whether we are training the agents or just executing per-learning strategies
         self.learning_mode = self.learning_config.learning_mode
         self.evaluation_mode = self.learning_config.evaluation_mode
+        self.algorithm = self.learning_config.algorithm
 
         self.actor_architecture = self.learning_config.actor_architecture
 
@@ -67,17 +69,21 @@ def __init__(self, *args, **kwargs):
         self.exploration_noise_std = self.learning_config.exploration_noise_std
 
         if self.learning_mode or self.evaluation_mode:
-            # learning role overwrites this if loaded from file or after initial experience episodes
-            self.collect_initial_experience_mode = True
-
-            self.action_noise = NormalActionNoise(
-                mu=0.0,
-                sigma=self.learning_config.noise_sigma,
-                action_dimension=self.act_dim,
-                scale=self.learning_config.noise_scale,
-                dt=self.learning_config.noise_dt,
+            # Keeping initial random exploration only for off-policy methods.
+            self.collect_initial_experience_mode = is_off_policy(
+                self.learning_config.algorithm
             )
 
+            if is_off_policy(self.learning_config.algorithm):
+                self.action_noise = NormalActionNoise(
+                    mu=0.0,
+                    sigma=self.learning_config.off_policy.noise_sigma,
+                    action_dimension=self.act_dim,
+                    scale=self.learning_config.off_policy.noise_scale,
+                    dt=self.learning_config.off_policy.noise_dt,
+                )
+            # For on-policy algorithms, no action noise needed - variable remains undefined
+
             self.learning_role.register_strategy(self)
 
         # actor policies are only loaded here from file if learning mode is off (otherwise handled by learning_role)
@@ -241,8 +247,10 @@ def get_individual_observations(
         return np.array([])
 
     def get_actions(self, next_observation):
-        """
-        Determines actions based on the current observation, applying noise for exploration if in learning mode.
+        """Determine action and exploration noise for the current observation.
+
+        All algorithm-specific sampling logic lives in the
+        algorithm class via get_action.
 
         Args
         ----
@@ -260,48 +268,9 @@ def get_actions(self, next_observation):
         -----
         In learning mode, actions incorporate noise for exploration. Initial exploration relies
         solely on noise to cover the action space broadly.
+        For PPO, we also store log_prob and value estimates for later use.
         """
-
-        # distinction whether we are in learning mode or not to handle exploration realised with noise
-        if self.learning_mode and not self.evaluation_mode:
-            # if we are in learning mode the first x episodes we want to explore the entire action space
-            # to get a good initial experience, in the area around the costs of the agent
-            if self.collect_initial_experience_mode:
-                # define current action as solely noise
-                noise = th.normal(
-                    mean=0.0,
-                    std=self.exploration_noise_std,
-                    size=(self.act_dim,),
-                    dtype=self.float_type,
-                    device=self.device,
-                )
-
-                # =============================================================================
-                # 2.1 Get Actions and handle exploration
-                # =============================================================================
-                # only use noise as the action to enforce exploration
-                curr_action = noise
-
-            else:
-                # if we are not in the initial exploration phase we chose the action with the actor neural net
-                # and add noise to the action
-                curr_action = self.actor(next_observation).detach()
-                noise = self.action_noise.noise(
-                    device=self.device, dtype=self.float_type
-                )
-                curr_action += noise
-
-                # make sure that noise adding does not exceed the actual output of the NN as it pushes results in a direction that actor can't even reach
-                curr_action = th.clamp(
-                    curr_action, self.actor.min_output, self.actor.max_output
-                )
-        else:
-            # if we are not in learning mode we just use the actor neural net to get the action without adding noise
-            curr_action = self.actor(next_observation).detach()
-            # noise is an tensor with zeros, because we are not in learning mode
-            noise = th.zeros_like(curr_action, dtype=self.float_type)
-
-        return curr_action, noise
+        return self.learning_role.rl_algorithm.get_action(self, next_observation)
 
 
 class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
@@ -349,6 +318,8 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
         Number of time steps for which the agent forecasts market conditions. Defaults to 12.
     max_bid_price : float
         Maximum allowable bid price. Defaults to 100.
+    max_demand : float
+        Maximum demand capacity of the unit. Defaults to 10e3.
     device : str
         Device for computation, such as "cpu" or "cuda". Defaults to "cpu".
     float_type : str
@@ -361,6 +332,8 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
         Class of the neural network architecture used for the actor network. Defaults to MLPActor.
     actor : torch.nn.Module
         Actor network for determining actions.
+    order_types : list[str]
+        Types of market orders supported by the strategy. Defaults to ["SB"].
     action_noise : NormalActionNoise
         Noise model added to actions during learning to encourage exploration. Defaults to None.
     collect_initial_experience_mode : bool
@@ -386,6 +359,9 @@ def __init__(self, *args, **kwargs):
             **kwargs,
         )
 
+        # define allowed order types
+        self.order_types = kwargs.get("order_types", ["SB"])
+
     def calculate_bids(
         self,
         unit: SupportsMinMax,
@@ -510,7 +486,9 @@ def get_actions(self, next_observation):
         curr_action, noise = super().get_actions(next_observation)
 
         if self.learning_mode and not self.evaluation_mode:
-            if self.collect_initial_experience_mode:
+            if self.collect_initial_experience_mode and is_off_policy(
+                self.learning_config.algorithm
+            ):
                 # Assumes last dimension of the observation corresponds to marginal cost
                 marginal_cost = next_observation[
                     -1
@@ -828,6 +806,8 @@ class StorageEnergyLearningStrategy(TorchLearningStrategy, MinMaxChargeStrategy)
         Number of time steps for forecasting market conditions. Defaults to 24.
     max_bid_price : float
         Maximum allowable bid price. Defaults to 100.
+    max_demand : float
+        Maximum demand capacity of the storage. Defaults to 10e3.
     device : str
         Device used for computation ("cpu" or "cuda"). Defaults to "cpu".
     float_type : str
@@ -840,6 +820,8 @@ class StorageEnergyLearningStrategy(TorchLearningStrategy, MinMaxChargeStrategy)
         Class of the neural network for the actor network. Defaults to MLPActor.
     actor : torch.nn.Module
         The neural network used to predict actions.
+    order_types : list[str]
+        Types of market orders used by the strategy. Defaults to ["SB"].
     action_noise : NormalActionNoise
         Noise model added to actions during learning for exploration. Defaults to None.
     collect_initial_experience_mode : bool
@@ -865,6 +847,9 @@ def __init__(self, *args, **kwargs):
             **kwargs,
         )
 
+        # define allowed order types
+        self.order_types = kwargs.get("order_types", ["SB"])
+
     def get_individual_observations(
         self, unit: SupportsMinMaxCharge, start: datetime, end: datetime
     ):
@@ -1132,6 +1117,8 @@ class RenewableEnergyLearningSingleBidStrategy(EnergyLearningSingleBidStrategy):
         Class of the neural network for the actor network. Defaults to MLPActor.
     actor : torch.nn.Module
         The neural network used to predict actions.
+    order_types : list[str]
+        Types of market orders used by the strategy. Defaults to ["SB"].
     action_noise : NormalActionNoise
         Noise model added to actions during learning for exploration. Defaults to None.
     collect_initial_experience_mode : bool
@@ -1157,6 +1144,9 @@ def __init__(self, *args, **kwargs):
             **kwargs,
         )
 
+        # define allowed order types
+        self.order_types = kwargs.get("order_types", ["SB"])
+
     def get_individual_observations(
         self, unit: SupportsMinMaxCharge, start: datetime, end: datetime
     ):
diff --git a/docs/source/assume.reinforcement_learning.rst b/docs/source/assume.reinforcement_learning.rst
index fe89cfce1..866070f00 100644
--- a/docs/source/assume.reinforcement_learning.rst
+++ b/docs/source/assume.reinforcement_learning.rst
@@ -48,6 +48,22 @@ assume.reinforcement\_learning.algorithms.matd3 module
    :undoc-members:
    :show-inheritance:
 
+assume.reinforcement\_learning.algorithms.maddpg module
+-------------------------------------------------------
+
+.. automodule:: assume.reinforcement_learning.algorithms.maddpg
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+assume.reinforcement\_learning.algorithms.mappo module
+------------------------------------------------------
+
+.. automodule:: assume.reinforcement_learning.algorithms.mappo
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 Module contents
 ---------------
 
diff --git a/docs/source/learning.rst b/docs/source/learning.rst
index 21cfbdd47..a54600d15 100644
--- a/docs/source/learning.rst
+++ b/docs/source/learning.rst
@@ -36,8 +36,8 @@ After taking action :math:`a_i \in A_i` in state :math:`s_i \in S` according to
 Each agent receives a reward :math:`r_i` according to the individual reward function :math:`R_i` and a private observation correlated with the state :math:`o_i: S \rightarrow O_i`.
 Like in a Markov Decision Process, each agent :math:`i` learns an optimal policy :math:`\pi_i^*(s)` that maximizes its expected reward.
 
-To enable multi-agent learning some adjustments are needed within the learning algorithm to get from the TD3 to an MATD3 algorithm.
-Other authors used similar tweaks to improve the MADDPG algorithm and derive the MA-TD3 algorithm.
+To enable multi-agent learning, ASSUME supports three RL algorithms out of the box: **MATD3** (Multi-Agent Twin Delayed DDPG, off-policy), **MADDPG** (Multi-Agent DDPG, off-policy), and **MAPPO** (Multi-Agent PPO, on-policy).
+The algorithm to use is selected via the ``algorithm`` config item.
 We'll start explaining the learning by focusing on a single agent and then extend it to multi-agent learning.
 
 Single-Agent Learning
@@ -96,18 +96,23 @@ Multi-Agent Learning
 In a single-agent setup, the state transition and respective reward depend only on the actions of a single agent. However, in a
 multi-agent setup, the state transitions and rewards depend on the actions of all learning agents. This makes the environment
 non-stationary for a single agent, violating the Markov property. The convergence guarantees of single-agent RL algorithms are no longer
-valid. To address this, we utilize the framework of centralized training and decentralized execution and expand upon the MADDPG algorithm.
+valid. To address this, we utilize the framework of centralized training and decentralized execution, which is supported in ASSUME (MATD3, MADDPG, MAPPO).
 The main idea is to use a centralized critic during the training phase, which has access to the entire state :math:`S`, and all actions :math:`a_1, \ldots, a_N`, thus resolving the issue of non-stationarity.
 Changes in state transitions and rewards can be explained by the actions of other agents.
 Meanwhile, during both training and execution, the actor has access only to its local observations :math:`o_i` derived from the entire state :math:`S`.
 
-For each agent :math:`i`, we train not one but two centralized critics :math:`Q_{i,\theta_1,2}(S, a_1, \ldots, a_N)` together with two target critic networks.
-Similar to TD3, the smaller value of the two critics and target action noise :math:`a_i,k \sim` is used to calculate the target :math:`y_i,k`. This is done to to address the issue of overestimation bias.
+For each agent :math:`i`, MATD3 and MADDPG train centralized critics together with target critic networks.
+In MATD3 and MADDPG, the critics are used to calculate the target :math:`y_i,k`. In MATD3, two critics per agent are maintained and the minimum value is used (twin-critic trick) to address overestimation bias.
+In MADDPG, a single critic per agent is used. In MAPPO, a single centralized value network is used across all agents, updated via GAE-based advantage estimates rather than Bellman targets.
+
+For MATD3, the target uses the twin-critic minimum:
 
 .. math::
 
     y_i,k = r_i,k + γ * min_j=1,2 Q_i,θ′_j(S′_k, a_1,k, ..., a_N,k, π′(o_i,k))
 
+For MADDPG, the same formulation is used with a single critic.
+
 where :math:`r_i,k` is the reward obtained by agent :math:`i` at time step :math:`k`, :math:`\gamma` is the discount factor, :math:`S'_k` is the next state of the
 environment, and :math:`\pi'(o_i,k)` is the target policy of agent :math:`i`.
 
@@ -123,8 +128,12 @@ The actor policy of each agent is updated using the deterministic policy gradien
 
     ∇_a Q_i,θ_j(S_k, a_1,k, ..., a_N,k, π(o_i,k))|a_i,k=π(o_i,k) * ∇_θ π(o_i,k)
 
-The actor is updated similarly using only one critic network :math:`Q_{θ1}`. These changes to the original DDPG algorithm allow increased stability and convergence of the TD3 algorithm. This is especially relevant when approaching a multi-agent RL setup, as discussed in the foregoing section.
-Please note that the actor and critics are updated by sampling experience from the buffer where all interactions of the agents are stored, namely the observations, actions and rewards. There are more complex buffers possible, like those that use importance sampling, but the default buffer is a simple replay buffer. You can find a documentation of the latter in :ref:`replay-buffer`.
+In MATD3, the actor update is delayed relative to the critic (every ``policy_delay`` gradient steps) to improve stability.
+In MADDPG, the actor is updated at every gradient step.
+In MAPPO, the actor is updated using the PPO clipped surrogate objective rather than the deterministic policy gradient.
+Please note that for off-policy algorithms (MATD3, MADDPG), the actor and critics are updated by sampling experience from the replay buffer where all interactions of the agents are stored.
+For the on-policy algorithm (MAPPO), a rollout buffer is used instead, and experiences are discarded after each policy update.
+You can find documentation of both buffer types in :ref:`replay-buffer` and :ref:`rollout-buffer`.
 
 .. _learning_implementation:
 
diff --git a/docs/source/learning_algorithm.rst b/docs/source/learning_algorithm.rst
index d2e853a0d..1b14fa365 100644
--- a/docs/source/learning_algorithm.rst
+++ b/docs/source/learning_algorithm.rst
@@ -26,41 +26,63 @@ The following table shows the options that can be adjusted and gives a short exp
 
 
 
- ======================================== ==========================================================================================================
-  learning config item                    description
- ======================================== ==========================================================================================================
-  learning_mode                           Should we use learning mode at all? If False, the learning bidding strategy is loaded from trained_policies_load_path and no training occurs. Default is False.
-  evaluation_mode                         This setting is modified internally. Whether to run in evaluation mode. If True, the agent uses the learned policy without exploration noise and no training updates occur. Default is False.
-  continue_learning                       Whether to use pre-learned strategies and then continue learning. If True, loads existing policies from trained_policies_load_path and continues training. Note: Set True when you have a pretrained model and want incremental learning under new data or scenarios. Leave False for clean experiments. Default is False.
-  trained_policies_save_path              The directory path - relative to the scenario's inputs_path - where newly trained RL policies (actor and critic networks) will be saved. Only needed when learning_mode is True. Value is set in setup_world(). Defaults otherwise to None.
-  trained_policies_load_path              The directory path - relative to the scenario's inputs_path - from which pre-trained policies should be loaded. Needed when continue_learning is True or using pre-trained strategies. Default is None.
-  min_bid_price                           The minimum bid price which limits the action of the actor to this price. Used to constrain the actor's output to a price range. Note: Best practice is to set this parameter as unconstraining as possible. When agent bid convergence is guaranteed to occur above zero, increasing the minimum bid value can reduce training times. Default is -100.0.
-  max_bid_price                           The maximum bid price which limits the action of the actor to this price. Used to constrain the actor's output to a price range. Note: Align this with realistic market constraints. Too low = limited strategy space. Too high = noisy learning.  Default is 100.0.
-  device                                  The device to use for PyTorch computations. Options include "cpu", "cuda", or specific CUDA devices like "cuda:0". Default is "cpu".
-  episodes_collecting_initial_experience  The number of episodes at the start during which random actions are chosen instead of using the actor network. This helps populate the replay buffer with diverse experiences. Note: Increase (5–20) for larger environments. Too low causes early high variance and instability; too high wastes time.  Default is 5.
-  exploration_noise_std                   The standard deviation of Gaussian noise added to actions during exploration in the environment. Higher values encourage more exploration. Default is 0.2.
-  training_episodes                       The number of training episodes, where one episode is the entire simulation horizon specified in the general config. Default is 100.
-  validation_episodes_interval            The interval (in episodes) at which validation episodes are run to evaluate the current policy's performance without training updates. Note: With long simulation horizons, choosing this higher will reduce training time. Default is 5.
-  train_freq                              Defines the frequency in time steps at which the actor and critic networks are updated. Accepts time strings like "24h" for 24 hours or "1d" for 1 day. Note: Shorter intervals = frequent updates, faster but less stable learning. Longer intervals = slower but more reliable. Use intervals > "72h" for units that require time coupling such as storages. Default is "24h".
-  batch_size                              The batch size of experiences sampled from the replay buffer for each training update. Larger batches provide more stable gradients but require more memory. In environments with many learning agents we advise small batch sizes. Default is 128.
-  gradient_steps                          The number of gradient descent steps performed during each training update. More steps can lead to better learning but increase computation time. Note: For environments with many agents one should use not many gradient steps, as policies of other agents are updated as well outdating the current best strategy. Default is 100.
-  learning_rate                           The learning rate (step size) for the optimizer, which controls how much the policy and value networks are updated during training. Note: Start around 1e-3. Decrease (e.g. 3e-4, 1e-4) if training oscillates or diverges. Default is 0.001.
-  learning_rate_schedule                  Which learning rate decay schedule to use. Currently only "linear" decay is available, which linearly decreases the learning rate over time. Default is None (constant learning rate).
-  early_stopping_steps                    The number of validation steps over which the moving average reward is calculated for early stopping. If the reward doesn't change by early_stopping_threshold over this many steps, training stops. Note: It prevents wasting compute on runs that have plateaued. Higher values are safer for noisy environments to avoid premature stopping; lower values react faster in stable settings. If None, defaults to training_episodes / validation_episodes_interval + 1.
-  early_stopping_threshold                The minimum improvement in moving average reward required to avoid early stopping. If the reward improvement is less than this threshold over early_stopping_steps, training is terminated early. Note: If training stops too early, reduce the threshold. In noisy environments, combine a lower threshold with higher early_stopping_steps. Default is 0.05.
-  algorithm                               Specifies which reinforcement learning algorithm to use. Currently, only "matd3" (Multi-Agent Twin Delayed Deep Deterministic Policy Gradient) is implemented. Default is "matd3".
-  replay_buffer_size                      The maximum number of transitions stored in the replay buffer for experience replay. Larger buffers allow for more diverse training samples. Default is 500000.
-  gamma                                   The discount factor for future rewards, ranging from 0 to 1. Higher values give more weight to long-term rewards in decision-making, which should be chosen for units with time coupling like storages. Default is 0.99.
-  actor_architecture                      The architecture of the neural networks used for the actors. Options include "mlp" (Multi-Layer Perceptron) and "lstm" (Long Short-Term Memory). Default is "mlp".
-  policy_delay                            The frequency (in gradient steps) at which the actor policy is updated. TD3 updates the critic more frequently than the actor to stabilize training. Default is 2.
-  noise_sigma                             The standard deviation of the Ornstein-Uhlenbeck or Gaussian noise distribution used to generate exploration noise added to actions. Note: In multi-agent ennvironments high noises are necessary to encourage sufficient exploration. Default is 0.1.
-  noise_scale                             The scale factor multiplied by the noise drawn from the distribution. Larger values increase exploration. Default is 1.
-  noise_dt                                The time step parameter for the Ornstein-Uhlenbeck process, which determines how quickly the noise decays over time. Used for noise scheduling. Default is 1.
-  action_noise_schedule                   Which action noise decay schedule to use. Currently only "linear" decay is available, which linearly decreases exploration noise over training. Default is "linear".
-  tau                                     The soft update coefficient for updating target networks. Controls how slowly target networks track the main networks. Smaller values mean slower updates. Default is 0.005.
-  target_policy_noise                     The standard deviation of noise added to target policy actions during critic updates. This smoothing helps prevent overfitting to narrow policy peaks. Default is 0.2.
-  target_noise_clip                       The maximum absolute value for clipping the target policy noise. Prevents the noise from being too large. Default is 0.5.
- ======================================== ==========================================================================================================
+**Common parameters** (apply to all algorithms — set directly under ``learning:`` in the config)
+
+ ================================= ==========================================================================================================
+  learning config item              description
+ ================================= ==========================================================================================================
+  learning_mode                     Should we use learning mode at all? If False, the learning bidding strategy is loaded from trained_policies_load_path and no training occurs. Default is False.
+  evaluation_mode                   This setting is modified internally. Whether to run in evaluation mode. If True, the agent uses the learned policy without exploration noise and no training updates occur. Default is False.
+  continue_learning                 Whether to use pre-learned strategies and then continue learning. If True, loads existing policies from trained_policies_load_path and continues training. Note: Set True when you have a pretrained model and want incremental learning under new data or scenarios. Leave False for clean experiments. Default is False.
+  trained_policies_save_path        The directory path - relative to the scenario's inputs_path - where newly trained RL policies (actor and critic networks) will be saved. Only needed when learning_mode is True. Value is set in setup_world(). Defaults otherwise to None.
+  trained_policies_load_path        The directory path - relative to the scenario's inputs_path - from which pre-trained policies should be loaded. Needed when continue_learning is True or using pre-trained strategies. Default is None.
+  min_bid_price                     The minimum bid price which limits the action of the actor to this price. Used to constrain the actor's output to a price range. Note: Best practice is to set this parameter as unconstraining as possible. Default is -100.0.
+  max_bid_price                     The maximum bid price which limits the action of the actor to this price. Used to constrain the actor's output to a price range. Note: Align this with realistic market constraints. Default is 100.0.
+  device                            The device to use for PyTorch computations. Options include "cpu", "cuda", or specific CUDA devices like "cuda:0". Default is "cpu".
+  exploration_noise_std             The standard deviation of Gaussian noise added to actions during exploration in the environment. Higher values encourage more exploration. Default is 0.2.
+  training_episodes                 The number of training episodes, where one episode is the entire simulation horizon specified in the general config. Default is 100.
+  validation_episodes_interval      The interval (in episodes) at which validation episodes are run to evaluate the current policy's performance without training updates. Default is 5.
+  train_freq                        Defines the frequency in time steps at which the actor and critic networks are updated. Accepts time strings like "24h" or "1d". Default is "24h".
+  batch_size                        The batch size of experiences sampled from the buffer for each training update. Default is 128.
+  learning_rate                     The learning rate for the optimizer. Note: Start around 1e-3. Decrease (e.g. 3e-4, 1e-4) if training oscillates or diverges. Default is 0.001.
+  learning_rate_schedule            Which learning rate decay schedule to use. Currently only "linear" decay is available. Default is None (constant learning rate).
+  early_stopping_steps              The number of validation steps over which the moving average reward is checked for early stopping. If None, defaults to training_episodes / validation_episodes_interval + 1.
+  early_stopping_threshold          The minimum improvement in moving average reward required to avoid early stopping. Default is 0.05.
+  algorithm                         Specifies which reinforcement learning algorithm to use. Options: ``"matd3"`` (Multi-Agent Twin Delayed DDPG, off-policy), ``"maddpg"`` (Multi-Agent DDPG, off-policy), ``"mappo"`` (Multi-Agent PPO, on-policy). Default is ``"matd3"``.
+  gamma                             The discount factor for future rewards (0–1). Higher values weight long-term rewards more. Default is 0.99.
+  actor_architecture                The neural network architecture for the actors. Options: ``"mlp"`` (Multi-Layer Perceptron) or ``"lstm"`` (Long Short-Term Memory). Default is ``"mlp"``.
+ ================================= ==========================================================================================================
+
+**Off-policy parameters** (apply to ``"matd3"`` and ``"maddpg"`` — set under ``off_policy:`` in the config)
+
+ ========================================= ==========================================================================================================
+  off_policy config item                   description
+ ========================================= ==========================================================================================================
+  episodes_collecting_initial_experience   The number of episodes at the start during which random actions are chosen instead of using the actor network. Helps populate the replay buffer with diverse experiences. Note: Increase (5–20) for larger environments. Default is 5.
+  gradient_steps                           The number of gradient descent steps performed during each training update. Note: For environments with many agents, use fewer gradient steps as other agents' policies are updated simultaneously. Default is 100.
+  replay_buffer_size                       The maximum number of transitions stored in the replay buffer. Larger buffers allow for more diverse training samples. Default is 50000.
+  tau                                      The soft update coefficient for updating target networks. Smaller values mean slower target network updates. Default is 0.005.
+  policy_delay                             (MATD3 only) The frequency (in gradient steps) at which the actor policy is updated. TD3 updates the critic more frequently than the actor to stabilize training. Default is 2.
+  noise_sigma                              The standard deviation of the exploration noise distribution added to actions. Note: In multi-agent environments, higher noise encourages sufficient exploration. Default is 0.1.
+  noise_scale                              The scale factor multiplied by the drawn noise. Larger values increase exploration. Default is 1.
+  noise_dt                                 The time step parameter for the Ornstein-Uhlenbeck process, determining how quickly noise decays. Default is 1.
+  action_noise_schedule                    Which action noise decay schedule to use. Currently only ``"linear"`` decay is available. Default is None.
+  target_policy_noise                      (MATD3 only) The standard deviation of noise added to target policy actions during critic updates. Helps prevent overfitting to narrow policy peaks. Default is 0.2.
+  target_noise_clip                        (MATD3 only) The maximum absolute value for clipping target policy noise. Default is 0.5.
+ ========================================= ==========================================================================================================
+
+**On-policy parameters** (apply to ``"mappo"`` — set under ``on_policy:`` in the config)
+
+ ====================== ==========================================================================================================
+  on_policy config item  description
+ ====================== ==========================================================================================================
+  clip_ratio             The clipping ratio for the PPO surrogate objective. Controls how far the new policy can deviate from the old one in a single update. Default is 0.1.
+  entropy_coef           Coefficient for the entropy bonus term in the loss. Higher values encourage more exploration. Default is 0.01.
+  gae_lambda             Lambda parameter for Generalized Advantage Estimation (GAE). Controls the bias-variance trade-off. Default is 0.95.
+  max_grad_norm          Maximum gradient norm for gradient clipping. Default is 0.5.
+  vf_coef                Coefficient for the value function loss term. Default is 0.5.
+  n_epochs               Number of optimization epochs performed over each rollout batch. Default is 10.
+ ====================== ==========================================================================================================
 
 Note: We advise to not use the setting of a seed in the general config (``seed=null``) when using learning, as it will decrease performance, see https://docs.pytorch.org/docs/stable/notes/randomness.html. Completely reproducible results are not guaranteed across different PyTorch versions, hardware, or CUDA configurations.
 
@@ -133,10 +155,60 @@ Note, that the specific implementation of each network architecture is defined i
 
 [2] Y. Ye, D. Qiu, J. Li and G. Strbac, "Multi-Period and Multi-Spatial Equilibrium Analysis in Imperfect Electricity Markets: A Novel Multi-Agent Deep Reinforcement Learning Approach," in IEEE Access, vol. 7, pp. 130515-130529, 2019, doi: 10.1109/ACCESS.2019.2940005.
 
-.. _replay-buffer:
+DDPG (Deep Deterministic Policy Gradient)
+------------------------------------------
+
+DDPG is a single-agent off-policy algorithm that serves as the foundation for TD3. While TD3 improves upon DDPG with twin critics and delayed updates,
+DDPG itself remains a powerful baseline for continuous control tasks.
+
+Original paper: https://arxiv.org/abs/1509.02971
+
+OpenAI Spinning Guide for DDPG: https://spinningup.openai.com/en/latest/algorithms/ddpg.html
+
+DDPG extends the deterministic policy gradient to off-policy settings using a single critic network and an actor network.
+The algorithm updates the critic towards the Bellman target:
+
+.. math::
+    y = r + \gamma Q_{\theta'}(s', \pi_{\phi'}(s'))
+
+And updates the actor using the deterministic policy gradient:
+
+.. math::
+    \nabla_\phi J(\phi) = \mathbb{E}[(\nabla_a Q(s, a) |_{a=\pi(s)} \nabla_\phi \pi(s))]
+
+In ASSUME, DDPG is implemented in a multi-agent setting (MADDPG) with centralized training and decentralized execution.
+The main differences from TD3 are: only one critic network, actor updates every step (no policy delay), and no target action smoothing.
+The implementation follows the same structure as TD3, with actors and critics initialized via :func:`assume.reinforcement_learning.algorithms.maddpg.DDPG.initialize_policy`
+and policy updates performed in :func:`assume.reinforcement_learning.algorithms.maddpg.DDPG.update_policy`.
+
+PPO (Proximal Policy Optimization)
+-----------------------------------
+
+PPO is a state-of-the-art on-policy algorithm that uses a clipped surrogate objective to ensure stable policy updates.
+Unlike off-policy methods like TD3 and DDPG, PPO requires fresh data collected under the current policy.
+
+Original paper: https://arxiv.org/abs/1707.06347
+
+OpenAI Spinning Guide for PPO: https://spinningup.openai.com/en/latest/algorithms/ppo.html
+
+PPO improves upon policy gradient methods by limiting policy updates with a clipping mechanism:
+
+.. math::
+    L^{CLIP}(\theta) = \hat{\mathbb{E}}_t[\min(r_t(\theta) \hat{A}_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) \hat{A}_t)]
+
+where :math:`r_t(\theta)` is the importance sampling ratio and :math:`\hat{A}_t` is the advantage estimate using Generalized Advantage Estimation (GAE).
+
+In ASSUME, PPO uses a rollout buffer instead of a replay buffer, collecting on-policy experiences and training over multiple epochs.
+The algorithm maintains both an actor and a centralized critic, similar to other methods, but updates them using the clipped surrogate objective.
+Network initialization is handled by :func:`assume.reinforcement_learning.algorithms.mappo.PPO.initialize_policy`
+and policy updates occur in :func:`assume.reinforcement_learning.algorithms.mappo.PPO.update_policy`.
+The rollout buffer enables efficient multi-pass training over the same batch of experiences, improving sample efficiency.
+For more details on how the rollout buffer works, see :ref:`rollout-buffer`.
+
+.. _buffer:
 
 ##############
-Replay Buffer
+Buffer
 ##############
 
 This chapter gives you an insight into the general usage of buffers in reinforcement learning and how they are implemented in ASSUME.
@@ -145,25 +217,21 @@ This chapter gives you an insight into the general usage of buffers in reinforce
 Why do we need buffers?
 =======================
 
-In reinforcement learning, a buffer, often referred to as a replay buffer, is a crucial component in algorithms like for Experience Replay.
-It serves as a memory for the agent's past experiences, storing tuples of observations, actions, rewards, and subsequent observations.
-
-Instead of immediately using each new experience for training, the experiences are stored in the buffer. During the training process,
-a batch of experiences is randomly sampled from the replay buffer. This random sampling breaks the temporal correlation in the data, contributing to a more stable learning process.
-
-The replay buffer improves sample efficiency by allowing the agent to reuse and learn from past experiences multiple times.
-This reduces the reliance on new experiences and makes better use of the available data. It also helps mitigate the effects of non-stationarity in the environment,
-as the agent is exposed to a diverse set of experiences.
+In reinforcement learning, a buffer is a crucial component for storing
+the agent's past experiences as tuples of observations, actions, rewards,
+and subsequent observations.
 
-Overall, the replay buffer is instrumental in stabilizing the learning process in reinforcement learning algorithms,
+Overall, the buffer is instrumental in stabilizing the learning process in reinforcement learning algorithms,
 enhancing their robustness and performance by providing a diverse and non-correlated set of training samples.
 
 
-How are they used in ASSUME?
-============================
+How are buffers implemented in ASSUME?
+======================================
 In principal ASSUME allows for different buffers to be implemented. They just need to adhere to the structure presented in the base buffer. Here we will present the different buffers already implemented, which is only one, yet.
 
 
+.. _replay-buffer:
+
 The simple replay buffer
 ------------------------
 
@@ -173,3 +241,54 @@ Yet, the buffer is quite large to store all observations also from multiple agen
 After a certain round of training runs which is defined in the config file the RL strategy is updated by calling the update function of the respective algorithms which calls the sample function of the replay buffer.
 The sample function returns a batch of experiences which is then used to update the RL strategy.
 For more information on the learning capabilities of ASSUME, see :doc:`learning`.
+
+Instead of immediately using each new experience for training, the experiences are stored in the buffer. During the training process,
+a batch of experiences is randomly sampled from the buffer. This random sampling breaks the temporal correlation in the data, contributing to a more stable learning process.
+
+The buffer improves sample efficiency by allowing the agent to reuse and learn from past experiences multiple times.
+This reduces the reliance on new experiences and makes better use of the available data. It also helps mitigate the effects of non-stationarity in the environment,
+as the agent is exposed to a diverse set of experiences.
+
+
+.. _rollout-buffer:
+
+The rollout buffer
+------------------
+
+A rollout buffer is a specialized type of experience storage designed for on-policy reinforcement learning algorithms like PPO (Proximal Policy Optimization).
+Unlike replay buffers that store and reuse experiences from multiple past policies, rollout buffers only store experiences collected by the current policy.
+
+The key characteristics of a rollout buffer are:
+
+* **On-policy storage**: Only stores trajectories from the current policy version
+* **Single-use data**: Experiences are used once for training, then discarded
+* **Temporal structure**: Maintains the sequential order of experiences for advantage computation
+* **Additional metadata**: Stores policy-specific information like old log probabilities and value estimates
+
+This design makes rollout buffers particularly suitable for policy gradient methods that require fresh, on-policy data for stable learning.
+
+The rollout buffer for PPO is implemented as a fixed-size circular buffer that stores one complete rollout of experiences.
+Unlike the replay buffer, it is completely reset after each training update to ensure only on-policy data is used.
+
+The buffer stores the following information for each timestep:
+
+* **Observations**: The state observed by each agent
+* **Actions**: The actions taken by each agent
+* **Rewards**: The rewards received by each agent
+* **Old log probabilities**: The log probability of the action under the policy that collected it
+* **Old values**: The value function estimate at that state
+* **Dones**: Whether the episode terminated
+
+After a complete rollout is collected (determined by the ``train_freq`` parameter in the config), the buffer computes:
+
+* **Returns**: Discounted sum of future rewards for each timestep
+* **Advantages**: GAE-based advantage estimates that guide policy improvement
+
+The learning role collects experiences after each environment step by calling the buffer's add function.
+Once the buffer accumulates enough data (specified by ``batch_size``), the PPO algorithm's update function
+is triggered, which retrieves mini-batches from the buffer for multiple training epochs (specified by ``on_policy.n_epochs``).
+
+After training is complete, the buffer is reset, and the cycle begins again with the updated policy.
+This ensures that PPO always learns from fresh, on-policy experiences, which is critical for the algorithm's stability and performance.
+
+For more information on how PPO uses the rollout buffer, see the PPO algorithm documentation above.
diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml
index d1726365e..2254d9d95 100644
--- a/examples/inputs/example_02a/config.yaml
+++ b/examples/inputs/example_02a/config.yaml
@@ -4,7 +4,7 @@
 
 base:
   start_date: 2019-03-01 00:00
-  end_date: 2019-03-31 00:00
+  end_date: 2019-03-30 00:00
   time_step: 1h
   save_frequency_hours: null
   seed: null
@@ -18,18 +18,31 @@ base:
     algorithm: matd3
     learning_rate: 0.001
     training_episodes: 100
-    episodes_collecting_initial_experience: 5
+    actor_architecture: mlp
     train_freq: 100h
-    gradient_steps: 10
     batch_size: 128
     gamma: 0.99
     device: cpu
-    action_noise_schedule: linear
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
     validation_episodes_interval: 5
 
+    # Off-policy parameters (required for TD3/DDPG algorithms)
+    off_policy:
+      episodes_collecting_initial_experience: 3
+      gradient_steps: 10
+      noise_sigma: 0.1
+      noise_scale: 1
+      action_noise_schedule: linear
+      noise_dt: 1
+      replay_buffer_size: 10000
+
+    # On-policy parameters (required for PPO/MAPPO algorithms)
+    on_policy:
+      clip_ratio: 0.2
+      entropy_coef: 0.01
+      vf_coef: 0.5
+      gae_lambda: 0.95
+      n_epochs: 25
+
   markets_config:
     EOM:
       operator: EOM_operator
@@ -63,20 +76,33 @@ base_lstm:
     algorithm: matd3
     learning_rate: 0.001
     training_episodes: 50
-    episodes_collecting_initial_experience: 5
     train_freq: 24h
-    gradient_steps: 24
     batch_size: 256
     gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
     validation_episodes_interval: 5
     early_stopping_steps: 10
     early_stopping_threshold: 0.05
     actor_architecture: lstm
 
+    # Off-policy parameters (required for TD3/DDPG algorithms)
+    off_policy:
+      episodes_collecting_initial_experience: 5
+      gradient_steps: 24
+      noise_sigma: 0.1
+      noise_scale: 1
+      action_noise_schedule: linear
+      noise_dt: 1
+      replay_buffer_size: 10000
+
+    # On-policy parameters (required for PPO/MAPPO algorithms)
+    on_policy:
+      clip_ratio: 0.2
+      entropy_coef: 0.01
+      vf_coef: 0.5
+      gae_lambda: 0.95
+      n_epochs: 25
+
   markets_config:
     EOM:
       operator: EOM_operator
@@ -110,18 +136,31 @@ tiny:
     algorithm: matd3
     learning_rate: 0.001
     training_episodes: 10
-    episodes_collecting_initial_experience: 3
     train_freq: 24h
-    gradient_steps: 24
     batch_size: 64
     gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
     validation_episodes_interval: 5
     actor_architecture: mlp
 
+    # Off-policy parameters (required for TD3/DDPG algorithms)
+    off_policy:
+      episodes_collecting_initial_experience: 3
+      gradient_steps: 24
+      noise_sigma: 0.1
+      noise_scale: 1
+      action_noise_schedule: linear
+      noise_dt: 1
+      replay_buffer_size: 10000
+
+    # On-policy parameters (required for PPO/MAPPO algorithms)
+    on_policy:
+      clip_ratio: 0.2
+      entropy_coef: 0.01
+      vf_coef: 0.5
+      gae_lambda: 0.95
+      n_epochs: 25
+
   markets_config:
     EOM:
       operator: EOM_operator
diff --git a/examples/inputs/example_02b/config.yaml b/examples/inputs/example_02b/config.yaml
index 1f7897ac1..d69f20e29 100644
--- a/examples/inputs/example_02b/config.yaml
+++ b/examples/inputs/example_02b/config.yaml
@@ -4,7 +4,7 @@
 
 base:
   start_date: 2019-03-01 00:00
-  end_date: 2019-04-01 00:00
+  end_date: 2019-03-29 00:00
   time_step: 1h
   save_frequency_hours: null
   seed: null
@@ -17,19 +17,31 @@ base:
     max_bid_price: 100
     algorithm: matd3
     learning_rate: 0.001
-    training_episodes: 100
-    episodes_collecting_initial_experience: 5
+    training_episodes: 150
     train_freq: 100h
-    gradient_steps: 10
     batch_size: 128
     gamma: 0.99
     device: cpu
-    action_noise_schedule: linear
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
     validation_episodes_interval: 5
 
+    # Off-policy parameters (required for TD3/DDPG algorithms)
+    off_policy:
+      episodes_collecting_initial_experience: 5
+      gradient_steps: 10
+      noise_sigma: 0.1
+      noise_scale: 1
+      action_noise_schedule: linear
+      noise_dt: 1
+      replay_buffer_size: 10000
+
+    # On-policy parameters (required for PPO/MAPPO algorithms)
+    on_policy:
+      clip_ratio: 0.2
+      entropy_coef: 0.01
+      vf_coef: 0.5
+      gae_lambda: 0.95
+      n_epochs: 25
+
   markets_config:
     EOM:
       operator: EOM_operator
@@ -49,7 +61,7 @@ base:
 
 base_lstm:
   start_date: 2019-03-01 00:00
-  end_date: 2019-04-01 00:00
+  end_date: 2019-03-07 00:00
   time_step: 1h
   save_frequency_hours: null
   seed: null
@@ -62,20 +74,32 @@ base_lstm:
     algorithm: matd3
     actor_architecture: lstm
     learning_rate: 0.001
-    training_episodes: 100
-    episodes_collecting_initial_experience: 3
+    training_episodes: 30
     train_freq: 24h
-    gradient_steps: 1
     batch_size: 256
     gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
     validation_episodes_interval: 5
     early_stopping_steps: 10
     early_stopping_threshold: 0.05
 
+    # Off-policy parameters (required for TD3/DDPG algorithms)
+    off_policy:
+      episodes_collecting_initial_experience: 3
+      gradient_steps: 1
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
+      replay_buffer_size: 10000
+
+    # On-policy parameters (required for PPO/MAPPO algorithms)
+    on_policy:
+      clip_ratio: 0.2
+      entropy_coef: 0.01
+      vf_coef: 0.5
+      gae_lambda: 0.95
+      n_epochs: 25
+
   markets_config:
     EOM:
       operator: EOM_operator
diff --git a/examples/inputs/example_02c/config.yaml b/examples/inputs/example_02c/config.yaml
index 78ac4ed7c..de46a341c 100644
--- a/examples/inputs/example_02c/config.yaml
+++ b/examples/inputs/example_02c/config.yaml
@@ -18,18 +18,30 @@ base:
     algorithm: matd3
     learning_rate: 0.001
     training_episodes: 100
-    episodes_collecting_initial_experience: 5
     train_freq: 100h
-    gradient_steps: 10
     batch_size: 128
     gamma: 0.99
     device: cpu
-    action_noise_schedule: linear
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
     validation_episodes_interval: 5
 
+    # Off-policy parameters (required for TD3/DDPG algorithms)
+    off_policy:
+      episodes_collecting_initial_experience: 5
+      gradient_steps: 10
+      noise_sigma: 0.1
+      noise_scale: 1
+      action_noise_schedule: linear
+      noise_dt: 1
+      replay_buffer_size: 10000
+
+    # On-policy parameters (required for PPO/MAPPO algorithms)
+    on_policy:
+      clip_ratio: 0.2
+      entropy_coef: 0.01
+      vf_coef: 0.5
+      gae_lambda: 0.95
+      n_epochs: 25
+
   markets_config:
     EOM:
       operator: EOM_operator
diff --git a/examples/inputs/example_02d/config.yaml b/examples/inputs/example_02d/config.yaml
index 78ac4ed7c..de46a341c 100644
--- a/examples/inputs/example_02d/config.yaml
+++ b/examples/inputs/example_02d/config.yaml
@@ -18,18 +18,30 @@ base:
     algorithm: matd3
     learning_rate: 0.001
     training_episodes: 100
-    episodes_collecting_initial_experience: 5
     train_freq: 100h
-    gradient_steps: 10
     batch_size: 128
     gamma: 0.99
     device: cpu
-    action_noise_schedule: linear
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
     validation_episodes_interval: 5
 
+    # Off-policy parameters (required for TD3/DDPG algorithms)
+    off_policy:
+      episodes_collecting_initial_experience: 5
+      gradient_steps: 10
+      noise_sigma: 0.1
+      noise_scale: 1
+      action_noise_schedule: linear
+      noise_dt: 1
+      replay_buffer_size: 10000
+
+    # On-policy parameters (required for PPO/MAPPO algorithms)
+    on_policy:
+      clip_ratio: 0.2
+      entropy_coef: 0.01
+      vf_coef: 0.5
+      gae_lambda: 0.95
+      n_epochs: 25
+
   markets_config:
     EOM:
       operator: EOM_operator
diff --git a/examples/inputs/example_02e/config.yaml b/examples/inputs/example_02e/config.yaml
index 4201ec8ef..249d24d94 100644
--- a/examples/inputs/example_02e/config.yaml
+++ b/examples/inputs/example_02e/config.yaml
@@ -19,18 +19,30 @@ base:
     actor_architecture: mlp
     learning_rate: 0.0003
     training_episodes: 30
-    episodes_collecting_initial_experience: 5
     train_freq: 720h
-    gradient_steps: 720
     batch_size: 256
     gamma: 0.999
     device: cpu
-    # action_noise_schedule: linear
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
     validation_episodes_interval: 5
 
+    # Off-policy parameters (required for TD3/DDPG algorithms)
+    off_policy:
+      episodes_collecting_initial_experience: 5
+      gradient_steps: 720
+      noise_sigma: 0.1
+      noise_scale: 1
+      # action_noise_schedule: linear
+      noise_dt: 1
+      replay_buffer_size: 10000
+
+    # On-policy parameters (required for PPO/MAPPO algorithms)
+    on_policy:
+      clip_ratio: 0.2
+      entropy_coef: 0.01
+      vf_coef: 0.5
+      gae_lambda: 0.95
+      n_epochs: 25
+
   markets_config:
     EOM:
       operator: EOM_operator
@@ -65,15 +77,27 @@ tiny:
     learning_rate: 0.001
     training_episodes: 5
     validation_episodes_interval: 2
-    episodes_collecting_initial_experience: 1
     train_freq: 24h
-    gradient_steps: 24
     batch_size: 64
     gamma: 0.99
     device: cpu
-    noise_sigma: 0.1
-    noise_scale: 1
-    noise_dt: 1
+
+    # Off-policy parameters (required for TD3/DDPG algorithms)
+    off_policy:
+      episodes_collecting_initial_experience: 1
+      gradient_steps: 24
+      noise_sigma: 0.1
+      noise_scale: 1
+      noise_dt: 1
+      replay_buffer_size: 10000
+
+    # On-policy parameters (required for PPO/MAPPO algorithms)
+    on_policy:
+      clip_ratio: 0.2
+      entropy_coef: 0.01
+      vf_coef: 0.5
+      gae_lambda: 0.95
+      n_epochs: 25
 
   markets_config:
     EOM:
diff --git a/tests/test_learning_role.py b/tests/test_learning_role.py
index f72317184..aecea8a40 100644
--- a/tests/test_learning_role.py
+++ b/tests/test_learning_role.py
@@ -8,7 +8,7 @@
 import pytest
 
 try:
-    from assume.common.base import LearningConfig
+    from assume.common.base import LearningConfig, OffPolicyConfig
     from assume.reinforcement_learning.learning_role import (
         Learning,
         LearningStrategy,
@@ -37,11 +37,13 @@ def test_learning_init():
             learning_mode=True,
             evaluation_mode=False,
             training_episodes=3,
-            episodes_collecting_initial_experience=1,
             continue_learning=False,
             trained_policies_save_path=None,
             early_stopping_steps=10,
             early_stopping_threshold=0.05,
+            off_policy=OffPolicyConfig(
+                episodes_collecting_initial_experience=1,
+            ),
         ),
     }
 
@@ -89,11 +91,13 @@ async def learning_role():
             learning_mode=True,
             evaluation_mode=True,  # evaluation mode to skip buffer/policy update
             training_episodes=3,
-            episodes_collecting_initial_experience=1,
             continue_learning=False,
             trained_policies_save_path=None,
             early_stopping_steps=10,
             early_stopping_threshold=0.05,
+            off_policy=OffPolicyConfig(
+                episodes_collecting_initial_experience=1,
+            ),
         ),
     }
 
diff --git a/tests/test_maddpg.py b/tests/test_maddpg.py
new file mode 100644
index 000000000..4de547496
--- /dev/null
+++ b/tests/test_maddpg.py
@@ -0,0 +1,512 @@
+# SPDX-FileCopyrightText: ASSUME Developers
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import json
+import os
+from copy import copy, deepcopy
+from datetime import datetime
+
+import pytest
+
+from assume.common.base import LearningConfig, OffPolicyConfig
+
+try:
+    import torch as th
+
+    from assume.common.base import LearningStrategy
+    from assume.reinforcement_learning.algorithms.maddpg import DDPG
+    from assume.reinforcement_learning.learning_role import Learning
+
+except ImportError:
+    pass
+
+
+start = datetime(2023, 7, 1)
+end = datetime(2023, 7, 2)
+
+
+@pytest.fixture
+def base_learning_config() -> dict:
+    foresight = 2
+    unique_obs_dim = 2
+    num_timeseries_obs_dim = 4
+    return {
+        "foresight": foresight,
+        "act_dim": 3,
+        "unique_obs_dim": unique_obs_dim,
+        "num_timeseries_obs_dim": num_timeseries_obs_dim,
+        "obs_dim": foresight * num_timeseries_obs_dim + unique_obs_dim,
+        "learning_config": LearningConfig(
+            train_freq="1h",
+            algorithm="maddpg",
+            actor_architecture="mlp",
+            learning_mode=True,
+            evaluation_mode=False,
+            training_episodes=1,
+            continue_learning=False,
+            trained_policies_save_path=None,
+            early_stopping_steps=10,
+            early_stopping_threshold=0.05,
+            learning_rate=1e-4,
+            batch_size=100,
+            gamma=0.99,
+            off_policy=OffPolicyConfig(
+                episodes_collecting_initial_experience=0,
+                gradient_steps=1,
+                tau=0.005,
+                policy_delay=2,
+                target_policy_noise=0.2,
+                target_noise_clip=0.5,
+            ),
+        ),
+    }
+
+
+@pytest.fixture(scope="function")
+def learning_role_n(base_learning_config):
+    config = copy(base_learning_config)
+    learn = Learning(config["learning_config"], start, end)
+    learn.rl_strats["agent_0"] = LearningStrategy(**config, learning_role=learn)
+    learn.rl_strats["agent_1"] = LearningStrategy(**config, learning_role=learn)
+    return learn
+
+
+@pytest.fixture(scope="function")
+def learning_role_n_plus_m(base_learning_config):
+    config = copy(base_learning_config)
+    learn = Learning(config["learning_config"], start, end)
+    learn.rl_strats["agent_0"] = LearningStrategy(**config, learning_role=learn)
+    learn.rl_strats["agent_1"] = LearningStrategy(**config, learning_role=learn)
+    learn.rl_strats["agent_2"] = LearningStrategy(**config, learning_role=learn)
+    return learn
+
+
+@pytest.fixture(scope="function")
+def saved_n_agent_model(learning_role_n, tmp_path) -> tuple[str, dict]:
+    learning_role_n.initialize_policy()
+    save_dir = tmp_path / "saved_model_n"
+    save_dir.mkdir(parents=True, exist_ok=True)
+    learning_role_n.rl_algorithm.save_params(directory=str(save_dir))
+    agent = learning_role_n.rl_strats["agent_0"]
+    return str(save_dir), {
+        "critic": agent.critics.state_dict(),
+        "actor": agent.actor.state_dict(),
+        "target_critic": agent.target_critics.state_dict(),
+        "target_actor": agent.actor_target.state_dict(),
+        "optimizer_critic": agent.critics.optimizer.state_dict(),
+        "optimizer_actor": agent.actor.optimizer.state_dict(),
+    }
+
+
+@pytest.fixture(scope="function")
+def saved_n_plus_m_agent_model(learning_role_n_plus_m, tmp_path) -> tuple[str, dict]:
+    learning_role_n_plus_m.initialize_policy()
+    save_dir = tmp_path / "saved_model_n_plus_m"
+    save_dir.mkdir(parents=True, exist_ok=True)
+    learning_role_n_plus_m.rl_algorithm.save_params(directory=str(save_dir))
+    agent = learning_role_n_plus_m.rl_strats["agent_0"]
+    return str(save_dir), {
+        "critic": agent.critics.state_dict(),
+        "actor": agent.actor.state_dict(),
+    }
+
+
+def compare_state_dicts(dict1, dict2) -> bool:
+    if dict1.keys() != dict2.keys():
+        return False
+    for k in dict1:
+        v1, v2 = dict1[k], dict2[k]
+        if isinstance(v1, th.Tensor):
+            if not th.equal(v1, v2):
+                return False
+        elif isinstance(v1, dict):
+            if not compare_state_dicts(v1, v2):
+                return False
+        else:
+            if v1 != v2:
+                return False
+    return True
+
+
+@pytest.mark.require_learning
+def test_maddpg_algorithm_class(learning_role_n):
+    learning_role_n.initialize_policy()
+    assert isinstance(learning_role_n.rl_algorithm, DDPG)
+
+
+@pytest.mark.require_learning
+def test_maddpg_save_params_creates_files(learning_role_n, tmp_path):
+    learning_role_n.initialize_policy()
+    save_dir = tmp_path / "model_save_test"
+
+    learning_role_n.rl_algorithm.save_params(directory=str(save_dir))
+
+    assert os.path.exists(save_dir / "critics" / "critic_agent_0.pt")
+    assert os.path.exists(save_dir / "critics" / "critic_agent_1.pt")
+    assert os.path.exists(save_dir / "actors" / "actor_agent_0.pt")
+    assert os.path.exists(save_dir / "actors" / "actor_agent_1.pt")
+
+
+@pytest.mark.require_learning
+def test_maddpg_save_params_u_id_order(learning_role_n, tmp_path):
+    learning_role_n.initialize_policy()
+    save_dir = tmp_path / "u_id_order_test"
+
+    learning_role_n.rl_algorithm.save_params(directory=str(save_dir))
+
+    order_file = save_dir / "critics" / "u_id_order.json"
+    assert order_file.exists(), "u_id_order.json must be written alongside critic files"
+    with open(order_file) as f:
+        mapping = json.load(f)
+    assert mapping.get("u_id_order") == ["agent_0", "agent_1"]
+
+
+@pytest.mark.require_learning
+def test_maddpg_load_matching_n(base_learning_config, saved_n_agent_model):
+    save_dir, original_states = saved_n_agent_model
+
+    config_new = copy(base_learning_config)
+    learn_new = Learning(config_new["learning_config"], start, end)
+    learn_new.rl_strats["agent_0"] = LearningStrategy(
+        **config_new, learning_role=learn_new
+    )
+    learn_new.rl_strats["agent_1"] = LearningStrategy(
+        **config_new, learning_role=learn_new
+    )
+    learn_new.initialize_policy()
+    learn_new.rl_algorithm.load_params(directory=save_dir)
+
+    agent = learn_new.rl_strats["agent_0"]
+
+    assert compare_state_dicts(original_states["critic"], agent.critics.state_dict())
+    assert compare_state_dicts(original_states["actor"], agent.actor.state_dict())
+    assert compare_state_dicts(
+        original_states["target_critic"], agent.target_critics.state_dict()
+    )
+    assert compare_state_dicts(
+        original_states["target_actor"], agent.actor_target.state_dict()
+    )
+    assert compare_state_dicts(
+        deepcopy(original_states["optimizer_critic"]),
+        deepcopy(agent.critics.optimizer.state_dict()),
+    )
+    assert compare_state_dicts(
+        deepcopy(original_states["optimizer_actor"]),
+        deepcopy(agent.actor.optimizer.state_dict()),
+    )
+
+
+def make_state_dicts(
+    obs_base: int,
+    act_dim: int,
+    unique_obs: int,
+    old_id_order: list[str],
+    new_id_order: list[str],
+    hidden_dims: list[int],
+):
+    import torch as th
+
+    class FakeModel:
+        def __init__(self, sd):
+            self._sd = sd
+
+        def state_dict(self):
+            return self._sd
+
+    old_n = len(old_id_order)
+    new_n = len(new_id_order)
+    old_input_dim = obs_base + unique_obs * max(0, old_n - 1) + act_dim * old_n
+    new_input_dim = obs_base + unique_obs * max(0, new_n - 1) + act_dim * new_n
+
+    # Build baseline for new model
+    baseline_new = {}
+    prefix = "q_layers"
+    baseline_new[f"{prefix}.0.weight"] = th.randn(hidden_dims[0], new_input_dim)
+    baseline_new[f"{prefix}.0.bias"] = th.randn(hidden_dims[0])
+    for i in range(1, len(hidden_dims)):
+        baseline_new[f"{prefix}.{i}.weight"] = th.randn(
+            hidden_dims[i], hidden_dims[i - 1]
+        )
+        baseline_new[f"{prefix}.{i}.bias"] = th.randn(hidden_dims[i])
+
+    # Build old_state with matching dims
+    old_state = {}
+    old_state[f"{prefix}.0.weight"] = th.randn(hidden_dims[0], old_input_dim) + 10.0
+    old_state[f"{prefix}.0.bias"] = th.randn(hidden_dims[0]) + 20.0
+    for i in range(1, len(hidden_dims)):
+        old_state[f"{prefix}.{i}.weight"] = baseline_new[f"{prefix}.{i}.weight"].clone()
+        old_state[f"{prefix}.{i}.bias"] = baseline_new[f"{prefix}.{i}.bias"].clone()
+
+    model = FakeModel(baseline_new)
+    return model, old_state, baseline_new
+
+
+@pytest.mark.require_learning
+def test_ddpg_load_transfer_n_plus_m(
+    learning_role_n_plus_m, saved_n_agent_model, base_learning_config
+):
+    """Saving a 2-agent DDPG model and loading it into a 3-agent setup must
+    transfer matching obs and action weight slices while leaving new-agent
+    slices at their random initialisation.
+    """
+    save_dir, original_states = saved_n_agent_model
+    n_agents_old = 2
+    n_agents_new = 3
+
+    learning_role_n_plus_m.initialize_policy()
+
+    pre_state = deepcopy(
+        learning_role_n_plus_m.rl_strats["agent_0"].critics.state_dict()
+    )
+    pre_opt_state = deepcopy(
+        learning_role_n_plus_m.rl_strats["agent_0"].critics.optimizer.state_dict()
+    )
+
+    learning_role_n_plus_m.rl_algorithm.load_params(directory=save_dir)
+
+    post_state = learning_role_n_plus_m.rl_strats["agent_0"].critics.state_dict()
+    post_target = learning_role_n_plus_m.rl_strats[
+        "agent_0"
+    ].target_critics.state_dict()
+    post_opt_state = learning_role_n_plus_m.rl_strats[
+        "agent_0"
+    ].critics.optimizer.state_dict()
+
+    assert not compare_state_dicts(pre_state, post_state)
+
+    obs_base = base_learning_config["obs_dim"]
+    act_dim = base_learning_config["act_dim"]
+    unique_obs = base_learning_config["unique_obs_dim"]
+
+    old_total_obs = obs_base + unique_obs * (n_agents_old - 1)
+    new_total_obs = obs_base + unique_obs * (n_agents_new - 1)
+    copy_agent_count = min(n_agents_old, n_agents_new)
+    copy_unique_obs_count = unique_obs * (copy_agent_count - 1)
+    copy_obs_end_idx = obs_base + copy_unique_obs_count
+    copy_action_count = act_dim * copy_agent_count
+
+    for prefix in ["q_layers"]:
+        w_key = f"{prefix}.0.weight"
+        b_key = f"{prefix}.0.bias"
+        assert th.equal(
+            post_state[w_key][:, :obs_base],
+            original_states["critic"][w_key][:, :obs_base],
+        )
+        # Matched unique-obs slices
+        if copy_obs_end_idx > obs_base:
+            assert th.equal(
+                post_state[w_key][:, obs_base:copy_obs_end_idx],
+                original_states["critic"][w_key][:, obs_base:copy_obs_end_idx],
+            )
+        # Matched action slices
+        assert th.equal(
+            post_state[w_key][:, new_total_obs : new_total_obs + copy_action_count],
+            original_states["critic"][w_key][
+                :, old_total_obs : old_total_obs + copy_action_count
+            ],
+        )
+
+    # Target critic must copy critic after transfer
+    assert compare_state_dicts(post_state, post_target)
+    # Optimizer state is preserved
+    assert compare_state_dicts(post_opt_state, pre_opt_state)
+
+
+@pytest.mark.require_learning
+def test_ddpg_load_transfer_n_minus_m(
+    learning_role_n, saved_n_plus_m_agent_model, base_learning_config
+):
+    """Saving a 3-agent DDPG model and loading it into a 2-agent setup must
+    transfer only the overlapping obs and action weight slices.
+    """
+    save_dir, original_states = saved_n_plus_m_agent_model
+    n_agents_old = 3
+    n_agents_new = 2
+
+    learning_role_n.initialize_policy()
+
+    pre_state = deepcopy(learning_role_n.rl_strats["agent_0"].critics.state_dict())
+    learning_role_n.rl_algorithm.load_params(directory=save_dir)
+
+    post_state = learning_role_n.rl_strats["agent_0"].critics.state_dict()
+    post_target = learning_role_n.rl_strats["agent_0"].target_critics.state_dict()
+
+    assert not compare_state_dicts(pre_state, post_state)
+
+    obs_base = base_learning_config["obs_dim"]
+    act_dim = base_learning_config["act_dim"]
+    unique_obs = base_learning_config["unique_obs_dim"]
+
+    old_total_obs = obs_base + unique_obs * (n_agents_old - 1)
+    new_total_obs = obs_base + unique_obs * (n_agents_new - 1)
+    copy_agent_count = min(n_agents_old, n_agents_new)
+    copy_unique_obs_count = unique_obs * (copy_agent_count - 1)
+    copy_obs_end_idx = obs_base + copy_unique_obs_count
+    copy_action_count = act_dim * copy_agent_count
+
+    for prefix in ["q_layers"]:
+        w_key = f"{prefix}.0.weight"
+
+        assert th.equal(
+            post_state[w_key][:, :obs_base],
+            original_states["critic"][w_key][:, :obs_base],
+        )
+        if copy_obs_end_idx > obs_base:
+            assert th.equal(
+                post_state[w_key][:, obs_base:copy_obs_end_idx],
+                original_states["critic"][w_key][:, obs_base:copy_obs_end_idx],
+            )
+        assert th.equal(
+            post_state[w_key][:, new_total_obs : new_total_obs + copy_action_count],
+            original_states["critic"][w_key][
+                :, old_total_obs : old_total_obs + copy_action_count
+            ],
+        )
+
+    assert compare_state_dicts(post_state, post_target)
+
+
+@pytest.mark.parametrize(
+    "new_id_order",
+    [
+        ["pp_5", "pp_6", "pp_3", "pp_4", "st_1"],
+        ["pp_3", "pp_4", "st_1", "pp_5", "pp_6"],
+        ["pp_3", "pp_5", "pp_4", "pp_6", "st_1"],
+        ["pp_3", "st_1"],
+    ],
+)
+@pytest.mark.require_learning
+def test_ddpg_transfer_weights_various_orders(new_id_order):
+    import torch as th
+
+    from assume.reinforcement_learning.learning_utils import transfer_weights
+
+    obs_base = 10
+    act_dim = 3
+    unique_obs = 2
+    hidden_dims = [5, 4]
+    old_id_order = ["pp_3", "pp_4", "st_1"]
+
+    model, old_state, baseline = make_state_dicts(
+        obs_base, act_dim, unique_obs, old_id_order, new_id_order, hidden_dims
+    )
+    new_state = transfer_weights(
+        model, old_state, old_id_order, new_id_order, obs_base, act_dim, unique_obs
+    )
+    assert isinstance(new_state, dict), "transfer_weights must return a dict for DDPG"
+
+    old_n = len(old_id_order)
+    new_n = len(new_id_order)
+    old_obs_tot = obs_base + unique_obs * max(0, old_n - 1)
+    new_obs_tot = obs_base + unique_obs * max(0, new_n - 1)
+
+    prefix = "q_layers"
+    w_old = old_state[f"{prefix}.0.weight"]
+    w_base = baseline[f"{prefix}.0.weight"]
+    w_new = new_state[f"{prefix}.0.weight"]
+
+    # Shared obs_base copied from old
+    assert th.equal(w_new[:, :obs_base], w_old[:, :obs_base])
+
+    # unique_obs slices per agent
+    for new_idx, u in enumerate(new_id_order):
+        if new_idx == 0:
+            continue
+        s = obs_base + unique_obs * (new_idx - 1)
+        e = s + unique_obs
+        if u in old_id_order:
+            old_idx = old_id_order.index(u)
+            if old_idx > 0:
+                os_ = obs_base + unique_obs * (old_idx - 1)
+                oe = os_ + unique_obs
+                assert th.equal(w_new[:, s:e], w_old[:, os_:oe])
+        else:
+            assert th.equal(w_new[:, s:e], w_base[:, s:e])
+
+    # action slices per agent
+    for new_idx, u in enumerate(new_id_order):
+        s = new_obs_tot + act_dim * new_idx
+        e = s + act_dim
+        if u in old_id_order:
+            old_idx = old_id_order.index(u)
+            os_ = old_obs_tot + act_dim * old_idx
+            oe = os_ + act_dim
+            assert th.equal(w_new[:, s:e], w_old[:, os_:oe])
+        else:
+            assert th.equal(w_new[:, s:e], w_base[:, s:e])
+
+
+@pytest.mark.require_learning
+def test_maddpg_load_corrupted_critic(tmp_path, base_learning_config):
+    config = copy(base_learning_config)
+    learning = Learning(config["learning_config"], start, end)
+    learning.rl_strats["agent_0"] = LearningStrategy(**config, learning_role=learning)
+    learning.initialize_policy()
+
+    original_state = deepcopy(learning.rl_strats["agent_0"].critics.state_dict())
+
+    corrupted_dir = tmp_path / "critics"
+    corrupted_dir.mkdir(parents=True, exist_ok=True)
+
+    corrupted_obj = {
+        "critic": original_state,
+        "critic_target": {
+            k: v[:1] if isinstance(v, th.Tensor) and v.ndim > 0 else v
+            for k, v in original_state.items()
+        },
+    }
+    th.save(corrupted_obj, corrupted_dir / "critic_agent_0.pt")
+    learning.rl_algorithm.load_critic_params(directory=str(tmp_path))
+
+    loaded_state = learning.rl_strats["agent_0"].critics.state_dict()
+    assert compare_state_dicts(loaded_state, original_state)
+
+
+@pytest.mark.parametrize(
+    "mod_field, mod_value, expected_error",
+    [
+        ("foresight", 99, "All foresight values must be the same"),
+        ("act_dim", 99, "All action dimensions must be the same"),
+        ("unique_obs_dim", 99, "All unique_obs_dim values must be the same"),
+        (
+            "num_timeseries_obs_dim",
+            99,
+            "All num_timeseries_obs_dim values must be the same",
+        ),
+    ],
+)
+@pytest.mark.require_learning
+def test_initialize_policy_dimension_mismatch(
+    base_learning_config, mod_field, mod_value, expected_error
+):
+    config = copy(base_learning_config)
+    config["num_timeseries_obs_dim"] = 1
+
+    learn = Learning(config["learning_config"], start, end)
+    strat_0 = LearningStrategy(**config, learning_role=learn)
+
+    config_mismatch = copy(config)
+    config_mismatch[mod_field] = mod_value
+    strat_1 = LearningStrategy(**config_mismatch, learning_role=learn)
+
+    learn.rl_strats["agent_0"] = strat_0
+    learn.rl_strats["agent_1"] = strat_1
+
+    with pytest.raises(ValueError, match=expected_error):
+        learn.rl_algorithm.initialize_policy()
+
+
+@pytest.mark.require_learning
+def test_initialize_policy_all_dimensions_match(base_learning_config):
+    config = copy(base_learning_config)
+    config["num_timeseries_obs_dim"] = 1
+
+    learn = Learning(config["learning_config"], start, end)
+    for agent_id in ("agent_0", "agent_1", "agent_2"):
+        learn.rl_strats[agent_id] = LearningStrategy(**config, learning_role=learn)
+
+    try:
+        learn.rl_algorithm.initialize_policy()
+    except Exception as e:
+        pytest.fail(f"initialize_policy raised an unexpected error: {e}")
diff --git a/tests/test_mappo.py b/tests/test_mappo.py
new file mode 100644
index 000000000..cd69550ef
--- /dev/null
+++ b/tests/test_mappo.py
@@ -0,0 +1,373 @@
+# SPDX-FileCopyrightText: ASSUME Developers
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import json
+import os
+from copy import copy, deepcopy
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from assume.common.base import LearningConfig, OnPolicyConfig
+
+try:
+    import torch as th
+
+    from assume.common.base import LearningStrategy
+    from assume.reinforcement_learning.algorithms.mappo import PPO
+    from assume.reinforcement_learning.buffer import RolloutBuffer
+    from assume.reinforcement_learning.learning_role import Learning
+
+
+except ImportError:
+    pass
+
+
+start = datetime(2023, 7, 1)
+end = datetime(2023, 7, 2)
+
+
+@pytest.fixture
+def base_learning_config() -> dict:
+    foresight = 2
+    unique_obs_dim = 2
+    num_timeseries_obs_dim = 4
+    return {
+        "foresight": foresight,
+        "act_dim": 3,
+        "unique_obs_dim": unique_obs_dim,
+        "num_timeseries_obs_dim": num_timeseries_obs_dim,
+        "obs_dim": foresight * num_timeseries_obs_dim + unique_obs_dim,
+        "learning_config": LearningConfig(
+            train_freq="1h",
+            algorithm="mappo",
+            actor_architecture="mlp",
+            learning_mode=True,
+            evaluation_mode=False,
+            training_episodes=10,
+            continue_learning=False,
+            trained_policies_save_path=None,
+            early_stopping_steps=10,
+            early_stopping_threshold=0.05,
+            learning_rate=1e-4,
+            batch_size=10,
+            gamma=0.99,
+            on_policy=OnPolicyConfig(
+                clip_ratio=0.2,
+                entropy_coef=0.01,
+                gae_lambda=0.95,
+                max_grad_norm=0.5,
+                vf_coef=0.5,
+                n_epochs=2,
+            ),
+        ),
+    }
+
+
+@pytest.fixture(scope="function")
+def learning_role_n(base_learning_config):
+    config = copy(base_learning_config)
+    learn = Learning(config["learning_config"], start, end)
+    for agent_id in ("agent_0", "agent_1"):
+        strategy = LearningStrategy(**config, learning_role=learn)
+        strategy.unit_id = agent_id
+        learn.rl_strats[agent_id] = strategy
+    return learn
+
+
+@pytest.fixture(scope="function")
+def saved_n_agent_model(learning_role_n, tmp_path) -> tuple[str, dict]:
+    """Save a 2-agent PPO model; return (save_dir, state_dict_snapshot)."""
+    learning_role_n.initialize_policy()
+    save_dir = tmp_path / "saved_model_n"
+    save_dir.mkdir(parents=True, exist_ok=True)
+    learning_role_n.rl_algorithm.save_params(directory=str(save_dir))
+    agent = learning_role_n.rl_strats["agent_0"]
+    return str(save_dir), {
+        "critic": agent.critics.state_dict(),
+        "actor": agent.actor.state_dict(),
+        "optimizer_critic": agent.critics.optimizer.state_dict(),
+        "optimizer_actor": agent.actor.optimizer.state_dict(),
+    }
+
+
+def compare_state_dicts(dict1, dict2) -> bool:
+    if dict1.keys() != dict2.keys():
+        return False
+    for k in dict1:
+        v1, v2 = dict1[k], dict2[k]
+        if isinstance(v1, th.Tensor):
+            if not th.equal(v1, v2):
+                return False
+        elif isinstance(v1, dict):
+            if not compare_state_dicts(v1, v2):
+                return False
+        else:
+            if v1 != v2:
+                return False
+    return True
+
+
+def _make_rollout_buffer(
+    obs_dim: int,
+    act_dim: int,
+    n_agents: int,
+    n_steps: int,
+    device: str = "cpu",
+) -> "RolloutBuffer":
+    """Building and filling a RolloutBuffer with random data for update_policy tests."""
+    buf = RolloutBuffer(
+        buffer_size=n_steps + 10,
+        obs_dim=obs_dim,
+        act_dim=act_dim,
+        n_rl_units=n_agents,
+        device=device,
+        float_type=th.float32,
+        gamma=0.99,
+        gae_lambda=0.95,
+    )
+    rng = np.random.default_rng(42)
+    for _ in range(n_steps):
+        buf.add(
+            obs=rng.random((n_agents, obs_dim)).astype(np.float32),
+            action=rng.random((n_agents, act_dim)).astype(np.float32),
+            reward=rng.random(n_agents).astype(np.float32),
+            done=np.zeros(n_agents, dtype=np.float32),
+            value=rng.random(n_agents).astype(np.float32),
+            log_prob=(rng.random(n_agents).astype(np.float32) - 1.0),
+        )
+    return buf
+
+
+def _setup_for_update(learning_role) -> None:
+    """Setting minimal attributes needed."""
+    learning_role.update_steps = 0
+    learning_role.db_addr = None  # disables the context.schedule_instant_message path
+
+
+@pytest.mark.require_learning
+def test_mappo_algorithm_class(learning_role_n):
+    """initialize_policy creates a PPO instance as the rl_algorithm."""
+    learning_role_n.initialize_policy()
+    assert isinstance(learning_role_n.rl_algorithm, PPO)
+
+
+@pytest.mark.require_learning
+def test_mappo_save_params_creates_files(learning_role_n, tmp_path):
+    learning_role_n.initialize_policy()
+    save_dir = tmp_path / "model_save_test"
+
+    learning_role_n.rl_algorithm.save_params(directory=str(save_dir))
+
+    assert os.path.exists(save_dir / "critics" / "critic_agent_0.pt")
+    assert os.path.exists(save_dir / "critics" / "critic_agent_1.pt")
+    assert os.path.exists(save_dir / "actors" / "actor_agent_0.pt")
+    assert os.path.exists(save_dir / "actors" / "actor_agent_1.pt")
+
+
+@pytest.mark.require_learning
+def test_mappo_save_params_u_id_order(learning_role_n, tmp_path):
+    learning_role_n.initialize_policy()
+    save_dir = tmp_path / "uid_order_test"
+    learning_role_n.rl_algorithm.save_params(directory=str(save_dir))
+
+    order_file = save_dir / "critics" / "u_id_order.json"
+    assert order_file.exists(), "u_id_order.json must be written alongside critic files"
+    with open(order_file) as f:
+        mapping = json.load(f)
+    assert mapping.get("u_id_order") == ["agent_0", "agent_1"]
+
+
+@pytest.mark.require_learning
+def test_mappo_load_matching_n(base_learning_config, saved_n_agent_model):
+    save_dir, original_states = saved_n_agent_model
+
+    config_new = copy(base_learning_config)
+    learn_new = Learning(config_new["learning_config"], start, end)
+    learn_new.rl_strats["agent_0"] = LearningStrategy(
+        **config_new, learning_role=learn_new
+    )
+    learn_new.rl_strats["agent_1"] = LearningStrategy(
+        **config_new, learning_role=learn_new
+    )
+    learn_new.initialize_policy()
+    learn_new.rl_algorithm.load_params(directory=save_dir)
+
+    agent = learn_new.rl_strats["agent_0"]
+    assert compare_state_dicts(original_states["critic"], agent.critics.state_dict())
+    assert compare_state_dicts(original_states["actor"], agent.actor.state_dict())
+    assert compare_state_dicts(
+        deepcopy(original_states["optimizer_critic"]),
+        deepcopy(agent.critics.optimizer.state_dict()),
+    )
+    assert compare_state_dicts(
+        deepcopy(original_states["optimizer_actor"]),
+        deepcopy(agent.actor.optimizer.state_dict()),
+    )
+
+
+@pytest.mark.require_learning
+def test_mappo_initialize_policy_dimension_mismatch(base_learning_config):
+    config = copy(base_learning_config)
+    config["num_timeseries_obs_dim"] = 1
+
+    learn = Learning(config["learning_config"], start, end)
+    strat_0 = LearningStrategy(**config, learning_role=learn)
+
+    config_bad = copy(config)
+    config_bad["act_dim"] = 99
+    strat_1 = LearningStrategy(**config_bad, learning_role=learn)
+
+    learn.rl_strats["agent_0"] = strat_0
+    learn.rl_strats["agent_1"] = strat_1
+
+    with pytest.raises(ValueError, match="All action dimensions must be the same"):
+        learn.rl_algorithm.initialize_policy()
+
+
+@pytest.mark.require_learning
+def test_mappo_initialize_policy_all_dimensions_match(base_learning_config):
+    config = copy(base_learning_config)
+    config["num_timeseries_obs_dim"] = 1
+
+    learn = Learning(config["learning_config"], start, end)
+    for agent_id in ("agent_0", "agent_1", "agent_2"):
+        learn.rl_strats[agent_id] = LearningStrategy(**config, learning_role=learn)
+
+    try:
+        learn.rl_algorithm.initialize_policy()
+    except Exception as e:
+        pytest.fail(f"initialize_policy raised an unexpected error: {e}")
+
+
+@pytest.mark.require_learning
+def test_mappo_buffer_storage_uses_rl_strats_order(base_learning_config):
+    """Regression test for the agent-ordering bug.
+
+    The on-policy buffer-storage path used to call
+    ``sorted(cache["obs"][timestamp].keys())`` to order agents, while
+    ``mappo.PPO.update_policy`` iterates ``self.rl_strats.values()``.  When
+    the unit ids do not happen to be alphabetically sorted (e.g.
+    ``pp_6, pp_7, pp_8, pp_9, pp_10``) the two orders diverge and every
+    agent is trained on a different agent's observations / actions / values,
+    silently degrading MAPPO to noise.
+
+    This test pins ``learning_role`` to use the ``rl_strats`` insertion order
+    when filling the rollout buffer, exactly like the off-policy algorithms
+    already do.
+    """
+    import asyncio
+    from collections import defaultdict
+
+    config = copy(base_learning_config)
+
+    learn = Learning(config["learning_config"], start, end)
+    insertion_order = ("pp_6", "pp_7", "pp_8", "pp_9", "pp_10")
+    assert sorted(insertion_order) != list(insertion_order), (
+        "test scenario must use unit ids whose sort order differs from "
+        "insertion order; otherwise this regression test is trivially passing"
+    )
+
+    for agent_id in insertion_order:
+        strat = LearningStrategy(**config, learning_role=learn)
+        strat.unit_id = agent_id
+        learn.rl_strats[agent_id] = strat
+
+    learn.initialize_policy()
+
+    n_agents = len(insertion_order)
+    # ``LearningStrategy`` computes ``self.obs_dim`` from
+    # ``num_timeseries_obs_dim * foresight + unique_obs_dim``, so we must
+    # match that here for the fake centralized-critic input to align.
+    obs_dim = (
+        config["num_timeseries_obs_dim"] * config["foresight"]
+        + config["unique_obs_dim"]
+    )
+    act_dim = config["act_dim"]
+
+    # Build a fake rollout buffer large enough to hold one fake timestep.
+    learn.buffer = RolloutBuffer(
+        buffer_size=4,
+        obs_dim=obs_dim,
+        act_dim=act_dim,
+        n_rl_units=n_agents,
+        device="cpu",
+        float_type=th.float32,
+        gamma=0.99,
+        gae_lambda=0.95,
+    )
+
+    # Craft a cache where each unit's observation/action/reward is a unique
+    # constant equal to (1+i)*10, so we can assert that the row for agent i in
+    # the buffer matches the i-th *insertion-order* unit, not the i-th
+    # *sorted-order* unit.
+    timestamp = "2023-07-01 00:00:00"
+    cache = {
+        "obs": {timestamp: {}},
+        "actions": {timestamp: {}},
+        "rewards": {timestamp: {}},
+        "noises": {timestamp: {}},
+        "regret": {timestamp: {}},
+        "profit": {timestamp: {}},
+        "values": {timestamp: defaultdict(list)},
+        "log_probs": {timestamp: {}},
+        "dones": {timestamp: {}},
+    }
+    for i, unit_id in enumerate(insertion_order):
+        marker = float(i + 1)
+        cache["obs"][timestamp][unit_id] = [
+            th.full((obs_dim,), marker, dtype=th.float32)
+        ]
+        cache["actions"][timestamp][unit_id] = [
+            th.full((act_dim,), marker, dtype=th.float32)
+        ]
+        cache["rewards"][timestamp][unit_id] = [marker]
+        cache["noises"][timestamp][unit_id] = [
+            th.zeros(act_dim, dtype=th.float32)
+        ]
+        cache["regret"][timestamp][unit_id] = [0.0]
+        cache["profit"][timestamp][unit_id] = [0.0]
+        cache["log_probs"][timestamp][unit_id] = [-marker]
+        cache["dones"][timestamp][unit_id] = [0.0]
+        # leave cache["values"][timestamp] empty - mappo recomputes values
+
+    # Stash db_addr/update_steps so the logging path inside the algorithm is
+    # safe to call.  We do NOT need an actual policy update for this test, so
+    # we monkey-patch update_policy to a no-op.
+    learn.db_addr = None
+    learn.update_steps = 0
+    learn.rl_algorithm.update_policy = lambda: None
+
+    asyncio.run(
+        learn._store_to_buffer_and_update_sync(cache, learn.device)
+    )
+
+    buf = learn.buffer
+    # One timestamp -> one row in the buffer.
+    assert buf.pos == 1, f"expected 1 transition, got {buf.pos}"
+
+    stored_obs = buf.observations[0]
+    stored_actions = buf.actions[0]
+    stored_rewards = buf.rewards[0]
+    stored_log_probs = buf.log_probs[0]
+
+    for i in range(n_agents):
+        expected = float(i + 1)
+        assert np.allclose(stored_obs[i], expected), (
+            f"row {i} of buffer.observations should match insertion-order "
+            f"agent {insertion_order[i]} (value {expected}); got {stored_obs[i]}"
+        )
+        assert np.allclose(stored_actions[i], expected), (
+            f"row {i} of buffer.actions should match insertion-order "
+            f"agent {insertion_order[i]} (value {expected}); got {stored_actions[i]}"
+        )
+        assert np.allclose(stored_rewards[i], expected), (
+            f"row {i} of buffer.rewards should match insertion-order "
+            f"agent {insertion_order[i]} (value {expected}); got {stored_rewards[i]}"
+        )
+        assert np.allclose(stored_log_probs[i], -expected), (
+            f"row {i} of buffer.log_probs should match insertion-order "
+            f"agent {insertion_order[i]} (value {-expected}); got {stored_log_probs[i]}"
+        )
diff --git a/tests/test_matd3.py b/tests/test_matd3.py
index 0471f84eb..1d6fdbf35 100644
--- a/tests/test_matd3.py
+++ b/tests/test_matd3.py
@@ -9,7 +9,7 @@
 
 import pytest
 
-from assume.common.base import LearningConfig
+from assume.common.base import LearningConfig, OffPolicyConfig
 
 try:
     import torch as th
@@ -46,19 +46,21 @@ def base_learning_config() -> dict:
             learning_mode=True,
             evaluation_mode=False,
             training_episodes=1,
-            episodes_collecting_initial_experience=0,
             continue_learning=False,
             trained_policies_save_path=None,
             early_stopping_steps=10,
             early_stopping_threshold=0.05,
             learning_rate=1e-4,
             batch_size=100,
-            tau=0.005,
             gamma=0.99,
-            gradient_steps=1,
-            policy_delay=2,
-            target_policy_noise=0.2,
-            target_noise_clip=0.5,
+            off_policy=OffPolicyConfig(
+                episodes_collecting_initial_experience=0,
+                gradient_steps=1,
+                tau=0.005,
+                policy_delay=2,
+                target_policy_noise=0.2,
+                target_noise_clip=0.5,
+            ),
         ),
     }
 
diff --git a/tests/test_public_api.py b/tests/test_public_api.py
new file mode 100644
index 000000000..a11e047b8
--- /dev/null
+++ b/tests/test_public_api.py
@@ -0,0 +1,303 @@
+# SPDX-FileCopyrightText: ASSUME Developers
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+"""Tests for verification of the public API symbols are importable consistently.
+
+Import layers which are covered are as follows:
+- assume.reinforcement_learning.algorithms for algorithm-level package
+- assume.reinforcement_learning for RL sub-package
+- assume for top-level package
+"""
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Layer 1 – assume.reinforcement_learning.algorithms
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.require_learning
+class TestAlgorithmsPackageExports:
+    """All algorithm classes and helpers re-exported from the algorithms package."""
+
+    def test_import_rl_algorithm_base(self):
+        from assume.reinforcement_learning.algorithms import RLAlgorithm
+
+        assert RLAlgorithm is not None
+
+    def test_import_a2c_algorithm_base(self):
+        from assume.reinforcement_learning.algorithms import A2CAlgorithm
+
+        assert A2CAlgorithm is not None
+
+    def test_import_td3(self):
+        from assume.reinforcement_learning.algorithms import TD3
+
+        assert TD3 is not None
+
+    def test_import_ddpg(self):
+        from assume.reinforcement_learning.algorithms import DDPG
+
+        assert DDPG is not None
+
+    def test_import_ppo(self):
+        from assume.reinforcement_learning.algorithms import PPO
+
+        assert PPO is not None
+
+    def test_import_mlp_actor(self):
+        from assume.reinforcement_learning.algorithms import MLPActor
+
+        assert MLPActor is not None
+
+    def test_import_lstm_actor(self):
+        from assume.reinforcement_learning.algorithms import LSTMActor
+
+        assert LSTMActor is not None
+
+    def test_import_actor_architecture_aliases(self):
+        from assume.reinforcement_learning.algorithms import actor_architecture_aliases
+
+        assert "mlp" in actor_architecture_aliases
+        assert "lstm" in actor_architecture_aliases
+
+    def test_algorithm_hierarchy(self):
+        """TD3, DDPG, PPO must all be subclasses of A2CAlgorithm → RLAlgorithm."""
+        from assume.reinforcement_learning.algorithms import (
+            DDPG,
+            PPO,
+            TD3,
+            A2CAlgorithm,
+            RLAlgorithm,
+        )
+
+        for cls in (TD3, DDPG, PPO):
+            assert issubclass(cls, A2CAlgorithm), (
+                f"{cls.__name__} not subclass of A2CAlgorithm"
+            )
+            assert issubclass(cls, RLAlgorithm), (
+                f"{cls.__name__} not subclass of RLAlgorithm"
+            )
+
+    def test_actor_aliases_map_to_nn_modules(self):
+        from torch import nn
+
+        from assume.reinforcement_learning.algorithms import actor_architecture_aliases
+
+        for name, cls in actor_architecture_aliases.items():
+            assert issubclass(cls, nn.Module), (
+                f"alias '{name}' does not map to an nn.Module"
+            )
+
+
+# ---------------------------------------------------------------------------
+# Layer 2 – assume.reinforcement_learning
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.require_learning
+class TestRLPackageExports:
+    """All public symbols re-exported from the reinforcement_learning sub-package."""
+
+    def test_import_learning(self):
+        from assume.reinforcement_learning import Learning
+
+        assert Learning is not None
+
+    def test_import_rl_algorithm(self):
+        from assume.reinforcement_learning import RLAlgorithm
+
+        assert RLAlgorithm is not None
+
+    def test_import_a2c_algorithm(self):
+        from assume.reinforcement_learning import A2CAlgorithm
+
+        assert A2CAlgorithm is not None
+
+    def test_import_td3(self):
+        from assume.reinforcement_learning import TD3
+
+        assert TD3 is not None
+
+    def test_import_ddpg(self):
+        from assume.reinforcement_learning import DDPG
+
+        assert DDPG is not None
+
+    def test_import_ppo(self):
+        from assume.reinforcement_learning import PPO
+
+        assert PPO is not None
+
+    def test_import_mlp_actor(self):
+        from assume.reinforcement_learning import MLPActor
+
+        assert MLPActor is not None
+
+    def test_import_lstm_actor(self):
+        from assume.reinforcement_learning import LSTMActor
+
+        assert LSTMActor is not None
+
+    def test_import_actor_architecture_aliases(self):
+        from assume.reinforcement_learning import actor_architecture_aliases
+
+        assert isinstance(actor_architecture_aliases, dict)
+
+    def test_import_replay_buffer(self):
+        from assume.reinforcement_learning import ReplayBuffer
+
+        assert ReplayBuffer is not None
+
+    def test_import_replay_buffer_samples(self):
+        from assume.reinforcement_learning import ReplayBufferSamples
+
+        assert ReplayBufferSamples is not None
+
+    def test_import_rollout_buffer(self):
+        from assume.reinforcement_learning import RolloutBuffer
+
+        assert RolloutBuffer is not None
+
+    def test_import_rollout_buffer_samples(self):
+        from assume.reinforcement_learning import RolloutBufferSamples
+
+        assert RolloutBufferSamples is not None
+
+    def test_all_declared(self):
+        """Every symbol listed in __all__ must actually be importable."""
+        import assume.reinforcement_learning as rl_pkg
+
+        for name in rl_pkg.__all__:
+            assert hasattr(rl_pkg, name), f"__all__ entry '{name}' missing from module"
+
+    def test_replay_buffer_and_rollout_buffer_are_distinct(self):
+        from assume.reinforcement_learning import ReplayBuffer, RolloutBuffer
+
+        assert ReplayBuffer is not RolloutBuffer
+
+    def test_buffer_samples_are_distinct(self):
+        from assume.reinforcement_learning import (
+            ReplayBufferSamples,
+            RolloutBufferSamples,
+        )
+
+        assert ReplayBufferSamples is not RolloutBufferSamples
+
+
+# ---------------------------------------------------------------------------
+# Layer 3 – assume (top-level package)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.require_learning
+class TestTopLevelPackageRLExports:
+    """RL symbols must be reachable directly from `import assume`."""
+
+    def test_import_learning(self):
+        import assume
+
+        assert hasattr(assume, "Learning")
+
+    def test_import_rl_algorithm(self):
+        import assume
+
+        assert hasattr(assume, "RLAlgorithm")
+
+    def test_import_a2c_algorithm(self):
+        import assume
+
+        assert hasattr(assume, "A2CAlgorithm")
+
+    def test_import_td3(self):
+        import assume
+
+        assert hasattr(assume, "TD3")
+
+    def test_import_ddpg(self):
+        import assume
+
+        assert hasattr(assume, "DDPG")
+
+    def test_import_ppo(self):
+        import assume
+
+        assert hasattr(assume, "PPO")
+
+    def test_import_mlp_actor(self):
+        import assume
+
+        assert hasattr(assume, "MLPActor")
+
+    def test_import_lstm_actor(self):
+        import assume
+
+        assert hasattr(assume, "LSTMActor")
+
+    def test_import_actor_architecture_aliases(self):
+        import assume
+
+        assert hasattr(assume, "actor_architecture_aliases")
+
+    def test_import_replay_buffer(self):
+        import assume
+
+        assert hasattr(assume, "ReplayBuffer")
+
+    def test_import_replay_buffer_samples(self):
+        import assume
+
+        assert hasattr(assume, "ReplayBufferSamples")
+
+    def test_import_rollout_buffer(self):
+        import assume
+
+        assert hasattr(assume, "RolloutBuffer")
+
+    def test_import_rollout_buffer_samples(self):
+        import assume
+
+        assert hasattr(assume, "RolloutBufferSamples")
+
+    def test_all_declared(self):
+        """Every symbol in top-level __all__ must exist on the module."""
+        import assume
+
+        for name in assume.__all__:
+            assert hasattr(assume, name), f"__all__ entry '{name}' missing from assume"
+
+    def test_rl_symbols_consistent_across_layers(self):
+        """The same class object must be reachable from all three import paths."""
+        import assume
+        import assume.reinforcement_learning as rl
+        from assume.reinforcement_learning.algorithms import DDPG, PPO, TD3
+
+        for name, algo_cls in [("TD3", TD3), ("DDPG", DDPG), ("PPO", PPO)]:
+            assert getattr(rl, name) is algo_cls, (
+                f"rl.{name} is not the same object as algorithms.{name}"
+            )
+            assert getattr(assume, name) is algo_cls, (
+                f"assume.{name} is not the same object as algorithms.{name}"
+            )
+
+    def test_version_still_present(self):
+        import assume
+
+        assert hasattr(assume, "__version__")
+        assert isinstance(assume.__version__, str)
+
+    def test_non_rl_symbols_unchanged(self):
+        """Core non-RL exports (World, MarketConfig, etc.) must still be present."""
+        import assume
+
+        for name in (
+            "World",
+            "MarketConfig",
+            "MarketProduct",
+            "load_scenario_folder",
+            "run_learning",
+        ):
+            assert hasattr(assume, name), (
+                f"Pre-existing export '{name}' missing after __init__ update"
+            )
diff --git a/tests/test_rl_buffer.py b/tests/test_rl_replaybuffer.py
similarity index 100%
rename from tests/test_rl_buffer.py
rename to tests/test_rl_replaybuffer.py
diff --git a/tests/test_rl_rolloutbuffer.py b/tests/test_rl_rolloutbuffer.py
new file mode 100644
index 000000000..50410a4be
--- /dev/null
+++ b/tests/test_rl_rolloutbuffer.py
@@ -0,0 +1,504 @@
+# SPDX-FileCopyrightText: ASSUME Developers
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import numpy as np
+import pytest
+
+try:
+    import torch as th
+
+    from assume.reinforcement_learning.buffer import (
+        RolloutBuffer,
+        RolloutBufferSamples,
+    )
+except ImportError:
+    pass
+
+
+def make_rollout_buffer(
+    buffer_size=8,
+    obs_dim=3,
+    act_dim=2,
+    n_rl_units=2,
+    gamma=0.99,
+    gae_lambda=0.95,
+):
+    return RolloutBuffer(
+        buffer_size=buffer_size,
+        obs_dim=obs_dim,
+        act_dim=act_dim,
+        n_rl_units=n_rl_units,
+        device=th.device("cpu"),
+        float_type=th.float32,
+        gamma=gamma,
+        gae_lambda=gae_lambda,
+    )
+
+
+def fill_buffer(buf, n_steps=None, seed=0):
+    rng = np.random.default_rng(seed)
+    n = n_steps if n_steps is not None else buf.buffer_size
+    for _ in range(n):
+        obs = rng.random((buf.n_rl_units, buf.obs_dim)).astype(np.float32)
+        act = rng.random((buf.n_rl_units, buf.act_dim)).astype(np.float32)
+        rew = rng.random(buf.n_rl_units).astype(np.float32)
+        done = np.zeros(buf.n_rl_units, dtype=np.float32)
+        val = rng.random(buf.n_rl_units).astype(np.float32)
+        lp = rng.random(buf.n_rl_units).astype(np.float32) - 1.0
+        buf.add(obs, act, rew, done, val, lp)
+
+
+@pytest.mark.require_learning
+def test_rollout_buffer_init_shapes():
+    buf = make_rollout_buffer(buffer_size=10, obs_dim=3, act_dim=2, n_rl_units=4)
+    assert buf.observations.shape == (10, 4, 3)
+    assert buf.actions.shape == (10, 4, 2)
+    assert buf.rewards.shape == (10, 4)
+    assert buf.values.shape == (10, 4)
+    assert buf.log_probs.shape == (10, 4)
+    assert buf.dones.shape == (10, 4)
+    assert buf.advantages.shape == (10, 4)
+    assert buf.returns.shape == (10, 4)
+
+
+@pytest.mark.require_learning
+def test_rollout_buffer_init_state():
+    buf = make_rollout_buffer()
+    assert buf.pos == 0
+    assert buf.full is False
+    assert buf.generator_ready is False
+    assert buf.size() == 0
+
+
+@pytest.mark.require_learning
+def test_rollout_buffer_reset_clears_data():
+    buf = make_rollout_buffer(buffer_size=4)
+    fill_buffer(buf, n_steps=4)
+    assert buf.pos == 4
+
+    buf.reset()
+    assert buf.pos == 0
+    assert buf.full is False
+    assert buf.generator_ready is False
+    assert np.all(buf.observations == 0)
+    assert np.all(buf.rewards == 0)
+    assert np.all(buf.advantages == 0)
+    assert np.all(buf.returns == 0)
+
+
+@pytest.mark.require_learning
+def test_rollout_buffer_add_increments_pos():
+    buf = make_rollout_buffer(buffer_size=5)
+    obs = np.ones((buf.n_rl_units, buf.obs_dim), dtype=np.float32)
+    act = np.ones((buf.n_rl_units, buf.act_dim), dtype=np.float32)
+    rew = np.ones(buf.n_rl_units, dtype=np.float32)
+    done = np.zeros(buf.n_rl_units, dtype=np.float32)
+    val = np.ones(buf.n_rl_units, dtype=np.float32)
+    lp = np.zeros(buf.n_rl_units, dtype=np.float32)
+
+    for i in range(1, 6):
+        buf.add(obs, act, rew, done, val, lp)
+        assert buf.pos == i
+        assert buf.size() == i
+
+
+@pytest.mark.require_learning
+def test_rollout_buffer_add_stores_correct_values():
+    buf = make_rollout_buffer(buffer_size=4, obs_dim=2, act_dim=2, n_rl_units=1)
+    obs = np.array([[1.0, 2.0]], dtype=np.float32)
+    act = np.array([[0.5, -0.5]], dtype=np.float32)
+    rew = np.array([3.0], dtype=np.float32)
+    done = np.array([0.0], dtype=np.float32)
+    val = np.array([0.7], dtype=np.float32)
+    lp = np.array([-1.2], dtype=np.float32)
+
+    buf.add(obs, act, rew, done, val, lp)
+
+    np.testing.assert_array_almost_equal(buf.observations[0, 0], [1.0, 2.0])
+    np.testing.assert_array_almost_equal(buf.actions[0, 0], [0.5, -0.5])
+    assert buf.rewards[0, 0] == pytest.approx(3.0)
+    assert buf.dones[0, 0] == pytest.approx(0.0)
+    assert buf.values[0, 0] == pytest.approx(0.7)
+    assert buf.log_probs[0, 0] == pytest.approx(-1.2)
+
+
+@pytest.mark.require_learning
+def test_rollout_buffer_add_beyond_capacity_sets_full():
+    """The buffer becomes ``full`` after the last in-bounds add and refuses
+    to silently drop further transitions.
+
+    Silent drops mask configuration errors (e.g. an over-sized rollout window)
+    and were one of the bugs that hid the MAPPO ordering issue.  The buffer
+    must instead raise ``OverflowError`` so the caller has a chance to react.
+    """
+    buf = make_rollout_buffer(buffer_size=3)
+    obs = np.zeros((buf.n_rl_units, buf.obs_dim), dtype=np.float32)
+    act = np.zeros((buf.n_rl_units, buf.act_dim), dtype=np.float32)
+    rew = np.zeros(buf.n_rl_units, dtype=np.float32)
+    done = np.zeros(buf.n_rl_units, dtype=np.float32)
+    val = np.zeros(buf.n_rl_units, dtype=np.float32)
+    lp = np.zeros(buf.n_rl_units, dtype=np.float32)
+
+    for _ in range(3):
+        buf.add(obs, act, rew, done, val, lp)
+
+    assert buf.pos == 3
+    assert buf.size() == 3
+    assert buf.full is True
+
+    with pytest.raises(OverflowError):
+        buf.add(obs, act, rew, done, val, lp)
+    assert buf.full is True
+    assert buf.size() == 3
+
+
+@pytest.mark.require_learning
+def test_gae_single_step_non_terminal():
+    """For 1 step, 1 agent, non-terminal: advantage = TD error."""
+    gamma, gae_lambda = 0.99, 0.95
+    buf = make_rollout_buffer(
+        buffer_size=1,
+        obs_dim=1,
+        act_dim=1,
+        n_rl_units=1,
+        gamma=gamma,
+        gae_lambda=gae_lambda,
+    )
+    r, v, v_next = 1.0, 0.5, 0.8
+    buf.add(
+        obs=np.array([[0.0]]),
+        action=np.array([[0.0]]),
+        reward=np.array([r]),
+        done=np.array([0.0]),
+        value=np.array([v]),
+        log_prob=np.array([0.0]),
+    )
+
+    buf.compute_returns_and_advantages(
+        last_values=np.array([v_next]),
+        dones=np.array([0.0]),
+    )
+
+    expected_advantage = r + gamma * v_next - v
+    expected_return = expected_advantage + v
+
+    assert buf.advantages[0, 0] == pytest.approx(expected_advantage, abs=1e-5)
+    assert buf.returns[0, 0] == pytest.approx(expected_return, abs=1e-5)
+
+
+@pytest.mark.require_learning
+def test_gae_single_step_terminal():
+    """For a terminal episode end, bootstrap value must not propagate."""
+    gamma, gae_lambda = 0.99, 0.95
+    buf = make_rollout_buffer(
+        buffer_size=1,
+        obs_dim=1,
+        act_dim=1,
+        n_rl_units=1,
+        gamma=gamma,
+        gae_lambda=gae_lambda,
+    )
+    r, v = 2.0, 1.0
+    buf.add(
+        obs=np.array([[0.0]]),
+        action=np.array([[0.0]]),
+        reward=np.array([r]),
+        done=np.array([0.0]),
+        value=np.array([v]),
+        log_prob=np.array([0.0]),
+    )
+
+    # done=1 — so no bootstrapping from last_values
+    buf.compute_returns_and_advantages(
+        last_values=np.array([999.0]),
+        dones=np.array([1.0]),
+    )
+
+    expected_advantage = r - v
+    expected_return = expected_advantage + v  # = r
+
+    assert buf.advantages[0, 0] == pytest.approx(expected_advantage, abs=1e-5)
+    assert buf.returns[0, 0] == pytest.approx(expected_return, abs=1e-5)
+
+
+@pytest.mark.require_learning
+def test_gae_multi_step_manual():
+    """Manually verify 2-step GAE for a single agent."""
+    gamma, gae_lambda = 0.99, 0.95
+    buf = make_rollout_buffer(
+        buffer_size=2,
+        obs_dim=1,
+        act_dim=1,
+        n_rl_units=1,
+        gamma=gamma,
+        gae_lambda=gae_lambda,
+    )
+    r0, v0 = 1.0, 0.4
+    r1, v1 = 0.5, 0.6
+    v_next = 0.8
+
+    for r, v in [(r0, v0), (r1, v1)]:
+        buf.add(
+            obs=np.array([[0.0]]),
+            action=np.array([[0.0]]),
+            reward=np.array([r]),
+            done=np.array([0.0]),
+            value=np.array([v]),
+            log_prob=np.array([0.0]),
+        )
+
+    buf.compute_returns_and_advantages(
+        last_values=np.array([v_next]),
+        dones=np.array([0.0]),
+    )
+
+    delta_1 = r1 + gamma * v_next - v1
+    gae_1 = delta_1
+
+    delta_0 = r0 + gamma * v1 - v0
+    gae_0 = delta_0 + gamma * gae_lambda * gae_1
+
+    assert buf.advantages[0, 0] == pytest.approx(gae_0, abs=1e-5)
+    assert buf.advantages[1, 0] == pytest.approx(gae_1, abs=1e-5)
+    assert buf.returns[0, 0] == pytest.approx(gae_0 + v0, abs=1e-5)
+    assert buf.returns[1, 0] == pytest.approx(gae_1 + v1, abs=1e-5)
+
+
+@pytest.mark.require_learning
+def test_gae_lambda_zero_equals_td_error():
+    """gae_lambda=0 reduces GAE to a 1-step TD advantage per step."""
+    gamma, gae_lambda = 0.99, 0.0
+    buf = make_rollout_buffer(
+        buffer_size=3,
+        obs_dim=1,
+        act_dim=1,
+        n_rl_units=1,
+        gamma=gamma,
+        gae_lambda=gae_lambda,
+    )
+    rewards = [1.0, 0.5, 2.0]
+    values = [0.4, 0.6, 0.3]
+    v_next = 0.8
+
+    for r, v in zip(rewards, values):
+        buf.add(
+            obs=np.array([[0.0]]),
+            action=np.array([[0.0]]),
+            reward=np.array([r]),
+            done=np.array([0.0]),
+            value=np.array([v]),
+            log_prob=np.array([0.0]),
+        )
+
+    buf.compute_returns_and_advantages(
+        last_values=np.array([v_next]),
+        dones=np.array([0.0]),
+    )
+
+    next_vals = [values[1], values[2], v_next]
+    for step, (r, v, nv) in enumerate(zip(rewards, values, next_vals)):
+        expected = r + gamma * nv - v
+        assert buf.advantages[step, 0] == pytest.approx(expected, abs=1e-5)
+
+
+@pytest.mark.require_learning
+def test_gae_lambda_one_gamma_one_monte_carlo():
+    """with gamma=1, gae_lambda=1, terminal, should return equal undiscounted reward sums."""
+    gamma, gae_lambda = 1.0, 1.0
+    T = 4
+    buf = make_rollout_buffer(
+        buffer_size=T,
+        obs_dim=1,
+        act_dim=1,
+        n_rl_units=1,
+        gamma=gamma,
+        gae_lambda=gae_lambda,
+    )
+    rewards = [1.0, 1.0, 1.0, 1.0]
+    values = [0.1] * T
+
+    for r, v in zip(rewards, values):
+        buf.add(
+            obs=np.array([[0.0]]),
+            action=np.array([[0.0]]),
+            reward=np.array([r]),
+            done=np.array([0.0]),
+            value=np.array([v]),
+            log_prob=np.array([0.0]),
+        )
+
+    buf.compute_returns_and_advantages(
+        last_values=np.array([0.0]),
+        dones=np.array([1.0]),
+    )
+
+    for t in range(T):
+        assert buf.returns[t, 0] == pytest.approx(float(T - t), abs=1e-5)
+
+
+@pytest.mark.require_learning
+def test_gae_multi_agent_independence():
+    """One agent's rewards must not cause issue with another agent's advantages."""
+    gamma, gae_lambda = 0.99, 0.95
+    buf = make_rollout_buffer(
+        buffer_size=3,
+        obs_dim=1,
+        act_dim=1,
+        n_rl_units=2,
+        gamma=gamma,
+        gae_lambda=gae_lambda,
+    )
+
+    for _ in range(3):
+        buf.add(
+            obs=np.zeros((2, 1), dtype=np.float32),
+            action=np.zeros((2, 1), dtype=np.float32),
+            reward=np.array([1.0, 0.0]),
+            done=np.zeros(2, dtype=np.float32),
+            value=np.array([0.5, 0.5]),
+            log_prob=np.zeros(2, dtype=np.float32),
+        )
+
+    buf.compute_returns_and_advantages(
+        last_values=np.array([0.5, 0.5]),
+        dones=np.zeros(2),
+    )
+
+    for t in range(3):
+        assert abs(buf.advantages[t, 1]) < abs(buf.advantages[t, 0]), (
+            f"step {t}: agent-1 advantage {buf.advantages[t, 1]:.4f} should be "
+            f"smaller than agent-0 advantage {buf.advantages[t, 0]:.4f}"
+        )
+
+
+@pytest.mark.require_learning
+def test_gae_returns_equal_advantages_plus_values():
+    """returns == advantages + values for every step and agent."""
+    buf = make_rollout_buffer(buffer_size=6, n_rl_units=3)
+    fill_buffer(buf, n_steps=6)
+
+    last_values = np.random.rand(3).astype(np.float32)
+    buf.compute_returns_and_advantages(last_values, dones=np.zeros(3, dtype=np.float32))
+
+    np.testing.assert_array_almost_equal(
+        buf.returns[: buf.pos],
+        buf.advantages[: buf.pos] + buf.values[: buf.pos],
+        decimal=5,
+    )
+
+
+@pytest.mark.require_learning
+def test_rollout_buffer_get_raises_before_compute():
+    """Calling get() before compute_returns_and_advantages must raise ValueError."""
+    buf = make_rollout_buffer(buffer_size=4)
+    fill_buffer(buf, n_steps=4)
+
+    with pytest.raises(ValueError, match="compute_returns_and_advantages"):
+        next(buf.get(batch_size=2))
+
+
+@pytest.mark.require_learning
+def test_rollout_buffer_get_full_batch():
+    """get(batch_size=None) yields one batch with all steps and correct shapes."""
+    buf = make_rollout_buffer(buffer_size=5, obs_dim=3, act_dim=2, n_rl_units=2)
+    fill_buffer(buf, n_steps=5)
+    buf.compute_returns_and_advantages(
+        last_values=np.zeros(2, dtype=np.float32),
+        dones=np.zeros(2, dtype=np.float32),
+    )
+
+    batches = list(buf.get(batch_size=None))
+    assert len(batches) == 1
+
+    batch = batches[0]
+    assert isinstance(batch, RolloutBufferSamples)
+    assert batch.observations.shape == (5, 2, 3)
+    assert batch.actions.shape == (5, 2, 2)
+    assert batch.old_values.shape == (5, 2)
+    assert batch.old_log_probs.shape == (5, 2)
+    assert batch.advantages.shape == (5, 2)
+    assert batch.returns.shape == (5, 2)
+
+
+@pytest.mark.require_learning
+def test_rollout_buffer_get_mini_batches_cover_all_steps():
+    """Mini-batch iteration must cover every step exactly once."""
+    T = 8
+    buf = make_rollout_buffer(buffer_size=T, obs_dim=2, act_dim=1, n_rl_units=1)
+    fill_buffer(buf, n_steps=T)
+    buf.compute_returns_and_advantages(
+        last_values=np.zeros(1, dtype=np.float32),
+        dones=np.zeros(1, dtype=np.float32),
+    )
+
+    total_samples = 0
+    for batch in buf.get(batch_size=2):
+        assert isinstance(batch, RolloutBufferSamples)
+        total_samples += batch.observations.shape[0]
+
+    assert total_samples == T
+
+
+@pytest.mark.require_learning
+def test_rollout_buffer_get_partial_fill():
+    """A partially-filled buffer must only yield the filled steps."""
+    buf = make_rollout_buffer(buffer_size=10, obs_dim=2, act_dim=1, n_rl_units=1)
+    fill_buffer(buf, n_steps=4)
+    buf.compute_returns_and_advantages(
+        last_values=np.zeros(1, dtype=np.float32),
+        dones=np.zeros(1, dtype=np.float32),
+    )
+
+    batches = list(buf.get(batch_size=None))
+    assert batches[0].observations.shape[0] == 4
+
+
+@pytest.mark.require_learning
+def test_full_episode_rollout():
+    """fill -> GAE -> mini-batch epochs -> reset"""
+    T, obs_dim, act_dim, n_agents = 16, 5, 3, 2
+    buf = make_rollout_buffer(
+        buffer_size=T,
+        obs_dim=obs_dim,
+        act_dim=act_dim,
+        n_rl_units=n_agents,
+        gamma=0.99,
+        gae_lambda=0.95,
+    )
+
+    rng = np.random.default_rng(42)
+    for _ in range(T):
+        buf.add(
+            obs=rng.random((n_agents, obs_dim)).astype(np.float32),
+            action=rng.random((n_agents, act_dim)).astype(np.float32),
+            reward=rng.random(n_agents).astype(np.float32),
+            done=np.zeros(n_agents, dtype=np.float32),
+            value=rng.random(n_agents).astype(np.float32),
+            log_prob=-rng.random(n_agents).astype(np.float32),
+        )
+
+    assert buf.size() == T
+
+    last_values = rng.random(n_agents).astype(np.float32)
+    buf.compute_returns_and_advantages(last_values, dones=np.zeros(n_agents))
+
+    # returns == advantages + values
+    np.testing.assert_array_almost_equal(
+        buf.returns, buf.advantages + buf.values, decimal=5
+    )
+
+    # Two PPO epochs over mini-batches of size 4
+    for _epoch in range(2):
+        samples_seen = 0
+        for batch in buf.get(batch_size=4):
+            assert batch.observations.shape == (4, n_agents, obs_dim)
+            assert batch.actions.shape == (4, n_agents, act_dim)
+            samples_seen += batch.observations.shape[0]
+        assert samples_seen == T
+
+    # Reset for next rollout
+    buf.reset()
+    assert buf.pos == 0
+    assert buf.generator_ready is False
+    assert buf.size() == 0