Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
1fec8d4
DONE: Added DDPG, PPO in multi-agent environment in /reinforcement_le…
Harshul-18 Jan 9, 2026
8385449
DONE: Added DDPG, PPO in multi-agent environment in /reinforcement_le…
Harshul-18 Jan 9, 2026
4064608
Merge branch 'assume-framework:main' into with_PPO_and_DDPG
Harshul-18 Jan 12, 2026
5b9763d
UPDATED ppo-input-pipeline, code-documentation DELETED rollout_buffer…
Harshul-18 Jan 12, 2026
e38cc82
Merge branch 'with_PPO_and_DDPG' of https://github.com/Harshul-18/ass…
Harshul-18 Jan 12, 2026
9082e0b
FIX: initial values_data assignment
Harshul-18 Jan 14, 2026
e6d0056
- started making proper config definition
kim-mskw Jan 14, 2026
537f9e8
outsource activation_function_limit
kim-mskw Jan 14, 2026
afe0077
- make algorithm specific extra info instead of many if mappo statements
kim-mskw Jan 14, 2026
4c76dc4
ad buffer and algo doku
kim-mskw Jan 14, 2026
8d39963
DONE: Added DDPG, PPO in multi-agent environment in /reinforcement_le…
Harshul-18 Jan 9, 2026
7cc7035
DONE: Added DDPG, PPO in multi-agent environment in /reinforcement_le…
Harshul-18 Jan 9, 2026
fc2f6e0
UPDATED ppo-input-pipeline, code-documentation DELETED rollout_buffer…
Harshul-18 Jan 12, 2026
78d8033
FIX: initial values_data assignment
Harshul-18 Jan 14, 2026
36292ac
updated the structure and segmented the parameters for off-policy and…
Harshul-18 Feb 13, 2026
a464155
fixed independent working of on-policy algorithms
Harshul-18 Feb 13, 2026
d5521ab
Improved the parameters flow independently and fixed different buffer…
Harshul-18 Feb 14, 2026
fddaf2c
Updated base_algorithm.py documentation.
Harshul-18 Feb 14, 2026
239fb43
Updated maddpg.py documentation.
Harshul-18 Feb 14, 2026
0c78a24
Merge branch 'with_PPO_and_DDPG' of https://github.com/Harshul-18/ass…
Harshul-18 Feb 16, 2026
19c975a
temporary fix
Harshul-18 Feb 16, 2026
7727cb9
Updated the example_02a config file to test the scenario
Harshul-18 Feb 16, 2026
9435df5
Updated the all example_02 scenario config files according to the upd…
Harshul-18 Feb 28, 2026
e1bbcd6
updated the 04a notebook to work with the updated config pipeline
Harshul-18 Feb 28, 2026
2ba98f7
updated the 04a notebook, documentations of RL_algorithms folder code…
Harshul-18 Feb 28, 2026
171b5e9
fixed the PPO implementation, the policy update logic, corrected the …
Harshul-18 Apr 1, 2026
47349da
added 'get_distribution' method and splitted the forward method into …
Harshul-18 Apr 9, 2026
937bc05
Merge branch 'main' into with_PPO_and_DDPG
Harshul-18 Apr 23, 2026
563ea05
integrated MAPPO pipeline in other files
Harshul-18 Apr 24, 2026
cdfc3d6
Fix Path import bug in learning_role.py — fixing the runtime error fo…
Harshul-18 Apr 25, 2026
90ca112
Fixed assume import logic API
Harshul-18 Apr 26, 2026
c8a2059
Fixed test_matd3.py to use the nested off_policy config structure
Harshul-18 Apr 26, 2026
89eb665
Added Rollout Buffer test file (test_rl_rolloutbuffer.py)
Harshul-18 Apr 26, 2026
24364ef
Added MADDPG test cases (test_maddpg.py file)
Harshul-18 Apr 26, 2026
a1442a8
Updated the docs and completed the MADDPG implementation
Harshul-18 Apr 26, 2026
d34c4d9
Merge branch 'with_PPO_and_DDPG_merge' into with_PPO_and_DDPG
Harshul-18 Apr 26, 2026
0ba82b7
Added MAPPO test file (test_mappo.py)
Harshul-18 Apr 28, 2026
eb6522f
Moved get_action from learning_strategies to RLAlgorithm
Harshul-18 Apr 28, 2026
28cb3a0
Merge remote-tracking branch 'upstream/main' into with_PPO_and_DDPG
Harshul-18 Apr 28, 2026
d2f8f9d
Fixed the test_learning_role.py by updating the OffPolicyConfig param…
Harshul-18 May 7, 2026
798f7a0
Fixed the test_integration_cli.py by adding market_mechanism which go…
Harshul-18 May 8, 2026
3a1366f
- config cleaning and dokstringc chnages of AlgorithmConfig data class
kim-mskw May 8, 2026
fc88fd3
- move off-policy get_actions behavior to respective algorithm file
kim-mskw May 8, 2026
e66e973
- slight changes of error handling to ensure same flow as before
kim-mskw May 8, 2026
e7693c6
- change buffer logic to closely map each other
kim-mskw May 8, 2026
3b60e91
forgot to commit refactored function
kim-mskw May 8, 2026
1609b1d
delete unnecessary defintions and delet rl_strats sorting, as we use …
kim-mskw May 8, 2026
22fcd62
refactor initilze buffer into leanring role
kim-mskw May 8, 2026
e1f358a
ruff formatting
kim-mskw May 8, 2026
b0b8457
Fixed update_policy's strategies listing, temporarily fixed circular …
Harshul-18 May 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions assume/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,21 @@
from importlib.metadata import version

from assume.common import MarketConfig, MarketProduct
from assume.reinforcement_learning import (
A2CAlgorithm,
DDPG,
LSTMActor,
Learning,
MLPActor,
PPO,
RLAlgorithm,
ReplayBuffer,
ReplayBufferSamples,
RolloutBuffer,
RolloutBufferSamples,
TD3,
actor_architecture_aliases,
)
from assume.scenario.loader_csv import (
load_custom_units,
load_scenario_folder,
Expand All @@ -16,3 +31,34 @@

__author__ = "ASSUME Developers: Nick Harder, Kim Miskiw, Florian Maurer, Manish Khanra"
__copyright__ = "AGPL-3.0 License"

__all__ = [
# Framework version
"__version__",
# World & scenario
"World",
"load_scenario_folder",
"load_custom_units",
"run_learning",
# Market primitives
"MarketConfig",
"MarketProduct",
# RL orchestration
"Learning",
# RL algorithm base classes
"RLAlgorithm",
"A2CAlgorithm",
# RL concrete algorithms
"TD3",
"DDPG",
"PPO",
# RL actor architectures
"MLPActor",
"LSTMActor",
"actor_architecture_aliases",
# RL buffers
"ReplayBuffer",
"ReplayBufferSamples",
"RolloutBuffer",
"RolloutBufferSamples",
]
204 changes: 153 additions & 51 deletions assume/common/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import logging
from collections import defaultdict
from dataclasses import dataclass
from dataclasses import dataclass, field
from datetime import datetime, timedelta

import numpy as np
Expand Down Expand Up @@ -755,6 +755,132 @@ def update_forecasts_if_needed(unit: BaseUnit, *args, **kwargs):
unit.forecaster.update(*args, **kwargs)


@dataclass
class AlgorithmConfig:
"""
Base configuration for algorithm-specific parameters.

Parameters:
batch_size (int): The batch size of experiences sampled from the replay buffer for each training update.
Larger batches provide more stable gradients but require more memory. Default is 128.
gamma (float): The discount factor for future rewards, ranging from 0 to 1. Default is 0.99.
train_freq (str): Defines the frequency at which networks are updated. Default is "24h".
"""

batch_size: int = 128
gamma: float = 0.99
train_freq: str = "24h"


# Algorithm category mapping
ALGORITHM_CATEGORIES = {
"mappo": "on-policy",
"matd3": "off-policy",
"maddpg": "off-policy",
}


def is_on_policy(algorithm_name: str) -> bool:
"""Check if algorithm is on-policy."""
return ALGORITHM_CATEGORIES.get(algorithm_name) == "on-policy"


def is_off_policy(algorithm_name: str) -> bool:
"""Check if algorithm is off-policy."""
return ALGORITHM_CATEGORIES.get(algorithm_name) == "off-policy"


@dataclass
class OffPolicyConfig(AlgorithmConfig):
"""
Configuration for off-policy algorithms (MATD3/MADDPG) hyperparameters.

These parameters control the off-policy actor-critic algorithm behavior such as delayed policy updates,
target network updates, and exploration noise.

Parameters:
episodes_collecting_initial_experience (int): The number of episodes at the start during which random
actions are chosen instead of using the actor network. Default is 5.
gradient_steps (int): The number of gradient descent steps performed during each training update. Default is 100.
actor_architecture (str): The architecture of the neural networks used for the actors. Options include
"mlp" (Multi-Layer Perceptron) and "lstm" (Long Short-Term Memory). Default is "mlp".
replay_buffer_size (int): The maximum number of transitions stored in the replay buffer. Default is 50000.
policy_delay (int): The frequency (in gradient steps) at which the actor policy is updated.
Some algorithms update the critic more frequently than the actor to stabilize training. Default is 2.
noise_sigma (float): The standard deviation of the Ornstein-Uhlenbeck or Gaussian noise distribution
used to generate exploration noise added to actions. Default is 0.1.
noise_scale (int): The scale factor multiplied by the noise drawn from the distribution.
Larger values increase exploration. Default is 1.
noise_dt (int): The time step parameter for the Ornstein-Uhlenbeck process, which determines how
quickly the noise decays over time. Used for noise scheduling. Default is 1.
action_noise_schedule (str | None): Which action noise decay schedule to use. Currently only "linear"
decay is available, which linearly decreases exploration noise over training. Default is "linear".
tau (float): The soft update coefficient for updating target networks. Controls how slowly target
networks track the main networks. Smaller values mean slower updates. Default is 0.005.
target_policy_noise (float): The standard deviation of noise added to target policy actions during
critic updates. This smoothing helps prevent overfitting to narrow policy peaks. Default is 0.2.
target_noise_clip (float): The maximum absolute value for clipping the target policy noise.
Prevents the noise from being too large. Default is 0.5.
"""

episodes_collecting_initial_experience: int = 5
gradient_steps: int = 100
noise_dt: int = 1
noise_scale: int = 1
noise_sigma: float = 0.1
actor_architecture: str = "mlp"
action_noise_schedule: str | None = None
policy_delay: int = 2
tau: float = 0.005
target_policy_noise: float = 0.2
target_noise_clip: float = 0.5
replay_buffer_size: int = 50000

def __post_init__(self):
# if we do not have initial experience collected we will get an error as no samples are available on the
# buffer from which we can draw experience to adapt the strategy, hence we set it to minimum one episode
if self.episodes_collecting_initial_experience < 1:
logger.warning(
f"episodes_collecting_initial_experience need to be at least 1 to sample from buffer, got {self.episodes_collecting_initial_experience}. setting to 1"
)

self.episodes_collecting_initial_experience = 1

# check that gradient_steps is positive
if self.gradient_steps <= 0:
raise ValueError(
f"gradient_steps need to be positive, got {self.gradient_steps}"
)


@dataclass
class OnPolicyConfig(AlgorithmConfig):
"""
Configuration for on-policy algorithms (PPO/MAPPO) hyperparameters.

These parameters control the PPO algorithm behavior such as clipping ranges,
number of optimization epochs, and loss coefficients.

Parameters:
clip_ratio (float): The clipping ratio for the PPO surrogate objective. Default is 0.1.
entropy_coef (float): Coefficient for entropy term in loss. Default is 0.01.
gae_lambda (float): Lambda parameter for Generalized Advantage Estimation (GAE). Default is 0.95.
max_grad_norm (float): Maximum gradient norm for clipping. Default is 0.5.
vf_coef (float): Coefficient for value function term in loss. Default is 0.5.
n_epochs (int): Number of optimization epochs per rollout. Default is 10.
actor_architecture (str): The architecture of the neural networks used for the actors. Options include
"mlp" (Multi-Layer Perceptron) and "lstm" (Long Short-Term Memory). Default is "mlp".
"""

clip_ratio: float = 0.1
entropy_coef: float = 0.01
gae_lambda: float = 0.95
max_grad_norm: float = 0.5
vf_coef: float = 0.5
n_epochs: int = 10
actor_architecture: str = "mlp"


@dataclass
class LearningConfig:
"""
Expand All @@ -779,9 +905,6 @@ class LearningConfig:

device (str): The device to use for PyTorch computations. Options include "cpu", "cuda", or specific
CUDA devices like "cuda:0". Default is "cpu".
episodes_collecting_initial_experience (int): The number of episodes at the start during which random
actions are chosen instead of using the actor network. This helps populate the replay buffer with
diverse experiences. Default is 5.
exploration_noise_std (float): The standard deviation of Gaussian noise added to actions during
exploration in the environment. Higher values encourage more exploration. Default is 0.2.
training_episodes (int): The number of training episodes, where one episode is the entire simulation
Expand All @@ -793,8 +916,6 @@ class LearningConfig:
batch_size (int): The batch size of experiences sampled from the replay buffer for each training update.
Larger batches provide more stable gradients but require more memory. In environments with many leanring agents we advise small batch sizes.
Default is 128.
gradient_steps (int): The number of gradient descent steps performed during each training update.
More steps can lead to better learning but increase computation time. Default is 100.
learning_rate (float): The learning rate (step size) for the optimizer, which controls how much the
policy and value networks are updated during training. Default is 0.001.
learning_rate_schedule (str | None): Which learning rate decay schedule to use. Currently only "linear"
Expand All @@ -806,30 +927,15 @@ class LearningConfig:
early stopping. If the reward improvement is less than this threshold over early_stopping_steps,
training is terminated early. Default is 0.05.

algorithm (str): Specifies which reinforcement learning algorithm to use. Currently, only "matd3"
(Multi-Agent Twin Delayed Deep Deterministic Policy Gradient) is implemented. Default is "matd3".
replay_buffer_size (int): The maximum number of transitions stored in the replay buffer for experience replay.
Larger buffers allow for more diverse training samples. Default is 500000.
algorithm (str): Specifies which reinforcement learning algorithm to use. Options include "matd3"
(Multi-Agent Twin Delayed Deep Deterministic Policy Gradient), "maddpg" (Multi-Agent Deep Deterministic Policy Gradient), and "mappo" (Multi-Agent Proximal Policy Optimization). Default is "matd3".
gamma (float): The discount factor for future rewards, ranging from 0 to 1. Higher values give more
weight to long-term rewards in decision-making. Default is 0.99.
actor_architecture (str): The architecture of the neural networks used for the actors. Options include
"mlp" (Multi-Layer Perceptron) and "lstm" (Long Short-Term Memory). Default is "mlp".
policy_delay (int): The frequency (in gradient steps) at which the actor policy is updated.
TD3 updates the critic more frequently than the actor to stabilize training. Default is 2.
noise_sigma (float): The standard deviation of the Ornstein-Uhlenbeck or Gaussian noise distribution
used to generate exploration noise added to actions. Default is 0.1.
noise_scale (int): The scale factor multiplied by the noise drawn from the distribution.
Larger values increase exploration. Default is 1.
noise_dt (int): The time step parameter for the Ornstein-Uhlenbeck process, which determines how
quickly the noise decays over time. Used for noise scheduling. Default is 1.
action_noise_schedule (str | None): Which action noise decay schedule to use. Currently only "linear"
decay is available, which linearly decreases exploration noise over training. Default is "linear".
tau (float): The soft update coefficient for updating target networks. Controls how slowly target
networks track the main networks. Smaller values mean slower updates. Default is 0.005.
target_policy_noise (float): The standard deviation of noise added to target policy actions during
critic updates. This smoothing helps prevent overfitting to narrow policy peaks. Default is 0.2.
target_noise_clip (float): The maximum absolute value for clipping the target policy noise.
Prevents the noise from being too large. Default is 0.5.

off_policy (OffPolicyConfig): Nested configuration for off-policy algorithms (MATD3/MADDPG) hyperparameters.
on_policy (OnPolicyConfig): Nested configuration for on-policy algorithms (PPO/MAPPO) hyperparameters.

"""

Expand All @@ -843,52 +949,47 @@ class LearningConfig:
max_bid_price: float | None = 100.0

device: str = "cpu"
episodes_collecting_initial_experience: int = 5
exploration_noise_std: float = 0.2
training_episodes: int = 100
validation_episodes_interval: int = 5
train_freq: str = "24h"
batch_size: int = 128
gradient_steps: int = 100
learning_rate: float = 0.001
learning_rate_schedule: str | None = None
early_stopping_steps: int | None = None
early_stopping_threshold: float = 0.05

algorithm: str = "matd3"
replay_buffer_size: int = 50000
gamma: float = 0.99
actor_architecture: str = "mlp"
policy_delay: int = 2
noise_sigma: float = 0.1
noise_scale: int = 1
noise_dt: int = 1
action_noise_schedule: str | None = None
tau: float = 0.005
target_policy_noise: float = 0.2
target_noise_clip: float = 0.5

# Nested algorithm configurations
off_policy: OffPolicyConfig = field(default_factory=OffPolicyConfig)
on_policy: OnPolicyConfig = field(default_factory=OnPolicyConfig)

def __post_init__(self):
"""Calculate defaults that depend on other fields and validate inputs."""
# Convert nested dicts to dataclass instances if necessary
if isinstance(self.off_policy, dict):
self.off_policy = OffPolicyConfig(**self.off_policy)
if isinstance(self.on_policy, dict):
self.on_policy = OnPolicyConfig(**self.on_policy)

for config in [self.off_policy, self.on_policy]:
if config:
config.batch_size = self.batch_size
config.gamma = self.gamma
config.train_freq = self.train_freq

self.off_policy.actor_architecture = self.actor_architecture
self.on_policy.actor_architecture = self.actor_architecture

if self.early_stopping_steps is None:
self.early_stopping_steps = int(
self.training_episodes / self.validation_episodes_interval + 1
)

# if we do not have initial experience collected we will get an error as no samples are available on the
Comment thread
Harshul-18 marked this conversation as resolved.
# buffer from which we can draw experience to adapt the strategy, hence we set it to minimum one episode
if self.episodes_collecting_initial_experience < 1:
logger.warning(
f"episodes_collecting_initial_experience need to be at least 1 to sample from buffer, got {self.episodes_collecting_initial_experience}. setting to 1"
)

self.episodes_collecting_initial_experience = 1

# check that gradient_steps is positive
if self.gradient_steps <= 0:
raise ValueError(
f"gradient_steps need to be positive, got {self.gradient_steps}"
)
# check that gradient_steps is positive (now checked in off_policy config)


class LearningStrategy(BaseStrategy):
Expand Down Expand Up @@ -930,6 +1031,7 @@ def __init__(
# access to the learning_role that orchestrates learning
self.learning_role = learning_role
self.learning_config = learning_role.learning_config
self.algorithm = self.learning_config.algorithm

self.foresight = foresight
self.act_dim = act_dim
Expand Down
28 changes: 28 additions & 0 deletions assume/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,34 @@ def rename_study_case(path: str, old_key: str, new_key: str):
yaml.safe_dump(data, file, sort_keys=False)


def convert_to_tensors(array: np.array, copy=True, dtype=None, device=None):
"""Convert a numpy array to a PyTorch tensor.

Note:
It copies the data by default.

Args:
array (numpy.ndarray): The numpy array to convert.
copy (bool, optional): Whether to copy the data or not
(may be useful to avoid changing things by reference). Defaults to True.

Returns:
torch.Tensor: The converted PyTorch tensor.
"""

try:
import torch as th

if copy:
return th.tensor(array, dtype=dtype, device=device)

return th.as_tensor(array, dtype=dtype, device=device)

except ImportError:
# If torch is not installed, return the array unchanged
return array


def convert_tensors(data):
"""
Recursively checks if the data contains PyTorch tensors and converts them to
Expand Down
Loading