From 0a8416320f0366a890a019f0719bdb834618e00f Mon Sep 17 00:00:00 2001 From: Damon Rand Date: Fri, 9 Jan 2026 04:57:07 +0000 Subject: [PATCH] Fix NaN propagation in rate averages and cost totals (v2.0.1) Problem: When simulating periods with sparse missing data (e.g., 30 half-hour slots out of 17,520 in a full year), the entire summary would show NaN values for rates and costs. This made annual simulations unusable when even 0.17% of imbalance pricing data was missing. Root cause: Both aggregation paths used skipna=False, causing any NaN to propagate: 1. output.py: safe_average() used np.average() which returns NaN if any input value is NaN 2. breakdown.py: cost totals used .sum(skipna=False).sum(skipna=False) which returns NaN if any cell is NaN Solution: Add a 5% NaN threshold to both aggregation functions: - safe_average(): Filter out NaN values and their weights before calculating weighted average. Only return NaN if >5% of data is missing. - safe_sum(): New helper function that uses np.nansum() to sum valid values. Only return NaN if >5% of data is missing. This allows simulations to complete with valid results when small amounts of data are missing, while still flagging unreliable results when too much data (>5%) is absent. Files changed: - output.py: Enhanced safe_average() with NaN threshold - breakdown.py: Added safe_sum() helper, replaced double sum calls - pyproject.toml: Bump version to 2.0.1 --- pyproject.toml | 2 +- .../common/microgrid_analysis/breakdown.py | 25 ++++++++++++-- .../common/microgrid_analysis/output.py | 33 +++++++++++++++---- 3 files changed, 50 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 876b2d9..5a5cc97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "skypro" -version = "2.0.0" +version = "2.0.1" description = "Skyprospector by Cepro" authors = ["damonrand "] license = "AGPL-3.0" diff --git a/src/skypro/common/microgrid_analysis/breakdown.py b/src/skypro/common/microgrid_analysis/breakdown.py index 45bb23a..7e34955 100644 --- a/src/skypro/common/microgrid_analysis/breakdown.py +++ b/src/skypro/common/microgrid_analysis/breakdown.py @@ -5,6 +5,27 @@ import pandas as pd +def safe_sum(df: pd.DataFrame, nan_threshold: float = 0.05) -> float: + """ + Sum all values in a DataFrame, handling NaN with a threshold. + + If more than nan_threshold fraction of values are NaN, returns NaN. + Otherwise, sums only the valid values. + + Args: + nan_threshold: Maximum fraction of NaN values allowed (default 5%). + If exceeded, returns NaN to indicate unreliable result. + """ + flat = df.values.flatten() + nan_count = np.isnan(flat).sum() + nan_fraction = nan_count / len(flat) if len(flat) > 0 else 0 + + if nan_fraction > nan_threshold: + return np.nan # Too much missing data + + return np.nansum(flat) + + @dataclass class MicrogridBreakdown: """Summarises key info about a microgrid.""" @@ -129,12 +150,12 @@ def breakdown_microgrid_flows( if np.isnan(result.total_flows[flow_name]): result.total_int_vol_costs[flow_name] = np.nan else: - result.total_int_vol_costs[flow_name] = cost_df.sum(skipna=False).sum(skipna=False) + result.total_int_vol_costs[flow_name] = safe_sum(cost_df) for flow_name, cost_df in result.mkt_vol_costs_dfs.items(): if np.isnan(result.total_flows[flow_name]): result.total_mkt_vol_costs[flow_name] = np.nan else: - result.total_mkt_vol_costs[flow_name] = cost_df.sum(skipna=False).sum(skipna=False) + result.total_mkt_vol_costs[flow_name] = safe_sum(cost_df) result.total_int_bess_gain = - result.total_int_vol_costs["bess_discharge"] - result.total_int_vol_costs["bess_charge"] diff --git a/src/skypro/common/microgrid_analysis/output.py b/src/skypro/common/microgrid_analysis/output.py index 44f484e..066cc52 100644 --- a/src/skypro/common/microgrid_analysis/output.py +++ b/src/skypro/common/microgrid_analysis/output.py @@ -327,17 +327,36 @@ def apply_aggregation_functions(df: pd.DataFrame, agg_rules: Dict) -> pd.DataFra return result_df -def safe_average(a, weights=None): +def safe_average(a, weights=None, nan_threshold=0.05): """ - Wraps np.average and handles the case where weights sum to zero by returning NaN (np.average throws an exception) + Wraps np.average and handles: + - NaN values in the input (excluded if below threshold, otherwise returns NaN) + - Weights that sum to zero (returns 0.0 instead of raising exception) + + Args: + nan_threshold: Maximum fraction of NaN values allowed (default 5%). + If exceeded, returns NaN to indicate unreliable result. """ + a = np.array(a) + nan_count = np.isnan(a).sum() + nan_fraction = nan_count / len(a) if len(a) > 0 else 0 - if weights is not None and np.sum(weights) == 0: - ret_val = 0.0 - else: - ret_val = np.average(a, weights=weights) + if nan_fraction > nan_threshold: + return np.nan # Too much missing data - result would be unreliable + + mask = ~np.isnan(a) + + if weights is not None: + weights = np.array(weights)[mask] + if np.sum(weights) == 0: + return 0.0 + + a = a[mask] + + if len(a) == 0: + return np.nan - return ret_val + return np.average(a, weights=weights) def ensure_consistent_value_across_aggregation_window(df: pd.DataFrame, rows_per_agg_window: int):