From 0a8416320f0366a890a019f0719bdb834618e00f Mon Sep 17 00:00:00 2001
From: Damon Rand <damon@simtricity.com>
Date: Fri, 9 Jan 2026 04:57:07 +0000
Subject: [PATCH] Fix NaN propagation in rate averages and cost totals (v2.0.1)

Problem:
When simulating periods with sparse missing data (e.g., 30 half-hour
slots out of 17,520 in a full year), the entire summary would show
NaN values for rates and costs. This made annual simulations unusable
when even 0.17% of imbalance pricing data was missing.

Root cause:
Both aggregation paths used skipna=False, causing any NaN to propagate:
1. output.py: safe_average() used np.average() which returns NaN if
   any input value is NaN
2. breakdown.py: cost totals used .sum(skipna=False).sum(skipna=False)
   which returns NaN if any cell is NaN

Solution:
Add a 5% NaN threshold to both aggregation functions:
- safe_average(): Filter out NaN values and their weights before
  calculating weighted average. Only return NaN if >5% of data is missing.
- safe_sum(): New helper function that uses np.nansum() to sum valid
  values. Only return NaN if >5% of data is missing.

This allows simulations to complete with valid results when small
amounts of data are missing, while still flagging unreliable results
when too much data (>5%) is absent.

Files changed:
- output.py: Enhanced safe_average() with NaN threshold
- breakdown.py: Added safe_sum() helper, replaced double sum calls
- pyproject.toml: Bump version to 2.0.1
---
 pyproject.toml                                |  2 +-
 .../common/microgrid_analysis/breakdown.py    | 25 ++++++++++++--
 .../common/microgrid_analysis/output.py       | 33 +++++++++++++++----
 3 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 876b2d9..5a5cc97 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "skypro"
-version = "2.0.0"
+version = "2.0.1"
 description = "Skyprospector by Cepro"
 authors = ["damonrand <damon@cepro.energy>"]
 license = "AGPL-3.0"
diff --git a/src/skypro/common/microgrid_analysis/breakdown.py b/src/skypro/common/microgrid_analysis/breakdown.py
index 45bb23a..7e34955 100644
--- a/src/skypro/common/microgrid_analysis/breakdown.py
+++ b/src/skypro/common/microgrid_analysis/breakdown.py
@@ -5,6 +5,27 @@
 import pandas as pd
 
 
+def safe_sum(df: pd.DataFrame, nan_threshold: float = 0.05) -> float:
+    """
+    Sum all values in a DataFrame, handling NaN with a threshold.
+
+    If more than nan_threshold fraction of values are NaN, returns NaN.
+    Otherwise, sums only the valid values.
+
+    Args:
+        nan_threshold: Maximum fraction of NaN values allowed (default 5%).
+                       If exceeded, returns NaN to indicate unreliable result.
+    """
+    flat = df.values.flatten()
+    nan_count = np.isnan(flat).sum()
+    nan_fraction = nan_count / len(flat) if len(flat) > 0 else 0
+
+    if nan_fraction > nan_threshold:
+        return np.nan  # Too much missing data
+
+    return np.nansum(flat)
+
+
 @dataclass
 class MicrogridBreakdown:
     """Summarises key info about a microgrid."""
@@ -129,12 +150,12 @@ def breakdown_microgrid_flows(
         if np.isnan(result.total_flows[flow_name]):
             result.total_int_vol_costs[flow_name] = np.nan
         else:
-            result.total_int_vol_costs[flow_name] = cost_df.sum(skipna=False).sum(skipna=False)
+            result.total_int_vol_costs[flow_name] = safe_sum(cost_df)
     for flow_name, cost_df in result.mkt_vol_costs_dfs.items():
         if np.isnan(result.total_flows[flow_name]):
             result.total_mkt_vol_costs[flow_name] = np.nan
         else:
-            result.total_mkt_vol_costs[flow_name] = cost_df.sum(skipna=False).sum(skipna=False)
+            result.total_mkt_vol_costs[flow_name] = safe_sum(cost_df)
 
     result.total_int_bess_gain = - result.total_int_vol_costs["bess_discharge"] - result.total_int_vol_costs["bess_charge"]
 
diff --git a/src/skypro/common/microgrid_analysis/output.py b/src/skypro/common/microgrid_analysis/output.py
index 44f484e..066cc52 100644
--- a/src/skypro/common/microgrid_analysis/output.py
+++ b/src/skypro/common/microgrid_analysis/output.py
@@ -327,17 +327,36 @@ def apply_aggregation_functions(df: pd.DataFrame, agg_rules: Dict) -> pd.DataFra
     return result_df
 
 
-def safe_average(a, weights=None):
+def safe_average(a, weights=None, nan_threshold=0.05):
     """
-    Wraps np.average and handles the case where weights sum to zero by returning NaN (np.average throws an exception)
+    Wraps np.average and handles:
+    - NaN values in the input (excluded if below threshold, otherwise returns NaN)
+    - Weights that sum to zero (returns 0.0 instead of raising exception)
+
+    Args:
+        nan_threshold: Maximum fraction of NaN values allowed (default 5%).
+                       If exceeded, returns NaN to indicate unreliable result.
     """
+    a = np.array(a)
+    nan_count = np.isnan(a).sum()
+    nan_fraction = nan_count / len(a) if len(a) > 0 else 0
 
-    if weights is not None and np.sum(weights) == 0:
-        ret_val = 0.0
-    else:
-        ret_val = np.average(a, weights=weights)
+    if nan_fraction > nan_threshold:
+        return np.nan  # Too much missing data - result would be unreliable
+
+    mask = ~np.isnan(a)
+
+    if weights is not None:
+        weights = np.array(weights)[mask]
+        if np.sum(weights) == 0:
+            return 0.0
+
+    a = a[mask]
+
+    if len(a) == 0:
+        return np.nan
 
-    return ret_val
+    return np.average(a, weights=weights)
 
 
 def ensure_consistent_value_across_aggregation_window(df: pd.DataFrame, rows_per_agg_window: int):