From 6d7af9355167ad26c39970e46bfd8db0aefcc464 Mon Sep 17 00:00:00 2001
From: s-zx <2575376715@qq.com>
Date: Sun, 8 Mar 2026 21:00:31 +0100
Subject: [PATCH 1/3] [Bugfix] Validate fp16.loss_scale is finite in
 DeepSpeedFP16Config

`DeepSpeedFP16Config.loss_scale` accepted `float("inf")` without any
validation, silently allowing the engine to initialise.  During
training this caused all gradients to become NaN because the static
loss scaler multiplied every loss by infinity, while other config
fields such as `stage3_max_live_parameters` already used Pydantic
`ge=` constraints and would correctly raise `ValidationError` for
invalid inputs.

Fix: add `Field(0, ge=0)` so that negative values are rejected at the
Pydantic level, and add a `field_validator` that calls `math.isfinite`
so that `inf` and `nan` are also rejected with a clear error message.
The existing semantic where `loss_scale=0` selects dynamic loss scaling
is preserved unchanged.

Add unit tests to `tests/unit/runtime/test_ds_config_model.py` covering
the valid range (0, 1.0, 128.0, 65536.0) and the three previously
accepted invalid inputs (`inf`, `-inf`, `nan`, `-1.0`).

Fixes #7852
---
 deepspeed/runtime/precision_config.py      | 21 +++++++++++++++--
 tests/unit/runtime/test_ds_config_model.py | 27 ++++++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py
index 894ecfbc0409..28e2f7159d31 100644
--- a/deepspeed/runtime/precision_config.py
+++ b/deepspeed/runtime/precision_config.py
@@ -3,6 +3,10 @@
 
 # DeepSpeed Team
 
+import math
+
+from pydantic import Field, field_validator
+
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from .fp16.loss_scaler import (
     INITIAL_LOSS_SCALE,
@@ -108,11 +112,24 @@ class DeepSpeedFP16Config(DeepSpeedConfigModel):
     Automatically cast inputs to fp16
     """
 
-    loss_scale: float = 0
+    loss_scale: float = Field(0, ge=0)
     """
-    Loss scaling value. Default value of 0 means dynamic loss scaling instead of static loss scale.
+    Loss scaling value for static loss scaling. Default value of 0 enables
+    dynamic loss scaling instead of a fixed static scale. Must be a finite
+    non-negative number; ``float('inf')`` and ``float('nan')`` are rejected
+    because they cause silent NaN gradients during training.
     """
 
+    @field_validator("loss_scale")
+    @classmethod
+    def loss_scale_must_be_finite(cls, v: float) -> float:
+        if not math.isfinite(v):
+            raise ValueError(
+                f"fp16.loss_scale must be a finite non-negative number (0 for dynamic scaling), "
+                f"got {v!r}. Infinite or NaN values silently corrupt gradients."
+            )
+        return v
+
     initial_scale_power: int = 16
     """
     For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}.
diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py
index 4d184b2858a8..a77356334ed7 100644
--- a/tests/unit/runtime/test_ds_config_model.py
+++ b/tests/unit/runtime/test_ds_config_model.py
@@ -84,3 +84,30 @@ def test_config_base_literalfail(config_dict):
 def test_config_base_deprecatedfail():
     with pytest.raises(AssertionError):
         config = SimpleConf(**{"param_2": ["DS"], "param_2_old": "DS"})
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# DeepSpeedFP16Config.loss_scale validation  (issue #7852)
+# ──────────────────────────────────────────────────────────────────────────────
+
+from deepspeed.runtime.precision_config import DeepSpeedFP16Config
+
+
+@pytest.mark.parametrize("loss_scale", [0, 1.0, 128.0, 65536.0])
+def test_fp16_loss_scale_valid(loss_scale):
+    """0 (dynamic) and finite positive values must be accepted."""
+    cfg = DeepSpeedFP16Config(loss_scale=loss_scale)
+    assert cfg.loss_scale == loss_scale
+
+
+@pytest.mark.parametrize("loss_scale", [float("inf"), float("-inf"), float("nan")])
+def test_fp16_loss_scale_rejects_non_finite(loss_scale):
+    """Non-finite loss_scale values must raise ValidationError (issue #7852)."""
+    with pytest.raises(ValidationError, match="finite"):
+        DeepSpeedFP16Config(loss_scale=loss_scale)
+
+
+def test_fp16_loss_scale_rejects_negative():
+    """Negative loss_scale must raise ValidationError."""
+    with pytest.raises(ValidationError):
+        DeepSpeedFP16Config(loss_scale=-1.0)

From 96cf05af3dc13c844efef14a730c7c7a501ab240 Mon Sep 17 00:00:00 2001
From: s-zx <2575376715@qq.com>
Date: Sun, 8 Mar 2026 21:57:34 +0100
Subject: [PATCH 2/3] test: relax non-finite loss_scale match to accommodate
 Pydantic ge=0 constraint order

Pydantic runs field constraints (ge=0) before the after-mode
field_validator, so for -inf and nan the ValidationError may contain
"greater than or equal to" rather than "finite".  The assertion
`match="finite"` was therefore too strict and would fail in CI even
though the invalid value is correctly rejected.

Drop the match argument; validating that a ValidationError is raised
is sufficient to confirm the fix.
---
 tests/unit/runtime/test_ds_config_model.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py
index a77356334ed7..a467d473ecc2 100644
--- a/tests/unit/runtime/test_ds_config_model.py
+++ b/tests/unit/runtime/test_ds_config_model.py
@@ -102,8 +102,14 @@ def test_fp16_loss_scale_valid(loss_scale):
 
 @pytest.mark.parametrize("loss_scale", [float("inf"), float("-inf"), float("nan")])
 def test_fp16_loss_scale_rejects_non_finite(loss_scale):
-    """Non-finite loss_scale values must raise ValidationError (issue #7852)."""
-    with pytest.raises(ValidationError, match="finite"):
+    """Non-finite loss_scale values must raise ValidationError (issue #7852).
+
+    The exact error message is not asserted because Pydantic may run the
+    ``ge=0`` field constraint before the custom ``field_validator``, producing
+    a "greater than or equal to" message for -inf/nan instead of our "finite"
+    message.  What matters is that invalid values are rejected.
+    """
+    with pytest.raises(ValidationError):
         DeepSpeedFP16Config(loss_scale=loss_scale)
 
 

From 76136147314d21760723c0ebb20a27ce9276ccbe Mon Sep 17 00:00:00 2001
From: s-zx <2575376715@qq.com>
Date: Wed, 11 Mar 2026 22:13:21 +0100
Subject: [PATCH 3/3] test: relax non-finite loss_scale assertion to accept
 finite or ge message

Pydantic may run the ge=0 field constraint before the field_validator,
producing a "greater than or equal" message for -inf/nan instead of our
"finite" message. Use a flexible match to accept either.

Addresses Codex review feedback.
---
 tests/unit/runtime/test_ds_config_model.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py
index a467d473ecc2..36ee4b395734 100644
--- a/tests/unit/runtime/test_ds_config_model.py
+++ b/tests/unit/runtime/test_ds_config_model.py
@@ -104,12 +104,11 @@ def test_fp16_loss_scale_valid(loss_scale):
 def test_fp16_loss_scale_rejects_non_finite(loss_scale):
     """Non-finite loss_scale values must raise ValidationError (issue #7852).
 
-    The exact error message is not asserted because Pydantic may run the
-    ``ge=0`` field constraint before the custom ``field_validator``, producing
-    a "greater than or equal to" message for -inf/nan instead of our "finite"
-    message.  What matters is that invalid values are rejected.
+    Pydantic may run the ``ge=0`` field constraint before the custom
+    ``field_validator``, producing a "greater than or equal" message for
+    -inf/nan instead of our "finite" message. Accept either.
     """
-    with pytest.raises(ValidationError):
+    with pytest.raises(ValidationError, match=r"finite|greater than or equal"):
         DeepSpeedFP16Config(loss_scale=loss_scale)