deepspeedai · s-zx · Mar 8, 2026 · Mar 8, 2026 · Mar 11, 2026
@@ -3,6 +3,10 @@
 
 # DeepSpeed Team
 
+import math
+
+from pydantic import Field, field_validator
+
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from .fp16.loss_scaler import (
     INITIAL_LOSS_SCALE,
@@ -108,11 +112,24 @@ class DeepSpeedFP16Config(DeepSpeedConfigModel):
     Automatically cast inputs to fp16
     """
 
-    loss_scale: float = 0
+    loss_scale: float = Field(0, ge=0)
     """
-    Loss scaling value. Default value of 0 means dynamic loss scaling instead of static loss scale.
+    Loss scaling value for static loss scaling. Default value of 0 enables
+    dynamic loss scaling instead of a fixed static scale. Must be a finite
+    non-negative number; ``float('inf')`` and ``float('nan')`` are rejected
+    because they cause silent NaN gradients during training.
     """
 
+    @field_validator("loss_scale")
+    @classmethod
+    def loss_scale_must_be_finite(cls, v: float) -> float:
+        if not math.isfinite(v):
+            raise ValueError(
+                f"fp16.loss_scale must be a finite non-negative number (0 for dynamic scaling), "
+                f"got {v!r}. Infinite or NaN values silently corrupt gradients."
+            )
+        return v
+
     initial_scale_power: int = 16
     """
     For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}.

@@ -84,3 +84,35 @@ def test_config_base_literalfail(config_dict):
 def test_config_base_deprecatedfail():
     with pytest.raises(AssertionError):
         config = SimpleConf(**{"param_2": ["DS"], "param_2_old": "DS"})
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# DeepSpeedFP16Config.loss_scale validation  (issue #7852)
+# ──────────────────────────────────────────────────────────────────────────────
+
+from deepspeed.runtime.precision_config import DeepSpeedFP16Config
+
+
+@pytest.mark.parametrize("loss_scale", [0, 1.0, 128.0, 65536.0])
+def test_fp16_loss_scale_valid(loss_scale):
+    """0 (dynamic) and finite positive values must be accepted."""
+    cfg = DeepSpeedFP16Config(loss_scale=loss_scale)
+    assert cfg.loss_scale == loss_scale
+
+
+@pytest.mark.parametrize("loss_scale", [float("inf"), float("-inf"), float("nan")])
+def test_fp16_loss_scale_rejects_non_finite(loss_scale):
+    """Non-finite loss_scale values must raise ValidationError (issue #7852).
+
+    Pydantic may run the ``ge=0`` field constraint before the custom
+    ``field_validator``, producing a "greater than or equal" message for
+    -inf/nan instead of our "finite" message. Accept either.
+    """
+    with pytest.raises(ValidationError, match=r"finite|greater than or equal"):
+        DeepSpeedFP16Config(loss_scale=loss_scale)
+
+
+def test_fp16_loss_scale_rejects_negative():
+    """Negative loss_scale must raise ValidationError."""
+    with pytest.raises(ValidationError):
+        DeepSpeedFP16Config(loss_scale=-1.0)