diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py index 894ecfbc0409..28e2f7159d31 100644 --- a/deepspeed/runtime/precision_config.py +++ b/deepspeed/runtime/precision_config.py @@ -3,6 +3,10 @@ # DeepSpeed Team +import math + +from pydantic import Field, field_validator + from deepspeed.runtime.config_utils import DeepSpeedConfigModel from .fp16.loss_scaler import ( INITIAL_LOSS_SCALE, @@ -108,11 +112,24 @@ class DeepSpeedFP16Config(DeepSpeedConfigModel): Automatically cast inputs to fp16 """ - loss_scale: float = 0 + loss_scale: float = Field(0, ge=0) """ - Loss scaling value. Default value of 0 means dynamic loss scaling instead of static loss scale. + Loss scaling value for static loss scaling. Default value of 0 enables + dynamic loss scaling instead of a fixed static scale. Must be a finite + non-negative number; ``float('inf')`` and ``float('nan')`` are rejected + because they cause silent NaN gradients during training. """ + @field_validator("loss_scale") + @classmethod + def loss_scale_must_be_finite(cls, v: float) -> float: + if not math.isfinite(v): + raise ValueError( + f"fp16.loss_scale must be a finite non-negative number (0 for dynamic scaling), " + f"got {v!r}. Infinite or NaN values silently corrupt gradients." + ) + return v + initial_scale_power: int = 16 """ For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}. diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py index 4d184b2858a8..36ee4b395734 100644 --- a/tests/unit/runtime/test_ds_config_model.py +++ b/tests/unit/runtime/test_ds_config_model.py @@ -84,3 +84,35 @@ def test_config_base_literalfail(config_dict): def test_config_base_deprecatedfail(): with pytest.raises(AssertionError): config = SimpleConf(**{"param_2": ["DS"], "param_2_old": "DS"}) + + +# ────────────────────────────────────────────────────────────────────────────── +# DeepSpeedFP16Config.loss_scale validation (issue #7852) +# ────────────────────────────────────────────────────────────────────────────── + +from deepspeed.runtime.precision_config import DeepSpeedFP16Config + + +@pytest.mark.parametrize("loss_scale", [0, 1.0, 128.0, 65536.0]) +def test_fp16_loss_scale_valid(loss_scale): + """0 (dynamic) and finite positive values must be accepted.""" + cfg = DeepSpeedFP16Config(loss_scale=loss_scale) + assert cfg.loss_scale == loss_scale + + +@pytest.mark.parametrize("loss_scale", [float("inf"), float("-inf"), float("nan")]) +def test_fp16_loss_scale_rejects_non_finite(loss_scale): + """Non-finite loss_scale values must raise ValidationError (issue #7852). + + Pydantic may run the ``ge=0`` field constraint before the custom + ``field_validator``, producing a "greater than or equal" message for + -inf/nan instead of our "finite" message. Accept either. + """ + with pytest.raises(ValidationError, match=r"finite|greater than or equal"): + DeepSpeedFP16Config(loss_scale=loss_scale) + + +def test_fp16_loss_scale_rejects_negative(): + """Negative loss_scale must raise ValidationError.""" + with pytest.raises(ValidationError): + DeepSpeedFP16Config(loss_scale=-1.0)