From 920f6ee533026d3e393f66e43ab1d4568b093f4e Mon Sep 17 00:00:00 2001
From: yuchenwang3 <eang333cms@gmail.com>
Date: Wed, 17 Jun 2026 13:23:59 -0700
Subject: [PATCH 1/3] fix(muon_utils): keep newton_schulz scale-invariant for
 small-norm inputs

`newton_schulz` should be scale-invariant, but the internal
`F.normalize(x, p=2, dim=(-2,-1), eps=1e-7)` divides a small-norm input by
`eps` instead of its norm once `||x||_F < eps`, so the iteration (tuned for
singular values ~1) cannot lift it and the output silently degenerates to a
non-orthogonal matrix.

Lower the `eps` default 1e-7 -> 1e-30 so it acts purely as a divide-by-zero
guard (its documented purpose). Input is enforced fp32, so 1e-30 is safe. This
covers the non-TP `F.normalize`, the TP `distributed_normalize_p2`, and
`newton_schulz_tp` (which routes through `newton_schulz`).

Add `test_newtonschulz_scale_invariance` regression test.

Fixes #229

Signed-off-by: yuchenwang3 <eang333cms@gmail.com>
---
 .../orthogonalized_optimizers/muon_utils.py   |  9 ++++++--
 tests/test_muon_utils.py                      | 23 +++++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/emerging_optimizers/orthogonalized_optimizers/muon_utils.py b/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
index b29e02b..c247288 100644
--- a/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
+++ b/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
@@ -141,7 +141,7 @@ def newton_schulz(
     steps: int,
     coefficient_type: NSCoeffT = "quintic",
     custom_coefficient_sets: list[tuple[float, float, float]] | None = None,
-    eps: float = 1e-7,
+    eps: float = 1e-30,
     transpose: bool | None = None,
     tp_group: torch.distributed.ProcessGroup | None = None,
     use_syrk: bool = False,
@@ -192,7 +192,12 @@ def newton_schulz(
     if transpose:
         x = x.mT
 
-    # Ensure spectral norm is at most 1
+    # Ensure spectral norm is at most 1.
+    # NOTE: ``eps`` is purely a divide-by-zero guard and must stay well below any realistic
+    # ``||x||_F`` (input is fp32). If it is too large, a small-norm input is divided by ``eps``
+    # instead of its norm, so ``||X||_F = ||x||_F / eps << 1`` and the iteration (tuned for
+    # singular values ~1) cannot lift it -> silently degenerate, non-orthogonal output. This
+    # breaks the scale-invariance of orthogonalization. See issue #229.
     if tp_group is not None:
         X = distributed_normalize_p2(x, eps, tp_group)
     else:
diff --git a/tests/test_muon_utils.py b/tests/test_muon_utils.py
index ac58f55..7b7a55a 100644
--- a/tests/test_muon_utils.py
+++ b/tests/test_muon_utils.py
@@ -120,6 +120,29 @@ def test_newtonschulz5_close_to_reference(self, dim1, dim2):
             rtol=1e-7,
         )
 
+    @parameterized.parameters(1e-2, 1e-6, 1e-9, 1e-12)
+    def test_newtonschulz_scale_invariance(self, scale):
+        """Orthogonalization depends only on direction, so scaling the input must not change the output.
+
+        Regression test for issue #229: a too-large ``eps`` in the internal ``F.normalize`` divides
+        small-norm inputs by ``eps`` instead of their norm, silently degenerating the output. The
+        orthogonalized result for ``x`` and ``scale * x`` must match for any ``scale > 0``.
+        """
+        x = torch.randn(256, 256, device=self.device, dtype=torch.float32)
+        x = x / x.norm()  # unit Frobenius norm direction
+        ref = muon_utils.newton_schulz(x, steps=5, coefficient_type="quintic")
+        out = muon_utils.newton_schulz(scale * x, steps=5, coefficient_type="quintic")
+        torch.testing.assert_close(
+            out,
+            ref,
+            atol=1e-4,
+            rtol=1e-5,
+            msg=lambda m: (
+                f"newton_schulz not scale-invariant at input scale {scale}: "
+                f"||out||_F={out.norm().item():.4f} vs ||ref||_F={ref.norm().item():.4f}\n{m}"
+            ),
+        )
+
     @parameterized.parameters(
         (2, 256, 256),
         (4, 128, 256),

From dcf83614eee27b4e51b5247e23246dee029264c5 Mon Sep 17 00:00:00 2001
From: yuchenwang3 <eang333cms@gmail.com>
Date: Sun, 21 Jun 2026 20:48:29 -0700
Subject: [PATCH 2/3] use fp32-safe eps=1e-15 (not 1e-30) per @skyw review

@skyw is right that 1e-30 underflows in fp32 when squared (1e-30**2 = 1e-60).
1e-15 keeps eps**2 representable (1e-15**2 = 1e-30, a normal fp32 value) while
staying far below any realistic ||x||_F, so small-norm inputs still normalize by
their true norm instead of the guard. Within the 1e-12..1e-15 range you suggested;
chose 1e-15 for maximum dynamic range below the floor. See #229.

Signed-off-by: yuchenwang3 <eang333cms@gmail.com>
---
 emerging_optimizers/orthogonalized_optimizers/muon_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/emerging_optimizers/orthogonalized_optimizers/muon_utils.py b/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
index c247288..d72373f 100644
--- a/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
+++ b/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
@@ -141,7 +141,7 @@ def newton_schulz(
     steps: int,
     coefficient_type: NSCoeffT = "quintic",
     custom_coefficient_sets: list[tuple[float, float, float]] | None = None,
-    eps: float = 1e-30,
+    eps: float = 1e-15,
     transpose: bool | None = None,
     tp_group: torch.distributed.ProcessGroup | None = None,
     use_syrk: bool = False,
@@ -197,7 +197,9 @@ def newton_schulz(
     # ``||x||_F`` (input is fp32). If it is too large, a small-norm input is divided by ``eps``
     # instead of its norm, so ``||X||_F = ||x||_F / eps << 1`` and the iteration (tuned for
     # singular values ~1) cannot lift it -> silently degenerate, non-orthogonal output. This
-    # breaks the scale-invariance of orthogonalization. See issue #229.
+    # breaks the scale-invariance of orthogonalization. It must also stay above ~1e-15 so that
+    # ``eps**2`` is still representable in fp32 (1e-15**2 = 1e-30, a normal float); a much smaller
+    # guard such as 1e-30 would underflow when squared. See issue #229.
     if tp_group is not None:
         X = distributed_normalize_p2(x, eps, tp_group)
     else:

From 620196ca7f02ea39ec38fd83de1948a917378659 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yw.yy953e@alibaba-inc.com>
Date: Mon, 22 Jun 2026 10:03:25 -0700
Subject: [PATCH 3/3] address review: trim NOTE to a #229 reference, rename
 test to test_newtonschulz_small_eps

Per @skyw review on #230: keep the fp32-safe eps=1e-15 change, shorten
the over-long NOTE to a brief #229 reference, and give the regression
test a more explicit name.

Signed-off-by: Yuchen Wang <yw.yy953e@alibaba-inc.com>
---
 .../orthogonalized_optimizers/muon_utils.py              | 9 ++-------
 tests/test_muon_utils.py                                 | 2 +-
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/emerging_optimizers/orthogonalized_optimizers/muon_utils.py b/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
index d72373f..521fc71 100644
--- a/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
+++ b/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
@@ -193,13 +193,8 @@ def newton_schulz(
         x = x.mT
 
     # Ensure spectral norm is at most 1.
-    # NOTE: ``eps`` is purely a divide-by-zero guard and must stay well below any realistic
-    # ``||x||_F`` (input is fp32). If it is too large, a small-norm input is divided by ``eps``
-    # instead of its norm, so ``||X||_F = ||x||_F / eps << 1`` and the iteration (tuned for
-    # singular values ~1) cannot lift it -> silently degenerate, non-orthogonal output. This
-    # breaks the scale-invariance of orthogonalization. It must also stay above ~1e-15 so that
-    # ``eps**2`` is still representable in fp32 (1e-15**2 = 1e-30, a normal float); a much smaller
-    # guard such as 1e-30 would underflow when squared. See issue #229.
+    # NOTE: ``eps`` is a divide-by-zero guard; it must stay well below any realistic ``||x||_F``
+    # yet remain fp32-safe when squared. See issue #229.
     if tp_group is not None:
         X = distributed_normalize_p2(x, eps, tp_group)
     else:
diff --git a/tests/test_muon_utils.py b/tests/test_muon_utils.py
index 7b7a55a..dbc56e6 100644
--- a/tests/test_muon_utils.py
+++ b/tests/test_muon_utils.py
@@ -121,7 +121,7 @@ def test_newtonschulz5_close_to_reference(self, dim1, dim2):
         )
 
     @parameterized.parameters(1e-2, 1e-6, 1e-9, 1e-12)
-    def test_newtonschulz_scale_invariance(self, scale):
+    def test_newtonschulz_small_eps(self, scale):
         """Orthogonalization depends only on direction, so scaling the input must not change the output.
 
         Regression test for issue #229: a too-large ``eps`` in the internal ``F.normalize`` divides