NVIDIA-NeMo · skyw · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · greptile-apps
diff --git a/tests/_comparison.py b/tests/_comparison.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Comparison helpers for tests.
+
+``assert_equal`` is :func:`torch.testing.assert_close` with ``atol=rtol=0``: it asserts that two
+tensors are exactly (bitwise) equal. Use it for bit-identity checks instead of repeating
+``atol=0, rtol=0``.
+"""
+
+import functools
+
+import torch
+from torch import testing as torch_testing
+
+
+assert_equal = functools.partial(torch_testing.assert_close, rtol=0, atol=0)
+
+
+def assert_close_to_identity(actual, *, off_diag_atol=0, diag_atol=0):
+    r"""Assert that ``actual`` is close to the identity matrix.
+
+    Checks the identity structure with separate tolerances for the diagonal (compared to ones) and the
+    off-diagonal entries (compared to zeros). This is more informative, and allows looser tolerances on
+    the off-diagonal, than comparing the whole matrix to ``torch.eye`` with a single tolerance.
+
+    Args:
+        actual: A square 2D tensor expected to be (approximately) the identity matrix.
+        off_diag_atol: Absolute tolerance for the off-diagonal entries (compared to 0).
+        diag_atol: Absolute tolerance for the diagonal entries (compared to 1).
+
+    Raises:
+        ValueError: If ``actual`` is not a square 2D matrix.
+        AssertionError: If ``actual`` is not close to the identity matrix.
+    """
+    if actual.ndim != 2 or actual.shape[0] != actual.shape[1]:
+        raise ValueError(f"actual must be a square 2D matrix, got shape {tuple(actual.shape)}")
+
+    n = actual.shape[-1]
+    off_diag_mask = ~torch.eye(n, dtype=torch.bool, device=actual.device)
+    diag = torch.diagonal(actual)
+    off_diag = actual[off_diag_mask]
+    torch_testing.assert_close(
+        diag,
+        torch.ones_like(diag),
+        atol=diag_atol,
+        rtol=0,
+        msg=lambda msg: f"Diagonal is not close to ones.\n\n{msg}",
+    )
+    torch_testing.assert_close(
+        off_diag,
+        torch.zeros_like(off_diag),
+        atol=off_diag_atol,
+        rtol=0,
+        msg=lambda msg: f"Off-diagonal is not close to zeros.\n\n{msg}",
+    )
+
+
+def assert_close_to_orthogonal(actual, *, off_diag_atol=0, diag_atol=0):
+    r"""Assert that a 2D matrix is (semi-)orthogonal.
+
+    Builds the Gram matrix over the smaller dimension (``X @ Xᵀ`` when ``X`` has no more rows than
+    columns, otherwise ``Xᵀ @ X``) and asserts it is close to the identity via
+    :func:`assert_close_to_identity`.
+
+    Args:
+        actual: A 2D tensor expected to have (semi-)orthonormal rows or columns.
+        off_diag_atol: Absolute tolerance for the off-diagonal entries of the Gram matrix.
+        diag_atol: Absolute tolerance for the diagonal entries of the Gram matrix.
+
+    Raises:
+        ValueError: If ``actual`` is not a 2D matrix.
+        AssertionError: If ``actual`` is not (semi-)orthogonal.
+    """
+    if actual.ndim != 2:
+        raise ValueError(f"actual must be a 2D matrix, got shape {tuple(actual.shape)}")
+
+    m, n = actual.shape
+    gram = actual @ actual.mT if m <= n else actual.mT @ actual
+    assert_close_to_identity(gram, off_diag_atol=off_diag_atol, diag_atol=diag_atol)
diff --git a/tests/test_distributed_muon_utils_cpu.py b/tests/test_distributed_muon_utils_cpu.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import torch
+from _comparison import assert_equal
 from absl import flags, logging
 from absl.testing import absltest, parameterized
 
@@ -195,7 +196,7 @@ def test_fall_back_to_non_tp(self, shape):
         )
         ref_out = muon_utils.newton_schulz(x, steps=5, coefficient_type="quintic")
 
-        torch.testing.assert_close(test_out, ref_out, atol=0, rtol=0)
+        assert_equal(test_out, ref_out)
 
     @parameterized.product(
         shape=((20, 16), (16, 32)),

diff --git a/tests/test_distributed_rekls_cpu.py b/tests/test_distributed_rekls_cpu.py
@@ -16,6 +16,7 @@
 import sys
 
 import torch
+from _comparison import assert_equal
 from absl import flags, logging
 from absl.testing import absltest, parameterized
 
@@ -122,11 +123,9 @@ def test_5steps_matches_non_distributed_rekls(self):
                     ref_local = ref_p.detach()
                 else:
                     ref_local = ref_p.detach().chunk(self.world_size, dim=pd)[self.rank]
-                torch.testing.assert_close(
+                assert_equal(
                     tp_p.detach(),
                     ref_local,
-                    atol=0,
-                    rtol=0,
                 )
 
 

diff --git a/tests/test_distributed_soap_utils_cpu.py b/tests/test_distributed_soap_utils_cpu.py
@@ -16,6 +16,7 @@
 import sys
 
 import torch
+from _comparison import assert_equal
 from absl import flags, logging
 from absl.testing import absltest, parameterized
 
@@ -75,9 +76,9 @@ def test_matches_non_distributed(self, shape, partition_dim):
             tp_group=self.tp_group,
         )
 
-        torch.testing.assert_close(gathered_grad, full_grad, atol=0, rtol=0)
-        torch.testing.assert_close(gathered_factors[0], full_l, atol=0, rtol=0)
-        torch.testing.assert_close(gathered_factors[1], full_r, atol=0, rtol=0)
+        assert_equal(gathered_grad, full_grad)
+        assert_equal(gathered_factors[0], full_l)
+        assert_equal(gathered_factors[1], full_r)
 
     @parameterized.product(
         shape=((16, 32), (32, 16), (96, 200)),
@@ -113,8 +114,8 @@ def test_updated_factors_match_non_distributed(self, shape, partition_dim):
         )
         soap.update_kronecker_factors(gathered_factors, gathered_grad, shampoo_beta=shampoo_beta)
 
-        torch.testing.assert_close(gathered_factors[0], ref_l, atol=0, rtol=0)
-        torch.testing.assert_close(gathered_factors[1], ref_r, atol=0, rtol=0)
+        assert_equal(gathered_factors[0], ref_l)
+        assert_equal(gathered_factors[1], ref_r)
 
 
 if __name__ == "__main__":

diff --git a/tests/test_eig_utils.py b/tests/test_eig_utils.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
+from _comparison import assert_equal
 from absl import flags, logging
 from absl.testing import absltest, parameterized
 
@@ -145,7 +146,7 @@ def test_conjugate_match_reference(self) -> None:
         _, p = torch.linalg.eigh(a)
 
         ref = p.T @ a @ p
-        torch.testing.assert_close(eig_utils.conjugate(a, p), ref, atol=0, rtol=0)
+        assert_equal(eig_utils.conjugate(a, p), ref)
 
     def test_eigh_with_fallback_reraises_runtime_error_when_force_double(self) -> None:
         """Test that eigh_with_fallback re-raises when force_double=True and eigh fails."""

diff --git a/tests/test_muon_utils.py b/tests/test_muon_utils.py
@@ -16,6 +16,7 @@
 from copy import deepcopy
 
 import torch
+from _comparison import assert_close_to_orthogonal, assert_equal
 from absl import flags, logging
 from absl.testing import absltest, parameterized
 
@@ -83,7 +84,7 @@ def tearDown(self):
         (512, 256),
         (256, 512),
     )
-    def test_newtonschulz5_svd_close(self, dim1, dim2):
+    def test_newtonschulz5_close_to_svd(self, dim1, dim2):
         shape = (dim1, dim2)
         x = torch.randn(*shape, device=self.device, dtype=torch.float32)
         out_zeropowerns = muon_utils.newton_schulz(x, steps=5, coefficient_type="quintic")
@@ -262,6 +263,18 @@ def test_polar_express_and_deepseekv4_10steps_better_than_quintic(self, size, co
             f"{coefficient_type} norm is larger than Quintic norm: {l2_norm_diff_polar:.6f} > {l2_norm_diff_quintic:.6f}",
         )
 
+    @parameterized.product(size=[(512, 256), (256, 512)])
+    def test_polar_express_16steps_almost_orthogonal(self, size):
+        """Polar Express Newton-Schulz with enough steps yields an almost-orthogonal matrix.
+
+        The output ``O`` should satisfy ``O Oᵀ ≈ I`` (or ``Oᵀ O ≈ I`` for the tall case), i.e. its
+        smaller-dimension Gram is close to the identity.
+        """
+        dim1, dim2 = size
+        x = torch.randn(dim1, dim2, device=self.device, dtype=torch.float32)
+        out = muon_utils.newton_schulz(x, steps=16, coefficient_type="polar_express")
+        assert_close_to_orthogonal(out, diag_atol=1e-5, off_diag_atol=1e-5)
+
     @parameterized.parameters(
         (511, 513),
         (511, 257),
@@ -461,7 +474,7 @@ def test_match_newton_schulz_step_by_gemm(self, dim1, dim2):
         test_out = muon_utils.newton_schulz_step_tsyrk(x, 2**-1, 2**-2, 2**-3)
         test_ref = muon_utils.newton_schulz_step(x, 2**-1, 2**-2, 2**-3)
 
-        torch.testing.assert_close(test_out, test_ref, atol=0, rtol=0)
+        assert_equal(test_out, test_ref)
 
 
 if __name__ == "__main__":

diff --git a/tests/test_normalized_optimizer.py b/tests/test_normalized_optimizer.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import torch
+from _comparison import assert_equal
 from absl import flags, logging
 from absl.testing import absltest, parameterized
 
@@ -164,22 +165,18 @@ def test_oblique_sgd_momentum_buffer_accumulates_across_steps(self) -> None:
 
         param.grad = first_grad.clone()
         optimizer.step()
-        torch.testing.assert_close(
+        assert_equal(
             optimizer.state[param]["momentum_buffer"],
             first_grad,
-            atol=0,
-            rtol=0,
         )
 
         param.grad = second_grad.clone()
         optimizer.step()
 
         expected_buffer = second_grad + 0.8 * first_grad
-        torch.testing.assert_close(
+        assert_equal(
             optimizer.state[param]["momentum_buffer"],
             expected_buffer,
-            atol=0,
-            rtol=0,
         )
 
     def test_oblique_adam_zero_gradient(self) -> None:

diff --git a/tests/test_orthogonalized_optimizer.py b/tests/test_orthogonalized_optimizer.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import torch
 import torch.nn as nn
+from _comparison import assert_equal
 from absl import flags, logging
 from absl.testing import absltest, parameterized
 
@@ -94,11 +95,9 @@ def test_orthogonalized_optimizer_core_matches_sgd(self, shape) -> None:
         orthogonalized_opt.step()
         sgd_opt.step()
 
-        torch.testing.assert_close(
+        assert_equal(
             test_param.data,
             ref_param.data,
-            atol=0,
-            rtol=0,
         )
 
     @parameterized.parameters(
@@ -139,11 +138,9 @@ def test_orthogonalized_optimizer_core_matches_sgd_with_momentum(self, shape) ->
             orthogonalized_opt.step()
             sgd_opt.step()
 
-        torch.testing.assert_close(
+        assert_equal(
             test_param.data,
             ref_param.data,
-            atol=0,
-            rtol=0,
         )
 
     @parameterized.parameters(
@@ -206,11 +203,9 @@ def dummy_interleaved_split_orth_fn(x: torch.Tensor) -> torch.Tensor:
         assert not torch.allclose(test_param, test_param.grad)
 
         ref_out = dummy_interleaved_split_orth_fn(test_param.grad)
-        torch.testing.assert_close(
+        assert_equal(
             test_param,
             ref_out,
-            atol=0,
-            rtol=0,
         )
 
     def test_non_2d_param_raises_value_error(self) -> None:
@@ -292,11 +287,9 @@ def test_use_independent_wd(self) -> None:
         )
         muon_opt_indep.step()
 
-        torch.testing.assert_close(
+        assert_equal(
             test_param,
             expected_param,
-            atol=0,
-            rtol=0,
         )
 
     def test_zero_num_ns_steps_raises_value_error(self) -> None:

diff --git a/tests/test_polargrad.py b/tests/test_polargrad.py
@@ -16,6 +16,7 @@
 
 import torch
 import torch.nn as nn
+from _comparison import assert_equal
 from absl import flags, logging
 from absl.testing import absltest, parameterized
 
@@ -74,11 +75,9 @@ def test_orthogonalize_fn_matches_ref(self, shape, extra_scale_factor) -> None:
 
         test_out = polargrad_opt.scaled_orthogonalize_fn(dummy_grad)
 
-        torch.testing.assert_close(
+        assert_equal(
             ref_out,
             test_out,
-            atol=0,
-            rtol=0,
         )
 
     def test_negative_num_ns_steps_raises_value_error(self) -> None:
@@ -104,8 +103,8 @@ def test_right_orthogonal_equivariance(self, shape, center_rows) -> None:
         torch.testing.assert_close(
             rotated,
             expected,
-            atol=1e-5,
-            rtol=1e-5,
+            atol=1e-4,
+            rtol=1e-4,
         )
 
     def test_usable_as_scaled_orthogonalize_fn(self) -> None: