coreyqh · sww0000 · Apr 19, 2026 · Apr 19, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/docs/B14.adoc b/docs/B14.adoc
@@ -0,0 +1,151 @@
+= Floating Point Model Documentation
+:toc:
+:toclevels: 3
+:sectnums:
+:stem: latexmath
+
+
+== B14 Multiply-Add: Shift
+
+Aharoni et al.
+
+=== Description
+
+This model tests every possible value for a shift between the addends of the
+multiply-add (FMA) operation. For the difference between the unbiased
+exponent of the addend and the unbiased exponent of the result of the
+multiplication, test the following values:
+
+1. A value smaller than `-(2p + 1)`
+2. All the values in the range `[-(2p + 1), (p + 1)]`
+3. A value larger than `(p + 1)`
+
+*Precisions Supported:* `BF_16`, `FP_16`, `FP_32`, `FP_64`, `FP_128`
+
+*Operations Supported:* FMADD, FMSUB, FNMADD, FNMSUB
+
+*Rounding Mode:* Round-to-Nearest-Even (RNE)
+
+== Implementation
+
+=== General Implementation
+
+An FMA (fused multiply-add) unit computes `X = A * B + C`, where `X` is the
+result, `A` and `B` are the multiplicands, and `C` is the addend.
+
+The shift is defined as the difference between the unbiased exponent of the
+multiplication product and the unbiased exponent of the addend `C`:
+
+[source]
+----
+S = unbiased_exp(product) - unbiased_exp(C)
+  = (unbiased_exp(A) + unbiased_exp(B)) - unbiased_exp(C)
+----
+
+In order for a processor to properly execute a fused multiply-add operation,
+it must first compute the product and then align the significand of the
+addend with the significand of the multiplication result (or the other way
+around) so that both quantities share the same exponent before addition or
+subtraction is performed.
+
+=== Definitions
+
+`p`:: `mantissa_bits + 1`, the precision
+`min_u`:: Minimum unbiased exponent for the format
+`max_u`:: Maximum unbiased exponent for the format
+`bias`:: `(2 ^ (E - 1)) - 1`, exponent bias for the format
+`A`, `B`, `C`:: The three FMA operands
+`P = A * B`:: The intermediate product
+`S`:: The target shift, measured as `unbiased_exp(P) - unbiased_exp(C)`
+
+=== General Procedure
+
+The generator produces a single group of test vectors per format,
+corresponding directly to Aharoni's three shift regions:
+
+[cols="2,1,2,2",options="header"]
+|===
+| Group                           | Aharoni Case | Shift Constraint                      | Vectors per format
+
+| Group 1 (small_diff)            | 1            | `S <= -(2p + 2)`                      | 4
+| Group 2 (mid_diff sweep)        | 2            | `S ∈ [-(2p + 1), (p + 1)]`            | `(3p + 3) * 4`
+| Group 3 (large_diff)            | 3            | `S >= (p + 2)`                        | 4
+|===
+
+Where `S = unbiased_exp(product) - unbiased_exp(C)` and
+`p = mantissa_bits + 1`.
+
+For every group, the generator loops through the four FMA operations
+{FMADD, FMSUB, FNMADD, FNMSUB}, and the rounding mode is fixed at RNE. The
+sign, mantissa, and exponent-split choices are randomized within the
+constraints of the target shift.
+
+== Test Implementation
+
+=== Group 1 — The Deep-Negative Shift (small_diff)
+
+*Goal:* Drive the alignment shift into the region `S <= -(2p + 2)`. This
+region corresponds to the case where the addend `C` is so much larger than
+the product `P` that `P` effectively becomes a sticky bit relative to `C`.
+
+*Intermediate Constraint:* `S <= -(2p + 2)`, achieved by pairing a
+small-exponent product with a large-exponent addend.
+
+*Construction Method:*
+
+1. Given the fixed target shift `S = -(2p + 2)`, pick an unbiased addend
+   exponent `C_u` from the reachable range
+   `[max(min_u, 2*min_u - S), min(max_u, 2*max_u - S)] ∩ [min_u, max_u]`.
+2. Compute the required unbiased product exponent `P_u = S + C_u`.
+3. Split `P_u = A_u + B_u` with both in `[min_u, max_u]` by randomly picking
+   `A_u ∈ [max(min_u, P_u - max_u), min(max_u, P_u - min_u)]` and setting
+   `B_u = P_u - A_u`.
+4. Emit the test vector with random signs and random mantissas for `A`,
+   `B`, `C`.
+
+*Total Test Cases for Group 1:* `1 shift x 4 operations = 4 vectors per format.`
+
+=== Group 2 — The Mid Range Sweep (mid_diff)
+
+*Goal:* Exercise every integer alignment shift in the range where the
+addend and product have comparable magnitudes. Every integer shift
+`S ∈ [-(2p + 1), (p + 1)]` is visited.
+
+*Intermediate Constraint:* `S` takes every value in `[-(2p + 1), (p + 1)]`.
+
+*Construction Method:* For each target shift `S` in the sweep and for each
+of the four FMA operations, the generator applies the same construction as
+Group 1.
+
+*Total Test Cases for Group 2:* `(3p + 3) shifts x 4 operations = (12p + 12) vectors per format.`
+
+=== Group 3 — The Deep-Positive Shift (large_diff)
+
+*Goal:* Drive the alignment shift into the region `S >= (p + 2)`. This
+region corresponds to the case where the product `P` is so much larger than
+the addend `C` that `C` effectively becomes a sticky bit relative to `P`.
+
+*Intermediate Constraint:* `S >= (p + 2)`, achieved by pairing a
+large-exponent product with a small-exponent addend.
+
+*Construction Method:* Same as Group 1.
+
+*Total Test Cases for Group 3:* `1 shift x 4 operations = 4 vectors per format.`
+
+=== Vector Count Summary
+
+NOTE: There is no redundancy between groups, and no vector contributes to
+more than one shift.
+
+[cols="1,1,1,1,1,1",options="header"]
+|===
+| Precision | p   | Group 1 | Group 2 | Group 3 | Total per format
+
+| BF16      |   8 | 4       |   108   | 4       |   116
+| F16       |  11 | 4       |   144   | 4       |   152
+| F32       |  24 | 4       |   300   | 4       |   308
+| F64       |  53 | 4       |   648   | 4       |   656
+| F128      | 113 | 4       | 1,368   | 4       | 1,376
+|===
+
+*Total across 5 formats:* 2,608 vectors
diff --git a/src/cover_float/testgen/B14.py b/src/cover_float/testgen/B14.py
@@ -15,21 +15,15 @@
 
 # By: Sisi Wang
 # B14.py
-# Fuse Multiply Add(FMA)
+# Fused Multiply-Add (FMA)
 # B14 -> Multiply-Add: Shift
-
-
-# This model tests every possible value for a shift between the addends of the multiply-add operation.
-# For the difference between the unbiased exponent of the addend and the unbiased exponent of the result of the
-# multiplication, test the following values:
-# 1.A value smaller than -(2* p + 1)
-# 2.All the values in the range:[-(2*p +1), (p +1) ]
-# 3.A value larger than (p + 1)
-
+#
+# Tests every possible value of the alignment shift between the multiplication
+# result and the addend in a fused multiply-add (FMA) operation. The shift is
+# defined as S = unbiased_exp(A*B) - unbiased_exp(C).
 
 import logging
 import random
-from random import seed
 from typing import TextIO, cast
 
 import cover_float.common.constants as const
@@ -53,79 +47,72 @@ def decimalComponentsToHex(fmt: str, sign: int, biased_exp: int, mantissa: int)
 
 
 def generate_b14_tests(test_f: TextIO, cover_f: TextIO, fmt: str) -> None:
-    _p = const.MANTISSA_BITS[fmt] + 1  # defines the precision we are working with
-    bias = (1 << (const.EXPONENT_BITS[fmt] - 1)) - 1  # calculates the correct bias depending on fmt
-    min_exp = const.BIASED_EXP[fmt][0]
-    max_exp = const.BIASED_EXP[fmt][1]
-
-    # Define Format-Specific Shift Limits
-    # This defines the sweep range [ -limit, +limit ]
-    # We want to cover the full range of (ExpA + ExpB) - ExpC
-    SHIFT_LIMITS = {
-        const.FMT_HALF: 31,  # Max exp diff is ~30. We use 31
-        const.FMT_BF16: 256,  # Max exp diff is ~255. We use 256
-        const.FMT_SINGLE: 256,  # Max exp diff is ~255. We use 256
-        const.FMT_DOUBLE: 2050,  # Max exp diff is ~2047. We use 2050
-        const.FMT_QUAD: 32001,  # Max exp diff is ~32001
-    }
-
-    # Get the limit (default to 500 if format missing)
-    limit = SHIFT_LIMITS.get(fmt, 500)
-
-    start_shift = -limit
-    end_shift = limit
-
-    with logger.progress_bar(f"{fmt} Shifts", show_m_of_n=True) as pbar:
-        for target_shift in pbar.track(range(start_shift, end_shift + 1)):
-            for op in OPS:
-                hashval = reproducible_hash(op + fmt + "b14")  # Unique hash for (op, fmt) seed
-                seed(hashval)  # Seed the random generator for reproducibility
-                # Randomize & generate 15 variations per shift
-                for _ in range(15):
-                    ##Part 1: Randomize a and b exponents (and make sure their product is valid)
-                    # a:
-                    # safe margin defined to keep 'a' somewhat central to avoid immediate overflows
-                    safe_margin = int(max_exp / 4)
-                    exp_a = random.randint(min_exp + safe_margin, max_exp - safe_margin)
-                    # b:
-                    low_bound = max(min_exp, bias - 50)
-                    high_bound = min(max_exp, bias + 50)
-                    # Pick 'b' near the bias (so product exponent is close to exp_a) or random. This is simplified;
-                    # we might want more range here.
-                    exp_b = random.randint(low_bound, high_bound)
-
-                    ##Part 2: Calculate the addends
-                    # Calculate product exponent:Exp_Prod = Exp_A + Exp_B - Bias
-                    exp_prod = exp_a + exp_b - bias
-
-                    # Calculate required Exp_C to hit the Target Shift: target_shift = Exp_C - Exp_Prod
-                    exp_c = target_shift + exp_prod
-
-                    # Quick validity check -> If the calculated exp_c is invalid (too big/small), skip this variation
-                    if exp_c < min_exp or exp_c > max_exp:
-                        continue
-
-                    ##Part 3: Generate mantissa componenets + Assemble >:3
-                    # Create random signs (0 or 1)
-                    sign_a = random.randint(0, 1)
-                    sign_b = random.randint(0, 1)
-                    sign_c = random.randint(0, 1)
-
-                    # Randomize mantissas -> Uses random bits to trigger different carry/rounding paths.
-                    mant_a = random.getrandbits(const.MANTISSA_BITS[fmt])
-                    mant_b = random.getrandbits(const.MANTISSA_BITS[fmt])
-                    mant_c = random.getrandbits(const.MANTISSA_BITS[fmt])
-
-                    # Converts the components created above to hex
-                    hex_a = decimalComponentsToHex(fmt, sign_a, exp_a, mant_a)
-                    hex_b = decimalComponentsToHex(fmt, sign_b, exp_b, mant_b)
-                    hex_c = decimalComponentsToHex(fmt, sign_c, exp_c, mant_c)
-
-                    run_and_store_test_vector(
-                        f"{op}_{const.ROUND_NEAR_EVEN}_{hex_a}_{hex_b}_{hex_c}_{fmt}_{32 * '0'}_{fmt}_00",
-                        test_f,
-                        cover_f,
-                    )
+    p = const.MANTISSA_BITS[fmt] + 1
+    min_u, max_u = const.UNBIASED_EXP[fmt]
+    bias = const.BIAS[fmt]
+
+    # Build shift list: [small_anchor] + [mid_range] + [large_anchor]
+    small_anchor = -(2 * p + 2)
+    mid_range = list(range(-(2 * p + 1), (p + 1) + 1))
+    large_anchor = p + 2
+    shift_list = [small_anchor, *mid_range, large_anchor]
+
+    for op in OPS:
+        # Seed once per (fmt, op) pair, outside the shift loop
+        rng = random.Random(reproducible_hash(f"B14 {fmt} {op}"))
+
+        for target_shift in shift_list:
+            # Section 5: Exponent Construction Algorithm
+
+            # 1. Pick unbiased C exponent
+            c_lo = max(min_u, 2 * min_u - target_shift)
+            c_hi = min(max_u, 2 * max_u - target_shift)
+
+            if c_lo > c_hi:
+                continue
+
+            c_u = rng.randint(c_lo, c_hi)
+
+            # 2. Compute unbiased product exponent
+            p_u = target_shift + c_u
+
+            # 3. Split p_u into a_u + b_u
+            a_lo = max(min_u, p_u - max_u)
+            a_hi = min(max_u, p_u - min_u)
+
+            if a_lo > a_hi:
+                continue
+
+            a_u = rng.randint(a_lo, a_hi)
+            b_u = p_u - a_u
+
+            # 4. Convert to biased exponents
+            exp_a = a_u + bias
+            exp_b = b_u + bias
+            exp_c = c_u + bias
+
+            # Section 7: Test Vector Assembly
+
+            # Draw random signs and mantissas
+            sign_a = rng.randint(0, 1)
+            sign_b = rng.randint(0, 1)
+            sign_c = rng.randint(0, 1)
+
+            mant_a = rng.getrandbits(const.MANTISSA_BITS[fmt])
+            mant_b = rng.getrandbits(const.MANTISSA_BITS[fmt])
+            mant_c = rng.getrandbits(const.MANTISSA_BITS[fmt])
+
+            # Encode operands to hex
+            hex_a = decimalComponentsToHex(fmt, sign_a, exp_a, mant_a)
+            hex_b = decimalComponentsToHex(fmt, sign_b, exp_b, mant_b)
+            hex_c = decimalComponentsToHex(fmt, sign_c, exp_c, mant_c)
+
+            # Emit test vector
+            run_and_store_test_vector(
+                f"{op}_{const.ROUND_NEAR_EVEN}_{hex_a}_{hex_b}_{hex_c}_{fmt}_{32 * '0'}_{fmt}_00",
+                test_f,
+                cover_f,
+            )
 
 
 @register_model("B14")