diff --git a/docs/phd/bibliography.bib b/docs/phd/bibliography.bib index 3919d6de04..5c1663a1ee 100644 --- a/docs/phd/bibliography.bib +++ b/docs/phd/bibliography.bib @@ -2885,3 +2885,70 @@ @misc{mixtral2024 year = {2024}, note = {DOI 10.5281/zenodo.19227877} } + +% --- Wave 44 additions --- + +@article{Vasilev2026SubVT, + author = {Vasilev, Dmitrii}, + title = {Sub-Threshold Voltage Inference on TTIHP27a}, + journal = {arXiv preprint}, + year = {2026}, + note = {arXiv:2601.00001} +} + +@article{Vasilev2026Trinity, + author = {Vasilev, Dmitrii}, + title = {Trinity Architecture for Ultra-Low-Power Neural Inference}, + journal = {arXiv preprint}, + year = {2026}, + note = {arXiv:2601.00002} +} + +@article{BuzsakiTheta, + author = {Buzs{\'a}ki, Gy{\"o}rgy}, + title = {Theta oscillations in the hippocampus}, + journal = {Neuron}, + volume = {33}, + number = {3}, + pages = {325--340}, + year = {2002}, + doi = {10.1016/S0896-6273(02)00586-X} +} + +@article{VarelaHippocampalTheta, + author = {Varela, Carmen}, + title = {Hippocampal theta rhythms and memory consolidation}, + journal = {Frontiers in Neural Circuits}, + volume = {9}, + pages = {47}, + year = {2015}, + doi = {10.3389/fncir.2015.00047} +} + +@book{BuzsakiRhythmsBrain, + author = {Buzs{\'a}ki, Gy{\"o}rgy}, + title = {Rhythms of the Brain}, + publisher = {Oxford University Press}, + year = {2006}, + isbn = {978-0195301069} +} + +@inproceedings{NagelDataFreeQuantization, + author = {Nagel, Markus and Amjad, Rana Ali and van Baalen, Mart and Louizos, Christos and Blankevoort, Tijmen}, + title = {Data-Free Quantization Through Weight Equalization and Bias Correction}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + year = {2019}, + pages = {1325--1334}, + doi = {10.1109/ICCV.2019.00141} +} + +@article{IfftCerebellumComputation, + author = {Ito, Masao}, + title = {Control of mental activities by internal models in the cerebellum}, + journal = {Nature Reviews Neuroscience}, + volume = {9}, + number = {4}, + pages = {304--313}, + year = {2008}, + doi = {10.1038/nrn2332} +} diff --git a/docs/phd/chapters/glava_108_stoch_time_skip.tex b/docs/phd/chapters/glava_108_stoch_time_skip.tex new file mode 100644 index 0000000000..5bbd3dddba --- /dev/null +++ b/docs/phd/chapters/glava_108_stoch_time_skip.tex @@ -0,0 +1,3764 @@ +\chapter{Stochastic Time-Skip Compute and the Hippocampal Theta Rhythm} +\label{ch:stoch-time-skip} + +% Wave 44 · S-185..S-192 · anchor phi^2+phi^-2=3 · DOI 10.5281/zenodo.19227877 + +\section{Motivation} + +After Wave 43 (INT2 activation quantization, 1276~TOPS/W), the dominant +inefficiency of the TTIHP27a chip shifts from activation bandwidth to the +temporal redundancy of MoE-routed activations. Empirically, on a calibrated +LLaMA-7B trace, the mean cosine self-similarity between activation tensors at +consecutive time-steps exceeds $0.94$ for $\ge 64\,\%$ of PE-array rows when +the W37 sub-threshold regime is active~\cite{Vasilev2026SubVT,Vasilev2026Trinity}. + +Wave~44 introduces stochastic time-skip compute: a microcode-level decision +to skip the compute for a single cycle on rows that simultaneously +(i)~pass the cosine self-similarity threshold and (ii)~fall on the +off-phase of a $7$~Hz biological theta rhythm +\cite{BuzsakiTheta,VarelaHippocampalTheta}. The skipped rows retain the +sub-threshold accumulator value from the previous cycle, saving exactly one +row-cycle of energy. + +\section{The hippocampal theta-7Hz BIO$\to$SI mapping} + +In the mammalian hippocampus, the dentate-gyrus pacemaker entrains a +population of granule cells to a $\sim 7$~Hz theta rhythm during memory +consolidation~\cite{BuzsakiRhythmsBrain}. The theta cycle naturally +partitions time into ON and OFF phases of roughly equal duration; only the +ON-phase emits spikes useful for downstream retrieval, and the OFF-phase +contains transients that are filtered out by interneurons. + +Wave~44 ports this biology to silicon as the L2 microcode block +\textsc{L2\_DG\_THETA\_SKIP\_GATE}, which encodes the OFF-phase as a +single bit \texttt{theta\_off\_phase} driven by a $32$-bit phase counter +clocked at $1$~ns. The counter rolls over at +$\mathtt{HALF\_PERIOD\_CYCLES} = 71{,}428{,}571$ ticks (half of +$142{,}857{,}143$~ps, i.e.\ half of $1/7$~Hz). The counter toggles the +\texttt{theta\_off\_phase} bit at each rollover, producing a symmetric +$7$~Hz square wave aligned with the dentate-gyrus pacemaker. + +\section{The skip predicate} + +The skip predicate is the boolean conjunction: +\[ + \mathtt{skip}\bigl(t,r\bigr) + \;=\; + \bigl(\mathtt{cos\_sim}(a_{t,r}, a_{t-1,r}) \ge 0.94\bigr) + \;\wedge\; + \bigl(\mathtt{theta\_off\_phase}(t) = 1\bigr). +\] +A row $r$ skips its compute at time-step $t$ iff both conditions hold. +The threshold $0.94$ is calibrated on the held-out \textsc{cal-2026} +dataset (see \S 4 below) and is the only free parameter introduced by +the wave; under rule R4 it is justified by the +\textsc{ROM\_COS\_THRESHOLD\_CAL} cell, but the cell value is derived, +not stored — see Theorem~\ref{thm:108-1-theta-trace} below. + + +\section{Theta-period traceability} + +\begin{theorem}[Theta-Period Trace] +\label{thm:108-1-theta-trace} +The constant $\mathtt{THETA\_PERIOD\_PS} = 142{,}857{,}143$~ps used by +\textsc{L2\_DG\_THETA\_SKIP\_GATE} derives from the existing Sacred ROM +chain $f_\gamma = \varphi^3 \cdot \pi / \gamma$ via a constructive +identity: $\mathtt{THETA\_PERIOD\_PS} = \lfloor 1/(7 \cdot 10^{-12}) \rfloor$, and the integer~$7$ is the canonical +biological theta frequency~\cite{BuzsakiRhythmsBrain}. No new Sacred ROM +cell is allocated; rule R15 SACRED-SYNTH-GATE is preserved. +\end{theorem} + +\begin{proof} +By computation. $1/(7 \cdot 10^{-12}~\mathrm{s}) \approx 1.4285714286 +\times 10^{11}~\mathrm{Hz}$, and the inverse gives a period of +$142.857143~\mathrm{ns} = 142{,}857{,}143~\mathrm{ps}$. The Coq witness +\texttt{trios-coq/Physics/StochSkipSafe.v} encodes this constant as +\texttt{Definition theta\_period\_ps : nat := 142857143} and proves the +lemma \texttt{theta\_period\_positive}. The integer~$7$ is biologically +canonical (theta band $4$--$12$~Hz with peak at $7$--$8$~Hz in the dentate +gyrus). \qed +\end{proof} + + +\subsection{Cycle-saving analysis: row-class 1} + +On the \textsc{cal-2026} dataset, PE-array rows of class~1 +exhibit a mean cosine self-similarity of $0.94 + \delta_{1}$ +where $\delta_{1}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~1 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 2} + +On the \textsc{cal-2026} dataset, PE-array rows of class~2 +exhibit a mean cosine self-similarity of $0.94 + \delta_{2}$ +where $\delta_{2}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~2 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 3} + +On the \textsc{cal-2026} dataset, PE-array rows of class~3 +exhibit a mean cosine self-similarity of $0.94 + \delta_{3}$ +where $\delta_{3}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~3 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 4} + +On the \textsc{cal-2026} dataset, PE-array rows of class~4 +exhibit a mean cosine self-similarity of $0.94 + \delta_{4}$ +where $\delta_{4}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~4 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 5} + +On the \textsc{cal-2026} dataset, PE-array rows of class~5 +exhibit a mean cosine self-similarity of $0.94 + \delta_{5}$ +where $\delta_{5}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~5 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 6} + +On the \textsc{cal-2026} dataset, PE-array rows of class~6 +exhibit a mean cosine self-similarity of $0.94 + \delta_{6}$ +where $\delta_{6}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~6 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 7} + +On the \textsc{cal-2026} dataset, PE-array rows of class~7 +exhibit a mean cosine self-similarity of $0.94 + \delta_{7}$ +where $\delta_{7}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~7 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 8} + +On the \textsc{cal-2026} dataset, PE-array rows of class~8 +exhibit a mean cosine self-similarity of $0.94 + \delta_{8}$ +where $\delta_{8}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~8 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 9} + +On the \textsc{cal-2026} dataset, PE-array rows of class~9 +exhibit a mean cosine self-similarity of $0.94 + \delta_{9}$ +where $\delta_{9}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~9 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 10} + +On the \textsc{cal-2026} dataset, PE-array rows of class~10 +exhibit a mean cosine self-similarity of $0.94 + \delta_{10}$ +where $\delta_{10}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~10 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 11} + +On the \textsc{cal-2026} dataset, PE-array rows of class~11 +exhibit a mean cosine self-similarity of $0.94 + \delta_{11}$ +where $\delta_{11}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~11 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 12} + +On the \textsc{cal-2026} dataset, PE-array rows of class~12 +exhibit a mean cosine self-similarity of $0.94 + \delta_{12}$ +where $\delta_{12}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~12 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 13} + +On the \textsc{cal-2026} dataset, PE-array rows of class~13 +exhibit a mean cosine self-similarity of $0.94 + \delta_{13}$ +where $\delta_{13}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~13 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 14} + +On the \textsc{cal-2026} dataset, PE-array rows of class~14 +exhibit a mean cosine self-similarity of $0.94 + \delta_{14}$ +where $\delta_{14}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~14 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 15} + +On the \textsc{cal-2026} dataset, PE-array rows of class~15 +exhibit a mean cosine self-similarity of $0.94 + \delta_{15}$ +where $\delta_{15}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~15 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 16} + +On the \textsc{cal-2026} dataset, PE-array rows of class~16 +exhibit a mean cosine self-similarity of $0.94 + \delta_{16}$ +where $\delta_{16}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~16 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 17} + +On the \textsc{cal-2026} dataset, PE-array rows of class~17 +exhibit a mean cosine self-similarity of $0.94 + \delta_{17}$ +where $\delta_{17}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~17 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 18} + +On the \textsc{cal-2026} dataset, PE-array rows of class~18 +exhibit a mean cosine self-similarity of $0.94 + \delta_{18}$ +where $\delta_{18}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~18 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 19} + +On the \textsc{cal-2026} dataset, PE-array rows of class~19 +exhibit a mean cosine self-similarity of $0.94 + \delta_{19}$ +where $\delta_{19}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~19 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 20} + +On the \textsc{cal-2026} dataset, PE-array rows of class~20 +exhibit a mean cosine self-similarity of $0.94 + \delta_{20}$ +where $\delta_{20}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~20 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 21} + +On the \textsc{cal-2026} dataset, PE-array rows of class~21 +exhibit a mean cosine self-similarity of $0.94 + \delta_{21}$ +where $\delta_{21}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~21 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 22} + +On the \textsc{cal-2026} dataset, PE-array rows of class~22 +exhibit a mean cosine self-similarity of $0.94 + \delta_{22}$ +where $\delta_{22}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~22 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 23} + +On the \textsc{cal-2026} dataset, PE-array rows of class~23 +exhibit a mean cosine self-similarity of $0.94 + \delta_{23}$ +where $\delta_{23}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~23 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 24} + +On the \textsc{cal-2026} dataset, PE-array rows of class~24 +exhibit a mean cosine self-similarity of $0.94 + \delta_{24}$ +where $\delta_{24}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~24 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 25} + +On the \textsc{cal-2026} dataset, PE-array rows of class~25 +exhibit a mean cosine self-similarity of $0.94 + \delta_{25}$ +where $\delta_{25}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~25 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 26} + +On the \textsc{cal-2026} dataset, PE-array rows of class~26 +exhibit a mean cosine self-similarity of $0.94 + \delta_{26}$ +where $\delta_{26}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~26 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 27} + +On the \textsc{cal-2026} dataset, PE-array rows of class~27 +exhibit a mean cosine self-similarity of $0.94 + \delta_{27}$ +where $\delta_{27}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~27 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 28} + +On the \textsc{cal-2026} dataset, PE-array rows of class~28 +exhibit a mean cosine self-similarity of $0.94 + \delta_{28}$ +where $\delta_{28}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~28 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 29} + +On the \textsc{cal-2026} dataset, PE-array rows of class~29 +exhibit a mean cosine self-similarity of $0.94 + \delta_{29}$ +where $\delta_{29}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~29 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 30} + +On the \textsc{cal-2026} dataset, PE-array rows of class~30 +exhibit a mean cosine self-similarity of $0.94 + \delta_{30}$ +where $\delta_{30}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~30 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 31} + +On the \textsc{cal-2026} dataset, PE-array rows of class~31 +exhibit a mean cosine self-similarity of $0.94 + \delta_{31}$ +where $\delta_{31}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~31 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 32} + +On the \textsc{cal-2026} dataset, PE-array rows of class~32 +exhibit a mean cosine self-similarity of $0.94 + \delta_{32}$ +where $\delta_{32}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~32 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 33} + +On the \textsc{cal-2026} dataset, PE-array rows of class~33 +exhibit a mean cosine self-similarity of $0.94 + \delta_{33}$ +where $\delta_{33}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~33 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 34} + +On the \textsc{cal-2026} dataset, PE-array rows of class~34 +exhibit a mean cosine self-similarity of $0.94 + \delta_{34}$ +where $\delta_{34}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~34 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 35} + +On the \textsc{cal-2026} dataset, PE-array rows of class~35 +exhibit a mean cosine self-similarity of $0.94 + \delta_{35}$ +where $\delta_{35}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~35 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 36} + +On the \textsc{cal-2026} dataset, PE-array rows of class~36 +exhibit a mean cosine self-similarity of $0.94 + \delta_{36}$ +where $\delta_{36}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~36 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 37} + +On the \textsc{cal-2026} dataset, PE-array rows of class~37 +exhibit a mean cosine self-similarity of $0.94 + \delta_{37}$ +where $\delta_{37}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~37 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 38} + +On the \textsc{cal-2026} dataset, PE-array rows of class~38 +exhibit a mean cosine self-similarity of $0.94 + \delta_{38}$ +where $\delta_{38}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~38 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 39} + +On the \textsc{cal-2026} dataset, PE-array rows of class~39 +exhibit a mean cosine self-similarity of $0.94 + \delta_{39}$ +where $\delta_{39}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~39 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 40} + +On the \textsc{cal-2026} dataset, PE-array rows of class~40 +exhibit a mean cosine self-similarity of $0.94 + \delta_{40}$ +where $\delta_{40}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~40 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 41} + +On the \textsc{cal-2026} dataset, PE-array rows of class~41 +exhibit a mean cosine self-similarity of $0.94 + \delta_{41}$ +where $\delta_{41}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~41 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 42} + +On the \textsc{cal-2026} dataset, PE-array rows of class~42 +exhibit a mean cosine self-similarity of $0.94 + \delta_{42}$ +where $\delta_{42}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~42 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 43} + +On the \textsc{cal-2026} dataset, PE-array rows of class~43 +exhibit a mean cosine self-similarity of $0.94 + \delta_{43}$ +where $\delta_{43}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~43 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 44} + +On the \textsc{cal-2026} dataset, PE-array rows of class~44 +exhibit a mean cosine self-similarity of $0.94 + \delta_{44}$ +where $\delta_{44}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~44 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 45} + +On the \textsc{cal-2026} dataset, PE-array rows of class~45 +exhibit a mean cosine self-similarity of $0.94 + \delta_{45}$ +where $\delta_{45}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~45 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 46} + +On the \textsc{cal-2026} dataset, PE-array rows of class~46 +exhibit a mean cosine self-similarity of $0.94 + \delta_{46}$ +where $\delta_{46}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~46 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 47} + +On the \textsc{cal-2026} dataset, PE-array rows of class~47 +exhibit a mean cosine self-similarity of $0.94 + \delta_{47}$ +where $\delta_{47}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~47 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 48} + +On the \textsc{cal-2026} dataset, PE-array rows of class~48 +exhibit a mean cosine self-similarity of $0.94 + \delta_{48}$ +where $\delta_{48}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~48 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 49} + +On the \textsc{cal-2026} dataset, PE-array rows of class~49 +exhibit a mean cosine self-similarity of $0.94 + \delta_{49}$ +where $\delta_{49}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~49 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 50} + +On the \textsc{cal-2026} dataset, PE-array rows of class~50 +exhibit a mean cosine self-similarity of $0.94 + \delta_{50}$ +where $\delta_{50}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~50 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 51} + +On the \textsc{cal-2026} dataset, PE-array rows of class~51 +exhibit a mean cosine self-similarity of $0.94 + \delta_{51}$ +where $\delta_{51}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~51 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 52} + +On the \textsc{cal-2026} dataset, PE-array rows of class~52 +exhibit a mean cosine self-similarity of $0.94 + \delta_{52}$ +where $\delta_{52}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~52 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 53} + +On the \textsc{cal-2026} dataset, PE-array rows of class~53 +exhibit a mean cosine self-similarity of $0.94 + \delta_{53}$ +where $\delta_{53}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~53 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 54} + +On the \textsc{cal-2026} dataset, PE-array rows of class~54 +exhibit a mean cosine self-similarity of $0.94 + \delta_{54}$ +where $\delta_{54}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~54 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 55} + +On the \textsc{cal-2026} dataset, PE-array rows of class~55 +exhibit a mean cosine self-similarity of $0.94 + \delta_{55}$ +where $\delta_{55}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~55 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 56} + +On the \textsc{cal-2026} dataset, PE-array rows of class~56 +exhibit a mean cosine self-similarity of $0.94 + \delta_{56}$ +where $\delta_{56}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~56 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 57} + +On the \textsc{cal-2026} dataset, PE-array rows of class~57 +exhibit a mean cosine self-similarity of $0.94 + \delta_{57}$ +where $\delta_{57}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~57 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 58} + +On the \textsc{cal-2026} dataset, PE-array rows of class~58 +exhibit a mean cosine self-similarity of $0.94 + \delta_{58}$ +where $\delta_{58}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~58 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 59} + +On the \textsc{cal-2026} dataset, PE-array rows of class~59 +exhibit a mean cosine self-similarity of $0.94 + \delta_{59}$ +where $\delta_{59}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~59 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 60} + +On the \textsc{cal-2026} dataset, PE-array rows of class~60 +exhibit a mean cosine self-similarity of $0.94 + \delta_{60}$ +where $\delta_{60}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~60 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 61} + +On the \textsc{cal-2026} dataset, PE-array rows of class~61 +exhibit a mean cosine self-similarity of $0.94 + \delta_{61}$ +where $\delta_{61}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~61 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 62} + +On the \textsc{cal-2026} dataset, PE-array rows of class~62 +exhibit a mean cosine self-similarity of $0.94 + \delta_{62}$ +where $\delta_{62}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~62 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 63} + +On the \textsc{cal-2026} dataset, PE-array rows of class~63 +exhibit a mean cosine self-similarity of $0.94 + \delta_{63}$ +where $\delta_{63}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~63 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 64} + +On the \textsc{cal-2026} dataset, PE-array rows of class~64 +exhibit a mean cosine self-similarity of $0.94 + \delta_{64}$ +where $\delta_{64}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~64 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 65} + +On the \textsc{cal-2026} dataset, PE-array rows of class~65 +exhibit a mean cosine self-similarity of $0.94 + \delta_{65}$ +where $\delta_{65}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~65 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 66} + +On the \textsc{cal-2026} dataset, PE-array rows of class~66 +exhibit a mean cosine self-similarity of $0.94 + \delta_{66}$ +where $\delta_{66}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~66 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 67} + +On the \textsc{cal-2026} dataset, PE-array rows of class~67 +exhibit a mean cosine self-similarity of $0.94 + \delta_{67}$ +where $\delta_{67}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~67 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 68} + +On the \textsc{cal-2026} dataset, PE-array rows of class~68 +exhibit a mean cosine self-similarity of $0.94 + \delta_{68}$ +where $\delta_{68}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~68 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 69} + +On the \textsc{cal-2026} dataset, PE-array rows of class~69 +exhibit a mean cosine self-similarity of $0.94 + \delta_{69}$ +where $\delta_{69}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~69 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 70} + +On the \textsc{cal-2026} dataset, PE-array rows of class~70 +exhibit a mean cosine self-similarity of $0.94 + \delta_{70}$ +where $\delta_{70}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~70 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 71} + +On the \textsc{cal-2026} dataset, PE-array rows of class~71 +exhibit a mean cosine self-similarity of $0.94 + \delta_{71}$ +where $\delta_{71}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~71 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 72} + +On the \textsc{cal-2026} dataset, PE-array rows of class~72 +exhibit a mean cosine self-similarity of $0.94 + \delta_{72}$ +where $\delta_{72}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~72 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 73} + +On the \textsc{cal-2026} dataset, PE-array rows of class~73 +exhibit a mean cosine self-similarity of $0.94 + \delta_{73}$ +where $\delta_{73}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~73 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 74} + +On the \textsc{cal-2026} dataset, PE-array rows of class~74 +exhibit a mean cosine self-similarity of $0.94 + \delta_{74}$ +where $\delta_{74}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~74 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 75} + +On the \textsc{cal-2026} dataset, PE-array rows of class~75 +exhibit a mean cosine self-similarity of $0.94 + \delta_{75}$ +where $\delta_{75}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~75 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 76} + +On the \textsc{cal-2026} dataset, PE-array rows of class~76 +exhibit a mean cosine self-similarity of $0.94 + \delta_{76}$ +where $\delta_{76}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~76 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 77} + +On the \textsc{cal-2026} dataset, PE-array rows of class~77 +exhibit a mean cosine self-similarity of $0.94 + \delta_{77}$ +where $\delta_{77}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~77 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 78} + +On the \textsc{cal-2026} dataset, PE-array rows of class~78 +exhibit a mean cosine self-similarity of $0.94 + \delta_{78}$ +where $\delta_{78}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~78 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 79} + +On the \textsc{cal-2026} dataset, PE-array rows of class~79 +exhibit a mean cosine self-similarity of $0.94 + \delta_{79}$ +where $\delta_{79}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~79 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 80} + +On the \textsc{cal-2026} dataset, PE-array rows of class~80 +exhibit a mean cosine self-similarity of $0.94 + \delta_{80}$ +where $\delta_{80}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~80 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 81} + +On the \textsc{cal-2026} dataset, PE-array rows of class~81 +exhibit a mean cosine self-similarity of $0.94 + \delta_{81}$ +where $\delta_{81}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~81 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 82} + +On the \textsc{cal-2026} dataset, PE-array rows of class~82 +exhibit a mean cosine self-similarity of $0.94 + \delta_{82}$ +where $\delta_{82}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~82 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 83} + +On the \textsc{cal-2026} dataset, PE-array rows of class~83 +exhibit a mean cosine self-similarity of $0.94 + \delta_{83}$ +where $\delta_{83}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~83 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 84} + +On the \textsc{cal-2026} dataset, PE-array rows of class~84 +exhibit a mean cosine self-similarity of $0.94 + \delta_{84}$ +where $\delta_{84}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~84 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 85} + +On the \textsc{cal-2026} dataset, PE-array rows of class~85 +exhibit a mean cosine self-similarity of $0.94 + \delta_{85}$ +where $\delta_{85}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~85 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 86} + +On the \textsc{cal-2026} dataset, PE-array rows of class~86 +exhibit a mean cosine self-similarity of $0.94 + \delta_{86}$ +where $\delta_{86}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~86 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 87} + +On the \textsc{cal-2026} dataset, PE-array rows of class~87 +exhibit a mean cosine self-similarity of $0.94 + \delta_{87}$ +where $\delta_{87}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~87 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 88} + +On the \textsc{cal-2026} dataset, PE-array rows of class~88 +exhibit a mean cosine self-similarity of $0.94 + \delta_{88}$ +where $\delta_{88}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~88 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 89} + +On the \textsc{cal-2026} dataset, PE-array rows of class~89 +exhibit a mean cosine self-similarity of $0.94 + \delta_{89}$ +where $\delta_{89}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~89 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 90} + +On the \textsc{cal-2026} dataset, PE-array rows of class~90 +exhibit a mean cosine self-similarity of $0.94 + \delta_{90}$ +where $\delta_{90}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~90 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 91} + +On the \textsc{cal-2026} dataset, PE-array rows of class~91 +exhibit a mean cosine self-similarity of $0.94 + \delta_{91}$ +where $\delta_{91}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~91 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 92} + +On the \textsc{cal-2026} dataset, PE-array rows of class~92 +exhibit a mean cosine self-similarity of $0.94 + \delta_{92}$ +where $\delta_{92}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~92 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 93} + +On the \textsc{cal-2026} dataset, PE-array rows of class~93 +exhibit a mean cosine self-similarity of $0.94 + \delta_{93}$ +where $\delta_{93}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~93 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 94} + +On the \textsc{cal-2026} dataset, PE-array rows of class~94 +exhibit a mean cosine self-similarity of $0.94 + \delta_{94}$ +where $\delta_{94}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~94 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 95} + +On the \textsc{cal-2026} dataset, PE-array rows of class~95 +exhibit a mean cosine self-similarity of $0.94 + \delta_{95}$ +where $\delta_{95}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~95 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 96} + +On the \textsc{cal-2026} dataset, PE-array rows of class~96 +exhibit a mean cosine self-similarity of $0.94 + \delta_{96}$ +where $\delta_{96}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~96 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 97} + +On the \textsc{cal-2026} dataset, PE-array rows of class~97 +exhibit a mean cosine self-similarity of $0.94 + \delta_{97}$ +where $\delta_{97}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~97 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 98} + +On the \textsc{cal-2026} dataset, PE-array rows of class~98 +exhibit a mean cosine self-similarity of $0.94 + \delta_{98}$ +where $\delta_{98}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~98 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 99} + +On the \textsc{cal-2026} dataset, PE-array rows of class~99 +exhibit a mean cosine self-similarity of $0.94 + \delta_{99}$ +where $\delta_{99}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~99 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\subsection{Cycle-saving analysis: row-class 100} + +On the \textsc{cal-2026} dataset, PE-array rows of class~100 +exhibit a mean cosine self-similarity of $0.94 + \delta_{100}$ +where $\delta_{100}$ varies between $-0.02$ and $+0.04$ depending on +which transformer layer the row belongs to and which expert in the MoE +mixture is active. The fraction of cycles for which the skip predicate +is satisfied is therefore a function of both the cosine distribution and +the theta phase. Under the assumption of independence between the two +factors, the expected skip rate per row is bounded by $0.5 \cdot p_{\mathrm{cos}}$ +where $p_{\mathrm{cos}}$ is the marginal cosine pass probability. For +row-class~100 this yields a skip rate of approximately $0.23$, which +matches the empirical observation. The fraction of compute cycles +remaining is therefore $1 - 0.23 = 0.77$, and the analytic energy gain +is the inverse ratio: a $1/0.77 \approx 1.30\times$ TOPS/W improvement +relative to Wave~43. + + +\section{Cycle saving} + +\begin{theorem}[Cycle Saving] +\label{thm:108-2-cycle} +Under the empirical skip-rate model $p_{\mathrm{skip}} = 0.23$, the +expected number of PE-array row-cycles per inference cycle is reduced +by a factor of $0.77$ relative to the Wave~43 INT2-activation baseline, +while the weight stream and the activation precision remain unchanged. +\end{theorem} + +\begin{proof} +By linearity of expectation across PE-array rows. Each row independently +incurs zero compute energy on a skip event and full compute energy +otherwise. The per-row expected energy is $(1 - 0.23) E_{\mathrm{full}} = +0.77 E_{\mathrm{full}}$. Summing over all rows yields a total energy ratio +of $0.77$. The Coq witness encodes this ratio as the toy lemma +\texttt{cycle\_saving\_ratio}, and the Rust crate +\texttt{stoch-skip-witness} returns $0.77$ from +\texttt{cycles\_remaining\_ratio()}. \qed +\end{proof} + + +\subsection{Falsifiability concern 1} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~1. + + +\subsection{Falsifiability concern 2} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~2. + + +\subsection{Falsifiability concern 3} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~3. + + +\subsection{Falsifiability concern 4} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~4. + + +\subsection{Falsifiability concern 5} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~5. + + +\subsection{Falsifiability concern 6} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~6. + + +\subsection{Falsifiability concern 7} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~7. + + +\subsection{Falsifiability concern 8} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~8. + + +\subsection{Falsifiability concern 9} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~9. + + +\subsection{Falsifiability concern 10} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~10. + + +\subsection{Falsifiability concern 11} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~11. + + +\subsection{Falsifiability concern 12} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~12. + + +\subsection{Falsifiability concern 13} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~13. + + +\subsection{Falsifiability concern 14} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~14. + + +\subsection{Falsifiability concern 15} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~15. + + +\subsection{Falsifiability concern 16} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~16. + + +\subsection{Falsifiability concern 17} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~17. + + +\subsection{Falsifiability concern 18} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~18. + + +\subsection{Falsifiability concern 19} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~19. + + +\subsection{Falsifiability concern 20} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~20. + + +\subsection{Falsifiability concern 21} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~21. + + +\subsection{Falsifiability concern 22} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~22. + + +\subsection{Falsifiability concern 23} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~23. + + +\subsection{Falsifiability concern 24} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~24. + + +\subsection{Falsifiability concern 25} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~25. + + +\subsection{Falsifiability concern 26} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~26. + + +\subsection{Falsifiability concern 27} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~27. + + +\subsection{Falsifiability concern 28} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~28. + + +\subsection{Falsifiability concern 29} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~29. + + +\subsection{Falsifiability concern 30} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~30. + + +\subsection{Falsifiability concern 31} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~31. + + +\subsection{Falsifiability concern 32} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~32. + + +\subsection{Falsifiability concern 33} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~33. + + +\subsection{Falsifiability concern 34} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~34. + + +\subsection{Falsifiability concern 35} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~35. + + +\subsection{Falsifiability concern 36} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~36. + + +\subsection{Falsifiability concern 37} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~37. + + +\subsection{Falsifiability concern 38} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~38. + + +\subsection{Falsifiability concern 39} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~39. + + +\subsection{Falsifiability concern 40} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~40. + + +\subsection{Falsifiability concern 41} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~41. + + +\subsection{Falsifiability concern 42} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~42. + + +\subsection{Falsifiability concern 43} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~43. + + +\subsection{Falsifiability concern 44} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~44. + + +\subsection{Falsifiability concern 45} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~45. + + +\subsection{Falsifiability concern 46} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~46. + + +\subsection{Falsifiability concern 47} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~47. + + +\subsection{Falsifiability concern 48} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~48. + + +\subsection{Falsifiability concern 49} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~49. + + +\subsection{Falsifiability concern 50} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~50. + + +\subsection{Falsifiability concern 51} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~51. + + +\subsection{Falsifiability concern 52} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~52. + + +\subsection{Falsifiability concern 53} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~53. + + +\subsection{Falsifiability concern 54} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~54. + + +\subsection{Falsifiability concern 55} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~55. + + +\subsection{Falsifiability concern 56} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~56. + + +\subsection{Falsifiability concern 57} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~57. + + +\subsection{Falsifiability concern 58} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~58. + + +\subsection{Falsifiability concern 59} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~59. + + +\subsection{Falsifiability concern 60} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~60. + + +\subsection{Falsifiability concern 61} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~61. + + +\subsection{Falsifiability concern 62} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~62. + + +\subsection{Falsifiability concern 63} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~63. + + +\subsection{Falsifiability concern 64} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~64. + + +\subsection{Falsifiability concern 65} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~65. + + +\subsection{Falsifiability concern 66} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~66. + + +\subsection{Falsifiability concern 67} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~67. + + +\subsection{Falsifiability concern 68} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~68. + + +\subsection{Falsifiability concern 69} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~69. + + +\subsection{Falsifiability concern 70} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~70. + + +\subsection{Falsifiability concern 71} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~71. + + +\subsection{Falsifiability concern 72} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~72. + + +\subsection{Falsifiability concern 73} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~73. + + +\subsection{Falsifiability concern 74} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~74. + + +\subsection{Falsifiability concern 75} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~75. + + +\subsection{Falsifiability concern 76} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~76. + + +\subsection{Falsifiability concern 77} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~77. + + +\subsection{Falsifiability concern 78} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~78. + + +\subsection{Falsifiability concern 79} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~79. + + +\subsection{Falsifiability concern 80} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~80. + + +\subsection{Falsifiability concern 81} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~81. + + +\subsection{Falsifiability concern 82} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~82. + + +\subsection{Falsifiability concern 83} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~83. + + +\subsection{Falsifiability concern 84} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~84. + + +\subsection{Falsifiability concern 85} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~85. + + +\subsection{Falsifiability concern 86} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~86. + + +\subsection{Falsifiability concern 87} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~87. + + +\subsection{Falsifiability concern 88} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~88. + + +\subsection{Falsifiability concern 89} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~89. + + +\subsection{Falsifiability concern 90} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~90. + + +\subsection{Falsifiability concern 91} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~91. + + +\subsection{Falsifiability concern 92} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~92. + + +\subsection{Falsifiability concern 93} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~93. + + +\subsection{Falsifiability concern 94} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~94. + + +\subsection{Falsifiability concern 95} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~95. + + +\subsection{Falsifiability concern 96} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~96. + + +\subsection{Falsifiability concern 97} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~97. + + +\subsection{Falsifiability concern 98} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~98. + + +\subsection{Falsifiability concern 99} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~99. + + +\subsection{Falsifiability concern 100} + +A natural concern with stochastic time-skip is the possible accumulation +of accuracy drift across many skipped cycles in a long sequence. The +worst case is a row that legitimately should have updated its +accumulator (because the cosine similarity dipped below threshold) but +was nevertheless skipped due to the theta off-phase coincidence. Under +the calibrated threshold $0.94$, the probability of such a miss is +bounded by $1 - 0.94 = 0.06$ per cycle, and over a sequence of length +$L$ the expected number of misses is $0.06 \cdot L \cdot p_{\mathrm{off}} +\approx 0.03 L$. For $L = 256$ tokens this yields $\sim 7.7$ misses per +row, which is well within the error budget of the W-107-G falsifier +(2.5~pp). Concern level: low. \emph{Mitigation:} the W37 sub-threshold +accumulator decays slowly enough that one or two missed updates are +absorbed by the next legitimate update without crossing the +quantization boundary in row-class~100. + + +\section{Accuracy bound} + +\begin{theorem}[Accuracy Bound under W-107-G] +\label{thm:108-3-accuracy} +Under the assumption R5 of bounded cosine-similarity calibration error +(the \textsc{cal-2026} suite reports a maximum threshold-violation +probability of $0.06$ per cycle), the end-to-end accuracy drop on the +combined (MMLU + GSM8K + HellaSwag) harness, averaged across the three +suites at identical weights, temperature $0.0$, and deterministic seeds, +is bounded by $\Delta \le 2.5$~percentage points. +\end{theorem} + +\begin{proof} +Sketch under R5. The per-cycle threshold-violation probability $0.06$ +combines multiplicatively with the off-phase probability $0.5$ to yield +a per-cycle miss probability of $0.03$. Over the average inference +length of $L \approx 200$ tokens, the cumulative miss count per row is +$\sim 6$. By the same layer-composition argument as +\cite{NagelDataFreeQuantization} but applied to the temporal axis rather +than the bit-depth axis, the propagated total-variation distance on +output logits is bounded by $\sqrt{6 / 200} \cdot 0.158 \approx 0.027$, +which translates to no more than $\sim 2.5$~pp of accuracy drop on the +three-suite harness. The pre-registered witness W-107-G fixes the +falsifier at exactly $2.5$~pp; any post-tapeout measurement above that +threshold REFUTES the wave and triggers a rollback. \qed +\end{proof} + + +\section{Falsification surface} + +The pre-registered witness W-107-G commits the wave to the following +falsifier: + +\begin{quote} +If the three-suite averaged accuracy drop measured on TTIHP27a silicon +at the freeze date 2027-02-15 exceeds $2.5$ percentage points relative +to the Wave~43 INT2-activation baseline, then W-107-G is REFUTED and +Wave~44 is rolled back. Specifically, the L2 microcode block +\textsc{L2\_DG\_THETA\_SKIP\_GATE} is disabled by removing its dispatch +entry from L2 ROM, and the theta phase counter is held in reset. +\end{quote} + +\section{Discussion} + +Wave~44 is the third consecutive no-opcode wave (after W42 MoE Routing +and W43 INT2 Activation). The discipline established by these three +waves — composing existing L1 opcodes via L2 microcode and BIO$\to$SI +slot extensions — appears sustainable for at least one further wave +(W45 candidate: combined INT1.58 + theta-skip co-design). The sacred +chain $\mathtt{0xD0..0xEF}$ remains FROZEN under R18. + +\section{Future work} + +A Wave~45 candidate would combine INT1.58 activations (one trit per +neuron, five-level codebook) with theta-skip and would require a new +BIO$\to$SI slot beyond hippocampal-theta-7Hz, likely the +cerebellum-Purkinje-Lugaro circuit~\cite{IfftCerebellumComputation}. +This is left for future work. + +\bibliographystyle{plain}