Self-Correcting-LLM-Research/final_report.tex at main · CodeWithInferno/Self-Correcting-LLM-Research · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[10pt,twocolumn]{article}
\usepackage[margin=1in,columnsep=0.2in]{geometry}
\usepackage{times}

\usepackage{amsmath,amssymb,amsfonts}
\usepackage{graphicx}
\usepackage{array}
\usepackage{url}
\usepackage{amsthm}
\usepackage{enumerate}

% Define table rules
\newcommand{\toprule}{\hline\hline}
\newcommand{\midrule}{\hline}
\newcommand{\bottomrule}{\hline\hline}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}

\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em
    T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}

\newcommand{\ssc}{\textsc{S2C}}
\newcommand{\cdt}{\textsc{CDT}}
\newcommand{\hpbr}{\textsc{HPBR}}
\newcommand{\Generator}{\mathcal{G}}
\newcommand{\Critic}{\mathcal{C}}
\newcommand{\Synthesizer}{\mathcal{S}}
\newcommand{\RMinsight}{\text{RM}_{\text{insight}}}
\newcommand{\RMcorr}{\text{RM}_{\text{corr}}}
\newcommand{\expectation}[1]{\mathbb{E}\left[#1\right]}
\newcommand{\prob}[1]{\mathbb{P}\left(#1\right)}

\begin{document}

\title{Synergistic Self-Correction: A Hierarchical Framework for Multi-Stage Reasoning and Error Recovery in Large Language Models}

\author{
  Pratham Patel$^{1}$\thanks{Equal contribution} \and
  Abhishek Jindal$^{1*}$ \\
  \\
  $^1$Department of Computer Science \\
  Dhirubhai Ambani Institute of Information and Communication Technology \\
  Gandhinagar, Gujarat, India \\
  \texttt{\{prathambiren2618, abhishek\_jindal\}@daiict.ac.in} \\
  $^*$Corresponding author
}

\maketitle

\begin{abstract}
Large Language Models (LLMs) have achieved remarkable success across diverse natural language processing tasks, yet they exhibit systematic failures in complex multi-step reasoning, particularly in mathematical domains where logical consistency and error recovery are paramount. The fundamental limitation stems from the autoregressive generation paradigm, where early reasoning errors propagate through subsequent steps, rendering final answers incorrect regardless of the overall approach validity. Existing solutions—external verification systems, ensemble methods, and process supervision—either require substantial computational overhead, fail to improve underlying model capabilities, or lack the sophistication needed for nuanced error identification and correction.

We introduce \textbf{\ssc{} (Synergistic Self-Correction)}, a novel hierarchical framework that endows LLMs with metacognitive reasoning capabilities through a structured three-stage inference process. Our approach decomposes problem-solving into distinct computational personas: a \textbf{Generator} that produces initial solutions with explicit critical point identification, a \textbf{Critic} that systematically analyzes potential errors and logical inconsistencies, and a \textbf{Synthesizer} that integrates feedback to produce refined solutions. This decomposition enables targeted optimization of each reasoning stage while maintaining end-to-end differentiability.

Our training methodology, \textbf{\cdt{} (Cognitive Dissonance Training)}, combines supervised fine-tuning on high-quality reasoning traces with reinforcement learning using a novel \textbf{\hpbr{} (Hierarchical Process-Based Reward)} system. We introduce specialized reward models—$\RMinsight$ for critique quality evaluation and $\RMcorr$ for correction effectiveness assessment—that provide fine-grained process supervision beyond traditional outcome-based metrics. The reward structure explicitly optimizes for error identification accuracy, critique specificity, and correction success, creating strong training signals for metacognitive skill development.

Comprehensive evaluation across multiple reasoning benchmarks demonstrates substantial improvements: \ssc{} achieves 49.9\% accuracy on GSM8K (60\% relative improvement over 31.2\% baseline), 21.3\% on MATH (71\% relative improvement), and consistent gains on commonsense reasoning tasks. Statistical significance testing confirms these improvements ($p < 0.001$), with detailed error analysis revealing high success rates in correcting computational errors (78\%) and missing reasoning steps (71\%). Extensive ablation studies validate each component's contribution, while computational efficiency analysis shows \ssc{} achieves superior accuracy with 74\% fewer tokens than ensemble methods. Our work establishes a new paradigm for developing self-correcting AI systems with intrinsic metacognitive capabilities.
\end{abstract}


\section{Introduction}

\subsection{The Reasoning Crisis in Large Language Models}

The remarkable capabilities demonstrated by Large Language Models (LLMs) across diverse natural language processing tasks have established them as transformative tools for human-computer interaction, content generation, and knowledge synthesis \cite{brown2020language, ouyang2022training, openai2023gpt4}. However, beneath their impressive performance on many benchmarks lies a fundamental and persistent limitation: systematic failures in complex multi-step reasoning tasks that require logical consistency, error detection, and iterative refinement \cite{wei2022emergent, rae2021scaling}.

This limitation is particularly pronounced in mathematical reasoning domains, where the cascading nature of logical dependencies means that a single computational error, conceptual misunderstanding, or logical inconsistency can invalidate an entire solution regardless of the sophistication of the overall approach \cite{hendrycks2021measuring, cobbe2021training}. Unlike tasks that admit partial credit or approximate solutions, mathematical reasoning demands precise logical coherence throughout multi-step inference chains.

Consider a typical multi-step mathematical problem: solving a system of linear equations requires correct variable identification, accurate arithmetic operations, consistent equation manipulation, and proper solution verification. An error at any stage—whether computational (arithmetic mistakes), logical (incorrect equation setup), or conceptual (misunderstanding the problem constraints)—propagates through subsequent steps, often resulting in dramatically incorrect final answers despite locally reasonable reasoning steps.

\subsection{Limitations of Current Approaches}

Contemporary approaches to improving LLM reasoning capabilities fall into several categories, each with significant limitations:

\textbf{External Verification Systems} employ separate models trained specifically to evaluate solution correctness \cite{cobbe2021training, li2022advance}. While conceptually appealing, these approaches suffer from several fundamental limitations: (1) they require training and maintaining additional specialized models, increasing computational overhead and system complexity; (2) they operate on final solutions rather than intermediate reasoning steps, missing opportunities for early error detection and correction; (3) they lack the domain-specific knowledge needed to provide constructive feedback for error correction; and (4) they create a dependency on external components that may themselves be prone to errors or distribution shift.

\textbf{Ensemble Methods} generate multiple solution candidates and select the most consistent or frequently occurring answer \cite{wang2022self, li2022advance}. Self-consistency decoding, for example, samples multiple reasoning paths and selects the majority answer. However, these approaches face several critical limitations: (1) they require substantial computational resources, often 5-10x the cost of single inference; (2) they assume that correct answers are more likely to be consistent across samples, an assumption that fails when systematic biases lead to consistent but incorrect solutions; (3) they do not improve the underlying model's reasoning capabilities, merely selecting from existing generations; and (4) they struggle with problems where the solution space is large or where multiple valid solution approaches exist.

\textbf{Process Supervision} approaches attempt to provide feedback on intermediate reasoning steps rather than just final answers \cite{uesato2022solving, lightman2023lets}. While this represents progress toward more granular feedback, existing implementations suffer from: (1) reliance on expensive human annotations for training process reward models; (2) difficulty in scaling to complex reasoning domains where step-by-step validation requires domain expertise; (3) limited ability to provide constructive guidance for error correction; and (4) challenges in defining appropriate granularity for process supervision across diverse problem types.

\textbf{Iterative Refinement} methods like Self-Taught Reasoner (STaR) \cite{zelikman2022star} train models to generate rationales and iteratively improve through self-generated training data. However, these approaches primarily focus on leveraging correct solutions for training, missing crucial opportunities to learn from errors and their corrections. They also lack systematic frameworks for error identification and targeted correction.

\subsection{The Need for Metacognitive Reasoning}

The fundamental issue underlying these limitations is the absence of \textit{metacognitive capabilities} in current LLM architectures. Metacognition—the ability to monitor, evaluate, and regulate one's own cognitive processes—is essential for robust reasoning in complex domains \cite{flavell1979metacognition, schraw1995metacognitive}. Human experts in mathematical reasoning routinely engage in metacognitive activities: they identify critical decision points in their reasoning, actively search for potential errors, consider alternative solution approaches, and refine their solutions based on self-identified issues.

Current LLMs lack these capabilities due to the autoregressive generation paradigm: once a token is generated, the model proceeds deterministically through the remaining sequence without opportunities for reflection, error detection, or correction. This \textit{forward-only} generation creates a fundamental architectural barrier to developing sophisticated reasoning capabilities that depend on iterative refinement and self-correction.

\subsection{Our Approach: Synergistic Self-Correction}

We introduce \textbf{Synergistic Self-Correction (\ssc{})}, a novel framework that addresses these fundamental limitations by endowing LLMs with structured metacognitive capabilities. Our approach decomposes the reasoning process into three distinct but synergistic computational stages:

\begin{enumerate}[leftmargin=*]
\item \textbf{Generation Stage}: Produces initial solutions while explicitly identifying \textit{Critical Points}—key logical steps, assumptions, or calculations that are essential for solution validity.

\item \textbf{Critique Stage}: Systematically analyzes the initial solution, focusing on the identified Critical Points to detect potential errors, logical inconsistencies, or missing reasoning steps.

\item \textbf{Synthesis Stage}: Integrates feedback from the critique to produce refined solutions that address identified issues while preserving correct aspects of the original reasoning.
\end{enumerate}

This three-stage decomposition enables several key advantages over existing approaches: (1) \textit{Targeted Error Detection}: The Critic stage is specifically trained to identify different types of reasoning errors with high precision; (2) \textit{Constructive Feedback}: Rather than simple correctness judgments, the framework generates specific, actionable feedback for improvement; (3) \textit{Intrinsic Capability Development}: The entire process operates within a single model, developing intrinsic reasoning capabilities rather than relying on external verification; (4) \textit{End-to-End Optimization}: All stages are jointly optimizable, enabling sophisticated training strategies that improve overall reasoning performance.

\subsection{Technical Contributions and Innovations}

Our work makes several key technical contributions:

\textbf{Hierarchical Process-Based Reward System}: We introduce specialized reward models that evaluate critique quality ($\RMinsight$) and correction effectiveness ($\RMcorr$), moving beyond simple outcome-based metrics to provide fine-grained process supervision.

\textbf{Cognitive Dissonance Training}: Our three-phase training methodology progressively develops self-correction capabilities through supervised fine-tuning, specialized reward model training, and process-based reinforcement learning.

\textbf{Theoretical Framework}: We provide mathematical formulations for each stage of the \ssc{} process, establishing theoretical foundations for multi-stage reasoning in LLMs.

\textbf{Comprehensive Evaluation}: We conduct extensive experiments across multiple reasoning benchmarks, including detailed ablation studies, error analysis, and computational efficiency assessments.

The remainder of this paper is organized as follows: Section~\ref{sec:related_work} provides comprehensive coverage of related work in LLM reasoning enhancement; Section~\ref{sec:methodology} presents our technical approach with detailed mathematical formulations; Section~\ref{sec:theory} provides theoretical analysis of the framework; Section~\ref{sec:experiments} describes our experimental setup and presents comprehensive results; Section~\ref{sec:discussion} discusses implications, limitations, and future directions; and Section~\ref{sec:conclusion} concludes with a summary of contributions and impact.


\section{Related Work} \label{sec:related_work}

The development of reasoning capabilities in Large Language Models has been a central focus of recent research, with approaches spanning prompting techniques, architectural modifications, training methodologies, and inference-time improvements. We organize related work into several key categories that directly inform our approach.

\subsection{Prompting-Based Reasoning Enhancement}

\subsubsection{Chain-of-Thought and Its Extensions}

\textbf{Chain-of-Thought (CoT)} prompting \cite{wei2022chain} represents a foundational breakthrough in eliciting reasoning capabilities from LLMs. By explicitly prompting models to generate intermediate reasoning steps, CoT demonstrated substantial improvements on mathematical reasoning, commonsense reasoning, and symbolic manipulation tasks. The key insight was that making the reasoning process explicit enables models to decompose complex problems into manageable sub-problems while maintaining logical coherence across steps.

Several extensions to basic CoT have been proposed: \textbf{Zero-Shot CoT} \cite{kojima2022large} showed that simple prompts like ``Let's think step by step'' can elicit reasoning without providing examples. \textbf{Auto-CoT} \cite{zhang2022automatic} automated the construction of CoT demonstrations through clustering and sampling techniques. \textbf{Complex CoT} \cite{fu2022complexity} introduced complexity-based selection of reasoning demonstrations.

However, these approaches share fundamental limitations: they operate purely at inference time without improving the underlying model capabilities, they lack mechanisms for error detection and correction, and they provide no systematic way to improve reasoning quality through iterative refinement.

\subsubsection{Multi-Path Reasoning and Ensembling}

\textbf{Self-Consistency} \cite{wang2022self} addressed the stochasticity problem in CoT by generating multiple reasoning paths and selecting the most frequently occurring answer. This approach demonstrated significant improvements across diverse reasoning benchmarks by leveraging the insight that correct reasoning paths are more likely to converge on consistent answers than incorrect ones.

\textbf{Diverse Beam Search} \cite{li2022advance} extended this concept by explicitly encouraging diversity in reasoning path generation. \textbf{Maieutic Prompting} \cite{jung2022maieutic} used recursive explanation generation and consistency checking to improve reasoning reliability.

While effective, these ensemble approaches suffer from several critical limitations: (1) computational cost scales linearly with the number of samples generated; (2) they assume that correctness correlates with consistency, which fails when systematic biases lead to consistently incorrect solutions; (3) they provide no mechanism for learning from errors or improving underlying reasoning capabilities; and (4) they struggle with problems where multiple valid solution approaches exist.

\subsubsection{Structured Reasoning Paradigms}

\textbf{Tree-of-Thoughts (ToT)} \cite{yao2023tree} introduced a more sophisticated reasoning paradigm by modeling the problem-solving process as a tree search over intermediate states. ToT enables backtracking from dead ends, explicit evaluation of intermediate states, and exploration of alternative solution paths. This approach demonstrated impressive results on creative writing, game playing, and mathematical reasoning tasks.

\textbf{Graph-of-Thoughts} \cite{besta2023graph} further generalized this concept by modeling reasoning as arbitrary graphs rather than trees, enabling more complex reasoning patterns including loops, convergence, and parallel exploration.

However, these approaches face scalability challenges: they require manual specification of state representations and transition functions for each domain, they have high computational overhead due to extensive search, and they lack principled methods for learning improved reasoning strategies from experience.

\subsection{Architecture-Based Approaches}

\subsubsection{Memory-Augmented Models}

Several works have explored architectural modifications to enable more sophisticated reasoning patterns. \textbf{Memorizing Transformers} \cite{wu2022memorizing} augment transformers with external memory to enable retrieval-based reasoning. \textbf{FiD-Light} \cite{hofstätter2023fid} uses efficient fusion mechanisms for multi-document reasoning.

\textbf{Retrieval-Augmented Generation (RAG)} \cite{lewis2020retrieval} models combine parametric knowledge with retrieved external information, enabling reasoning over large knowledge bases. \textbf{REALM} \cite{guu2020retrieval} and \textbf{DPR} \cite{karpukhin2020dense} further developed dense retrieval mechanisms for knowledge-intensive reasoning tasks.

While these approaches provide access to external knowledge, they do not address the core problem of error detection and correction in multi-step reasoning chains.

\subsubsection{Modular and Compositional Architectures}

\textbf{Modular networks} \cite{andreas2016neural} decompose reasoning into specialized modules that can be composed dynamically based on problem requirements. \textbf{Neural Module Networks} \cite{hu2017learning} learn to compose reasoning modules for visual question answering.

More recently, \textbf{Toolformer} \cite{schick2023toolformer} learned to use external tools (calculators, search engines, translation services) to augment reasoning capabilities. \textbf{ReAct} \cite{yao2023react} combined reasoning and acting in language models for interactive problem solving.

Our approach differs by developing intrinsic self-correction capabilities rather than relying on external tools or modular architectures.

\subsection{Training-Based Reasoning Improvements}

\subsubsection{Self-Improvement and Iterative Training}

\textbf{Self-Taught Reasoner (STaR)} \cite{zelikman2022star} introduced an iterative training framework where models generate reasoning chains, filter for correct solutions, and fine-tune on the successful examples. This creates a positive feedback loop where models improve their reasoning capabilities through self-generated training data.

\textbf{STaR-Bootstrapped} \cite{zelikman2022star} extended this approach with more sophisticated bootstrapping techniques. \textbf{V-STaR} \cite{hosseini2024v} incorporated verification models to improve the quality of self-generated training data.

However, these approaches primarily leverage correct solutions for training, missing crucial learning opportunities from errors and their corrections. They also lack systematic frameworks for error identification and targeted improvement.

\subsubsection{Constitutional and Principle-Based Training}

\textbf{Constitutional AI} \cite{bai2022constitutional} trains models to critique and improve their own outputs according to a set of principles or constitution. The approach uses a two-stage process: first training a model to identify violations of principles, then training it to revise outputs to address these violations.

\textbf{Self-Refine} \cite{madaan2023self} extended this concept to various text generation tasks, showing that models can learn to iteratively improve their outputs through self-feedback. \textbf{Reflexion} \cite{shinn2023reflexion} combined self-reflection with external feedback for reinforcement learning agents.

While promising, these approaches operate at a high level of abstraction and rely on general principles rather than domain-specific reasoning skills. They also lack the sophisticated reward structures needed for fine-grained process supervision.

\subsection{Process Supervision and Reward Modeling}

\subsubsection{Step-Level Supervision}

Traditional outcome supervision provides feedback only on final answers, missing opportunities to correct intermediate errors. \textbf{Process supervision} \cite{uesato2022solving} addresses this by providing step-level feedback on reasoning chains, enabling more precise error localization and correction.

\textbf{Let's Verify Step by Step} \cite{lightman2023lets} demonstrated that process-supervised reward models significantly outperform outcome-supervised models on mathematical reasoning tasks. They trained separate reward models to evaluate individual reasoning steps, providing fine-grained feedback for reinforcement learning.

\textbf{Math-Shepherd} \cite{wang2023math} further developed process supervision techniques with automated step-level annotation and reward model training. \textbf{PRM800K} \cite{lightman2023lets} created a large-scale dataset of process-level annotations for training reward models.

Our work extends these ideas by integrating process supervision directly into the generation model through specialized reward functions that target specific aspects of self-correction.

\subsubsection{Multi-Faceted Reward Structures}

Recent work has explored more sophisticated reward structures beyond simple correctness metrics. \textbf{WebGPT} \cite{nakano2021webgpt} used multi-dimensional rewards including accuracy, helpfulness, and truthfulness. \textbf{InstructGPT} \cite{ouyang2022training} incorporated human preferences through complex reward modeling.

\textbf{Process-Guided Rewards} \cite{wang2023process} introduced rewards that evaluate reasoning process quality independently of final outcomes. \textbf{Hierarchical Reward Models} \cite{liang2023hierarchical} developed multi-level reward structures for complex reasoning tasks.

Our \hpbr{} system builds on these foundations by introducing specialized reward models that evaluate critique quality and correction effectiveness, providing more nuanced process supervision than existing approaches.

\subsection{Error Analysis and Correction in LLMs}

\subsubsection{Error Taxonomies and Analysis}

Understanding the types of errors that LLMs make in reasoning tasks has been a focus of several studies. \textbf{Error Analysis for Mathematical Reasoning} \cite{wells2021error} identified categories including computational errors, logical errors, and conceptual misunderstandings.

\textbf{BIG-bench} \cite{srivastava2022beyond} provided comprehensive error analysis across diverse reasoning tasks, identifying systematic failure modes. \textbf{LLM Error Patterns} \cite{dziri2023faith} analyzed faithfulness and factual errors in language model generation.

\textbf{Mathematical Reasoning Error Types} \cite{frieder2023mathematical} provided detailed taxonomies of errors in mathematical reasoning, distinguishing between procedural, conceptual, and computational mistakes.

Our work builds on these taxonomies by developing targeted correction strategies for different error types and measuring correction success rates across error categories.

\subsubsection{Automated Error Detection}

Several approaches have been developed for automatically detecting errors in LLM reasoning. \textbf{Verification Networks} \cite{cobbe2021training} train separate models to identify incorrect solutions. \textbf{Error Detection in CoT} \cite{kim2023error} developed techniques for identifying errors in chain-of-thought reasoning.

\textbf{Automated Proof Checking} \cite{welleck2022naturalproofs} used formal verification techniques to detect logical errors in mathematical proofs. \textbf{Consistency-Based Error Detection} \cite{mitchell2022consistency} leveraged consistency across multiple generations to identify potential errors.

Our Critic stage extends these ideas by developing specialized error detection capabilities that are trained jointly with the generation and synthesis stages.

\subsection{Metacognitive AI and Self-Aware Systems}

\subsubsection{Metacognition in AI}

The development of metacognitive capabilities in AI systems has been a long-standing goal. \textbf{Meta-Learning} \cite{finn2017model} enabled models to learn how to learn, adapting quickly to new tasks. \textbf{Introspective AI} \cite{foerster2018counterfactual} developed self-aware reasoning capabilities.

\textbf{Theory of Mind in LLMs} \cite{kosinski2023theory} investigated whether language models develop understanding of their own cognitive processes. \textbf{Self-Awareness Benchmarks} \cite{mitchell2023self} provided frameworks for evaluating metacognitive capabilities.

Our work contributes to this area by developing concrete mechanisms for metacognitive reasoning in the specific domain of mathematical problem solving.

\subsubsection{Self-Correcting AI Systems}

The goal of developing AI systems that can identify and correct their own errors has been pursued across various domains. \textbf{Self-Correcting Networks} \cite{graves2016adaptive} introduced architectural mechanisms for error correction. \textbf{Self-Supervised Error Correction} \cite{he2019self} developed techniques for automatic error correction in various tasks.

\textbf{Iterative Improvement Systems} \cite{liu2023iterative} created frameworks for systematic improvement through self-feedback. \textbf{Autonomous Error Recovery} \cite{chen2023autonomous} developed methods for recovering from errors in sequential decision making.

Our \ssc{} framework provides a concrete instantiation of these ideas for mathematical reasoning, with specific mechanisms for error detection, critique generation, and solution refinement.

\subsection{Positioning of Our Approach}

Our work synthesizes insights from multiple research directions while addressing key limitations of existing approaches:

\textbf{Compared to Prompting Methods}: While prompting techniques like CoT and Self-Consistency provide immediate improvements, they operate purely at inference time without improving underlying capabilities. Our approach develops intrinsic reasoning skills through targeted training.

\textbf{Compared to Ensemble Methods}: Unlike computational expensive ensemble approaches, our method operates within a single forward pass while achieving superior error correction through structured self-reflection.

\textbf{Compared to Process Supervision}: While existing process supervision methods use separate reward models for evaluation, we integrate specialized reward functions directly into the training process and develop critique generation capabilities.

\textbf{Compared to Iterative Training}: Unlike methods that only learn from correct solutions, our approach specifically leverages errors and corrections to develop metacognitive capabilities.

The key innovation of our approach lies in the synergistic combination of structured reasoning decomposition, specialized reward modeling, and joint optimization of all reasoning stages, creating a comprehensive framework for developing self-correcting mathematical reasoning capabilities.

\section{Methodology} \label{sec:methodology}

\subsection{The Synergistic Self-Correction Framework}

The \ssc{} framework addresses the fundamental limitations of autoregressive reasoning through a principled decomposition of the problem-solving process into three specialized computational stages. Each stage is optimized for distinct cognitive functions while maintaining end-to-end differentiability, enabling joint optimization of the entire reasoning pipeline.

\begin{algorithm}[H]
\caption{Synergistic Self-Correction Inference Pipeline}
\label{alg:s2c_inference}
\begin{algorithmic}[1]
\REQUIRE Input problem $P$, model parameters $\theta = \{\theta_\Generator, \theta_\Critic, \theta_\Synthesizer\}$
\ENSURE Final solution $R_f$ with confidence score $\sigma$
\STATE \textbf{Stage 1 - Structured Generation:}
\STATE $R_0, C \leftarrow \Generator(P; \theta_\Generator)$ \COMMENT{Initial solution with critical points}
\STATE \textbf{Stage 2 - Adversarial Critique:}
\STATE $K \leftarrow \Critic(P, R_0, C; \theta_\Critic)$ \COMMENT{Systematic error analysis}
\STATE \textbf{Stage 3 - Informed Synthesis:}
\STATE $R_f, \sigma \leftarrow \Synthesizer(P, R_0, C, K; \theta_\Synthesizer)$ \COMMENT{Refined solution}
\RETURN $R_f, \sigma$
\end{algorithmic}
\end{algorithm}

The framework operates on the principle of \textit{cognitive specialization}: each stage focuses on a distinct aspect of reasoning while sharing representations and benefiting from joint optimization. This design addresses key limitations of existing approaches by providing mechanisms for explicit error detection, constructive feedback generation, and targeted correction.

\subsection{Mathematical Formulation}

We formalize the \ssc{} framework as a structured conditional generation problem where each stage implements a specialized probability distribution conditioned on inputs from previous stages.

\subsubsection{Problem Setup and Notation}

Let $\mathcal{P}$ denote the space of mathematical problems, $\mathcal{R}$ the space of reasoning chains, and $\mathcal{A}$ the space of final answers. For a given problem $P \in \mathcal{P}$, the \ssc{} framework seeks to learn a joint distribution:

\begin{equation}
p(R_f, C, K | P; \theta) = p(R_f | P, R_0, C, K; \theta_\Synthesizer) \cdot p(K | P, R_0, C; \theta_\Critic) \cdot p(R_0, C | P; \theta_\Generator)
\end{equation}

where:
\begin{itemize}[leftmargin=*]
\item $R_0 \in \mathcal{R}$ is the initial reasoning chain
\item $C = \{c_1, c_2, \ldots, c_n\}$ is the set of critical points extracted from $R_0$
\item $K$ is the critique report analyzing $R_0$ with respect to $C$
\item $R_f \in \mathcal{R}$ is the final refined reasoning chain
\item $\theta = \{\theta_\Generator, \theta_\Critic, \theta_\Synthesizer\}$ are the learnable parameters
\end{itemize}

\subsubsection{Stage 1: Structured Generator ($\Generator$)}

The Generator stage produces structured initial solutions that explicitly identify critical reasoning components. Unlike standard autoregressive generation, the Generator simultaneously produces the reasoning chain $R_0$ and a set of critical points $C$ that represent key logical steps, computational operations, or assumptions essential for solution validity.

\textbf{Critical Point Extraction}: Critical points serve as explicit markers of reasoning steps that require careful verification. They are extracted using a learned attention mechanism that identifies tokens with high gradient magnitudes with respect to the final answer prediction:

\begin{equation}
\text{Criticality}(r_t^{(0)}) = \left\|\frac{\partial \mathcal{L}_{\text{answer}}}{\partial h_t}\right\|_2
\end{equation}

where $h_t$ is the hidden representation of token $r_t^{(0)}$ and $\mathcal{L}_{\text{answer}}$ is the loss with respect to the correct answer.

\textbf{Mathematical Formulation}: The Generator implements a joint probability distribution:

\begin{equation}
p(R_0, C | P; \theta_\Generator) = p(R_0 | P; \theta_\Generator) \cdot p(C | P, R_0; \theta_\Generator)
\end{equation}

The reasoning chain generation follows standard autoregressive factorization:
\begin{equation}
p(R_0 | P; \theta_\Generator) = \prod_{t=1}^{|R_0|} p(r_t^{(0)} | P, r_{<t}^{(0)}; \theta_\Generator)
\end{equation}

Critical point selection uses a learned binary classifier:
\begin{equation}
p(c_i \in C | P, R_0; \theta_\Generator) = \sigma(W_c^T h_i + b_c)
\end{equation}

where $\sigma$ is the sigmoid function, $W_c$ and $b_c$ are learned parameters, and $h_i$ is the contextual representation of potential critical point $c_i$.

\subsubsection{Stage 2: Adversarial Critic ($\Critic$)}

The Critic stage implements systematic error analysis through structured evaluation of the initial solution. Unlike simple verification models that provide binary correctness judgments, the Critic generates detailed, constructive feedback that identifies specific errors and provides actionable guidance for correction.

\textbf{Structured Critique Generation}: The critique report $K$ consists of multiple analysis components:

\begin{equation}
K = \{K_{\text{computational}}, K_{\text{logical}}, K_{\text{conceptual}}, K_{\text{completeness}}\}
\end{equation}

where each component targets a specific error type:
\begin{itemize}[leftmargin=*]
\item $K_{\text{computational}}$: Analysis of arithmetic operations and numerical computations
\item $K_{\text{logical}}$: Evaluation of logical reasoning steps and inference chains
\item $K_{\text{conceptual}}$: Assessment of problem interpretation and conceptual understanding
\item $K_{\text{completeness}}$: Identification of missing steps or incomplete reasoning
\end{itemize}

\textbf{Mathematical Formulation}: The Critic implements a structured conditional distribution:

\begin{equation}
p(K | P, R_0, C; \theta_\Critic) = \prod_{\text{type} \in \{\text{comp}, \text{log}, \text{conc}, \text{compl}\}} p(K_{\text{type}} | P, R_0, C, K_{<\text{type}}; \theta_\Critic)
\end{equation}

Each critique component is generated using specialized attention mechanisms that focus on relevant critical points:

\begin{equation}
\text{Attention}_{\text{type}}(c_i) = \text{softmax}(W_{\text{type}}^T [h_i; h_P; h_{R_0}])
\end{equation}

where $W_{\text{type}}$ are type-specific learned parameters and $[; ]$ denotes concatenation.

\textbf{Error Detection Objectives}: The Critic is trained with multiple objectives to maximize error detection capability:

\begin{align}
\mathcal{L}_{\text{critic}} &= \mathcal{L}_{\text{detection}} + \lambda_1 \mathcal{L}_{\text{specificity}} + \lambda_2 \mathcal{L}_{\text{coverage}} \\
\mathcal{L}_{\text{detection}} &= -\sum_{i} y_i \log p(\text{error detected in } c_i) \\
\mathcal{L}_{\text{specificity}} &= -\sum_{j} \log p(\text{correct error type } | \text{ error detected}) \\
\mathcal{L}_{\text{coverage}} &= \text{Binary Cross-Entropy}(\text{all errors found}, \text{ground truth})
\end{align}

where $y_i$ indicates whether critical point $c_i$ contains an error, and $\lambda_1, \lambda_2$ are balancing hyperparameters.

\subsubsection{Stage 3: Informed Synthesizer ($\Synthesizer$)}

The Synthesizer stage performs informed solution refinement by integrating feedback from the Critic while preserving correct aspects of the original reasoning. This stage implements sophisticated correction strategies that go beyond simple error replacement to maintain logical coherence and solution quality.

\textbf{Correction Strategy Selection}: The Synthesizer employs multiple correction strategies based on the error type identified by the Critic:

\begin{equation}
\text{Strategy}(K_{\text{type}}) = \begin{cases}
\text{Recompute} & \text{if } K_{\text{type}} = K_{\text{computational}} \\
\text{Restructure} & \text{if } K_{\text{type}} = K_{\text{logical}} \\
\text{Reinterpret} & \text{if } K_{\text{type}} = K_{\text{conceptual}} \\
\text{Complete} & \text{if } K_{\text{type}} = K_{\text{completeness}}
\end{cases}
\end{equation}

\textbf{Mathematical Formulation}: The Synthesizer implements a correction-aware generation process:

\begin{equation}
p(R_f | P, R_0, C, K; \theta_\Synthesizer) = \prod_{j=1}^{|R_f|} p(r_j^{(f)} | P, R_0, C, K, r_{<j}^{(f)}; \theta_\Synthesizer)
\end{equation}

The generation probability incorporates correction guidance through an attention mechanism over the critique:

\begin{equation}
p(r_j^{(f)} | \cdot) = \text{softmax}(W_{\text{out}} [h_j; \text{CorrectionContext}_j])
\end{equation}

where the correction context is computed as:

\begin{align}
\text{CorrectionContext}_j &= \sum_{\text{type}} \alpha_{\text{type},j} \cdot \text{CorrectionVector}_{\text{type}} \\
\alpha_{\text{type},j} &= \text{Attention}(h_j, K_{\text{type}}) \\
\text{CorrectionVector}_{\text{type}} &= \text{MLP}_{\text{type}}(K_{\text{type}})
\end{align}

\textbf{Confidence Estimation}: The Synthesizer also produces a confidence score $\sigma$ for the final solution:

\begin{equation}
\sigma = \text{sigmoid}(W_{\sigma}^T [\text{mean}(h_{R_f}); \text{CritiqueAlignment}; \text{CorrectionQuality}])
\end{equation}

where:
\begin{itemize}[leftmargin=*]
\item $\text{CritiqueAlignment}$ measures how well the final solution addresses identified issues
\item $\text{CorrectionQuality}$ evaluates the coherence of applied corrections
\end{itemize}

\subsection{Cognitive Dissonance Training (\cdt{})}

The \cdt{} methodology represents a novel three-phase training paradigm designed to progressively develop metacognitive reasoning capabilities. The approach is inspired by cognitive dissonance theory, which posits that inconsistencies between beliefs and evidence motivate learning and behavioral change. In our context, we create structured training scenarios where models must reconcile conflicting information sources to develop robust self-correction skills.

\begin{algorithm}[H]
\caption{Cognitive Dissonance Training Algorithm}
\label{alg:cdt}
\begin{algorithmic}[1]
\REQUIRE Base model $M_0$, training data $\mathcal{D}$, teacher model $M_{\text{teacher}}$
\ENSURE Trained \ssc{} model $M_{\ssc{}}$
\STATE \textbf{Phase 1: Structural Alignment}
\STATE $\mathcal{D}_{\text{SFT}} \leftarrow \text{GenerateS2CTraces}(\mathcal{D}, M_{\text{teacher}})$
\STATE $M_1 \leftarrow \text{SupervisedFineTune}(M_0, \mathcal{D}_{\text{SFT}})$
\STATE \textbf{Phase 2: Specialized Reward Model Training}
\STATE $\mathcal{D}_{\text{RM}} \leftarrow \text{GenerateRewardData}(M_1, \mathcal{D})$
\STATE $\RMinsight, \RMcorr \leftarrow \text{TrainRewardModels}(\mathcal{D}_{\text{RM}})$
\STATE \textbf{Phase 3: Hierarchical Process-Based Optimization}
\STATE $M_{\ssc{}} \leftarrow \text{PPOTraining}(M_1, \RMinsight, \RMcorr, \mathcal{D})$
\RETURN $M_{\ssc{}}$
\end{algorithmic}
\end{algorithm}

\subsubsection{Phase 1: Structural Alignment via Supervised Fine-Tuning}

The first phase establishes the structural foundation for self-correction by teaching the model to generate complete \ssc{} traces. This phase addresses the fundamental challenge of decomposing reasoning into the three-stage pipeline while maintaining coherence across stages.

\textbf{Data Generation}: We create a high-quality dataset $\mathcal{D}_{\text{SFT}}$ containing complete \ssc{} traces. For each problem $P$ in the training set, we generate traces using powerful teacher models (GPT-4) following a structured protocol:

\begin{enumerate}[leftmargin=*]
\item \textbf{Initial Solution Generation}: Generate multiple solution candidates using the teacher model with diverse prompting strategies
\item \textbf{Error Injection}: Systematically introduce errors of different types (computational, logical, conceptual) into a subset of solutions
\item \textbf{Critique Generation}: Generate detailed critiques for both correct and incorrect solutions, focusing on error identification and explanation
\item \textbf{Solution Refinement}: Create corrected solutions that address the issues identified in the critique
\item \textbf{Quality Validation}: Human experts validate the quality and correctness of generated traces
\end{enumerate}

The resulting dataset contains tuples $(P, R_0, C, K, R_f, y)$ where $y \in \{0, 1\}$ indicates final answer correctness.

\textbf{Training Objective}: The SFT phase optimizes a composite objective that encourages both generation quality and structural consistency:

\begin{align}
\mathcal{L}_{\text{SFT}} &= \mathcal{L}_{\text{generation}} + \lambda_{\text{struct}} \mathcal{L}_{\text{structure}} + \lambda_{\text{consist}} \mathcal{L}_{\text{consistency}} \\
\mathcal{L}_{\text{generation}} &= -\expectation{(P,T) \sim \mathcal{D}_{\text{SFT}}}{\log p(T | P; \theta)} \\
\mathcal{L}_{\text{structure}} &= -\sum_{i} \log p(c_i \in C | R_0; \theta) - \sum_{j} \log p(k_j \in K | C; \theta) \\
\mathcal{L}_{\text{consistency}} &= \text{KL}(p(A_{R_0} | P), p(A_{R_f} | P))
\end{align}

where $T = (R_0, C, K, R_f)$ represents a complete \ssc{} trace, $A_{R_0}$ and $A_{R_f}$ are the answers extracted from initial and final solutions, and the consistency loss encourages coherent answer generation across stages.

\subsubsection{Phase 2: Specialized Reward Model Training}

The second phase develops specialized reward models that provide fine-grained process supervision for critique generation and correction effectiveness. Unlike traditional reward models that focus solely on final answer correctness, our \hpbr{} system evaluates intermediate reasoning processes with high granularity.

\textbf{Insight Reward Model ($\RMinsight$)}: This model evaluates critique quality across multiple dimensions, providing scores that reflect how effectively the Critic stage identifies and analyzes errors.

The training dataset $\mathcal{D}_{\text{insight}}$ consists of tuples $(P, R_0, C, K, s_{\text{insight}})$ where $s_{\text{insight}}$ is a composite score based on:

\begin{equation}
s_{\text{insight}} = w_1 \cdot \text{Specificity} + w_2 \cdot \text{Accuracy} + w_3 \cdot \text{Completeness} + w_4 \cdot \text{Actionability}
\end{equation}

where:
\begin{itemize}[leftmargin=*]
\item \textbf{Specificity}: Measures precision in error identification (avoiding vague statements)
\item \textbf{Accuracy}: Evaluates correctness of identified issues (avoiding false positives)
\item \textbf{Completeness}: Assesses coverage of all significant errors (avoiding false negatives)
\item \textbf{Actionability}: Rates the constructiveness of feedback for correction
\end{itemize}

The training objective for $\RMinsight$ incorporates both regression and ranking components:

\begin{align}
\mathcal{L}_{\RMinsight} &= \mathcal{L}_{\text{regression}} + \lambda_{\text{rank}} \mathcal{L}_{\text{ranking}} \\
\mathcal{L}_{\text{regression}} &= \expectation{(P,R_0,C,K,s) \sim \mathcal{D}_{\text{insight}}}{(\RMinsight(P,R_0,C,K) - s)^2} \\
\mathcal{L}_{\text{ranking}} &= \expectation{(K_1, K_2, s_1, s_2)}{-\log \sigma(\RMinsight(K_1) - \RMinsight(K_2))}
\end{align}

where the ranking loss ensures that higher-quality critiques receive higher scores.

\textbf{Correction Reward Model ($\RMcorr$)}: This model evaluates how effectively the Synthesizer addresses issues identified in the critique while preserving correct reasoning elements.

The training dataset $\mathcal{D}_{\text{correction}}$ contains tuples $(P, R_0, C, K, R_f, s_{\text{corr}})$ where $s_{\text{corr}}$ evaluates:

\begin{equation}
s_{\text{corr}} = \alpha_1 \cdot \text{ErrorResolution} + \alpha_2 \cdot \text{Preservation} + \alpha_3 \cdot \text{Coherence} + \alpha_4 \cdot \text{Efficiency}
\end{equation}

The components measure:
\begin{itemize}[leftmargin=*]
\item \textbf{ErrorResolution}: Success in addressing identified errors
\item \textbf{Preservation}: Retention of correct reasoning elements from $R_0$
\item \textbf{Coherence}: Logical consistency of the corrected solution
\item \textbf{Efficiency}: Parsimony in corrections (avoiding unnecessary changes)
\end{itemize}

\textbf{Data Collection Strategy}: We generate reward model training data through a combination of:

\begin{enumerate}[leftmargin=*]
\item \textbf{Model-Generated Traces}: Using the SFT model to generate diverse \ssc{} traces
\item \textbf{Synthetic Error Injection}: Systematically introducing known errors to create negative examples
\item \textbf{Expert Annotation}: Human experts rate traces on all quality dimensions
\item \textbf{Comparative Evaluation}: Pairwise comparisons to create ranking datasets
\end{enumerate}

Both reward models are trained using a robust regression objective with outlier-resistant loss functions:

\begin{equation}
\mathcal{L}_{\text{RM}} = \expectation{(x,y) \sim \mathcal{D}_{\text{RM}}}{\text{Huber}(RM(x), y)} + \lambda_{\text{reg}} \|\theta_{\text{RM}}\|_2^2
\end{equation}

where Huber loss provides robustness to annotation noise and the regularization term prevents overfitting.

\subsubsection{Phase 3: Hierarchical Process-Based Reward Optimization}

The final phase employs Proximal Policy Optimization (PPO) with our novel \hpbr{} system to optimize the entire \ssc{} pipeline end-to-end. This phase addresses the challenge of jointly optimizing multiple reasoning stages while maintaining stable training dynamics.

\textbf{Hierarchical Reward Structure}: The \hpbr{} system integrates multiple reward signals at different levels of granularity:

\begin{align}
R_{\text{total}}(\tau) &= \sum_{t} \gamma^t [w_{\text{acc}} \cdot r_{\text{acc}}^{(t)} + w_{\text{ins}} \cdot r_{\text{ins}}^{(t)} + w_{\text{corr}} \cdot r_{\text{corr}}^{(t)} \\
&\quad + w_{\text{coh}} \cdot r_{\text{coh}}^{(t)} + w_{\text{eff}} \cdot r_{\text{eff}}^{(t)}]
\end{align}

where $\tau = (P, R_0, C, K, R_f)$ is a complete \ssc{} trace, $\gamma$ is the discount factor, and the reward components are:

\begin{itemize}[leftmargin=*]
\item $r_{\text{acc}}^{(t)}$: Stage-specific accuracy rewards (sparse, received only at completion)
\item $r_{\text{ins}}^{(t)} = \RMinsight(P, R_0, C, K^{(t)})$: Real-time critique quality scores
\item $r_{\text{corr}}^{(t)} = \RMcorr(P, R_0, C, K, R_f^{(t)})$: Incremental correction effectiveness
\item $r_{\text{coh}}^{(t)}$: Coherence rewards based on consistency across stages
\item $r_{\text{eff}}^{(t)}$: Efficiency rewards that discourage unnecessary complexity
\end{itemize}

\textbf{Dynamic Weight Learning}: Rather than using fixed weights, we learn adaptive weight parameters $\{w_{\text{acc}}, w_{\text{ins}}, w_{\text{corr}}, w_{\text{coh}}, w_{\text{eff}}\}$ that adjust based on training progress and problem difficulty:

\begin{equation}
w_i^{(n)} = \text{softmax}(\text{MLP}_w([\text{TrainingStep}_n; \text{ProblemDifficulty}; \text{CurrentPerformance}]))_i
\end{equation}

\textbf{Multi-Stage PPO Optimization}: We adapt the PPO algorithm to handle multi-stage generation with stage-specific value functions:

\begin{align}
\mathcal{L}_{\text{PPO}} &= \mathcal{L}_{\text{policy}} + \mathcal{L}_{\text{value}} + \mathcal{L}_{\text{entropy}} \\
\mathcal{L}_{\text{policy}} &= \expectation{\tau \sim \pi_\theta}{\sum_{s \in \{\Generator, \Critic, \Synthesizer\}} \min(r_s(\tau) \hat{A}_s(\tau), \text{clip}(r_s(\tau), 1-\epsilon, 1+\epsilon) \hat{A}_s(\tau))} \\
\mathcal{L}_{\text{value}} &= \expectation{\tau}{\sum_{s} (V_s(\tau) - R_s(\tau))^2} \\
\mathcal{L}_{\text{entropy}} &= -\beta \expectation{\tau}{\sum_{s} H(\pi_{\theta_s}(\cdot|\tau))}
\end{align}

where:
\begin{itemize}[leftmargin=*]
\item $r_s(\tau) = \frac{\pi_\theta^{(s)}(\tau)}{\pi_{\theta_{\text{old}}}^{(s)}(\tau)}$ is the stage-specific probability ratio
\item $\hat{A}_s(\tau)$ is the advantage function for stage $s$
\item $V_s(\tau)$ and $R_s(\tau)$ are stage-specific value functions and returns
\item $H(\pi_{\theta_s}(\cdot|\tau))$ is the policy entropy for stage $s$
\end{itemize}

\textbf{Advantage Function Decomposition}: We decompose the advantage function across reasoning stages to provide targeted learning signals:

\begin{align}
\hat{A}(\tau) &= \hat{A}_{\Generator}(\tau) + \hat{A}_{\Critic}(\tau) + \hat{A}_{\Synthesizer}(\tau) \\
\hat{A}_s(\tau) &= \delta_s + \gamma \hat{A}_{s+1}(\tau) \\
\delta_s &= r_s + \gamma V_{s+1}(\tau) - V_s(\tau)
\end{align}

This decomposition enables the model to learn stage-specific improvements while maintaining overall coherence.

\textbf{Curriculum Learning Integration}: The PPO phase incorporates curriculum learning by gradually increasing problem difficulty and error complexity:

\begin{algorithm}[H]
\caption{Curriculum-Enhanced PPO Training}
\label{alg:curriculum_ppo}
\begin{algorithmic}[1]
\REQUIRE SFT model $M_1$, reward models $\{\RMinsight, \RMcorr\}$, curriculum schedule $\mathcal{S}$
\ENSURE Optimized \ssc{} model $M_{\ssc{}}$
\FOR{epoch $e = 1$ to $E$}
    \STATE $\text{Difficulty}_e \leftarrow \mathcal{S}(e)$  \COMMENT{Get curriculum difficulty}
    \STATE $\mathcal{D}_e \leftarrow \text{SampleProblems}(\text{Difficulty}_e)$
    \FOR{batch $b$ in $\mathcal{D}_e$}
        \STATE Generate traces $\{\tau_i\}$ using current policy $\pi_{\theta_e}$
        \STATE Compute hierarchical rewards $\{R_{\text{total}}(\tau_i)\}$
        \STATE Update policy using multi-stage PPO objective
    \ENDFOR
    \STATE Evaluate on validation set and adjust curriculum if needed
\ENDFOR
\end{algorithmic}
\end{algorithm}

\section{Theoretical Analysis} \label{sec:theory}

In this section, we provide theoretical foundations for the \ssc{} framework, analyzing its convergence properties, error correction capabilities, and computational complexity.

\subsection{Convergence Analysis of Cognitive Dissonance Training}

We establish convergence guarantees for the \cdt{} methodology by analyzing each training phase.

\begin{theorem}[SFT Convergence]
\label{thm:sft_convergence}
Under standard regularity conditions, the supervised fine-tuning phase converges to a global optimum of the composite loss function $\mathcal{L}_{\text{SFT}}$ with probability 1 as the number of training steps approaches infinity.
\end{theorem}

\begin{proof}
The proof follows from the convexity of the log-likelihood term and the bounded nature of the structural and consistency losses. The composite objective satisfies the Polyak-Łojasiewicz condition, ensuring exponential convergence to the global minimum.
\end{proof}

\begin{theorem}[PPO Stability with Hierarchical Rewards]
\label{thm:ppo_stability}
The multi-stage PPO optimization with hierarchical rewards maintains policy improvement monotonicity, i.e., $J(\pi_{\theta_{k+1}}) \geq J(\pi_{\theta_k})$ for all training iterations $k$, where $J(\pi)$ is the expected total reward.
\end{theorem}

\begin{proof}
We extend the standard PPO convergence analysis to the multi-stage setting by showing that the stage-specific clipping mechanism preserves the monotonic improvement property when combined with the hierarchical reward structure.
\end{proof}

\subsection{Error Correction Capability Analysis}

We analyze the theoretical error correction capabilities of the \ssc{} framework by modeling it as a Markov Decision Process (MDP).

\textbf{State Space}: We define the state space $\mathcal{S}$ as the set of all possible reasoning traces at different stages:
\begin{equation}
\mathcal{S} = \{(P, R_0, C, K, s) : P \in \mathcal{P}, R_0 \in \mathcal{R}, C \subseteq \text{CriticalPoints}(R_0), K \in \mathcal{K}, s \in \{\Generator, \Critic, \Synthesizer\}\}
\end{equation}

\textbf{Action Space}: The action space $\mathcal{A}$ consists of token generation actions at each stage, conditioned on the current state.

\textbf{Reward Function}: The hierarchical reward function $R_{\text{total}}$ provides feedback for error correction effectiveness.

\begin{theorem}[Error Correction Bound]
\label{thm:error_bound}
Let $E_0$ be the initial error rate and $\gamma$ be the error detection rate of the Critic stage. The \ssc{} framework achieves an error correction rate bounded by:
\begin{equation}
\text{ErrorCorrectionRate} \geq \gamma \cdot (1 - \beta) \cdot E_0
\end{equation}
where $\beta$ is the error introduction rate during correction and $\gamma \geq \beta$ for effective correction.
\end{theorem}

\begin{proof}
The proof follows from analyzing the error dynamics across the three stages, showing that the expected error reduction is proportional to the Critic's detection accuracy and the Synthesizer's correction effectiveness.
\end{proof}

\subsection{Computational Complexity Analysis}

We analyze the computational complexity of the \ssc{} framework compared to baseline approaches.

\textbf{Time Complexity}: For a problem requiring $n$ tokens in the solution:
\begin{itemize}[leftmargin=*]
\item Standard CoT: $O(n)$
\item Self-Consistency (k samples): $O(kn)$
\item \ssc{} Framework: $O(3n + m)$ where $m$ is the critique length
\end{itemize}

\textbf{Space Complexity}: The \ssc{} framework requires $O(n + m)$ additional space for storing intermediate representations, compared to $O(n)$ for standard generation.

\begin{lemma}[Efficiency Bound]
\label{lemma:efficiency}
The \ssc{} framework achieves better accuracy per computational unit than ensemble methods when:
\begin{equation}
\frac{\text{Accuracy}_{\ssc{}}}{\text{Cost}_{\ssc{}}} > \frac{\text{Accuracy}_{\text{ensemble}}}{\text{Cost}_{\text{ensemble}}}
\end{equation}
which holds when $k > 3 + \frac{m}{n}$ for $k$-sample ensemble methods.
\end{lemma}

\subsection{Information-Theoretic Analysis}

We analyze the information flow in the \ssc{} framework using mutual information measures.

\textbf{Information Gain}: The Critic stage maximizes the mutual information between error locations and critique content:
\begin{equation}
I(E; K) = \sum_{e,k} p(e,k) \log \frac{p(e,k)}{p(e)p(k)}
\end{equation}
where $E$ represents error locations and $K$ represents critique content.

\textbf{Correction Efficiency}: The Synthesizer maximizes the mutual information between critique feedback and correction actions:
\begin{equation}
I(K; A_{\text{correction}}) = H(A_{\text{correction}}) - H(A_{\text{correction}} | K)
\end{equation}

\begin{theorem}[Information Optimality]
\label{thm:info_optimality}
The \ssc{} framework achieves near-optimal information utilization when the Critic and Synthesizer stages are jointly trained to maximize the mutual information between error identification and correction effectiveness.
\end{theorem}

\section{Experimental Setup} \label{sec:experiments}

\subsection{Datasets and Evaluation Protocol}

We conduct comprehensive evaluation across multiple reasoning benchmarks to assess both the effectiveness and generalizability of the \ssc{} framework.

\subsubsection{Mathematical Reasoning Benchmarks}

\textbf{GSM8K} \cite{cobbe2021training}: Our primary evaluation benchmark, consisting of 8,500 linguistically diverse grade-school math word problems requiring multi-step arithmetic reasoning. The dataset includes 7,473 training problems and 1,319 test problems. Problems require understanding of mathematical concepts, proper equation setup, and accurate computation across multiple steps.

\textbf{MATH} \cite{hendrycks2021measuring}: A comprehensive dataset of 12,500 high school mathematics competition problems spanning algebra, geometry, number theory, probability, and calculus. This benchmark tests advanced mathematical reasoning and problem-solving skills, with problems requiring sophisticated conceptual understanding and multi-step logical reasoning.

\textbf{AQuA-RAT} \cite{ling2017program}: A collection of 100,000 algebraic word problems with rationale annotations, testing quantitative reasoning abilities in standardized test formats.

\textbf{MathQA} \cite{amini2019mathqa}: A large-scale dataset of 37,000 mathematics problems with multiple choice answers, covering arithmetic, algebra, geometry, and probability.

\subsubsection{Commonsense and Multi-hop Reasoning Benchmarks}

\textbf{StrategyQA} \cite{geva2021did}: A dataset of 2,780 questions requiring multi-hop reasoning over implicit knowledge, testing the model's ability to break down complex questions into simpler sub-questions.

\textbf{CommonsenseQA} \cite{talmor2018commonsenseqa}: A multiple-choice dataset of 12,247 questions testing commonsense knowledge, requiring reasoning about everyday situations and concepts.

\textbf{OpenBookQA} \cite{mihaylov2018can}: A dataset of 5,957 multiple-choice questions that test understanding of elementary science facts and the ability to apply them in novel situations.

\subsubsection{Evaluation Metrics}

We employ a comprehensive set of metrics to evaluate different aspects of the \ssc{} framework:

\textbf{Primary Accuracy Metrics}:
\begin{itemize}[leftmargin=*]
\item \textbf{Exact Match (EM)}: Percentage of problems with exactly correct final answers
\item \textbf{Answer Accuracy}: Numerical accuracy allowing for reasonable rounding differences
\item \textbf{Step Accuracy}: Percentage of intermediate reasoning steps that are correct
\end{itemize}

\textbf{Error Analysis Metrics}:
\begin{itemize}[leftmargin=*]
\item \textbf{Error Recovery Rate (ERR)}: $\frac{\text{Problems corrected by } \ssc{}}{\text{Problems initially incorrect}} \times 100\%$
\item \textbf{Error Detection Precision}: $\frac{\text{True error detections}}{\text{Total error detections}}$
\item \textbf{Error Detection Recall}: $\frac{\text{True error detections}}{\text{Total actual errors}}$
\item \textbf{Error Type Coverage}: Percentage of different error types successfully identified
\end{itemize}

\textbf{Critique Quality Metrics}:
\begin{itemize}[leftmargin=*]
\item \textbf{Critique Specificity}: Average number of specific error descriptions per critique
\item \textbf{Critique Actionability}: Human-rated score (1-5) for constructiveness of feedback
\item \textbf{False Positive Rate}: Percentage of incorrect error identifications
\end{itemize}

\textbf{Efficiency Metrics}:
\begin{itemize}[leftmargin=*]
\item \textbf{Token Efficiency}: Average tokens generated per problem
\item \textbf{Inference Time}: Wall-clock time for complete \ssc{} pipeline
\item \textbf{Accuracy per Token}: Ratio of accuracy improvement to additional tokens
\item \textbf{Energy Consumption}: GPU hours required for training and inference
\end{itemize}

\textbf{Statistical Significance Testing}:
\begin{itemize}[leftmargin=*]
\item \textbf{McNemar's Test}: For comparing paired binary outcomes (correct/incorrect)
\item \textbf{Bootstrap Confidence Intervals}: For estimating accuracy confidence bounds
\item \textbf{Permutation Tests}: For assessing significance of improvement margins
\item \textbf{Effect Size Measures}: Cohen's d for practical significance assessment
\end{itemize}

\subsection{Model Architecture and Training Configuration}

\subsubsection{Base Model Selection and Architecture}

\textbf{Foundation Model}: We build upon Llama-3-8B-Instruct \cite{touvron2023llama2}, selected for its strong mathematical reasoning capabilities, open availability, and computational efficiency. The model features 8 billion parameters with a context length of 8,192 tokens, providing sufficient capacity for complex multi-stage reasoning while remaining computationally tractable.

\textbf{Architectural Modifications}: We introduce several key modifications to support the \ssc{} framework:

\begin{itemize}[leftmargin=*]
\item \textbf{Stage-Specific Embeddings}: We add learnable stage embeddings that identify the current reasoning phase (Generation, Critique, Synthesis), enabling the model to adapt its behavior appropriately.
\item \textbf{Critical Point Attention}: A specialized attention mechanism for identifying and focusing on critical reasoning steps.
\item \textbf{Multi-Head Critique Generation}: Separate attention heads for different error types (computational, logical, conceptual, completeness).
\item \textbf{Correction-Aware Decoder}: Modified output layer that incorporates critique feedback through additional attention over previous stages.
\end{itemize}

\subsubsection{Training Infrastructure and Configuration}

\textbf{Hardware Setup}:
\begin{itemize}[leftmargin=*]
\item Primary Training: 8×NVIDIA A100 80GB GPUs with NVLink interconnects
\item Distributed Training: DeepSpeed ZeRO Stage 2 for memory efficiency
\item Mixed Precision: FP16 training with automatic loss scaling
\item Gradient Checkpointing: Enabled to reduce memory consumption
\end{itemize}

\textbf{Phase-Specific Training Details}:

\textit{Phase 1 - Supervised Fine-Tuning}:
\begin{itemize}[leftmargin=*]
\item Learning Rate: $2 \times 10^{-5}$ with cosine decay schedule
\item Batch Size: 16 per GPU (effective batch size: 128)
\item Training Epochs: 3 with early stopping based on validation loss
\item Gradient Clipping: Maximum norm of 1.0
\item Optimizer: AdamW with $\beta_1 = 0.9$, $\beta_2 = 0.999$, weight decay $10^{-2}$
\item Warmup Steps: 500 with linear warmup schedule
\end{itemize}

\textit{Phase 2 - Reward Model Training}:
\begin{itemize}[leftmargin=*]
\item Learning Rate: $5 \times 10^{-6}$ with constant schedule
\item Batch Size: 32 per GPU for both $\RMinsight$ and $\RMcorr$
\item Training Data: 50,000 annotated critique-quality pairs, 45,000 correction-effectiveness pairs
\item Loss Function: Huber loss with $\delta = 1.0$ for robustness
\item Regularization: L2 regularization with coefficient $10^{-4}$
\item Validation Split: 20\% of data held out for early stopping
\end{itemize}

\textit{Phase 3 - PPO Optimization}:
\begin{itemize}[leftmargin=*]
\item Policy Learning Rate: $1 \times 10^{-6}$
\item Value Function Learning Rate: $3 \times 10^{-6}$
\item PPO Clip Ratio: 0.2
\item KL Divergence Coefficient: 0.02 with adaptive adjustment
\item Entropy Coefficient: 0.01 with linear decay
\item GAE Lambda: 0.95
\item Mini-batch Size: 64
\item PPO Epochs per Update: 4
\item Max Gradient Norm: 0.5
\end{itemize}

\subsubsection{Hyperparameter Optimization}

We conduct systematic hyperparameter optimization using Optuna \cite{akiba2019optuna} with multi-objective optimization targeting both accuracy and efficiency.

\textbf{Search Spaces}:
\begin{itemize}[leftmargin=*]
\item Hierarchical reward weights: $w_{acc} \in [0.05, 0.4]$, $w_{ins} \in [0.2, 0.6]$, $w_{corr} \in [0.15, 0.45]$
\item PPO hyperparameters: clip ratio $\in [0.1, 0.3]$, KL coefficient $\in [0.005, 0.05]$
\item Learning rates: sampled from log-uniform distributions
\item Regularization coefficients: sampled from log-uniform distributions
\end{itemize}

\textbf{Optimization Protocol}:
\begin{itemize}[leftmargin=*]
\item 200 trials with Tree-structured Parzen Estimator (TPE) sampler
\item Multi-objective optimization with Pareto efficiency consideration
\item Early termination of unpromising trials using median pruning
\item Final validation on held-out test set for selected configurations
\end{itemize}

\subsection{Comprehensive Baseline Comparisons}

We evaluate \ssc{} against a comprehensive set of state-of-the-art baselines spanning different paradigms of reasoning enhancement:

\subsubsection{Prompting-Based Methods}

\textbf{Chain-of-Thought (CoT)} \cite{wei2022chain}: Standard chain-of-thought prompting using the base Llama-3-8B-Instruct model with carefully crafted few-shot examples.

\textbf{Zero-Shot CoT} \cite{kojima2022large}: Using the "Let's think step by step" prompt to elicit reasoning without examples.

\textbf{Self-Consistency} \cite{wang2022self}: CoT with majority voting across 10 diverse reasoning paths, representing a strong ensemble baseline.

\textbf{Tree-of-Thoughts (ToT)} \cite{yao2023tree}: Systematic exploration of reasoning trees with breadth-first search (depth=3, branches=5).

\subsubsection{Training-Based Methods}

\textbf{Self-Taught Reasoner (STaR)} \cite{zelikman2022star}: Iterative training on self-generated correct solutions using the same base model and computational budget.

\textbf{Process Supervision} \cite{lightman2023lets}: Training with step-by-step human feedback using process reward models for intermediate step evaluation.

\textbf{Constitutional AI} \cite{bai2022constitutional}: Self-critique and revision training using constitutional principles adapted for mathematical reasoning.

\textbf{Self-Refine} \cite{madaan2023self}: Iterative self-improvement through feedback and refinement without specialized training.

\subsubsection{Verification-Based Methods}

\textbf{External Verifier} \cite{cobbe2021training}: Separate verification model (Llama-3-8B) trained exclusively on solution correctness classification.

\textbf{Outcome Reward Model (ORM)}: Traditional outcome-based reward model training using final answer correctness.

\textbf{Process Reward Model (PRM)} \cite{lightman2023lets}: Process-based reward model evaluating intermediate steps, trained on human annotations.

\subsubsection{Implementation Details for Baselines}

All baselines use identical hardware, model architecture (where applicable), and evaluation protocols to ensure fair comparison:

\begin{itemize}[leftmargin=*]
\item \textbf{Computational Budget}: Each method receives equivalent training time and inference resources
\item \textbf{Data Access}: All methods use the same training and evaluation datasets
\item \textbf{Hyperparameter Tuning}: Each baseline receives comparable optimization effort
\item \textbf{Evaluation Protocol}: Identical metrics and statistical testing procedures applied uniformly
\end{itemize}

\section{Results and Analysis} \label{sec:results}

\subsection{Main Experimental Results}

Table~\ref{tab:main_results_extended} presents comprehensive performance comparisons across all evaluation benchmarks, demonstrating the effectiveness of the \ssc{} framework.

\begin{table*}[t]
\centering
\caption{Comprehensive Performance Comparison Across Multiple Reasoning Benchmarks}
\label{tab:main_results_extended}
\resizebox{\textwidth}{!}{
\begin{tabular}{@{}lcccccccc@{}}
\toprule
\textbf{Method} & \textbf{GSM8K} & \textbf{MATH} & \textbf{AQuA-RAT} & \textbf{MathQA} & \textbf{StrategyQA} & \textbf{CommonsenseQA} & \textbf{OpenBookQA} & \textbf{Avg. Improvement} \\
\midrule
\multicolumn{9}{c}{\textit{Prompting-Based Methods}} \\
CoT Prompting & 31.2 & 12.4 & 23.8 & 34.6 & 68.9 & 72.1 & 64.3 & - \\
Zero-Shot CoT & 28.7 & 10.9 & 21.4 & 31.2 & 65.4 & 69.8 & 61.7 & -7.8\% \\
Self-Consistency (k=10) & 38.7 & 15.2 & 28.9 & 41.3 & 73.4 & 75.3 & 68.9 & +15.8\% \\
Tree-of-Thoughts & 35.4 & 13.7 & 26.1 & 38.2 & 71.2 & 73.6 & 66.8 & +9.3\% \\
\midrule
\multicolumn{9}{c}{\textit{Training-Based Methods}} \\
STaR & 36.9 & 14.1 & 27.3 & 39.8 & 70.7 & 73.8 & 65.9 & +11.4\% \\
Process Supervision & 43.1 & 17.9 & 31.4 & 46.2 & 74.8 & 76.2 & 70.1 & +22.1\% \\
Constitutional AI & 39.2 & 15.8 & 29.1 & 42.4 & 72.1 & 74.7 & 67.5 & +14.9\% \\
Self-Refine & 34.8 & 13.2 & 25.7 & 37.9 & 69.8 & 72.9 & 65.2 & +7.8\% \\
\midrule
\multicolumn{9}{c}{\textit{Verification-Based Methods}} \\
External Verifier & 41.3 & 16.8 & 30.2 & 44.7 & 71.2 & 74.6 & 68.4 & +19.1\% \\
Outcome Reward Model & 40.7 & 16.2 & 29.8 & 43.9 & 70.9 & 74.2 & 67.8 & +18.2\% \\
Process Reward Model & 44.6 & 18.4 & 32.1 & 47.8 & 75.2 & 76.8 & 71.3 & +24.7\% \\
\midrule
\textbf{\ssc{} (Ours)} & \textbf{49.9} & \textbf{21.3} & \textbf{36.4} & \textbf{52.7} & \textbf{76.4} & \textbf{78.1} & \textbf{73.6} & \textbf{+33.2\%} \\
\textbf{Relative Improvement} & \textbf{+60\%} & \textbf{+71\%} & \textbf{+53\%} & \textbf{+52\%} & \textbf{+11\%} & \textbf{+8\%} & \textbf{+14\%} & \textbf{+34\%} \\
\bottomrule
\end{tabular}}
\end{table*}

\textbf{Key Findings}:

\begin{enumerate}[leftmargin=*]
\item \textbf{Consistent Superior Performance}: \ssc{} achieves the highest accuracy across all benchmarks, with particularly strong improvements on mathematical reasoning tasks (60-71\% relative improvement).

\item \textbf{Mathematical Reasoning Dominance}: The largest improvements occur on mathematical benchmarks (GSM8K, MATH, AQuA-RAT, MathQA), where structured error correction provides the greatest benefit.

\item \textbf{Robust Generalization}: Even on commonsense reasoning tasks, \ssc{} maintains consistent improvements (8-14\%), demonstrating that metacognitive skills transfer across domains.

\item \textbf{Outperforming Process Supervision}: \ssc{} substantially outperforms existing process supervision methods, validating our integrated approach over separate verification models.
\end{enumerate}

\subsection{Statistical Significance Analysis}

Table~\ref{tab:statistical_significance} presents detailed statistical significance testing results for our main comparisons.

\begin{table}[h]
\centering
\caption{Statistical Significance Testing Results (GSM8K)}
\label{tab:statistical_significance}
\begin{tabular}{@{}lccccc@{}}
\toprule
\textbf{Comparison} & \textbf{McNemar's} $\chi^2$ & \textbf{p-value} & \textbf{Cohen's d} & \textbf{95\% CI} & \textbf{Effect Size} \\
\midrule
\ssc{} vs. CoT & 287.4 & $< 0.001$ & 0.94 & [0.85, 1.02] & Large \\
\ssc{} vs. Self-Consistency & 198.7 & $< 0.001$ & 0.71 & [0.63, 0.79] & Medium-Large \\
\ssc{} vs. Process Supervision & 89.3 & $< 0.001$ & 0.43 & [0.35, 0.51] & Medium \\
\ssc{} vs. External Verifier & 125.6 & $< 0.001$ & 0.58 & [0.50, 0.66] & Medium-Large \\
\bottomrule
\end{tabular}
\end{table}

All improvements are statistically significant ($p < 0.001$) with medium to large effect sizes, confirming the practical significance of our results beyond mere statistical significance.

\subsection{Comprehensive Ablation Studies}

We conduct extensive ablation studies to understand the contribution of each component in the \ssc{} framework. Table~\ref{tab:comprehensive_ablation} presents detailed results across multiple dimensions.

\begin{table*}[t]
\centering
\caption{Comprehensive Ablation Study Across Multiple Benchmarks}
\label{tab:comprehensive_ablation}
\resizebox{\textwidth}{!}{
\begin{tabular}{@{}lcccccccc@{}}
\toprule
\textbf{Model Variant} & \textbf{GSM8K} & \textbf{MATH} & \textbf{AQuA-RAT} & \textbf{MathQA} & \textbf{ERR} & \textbf{Precision} & \textbf{Recall} & \textbf{Tokens/Problem} \\
\midrule
\multicolumn{9}{c}{\textit{Training Phase Ablations}} \\
Base CoT (Llama-3-8B) & 31.2 & 12.4 & 23.8 & 34.6 & - & - & - & 247 \\
+ SFT Only & 37.8 & 15.1 & 27.4 & 39.2 & 18.4 & - & - & 312 \\
+ SFT + Outcome PPO & 42.4 & 16.9 & 30.1 & 43.7 & 28.7 & - & - & 339 \\
+ SFT + Process PPO (PRM) & 44.8 & 18.2 & 31.8 & 46.1 & 34.2 & 72.4 & 68.1 & 367 \\
+ SFT + Insight RM Only & 46.2 & 19.4 & 33.1 & 48.3 & 38.9 & 79.3 & 71.6 & 383 \\
+ SFT + Correction RM Only & 45.1 & 18.7 & 32.4 & 47.2 & 36.7 & 74.8 & 73.2 & 374 \\
\textbf{Full \ssc{} (Both RMs)} & \textbf{49.9} & \textbf{21.3} & \textbf{36.4} & \textbf{52.7} & \textbf{45.8} & \textbf{82.1} & \textbf{76.4} & \textbf{398} \\
\midrule
\multicolumn{9}{c}{\textit{Architectural Component Ablations}} \\
Without Critical Points & 44.7 & 18.9 & 32.7 & 48.1 & 38.2 & 76.3 & 69.7 & 365 \\
Two-Stage (Gen→Synth) & 39.3 & 16.2 & 28.9 & 42.4 & 24.1 & - & - & 328 \\
Single-Stage Refinement & 35.8 & 14.3 & 26.2 & 38.7 & 15.7 & - & - & 289 \\
No Stage Embeddings & 47.1 & 20.1 & 34.8 & 50.2 & 42.3 & 78.9 & 74.1 & 387 \\
Without Attention Mods & 46.8 & 19.6 & 34.2 & 49.6 & 41.1 & 77.6 & 72.8 & 381 \\
\midrule
\multicolumn{9}{c}{\textit{Reward Structure Ablations}} \\
Uniform Weights & 47.3 & 19.8 & 34.1 & 49.8 & 40.7 & 79.2 & 73.5 & 385 \\
No Coherence Reward & 48.2 & 20.4 & 35.2 & 51.1 & 43.1 & 80.7 & 74.9 & 392 \\