SoarGroup · kimjune01 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/Core/SoarKernel/src/decision_process/decide.cpp b/Core/SoarKernel/src/decision_process/decide.cpp
@@ -1739,7 +1739,35 @@ byte run_preference_semantics(agent* thisAgent,
     {
         if (!consistency)
         {
-            (*result_candidates) = exploration_choose_according_to_policy(thisAgent, s, candidates);
+            // RL convergence gate: if all RL rules on this slot have converged,
+            // select greedily (highest Q-value) instead of stochastically.
+            // This makes the decision deterministic, enabling chunking.
+            bool rl_converged = !predict && some_numeric && rl_slot_converged(thisAgent, s);
+
+            if (rl_converged)
+            {
+                // Greedy selection: pick candidate with highest numeric value
+                preference* best = candidates;
+                for (preference* cand = candidates->next_candidate; cand; cand = cand->next_candidate)
+                {
+                    if (cand->numeric_value > best->numeric_value)
+                    {
+                        best = cand;
+                    }
+                }
+                (*result_candidates) = best;
+
+                if (thisAgent->trace_settings[TRACE_RL_SYSPARAM])
+                {
+                    thisAgent->outputManager->printa_sf(thisAgent,
+                        "RL convergence gate: slot converged, selecting greedily\n");
+                }
+            }
+            else
+            {
+                (*result_candidates) = exploration_choose_according_to_policy(thisAgent, s, candidates);
+            }
+
             if (!predict && rl_enabled(thisAgent))
             {
                 build_rl_trace(thisAgent, candidates, *result_candidates);

diff --git a/Core/SoarKernel/src/decision_process/rete.cpp b/Core/SoarKernel/src/decision_process/rete.cpp
@@ -7821,6 +7821,7 @@ void reteload_node_and_children(agent* thisAgent, rete_node* parent, FILE* f)
             prod->rl_ecr = 0.0;
             prod->rl_efr = 0.0;
             prod->rl_gql = 0.0;
+            prod->rl_ema_delta_q = 1.0;     // start unconverged
             if ((prod->type != JUSTIFICATION_PRODUCTION_TYPE) && (prod->type != TEMPLATE_PRODUCTION_TYPE))
             {
                 prod->rl_rule = rl_valid_rule(prod);

diff --git a/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.cpp b/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.cpp
@@ -155,6 +155,18 @@ rl_param_container::rl_param_container(agent* new_agent): soar_module::param_con
     chunk_stop = new soar_module::boolean_param("chunk-stop", on, new soar_module::f_predicate<boolean>());
     add(chunk_stop);
 
+    // chunk-gate — gate chunking on RL convergence
+    chunk_gate = new soar_module::boolean_param("chunk-gate", off, new soar_module::f_predicate<boolean>());
+    add(chunk_gate);
+
+    // chunk-gate-threshold — EMA of |delta_Q| below which an RL rule is considered converged
+    chunk_gate_threshold = new soar_module::decimal_param("chunk-gate-threshold", 0.01, new soar_module::gt_predicate<double>(0, false), new soar_module::f_predicate<double>());
+    add(chunk_gate_threshold);
+
+    // chunk-gate-ema-decay — EMA decay rate for convergence tracking (higher = smoother)
+    chunk_gate_ema_decay = new soar_module::decimal_param("chunk-gate-ema-decay", 0.95, new soar_module::btw_predicate<double>(0, 1, false), new soar_module::f_predicate<double>());
+    add(chunk_gate_ema_decay);
+
     // meta
     meta = new soar_module::boolean_param("meta", off, new soar_module::f_predicate<boolean>());
     add(meta);
@@ -609,6 +621,7 @@ Symbol* rl_build_template_instantiation(agent* thisAgent, instantiation* my_temp
             new_production->rl_ecr = 0.0;
             new_production->rl_efr = init_value;
             new_production->rl_gql = 0.0;
+            new_production->rl_ema_delta_q = 1.0;  // start unconverged
 
             // attempt to add to rete, remove if duplicate
             production* duplicate_rule = NULL;
@@ -994,6 +1007,13 @@ void rl_perform_update(agent* thisAgent, double op_value, bool op_rl, Symbol* go
                     prod->rl_ecr = new_ecr;
                     prod->rl_efr = new_efr;
                     prod->rl_gql = new_gql;
+
+                    // Update EMA of |delta_Q| for convergence gating
+                    {
+                        double abs_delta = fabs(delta_ecr + delta_efr);
+                        double ema_decay = thisAgent->RL->rl_params->chunk_gate_ema_decay->get_value();
+                        prod->rl_ema_delta_q = ema_decay * prod->rl_ema_delta_q + (1.0 - ema_decay) * abs_delta;
+                    }
                 }
 
                 if (thisAgent->RL->rl_params->learning_policy->get_value() & rl_param_container::gql)
@@ -1066,3 +1086,31 @@ void rl_watkins_clear(agent* /*thisAgent*/, Symbol* goal)
 {
     goal->id->rl_info->eligibility_traces->clear();
 }
+
+// Returns true when chunk-gate is enabled and every RL rule contributing
+// numeric-indifferent preferences to |s| has EMA(|delta_Q|) below threshold.
+// When chunk-gate is off, returns false (no gating, preserve existing behavior).
+bool rl_slot_converged(agent* thisAgent, slot* s)
+{
+    if (thisAgent->RL->rl_params->chunk_gate->get_value() != on)
+    {
+        return false;
+    }
+
+    double threshold = thisAgent->RL->rl_params->chunk_gate_threshold->get_value();
+    bool found_rl_rule = false;
+
+    for (preference* p = s->preferences[NUMERIC_INDIFFERENT_PREFERENCE_TYPE]; p != NIL; p = p->next)
+    {
+        if (p->inst && p->inst->prod && p->inst->prod->rl_rule)
+        {
+            found_rl_rule = true;
+            if (p->inst->prod->rl_ema_delta_q >= threshold)
+            {
+                return false;  // at least one rule hasn't converged
+            }
+        }
+    }
+
+    return found_rl_rule;  // true only if there were RL rules and all converged
+}
diff --git a/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.h b/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.h
@@ -142,6 +142,9 @@ class rl_param_container: public soar_module::param_container
         soar_module::boolean_param* temporal_discount;
 
         soar_module::boolean_param* chunk_stop;
+        soar_module::boolean_param* chunk_gate;              // gate chunking on RL convergence
+        soar_module::decimal_param* chunk_gate_threshold;    // EMA threshold for convergence
+        soar_module::decimal_param* chunk_gate_ema_decay;    // EMA decay rate (0,1)
         soar_module::boolean_param* meta; // Whether doc strings are used for storing metadata.
         soar_module::string_param* update_log_path; // If non-null and size > 0, log all RL updates to this file.
 
@@ -310,6 +313,10 @@ extern void rl_perform_update(agent* thisAgent, double op_value, bool op_rl, Sym
 // clears eligibility traces in accordance with watkins
 extern void rl_watkins_clear(agent* thisAgent, Symbol* goal);
 
+// check whether all RL rules contributing to a slot's decision have converged
+// (EMA of |delta_Q| below threshold for every contributing rule)
+extern bool rl_slot_converged(agent* thisAgent, struct slot_struct* s);
+
 class RL_Manager
 {
     public:

diff --git a/Core/SoarKernel/src/soar_representation/production.cpp b/Core/SoarKernel/src/soar_representation/production.cpp
@@ -436,6 +436,7 @@ production* make_production(agent*          thisAgent,
     p->rl_ecr = 0.0;
     p->rl_efr = 0.0;
     p->rl_gql = 0.0;
+    p->rl_ema_delta_q = 1.0;     // start unconverged
     if ((type != JUSTIFICATION_PRODUCTION_TYPE) && (type != TEMPLATE_PRODUCTION_TYPE))
     {
         p->rl_rule = rl_valid_rule(p);

diff --git a/Core/SoarKernel/src/soar_representation/production.h b/Core/SoarKernel/src/soar_representation/production.h
@@ -49,6 +49,8 @@ typedef struct production_struct
     double rl_efr;                // expected future reward (discounted next state)
     double rl_gql;                // second value for implementation of GQ(\lambda)
 
+    double rl_ema_delta_q;        // exponential moving average of |delta_Q| for convergence gating
+
     condition* rl_template_conds;
 
     int      duplicate_chunks_this_cycle;

diff --git a/UnitTests/SoarTestAgents/FullTests/testRLConvergenceGate.soar b/UnitTests/SoarTestAgents/FullTests/testRLConvergenceGate.soar
@@ -0,0 +1,81 @@
+# RL convergence gate test agent.
+# Two operators, consistent reward favoring op-a.
+# RL params (chunk-gate, etc.) are set by the C++ test before sourcing.
+
+srand 42
+
+rl --set learning on
+
+sp {init
+   (state <s> ^superstate nil)
+-->
+   (<s> ^name rl-gate-test ^step 0)
+}
+
+sp {propose*op-a
+   (state <s> ^name rl-gate-test ^step <n>)
+-->
+   (<s> ^operator <o> +)
+   (<o> ^name op-a ^step <n>)
+}
+
+sp {propose*op-b
+   (state <s> ^name rl-gate-test ^step <n>)
+-->
+   (<s> ^operator <o> +)
+   (<o> ^name op-b ^step <n>)
+}
+
+# RL rules: numeric indifferent preferences
+sp {rl*value*op-a
+   (state <s> ^name rl-gate-test
+              ^operator <o> +)
+   (<o> ^name op-a)
+-->
+   (<s> ^operator <o> = 1.0)
+}
+
+sp {rl*value*op-b
+   (state <s> ^name rl-gate-test
+              ^operator <o> +)
+   (<o> ^name op-b)
+-->
+   (<s> ^operator <o> = 0.0)
+}
+
+# Reward for choosing op-a
+sp {rl*reward*op-a
+   (state <s> ^name rl-gate-test
+              ^reward-link <rl>
+              ^operator <o>)
+   (<o> ^name op-a)
+-->
+   (<rl> ^reward.value 1.0)
+}
+
+# Reward for choosing op-b
+sp {rl*reward*op-b
+   (state <s> ^name rl-gate-test
+              ^reward-link <rl>
+              ^operator <o>)
+   (<o> ^name op-b)
+-->
+   (<rl> ^reward.value -1.0)
+}
+
+# Apply: advance step
+sp {apply*op
+   (state <s> ^name rl-gate-test
+              ^operator <o>
+              ^step <n>)
+-->
+   (<s> ^step <n> - ^step (+ <n> 1))
+}
+
+# Halt after 50 decisions
+sp {halt*done
+   (state <s> ^name rl-gate-test
+              ^step >= 50)
+-->
+   (halt)
+}
diff --git a/UnitTests/SoarUnitTests/FullTests.cpp b/UnitTests/SoarUnitTests/FullTests.cpp
@@ -1750,3 +1750,72 @@ void FullTests_Parent::testOutputLinkRemovalOrdering()
 
     SoarHelper::init_check_to_find_refcount_leaks(agent);
 }
+
+// RL convergence gate: verify agent runs to completion with chunk-gate enabled.
+// After ~4 decisions with stable Q-values (EMA decay=0.5, threshold=0.1),
+// the gate fires, forcing greedy selection for the remainder of the run.
+void FullTests_Parent::testRLConvergenceGate()
+{
+    agent->ExecuteCommandLine("rl --set chunk-gate on");
+    agent->ExecuteCommandLine("rl --set chunk-gate-ema-decay 0.5");
+    agent->ExecuteCommandLine("rl --set chunk-gate-threshold 0.1");
+
+    loadProductions(SoarHelper::GetResource("testRLConvergenceGate.soar"));
+
+    agent->RunSelf(50, sml::sml_DECIDE);
+
+    {
+        sml::ClientAnalyzedXML response;
+        agent->ExecuteCommandLineXML("stats", &response);
+        int decisions = response.GetArgInt(sml::sml_Names::kParamStatsCycleCountDecision, -1);
+        no_agent_assertTrue(decisions == 50);
+    }
+}
+
+// Same agent with chunk-gate off (default). Verify identical decision count,
+// confirming no regression in existing RL behavior.
+void FullTests_Parent::testRLConvergenceGateOff()
+{
+    loadProductions(SoarHelper::GetResource("testRLConvergenceGate.soar"));
+
+    agent->RunSelf(50, sml::sml_DECIDE);
+
+    {
+        sml::ClientAnalyzedXML response;
+        agent->ExecuteCommandLineXML("stats", &response);
+        int decisions = response.GetArgInt(sml::sml_Names::kParamStatsCycleCountDecision, -1);
+        no_agent_assertTrue(decisions == 50);
+    }
+}
+
+// Verify that the three new RL parameters are accepted by the command parser.
+void FullTests_Parent::testRLConvergenceGateParams()
+{
+    // chunk-gate: boolean on/off
+    agent->ExecuteCommandLine("rl --set chunk-gate on");
+    no_agent_assertTrue(agent->GetLastCommandLineResult());
+    agent->ExecuteCommandLine("rl --set chunk-gate off");
+    no_agent_assertTrue(agent->GetLastCommandLineResult());
+
+    // chunk-gate-threshold: positive double
+    agent->ExecuteCommandLine("rl --set chunk-gate-threshold 0.05");
+    no_agent_assertTrue(agent->GetLastCommandLineResult());
+    agent->ExecuteCommandLine("rl --set chunk-gate-threshold 0.001");
+    no_agent_assertTrue(agent->GetLastCommandLineResult());
+
+    // chunk-gate-ema-decay: double in (0,1)
+    agent->ExecuteCommandLine("rl --set chunk-gate-ema-decay 0.9");
+    no_agent_assertTrue(agent->GetLastCommandLineResult());
+    agent->ExecuteCommandLine("rl --set chunk-gate-ema-decay 0.5");
+    no_agent_assertTrue(agent->GetLastCommandLineResult());
+
+    // Verify current values via rl --get
+    std::string result = agent->ExecuteCommandLine("rl --get chunk-gate");
+    no_agent_assertTrue(result.find("off") != std::string::npos);
+
+    result = agent->ExecuteCommandLine("rl --get chunk-gate-threshold");
+    no_agent_assertTrue(result.find("0.001") != std::string::npos);
+
+    result = agent->ExecuteCommandLine("rl --get chunk-gate-ema-decay");
+    no_agent_assertTrue(result.find("0.5") != std::string::npos);
+}
diff --git a/UnitTests/SoarUnitTests/FullTests.hpp b/UnitTests/SoarUnitTests/FullTests.hpp
@@ -90,6 +90,9 @@ class FullTests_Parent
 	void testCommandToFile();
 	void testConvertIdentifier();
 	void testOutputLinkRemovalOrdering();
+	void testRLConvergenceGate();
+	void testRLConvergenceGateOff();
+	void testRLConvergenceGateParams();
 
 	void before() { setUp(); }
 	void after(bool caught) { tearDown(caught); }
@@ -236,6 +239,15 @@ class FullTests : public FullTests_Parent, public TestCategory
 	TEST(testOutputLinkRemovalOrdering, -1);
 	void testOutputLinkRemovalOrdering() { this->FullTests_Parent::testOutputLinkRemovalOrdering(); }
 
+	TEST(testRLConvergenceGate, -1);
+	void testRLConvergenceGate() { this->FullTests_Parent::testRLConvergenceGate(); }
+
+	TEST(testRLConvergenceGateOff, -1);
+	void testRLConvergenceGateOff() { this->FullTests_Parent::testRLConvergenceGateOff(); }
+
+	TEST(testRLConvergenceGateParams, -1);
+	void testRLConvergenceGateParams() { this->FullTests_Parent::testRLConvergenceGateParams(); }
+
 	void before() { setUp(); }
 	void after(bool caught) { tearDown(caught); }