diff --git a/Core/SoarKernel/src/decision_process/decide.cpp b/Core/SoarKernel/src/decision_process/decide.cpp index 2ff38b4a79..989c8b1013 100644 --- a/Core/SoarKernel/src/decision_process/decide.cpp +++ b/Core/SoarKernel/src/decision_process/decide.cpp @@ -1739,7 +1739,35 @@ byte run_preference_semantics(agent* thisAgent, { if (!consistency) { - (*result_candidates) = exploration_choose_according_to_policy(thisAgent, s, candidates); + // RL convergence gate: if all RL rules on this slot have converged, + // select greedily (highest Q-value) instead of stochastically. + // This makes the decision deterministic, enabling chunking. + bool rl_converged = !predict && some_numeric && rl_slot_converged(thisAgent, s); + + if (rl_converged) + { + // Greedy selection: pick candidate with highest numeric value + preference* best = candidates; + for (preference* cand = candidates->next_candidate; cand; cand = cand->next_candidate) + { + if (cand->numeric_value > best->numeric_value) + { + best = cand; + } + } + (*result_candidates) = best; + + if (thisAgent->trace_settings[TRACE_RL_SYSPARAM]) + { + thisAgent->outputManager->printa_sf(thisAgent, + "RL convergence gate: slot converged, selecting greedily\n"); + } + } + else + { + (*result_candidates) = exploration_choose_according_to_policy(thisAgent, s, candidates); + } + if (!predict && rl_enabled(thisAgent)) { build_rl_trace(thisAgent, candidates, *result_candidates); diff --git a/Core/SoarKernel/src/decision_process/rete.cpp b/Core/SoarKernel/src/decision_process/rete.cpp index 9ac7738928..784112d6cd 100644 --- a/Core/SoarKernel/src/decision_process/rete.cpp +++ b/Core/SoarKernel/src/decision_process/rete.cpp @@ -7821,6 +7821,7 @@ void reteload_node_and_children(agent* thisAgent, rete_node* parent, FILE* f) prod->rl_ecr = 0.0; prod->rl_efr = 0.0; prod->rl_gql = 0.0; + prod->rl_ema_delta_q = 1.0; // start unconverged if ((prod->type != JUSTIFICATION_PRODUCTION_TYPE) && (prod->type != TEMPLATE_PRODUCTION_TYPE)) { prod->rl_rule = rl_valid_rule(prod); diff --git a/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.cpp b/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.cpp index c52701b5f1..c15fb5d163 100644 --- a/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.cpp +++ b/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.cpp @@ -155,6 +155,18 @@ rl_param_container::rl_param_container(agent* new_agent): soar_module::param_con chunk_stop = new soar_module::boolean_param("chunk-stop", on, new soar_module::f_predicate()); add(chunk_stop); + // chunk-gate — gate chunking on RL convergence + chunk_gate = new soar_module::boolean_param("chunk-gate", off, new soar_module::f_predicate()); + add(chunk_gate); + + // chunk-gate-threshold — EMA of |delta_Q| below which an RL rule is considered converged + chunk_gate_threshold = new soar_module::decimal_param("chunk-gate-threshold", 0.01, new soar_module::gt_predicate(0, false), new soar_module::f_predicate()); + add(chunk_gate_threshold); + + // chunk-gate-ema-decay — EMA decay rate for convergence tracking (higher = smoother) + chunk_gate_ema_decay = new soar_module::decimal_param("chunk-gate-ema-decay", 0.95, new soar_module::btw_predicate(0, 1, false), new soar_module::f_predicate()); + add(chunk_gate_ema_decay); + // meta meta = new soar_module::boolean_param("meta", off, new soar_module::f_predicate()); add(meta); @@ -609,6 +621,7 @@ Symbol* rl_build_template_instantiation(agent* thisAgent, instantiation* my_temp new_production->rl_ecr = 0.0; new_production->rl_efr = init_value; new_production->rl_gql = 0.0; + new_production->rl_ema_delta_q = 1.0; // start unconverged // attempt to add to rete, remove if duplicate production* duplicate_rule = NULL; @@ -994,6 +1007,13 @@ void rl_perform_update(agent* thisAgent, double op_value, bool op_rl, Symbol* go prod->rl_ecr = new_ecr; prod->rl_efr = new_efr; prod->rl_gql = new_gql; + + // Update EMA of |delta_Q| for convergence gating + { + double abs_delta = fabs(delta_ecr + delta_efr); + double ema_decay = thisAgent->RL->rl_params->chunk_gate_ema_decay->get_value(); + prod->rl_ema_delta_q = ema_decay * prod->rl_ema_delta_q + (1.0 - ema_decay) * abs_delta; + } } if (thisAgent->RL->rl_params->learning_policy->get_value() & rl_param_container::gql) @@ -1066,3 +1086,31 @@ void rl_watkins_clear(agent* /*thisAgent*/, Symbol* goal) { goal->id->rl_info->eligibility_traces->clear(); } + +// Returns true when chunk-gate is enabled and every RL rule contributing +// numeric-indifferent preferences to |s| has EMA(|delta_Q|) below threshold. +// When chunk-gate is off, returns false (no gating, preserve existing behavior). +bool rl_slot_converged(agent* thisAgent, slot* s) +{ + if (thisAgent->RL->rl_params->chunk_gate->get_value() != on) + { + return false; + } + + double threshold = thisAgent->RL->rl_params->chunk_gate_threshold->get_value(); + bool found_rl_rule = false; + + for (preference* p = s->preferences[NUMERIC_INDIFFERENT_PREFERENCE_TYPE]; p != NIL; p = p->next) + { + if (p->inst && p->inst->prod && p->inst->prod->rl_rule) + { + found_rl_rule = true; + if (p->inst->prod->rl_ema_delta_q >= threshold) + { + return false; // at least one rule hasn't converged + } + } + } + + return found_rl_rule; // true only if there were RL rules and all converged +} diff --git a/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.h b/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.h index 5a70268e42..a5ee2f0fbd 100644 --- a/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.h +++ b/Core/SoarKernel/src/reinforcement_learning/reinforcement_learning.h @@ -142,6 +142,9 @@ class rl_param_container: public soar_module::param_container soar_module::boolean_param* temporal_discount; soar_module::boolean_param* chunk_stop; + soar_module::boolean_param* chunk_gate; // gate chunking on RL convergence + soar_module::decimal_param* chunk_gate_threshold; // EMA threshold for convergence + soar_module::decimal_param* chunk_gate_ema_decay; // EMA decay rate (0,1) soar_module::boolean_param* meta; // Whether doc strings are used for storing metadata. soar_module::string_param* update_log_path; // If non-null and size > 0, log all RL updates to this file. @@ -310,6 +313,10 @@ extern void rl_perform_update(agent* thisAgent, double op_value, bool op_rl, Sym // clears eligibility traces in accordance with watkins extern void rl_watkins_clear(agent* thisAgent, Symbol* goal); +// check whether all RL rules contributing to a slot's decision have converged +// (EMA of |delta_Q| below threshold for every contributing rule) +extern bool rl_slot_converged(agent* thisAgent, struct slot_struct* s); + class RL_Manager { public: diff --git a/Core/SoarKernel/src/soar_representation/production.cpp b/Core/SoarKernel/src/soar_representation/production.cpp index b64cd767fd..3e560c6532 100644 --- a/Core/SoarKernel/src/soar_representation/production.cpp +++ b/Core/SoarKernel/src/soar_representation/production.cpp @@ -436,6 +436,7 @@ production* make_production(agent* thisAgent, p->rl_ecr = 0.0; p->rl_efr = 0.0; p->rl_gql = 0.0; + p->rl_ema_delta_q = 1.0; // start unconverged if ((type != JUSTIFICATION_PRODUCTION_TYPE) && (type != TEMPLATE_PRODUCTION_TYPE)) { p->rl_rule = rl_valid_rule(p); diff --git a/Core/SoarKernel/src/soar_representation/production.h b/Core/SoarKernel/src/soar_representation/production.h index c154c6a9bc..6e3e083560 100644 --- a/Core/SoarKernel/src/soar_representation/production.h +++ b/Core/SoarKernel/src/soar_representation/production.h @@ -49,6 +49,8 @@ typedef struct production_struct double rl_efr; // expected future reward (discounted next state) double rl_gql; // second value for implementation of GQ(\lambda) + double rl_ema_delta_q; // exponential moving average of |delta_Q| for convergence gating + condition* rl_template_conds; int duplicate_chunks_this_cycle; diff --git a/UnitTests/SoarTestAgents/FullTests/testRLConvergenceGate.soar b/UnitTests/SoarTestAgents/FullTests/testRLConvergenceGate.soar new file mode 100644 index 0000000000..05cd76ac2a --- /dev/null +++ b/UnitTests/SoarTestAgents/FullTests/testRLConvergenceGate.soar @@ -0,0 +1,81 @@ +# RL convergence gate test agent. +# Two operators, consistent reward favoring op-a. +# RL params (chunk-gate, etc.) are set by the C++ test before sourcing. + +srand 42 + +rl --set learning on + +sp {init + (state ^superstate nil) +--> + ( ^name rl-gate-test ^step 0) +} + +sp {propose*op-a + (state ^name rl-gate-test ^step ) +--> + ( ^operator +) + ( ^name op-a ^step ) +} + +sp {propose*op-b + (state ^name rl-gate-test ^step ) +--> + ( ^operator +) + ( ^name op-b ^step ) +} + +# RL rules: numeric indifferent preferences +sp {rl*value*op-a + (state ^name rl-gate-test + ^operator +) + ( ^name op-a) +--> + ( ^operator = 1.0) +} + +sp {rl*value*op-b + (state ^name rl-gate-test + ^operator +) + ( ^name op-b) +--> + ( ^operator = 0.0) +} + +# Reward for choosing op-a +sp {rl*reward*op-a + (state ^name rl-gate-test + ^reward-link + ^operator ) + ( ^name op-a) +--> + ( ^reward.value 1.0) +} + +# Reward for choosing op-b +sp {rl*reward*op-b + (state ^name rl-gate-test + ^reward-link + ^operator ) + ( ^name op-b) +--> + ( ^reward.value -1.0) +} + +# Apply: advance step +sp {apply*op + (state ^name rl-gate-test + ^operator + ^step ) +--> + ( ^step - ^step (+ 1)) +} + +# Halt after 50 decisions +sp {halt*done + (state ^name rl-gate-test + ^step >= 50) +--> + (halt) +} diff --git a/UnitTests/SoarUnitTests/FullTests.cpp b/UnitTests/SoarUnitTests/FullTests.cpp index 40f2c4522f..d1ae62cbac 100644 --- a/UnitTests/SoarUnitTests/FullTests.cpp +++ b/UnitTests/SoarUnitTests/FullTests.cpp @@ -1750,3 +1750,72 @@ void FullTests_Parent::testOutputLinkRemovalOrdering() SoarHelper::init_check_to_find_refcount_leaks(agent); } + +// RL convergence gate: verify agent runs to completion with chunk-gate enabled. +// After ~4 decisions with stable Q-values (EMA decay=0.5, threshold=0.1), +// the gate fires, forcing greedy selection for the remainder of the run. +void FullTests_Parent::testRLConvergenceGate() +{ + agent->ExecuteCommandLine("rl --set chunk-gate on"); + agent->ExecuteCommandLine("rl --set chunk-gate-ema-decay 0.5"); + agent->ExecuteCommandLine("rl --set chunk-gate-threshold 0.1"); + + loadProductions(SoarHelper::GetResource("testRLConvergenceGate.soar")); + + agent->RunSelf(50, sml::sml_DECIDE); + + { + sml::ClientAnalyzedXML response; + agent->ExecuteCommandLineXML("stats", &response); + int decisions = response.GetArgInt(sml::sml_Names::kParamStatsCycleCountDecision, -1); + no_agent_assertTrue(decisions == 50); + } +} + +// Same agent with chunk-gate off (default). Verify identical decision count, +// confirming no regression in existing RL behavior. +void FullTests_Parent::testRLConvergenceGateOff() +{ + loadProductions(SoarHelper::GetResource("testRLConvergenceGate.soar")); + + agent->RunSelf(50, sml::sml_DECIDE); + + { + sml::ClientAnalyzedXML response; + agent->ExecuteCommandLineXML("stats", &response); + int decisions = response.GetArgInt(sml::sml_Names::kParamStatsCycleCountDecision, -1); + no_agent_assertTrue(decisions == 50); + } +} + +// Verify that the three new RL parameters are accepted by the command parser. +void FullTests_Parent::testRLConvergenceGateParams() +{ + // chunk-gate: boolean on/off + agent->ExecuteCommandLine("rl --set chunk-gate on"); + no_agent_assertTrue(agent->GetLastCommandLineResult()); + agent->ExecuteCommandLine("rl --set chunk-gate off"); + no_agent_assertTrue(agent->GetLastCommandLineResult()); + + // chunk-gate-threshold: positive double + agent->ExecuteCommandLine("rl --set chunk-gate-threshold 0.05"); + no_agent_assertTrue(agent->GetLastCommandLineResult()); + agent->ExecuteCommandLine("rl --set chunk-gate-threshold 0.001"); + no_agent_assertTrue(agent->GetLastCommandLineResult()); + + // chunk-gate-ema-decay: double in (0,1) + agent->ExecuteCommandLine("rl --set chunk-gate-ema-decay 0.9"); + no_agent_assertTrue(agent->GetLastCommandLineResult()); + agent->ExecuteCommandLine("rl --set chunk-gate-ema-decay 0.5"); + no_agent_assertTrue(agent->GetLastCommandLineResult()); + + // Verify current values via rl --get + std::string result = agent->ExecuteCommandLine("rl --get chunk-gate"); + no_agent_assertTrue(result.find("off") != std::string::npos); + + result = agent->ExecuteCommandLine("rl --get chunk-gate-threshold"); + no_agent_assertTrue(result.find("0.001") != std::string::npos); + + result = agent->ExecuteCommandLine("rl --get chunk-gate-ema-decay"); + no_agent_assertTrue(result.find("0.5") != std::string::npos); +} diff --git a/UnitTests/SoarUnitTests/FullTests.hpp b/UnitTests/SoarUnitTests/FullTests.hpp index 0f20a15569..ab651f1ab5 100644 --- a/UnitTests/SoarUnitTests/FullTests.hpp +++ b/UnitTests/SoarUnitTests/FullTests.hpp @@ -90,6 +90,9 @@ class FullTests_Parent void testCommandToFile(); void testConvertIdentifier(); void testOutputLinkRemovalOrdering(); + void testRLConvergenceGate(); + void testRLConvergenceGateOff(); + void testRLConvergenceGateParams(); void before() { setUp(); } void after(bool caught) { tearDown(caught); } @@ -236,6 +239,15 @@ class FullTests : public FullTests_Parent, public TestCategory TEST(testOutputLinkRemovalOrdering, -1); void testOutputLinkRemovalOrdering() { this->FullTests_Parent::testOutputLinkRemovalOrdering(); } + TEST(testRLConvergenceGate, -1); + void testRLConvergenceGate() { this->FullTests_Parent::testRLConvergenceGate(); } + + TEST(testRLConvergenceGateOff, -1); + void testRLConvergenceGateOff() { this->FullTests_Parent::testRLConvergenceGateOff(); } + + TEST(testRLConvergenceGateParams, -1); + void testRLConvergenceGateParams() { this->FullTests_Parent::testRLConvergenceGateParams(); } + void before() { setUp(); } void after(bool caught) { tearDown(caught); }