Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion Core/SoarKernel/src/decision_process/decide.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1739,7 +1739,35 @@ byte run_preference_semantics(agent* thisAgent,
{
if (!consistency)
{
(*result_candidates) = exploration_choose_according_to_policy(thisAgent, s, candidates);
// RL convergence gate: if all RL rules on this slot have converged,
// select greedily (highest Q-value) instead of stochastically.
// This makes the decision deterministic, enabling chunking.
bool rl_converged = !predict && some_numeric && rl_slot_converged(thisAgent, s);

if (rl_converged)
{
// Greedy selection: pick candidate with highest numeric value
preference* best = candidates;
for (preference* cand = candidates->next_candidate; cand; cand = cand->next_candidate)
{
if (cand->numeric_value > best->numeric_value)
{
best = cand;
}
}
(*result_candidates) = best;

if (thisAgent->trace_settings[TRACE_RL_SYSPARAM])
{
thisAgent->outputManager->printa_sf(thisAgent,
"RL convergence gate: slot converged, selecting greedily\n");
}
}
else
{
(*result_candidates) = exploration_choose_according_to_policy(thisAgent, s, candidates);
}

if (!predict && rl_enabled(thisAgent))
{
build_rl_trace(thisAgent, candidates, *result_candidates);
Expand Down
1 change: 1 addition & 0 deletions Core/SoarKernel/src/decision_process/rete.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7821,6 +7821,7 @@ void reteload_node_and_children(agent* thisAgent, rete_node* parent, FILE* f)
prod->rl_ecr = 0.0;
prod->rl_efr = 0.0;
prod->rl_gql = 0.0;
prod->rl_ema_delta_q = 1.0; // start unconverged
if ((prod->type != JUSTIFICATION_PRODUCTION_TYPE) && (prod->type != TEMPLATE_PRODUCTION_TYPE))
{
prod->rl_rule = rl_valid_rule(prod);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,18 @@ rl_param_container::rl_param_container(agent* new_agent): soar_module::param_con
chunk_stop = new soar_module::boolean_param("chunk-stop", on, new soar_module::f_predicate<boolean>());
add(chunk_stop);

// chunk-gate — gate chunking on RL convergence
chunk_gate = new soar_module::boolean_param("chunk-gate", off, new soar_module::f_predicate<boolean>());
add(chunk_gate);

// chunk-gate-threshold — EMA of |delta_Q| below which an RL rule is considered converged
chunk_gate_threshold = new soar_module::decimal_param("chunk-gate-threshold", 0.01, new soar_module::gt_predicate<double>(0, false), new soar_module::f_predicate<double>());
add(chunk_gate_threshold);

// chunk-gate-ema-decay — EMA decay rate for convergence tracking (higher = smoother)
chunk_gate_ema_decay = new soar_module::decimal_param("chunk-gate-ema-decay", 0.95, new soar_module::btw_predicate<double>(0, 1, false), new soar_module::f_predicate<double>());
add(chunk_gate_ema_decay);

// meta
meta = new soar_module::boolean_param("meta", off, new soar_module::f_predicate<boolean>());
add(meta);
Expand Down Expand Up @@ -609,6 +621,7 @@ Symbol* rl_build_template_instantiation(agent* thisAgent, instantiation* my_temp
new_production->rl_ecr = 0.0;
new_production->rl_efr = init_value;
new_production->rl_gql = 0.0;
new_production->rl_ema_delta_q = 1.0; // start unconverged

// attempt to add to rete, remove if duplicate
production* duplicate_rule = NULL;
Expand Down Expand Up @@ -994,6 +1007,13 @@ void rl_perform_update(agent* thisAgent, double op_value, bool op_rl, Symbol* go
prod->rl_ecr = new_ecr;
prod->rl_efr = new_efr;
prod->rl_gql = new_gql;

// Update EMA of |delta_Q| for convergence gating
{
double abs_delta = fabs(delta_ecr + delta_efr);
double ema_decay = thisAgent->RL->rl_params->chunk_gate_ema_decay->get_value();
prod->rl_ema_delta_q = ema_decay * prod->rl_ema_delta_q + (1.0 - ema_decay) * abs_delta;
}
}

if (thisAgent->RL->rl_params->learning_policy->get_value() & rl_param_container::gql)
Expand Down Expand Up @@ -1066,3 +1086,31 @@ void rl_watkins_clear(agent* /*thisAgent*/, Symbol* goal)
{
goal->id->rl_info->eligibility_traces->clear();
}

// Returns true when chunk-gate is enabled and every RL rule contributing
// numeric-indifferent preferences to |s| has EMA(|delta_Q|) below threshold.
// When chunk-gate is off, returns false (no gating, preserve existing behavior).
bool rl_slot_converged(agent* thisAgent, slot* s)
{
if (thisAgent->RL->rl_params->chunk_gate->get_value() != on)
{
return false;
}

double threshold = thisAgent->RL->rl_params->chunk_gate_threshold->get_value();
bool found_rl_rule = false;

for (preference* p = s->preferences[NUMERIC_INDIFFERENT_PREFERENCE_TYPE]; p != NIL; p = p->next)
{
if (p->inst && p->inst->prod && p->inst->prod->rl_rule)
{
found_rl_rule = true;
if (p->inst->prod->rl_ema_delta_q >= threshold)
{
return false; // at least one rule hasn't converged
}
}
}

return found_rl_rule; // true only if there were RL rules and all converged
}
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ class rl_param_container: public soar_module::param_container
soar_module::boolean_param* temporal_discount;

soar_module::boolean_param* chunk_stop;
soar_module::boolean_param* chunk_gate; // gate chunking on RL convergence
soar_module::decimal_param* chunk_gate_threshold; // EMA threshold for convergence
soar_module::decimal_param* chunk_gate_ema_decay; // EMA decay rate (0,1)
soar_module::boolean_param* meta; // Whether doc strings are used for storing metadata.
soar_module::string_param* update_log_path; // If non-null and size > 0, log all RL updates to this file.

Expand Down Expand Up @@ -310,6 +313,10 @@ extern void rl_perform_update(agent* thisAgent, double op_value, bool op_rl, Sym
// clears eligibility traces in accordance with watkins
extern void rl_watkins_clear(agent* thisAgent, Symbol* goal);

// check whether all RL rules contributing to a slot's decision have converged
// (EMA of |delta_Q| below threshold for every contributing rule)
extern bool rl_slot_converged(agent* thisAgent, struct slot_struct* s);

class RL_Manager
{
public:
Expand Down
1 change: 1 addition & 0 deletions Core/SoarKernel/src/soar_representation/production.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,7 @@ production* make_production(agent* thisAgent,
p->rl_ecr = 0.0;
p->rl_efr = 0.0;
p->rl_gql = 0.0;
p->rl_ema_delta_q = 1.0; // start unconverged
if ((type != JUSTIFICATION_PRODUCTION_TYPE) && (type != TEMPLATE_PRODUCTION_TYPE))
{
p->rl_rule = rl_valid_rule(p);
Expand Down
2 changes: 2 additions & 0 deletions Core/SoarKernel/src/soar_representation/production.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ typedef struct production_struct
double rl_efr; // expected future reward (discounted next state)
double rl_gql; // second value for implementation of GQ(\lambda)

double rl_ema_delta_q; // exponential moving average of |delta_Q| for convergence gating

condition* rl_template_conds;

int duplicate_chunks_this_cycle;
Expand Down
81 changes: 81 additions & 0 deletions UnitTests/SoarTestAgents/FullTests/testRLConvergenceGate.soar
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# RL convergence gate test agent.
# Two operators, consistent reward favoring op-a.
# RL params (chunk-gate, etc.) are set by the C++ test before sourcing.

srand 42

rl --set learning on

sp {init
(state <s> ^superstate nil)
-->
(<s> ^name rl-gate-test ^step 0)
}

sp {propose*op-a
(state <s> ^name rl-gate-test ^step <n>)
-->
(<s> ^operator <o> +)
(<o> ^name op-a ^step <n>)
}

sp {propose*op-b
(state <s> ^name rl-gate-test ^step <n>)
-->
(<s> ^operator <o> +)
(<o> ^name op-b ^step <n>)
}

# RL rules: numeric indifferent preferences
sp {rl*value*op-a
(state <s> ^name rl-gate-test
^operator <o> +)
(<o> ^name op-a)
-->
(<s> ^operator <o> = 1.0)
}

sp {rl*value*op-b
(state <s> ^name rl-gate-test
^operator <o> +)
(<o> ^name op-b)
-->
(<s> ^operator <o> = 0.0)
}

# Reward for choosing op-a
sp {rl*reward*op-a
(state <s> ^name rl-gate-test
^reward-link <rl>
^operator <o>)
(<o> ^name op-a)
-->
(<rl> ^reward.value 1.0)
}

# Reward for choosing op-b
sp {rl*reward*op-b
(state <s> ^name rl-gate-test
^reward-link <rl>
^operator <o>)
(<o> ^name op-b)
-->
(<rl> ^reward.value -1.0)
}

# Apply: advance step
sp {apply*op
(state <s> ^name rl-gate-test
^operator <o>
^step <n>)
-->
(<s> ^step <n> - ^step (+ <n> 1))
}

# Halt after 50 decisions
sp {halt*done
(state <s> ^name rl-gate-test
^step >= 50)
-->
(halt)
}
69 changes: 69 additions & 0 deletions UnitTests/SoarUnitTests/FullTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1750,3 +1750,72 @@ void FullTests_Parent::testOutputLinkRemovalOrdering()

SoarHelper::init_check_to_find_refcount_leaks(agent);
}

// RL convergence gate: verify agent runs to completion with chunk-gate enabled.
// After ~4 decisions with stable Q-values (EMA decay=0.5, threshold=0.1),
// the gate fires, forcing greedy selection for the remainder of the run.
void FullTests_Parent::testRLConvergenceGate()
{
agent->ExecuteCommandLine("rl --set chunk-gate on");
agent->ExecuteCommandLine("rl --set chunk-gate-ema-decay 0.5");
agent->ExecuteCommandLine("rl --set chunk-gate-threshold 0.1");

loadProductions(SoarHelper::GetResource("testRLConvergenceGate.soar"));

agent->RunSelf(50, sml::sml_DECIDE);

{
sml::ClientAnalyzedXML response;
agent->ExecuteCommandLineXML("stats", &response);
int decisions = response.GetArgInt(sml::sml_Names::kParamStatsCycleCountDecision, -1);
no_agent_assertTrue(decisions == 50);
}
}

// Same agent with chunk-gate off (default). Verify identical decision count,
// confirming no regression in existing RL behavior.
void FullTests_Parent::testRLConvergenceGateOff()
{
loadProductions(SoarHelper::GetResource("testRLConvergenceGate.soar"));

agent->RunSelf(50, sml::sml_DECIDE);

{
sml::ClientAnalyzedXML response;
agent->ExecuteCommandLineXML("stats", &response);
int decisions = response.GetArgInt(sml::sml_Names::kParamStatsCycleCountDecision, -1);
no_agent_assertTrue(decisions == 50);
}
}

// Verify that the three new RL parameters are accepted by the command parser.
void FullTests_Parent::testRLConvergenceGateParams()
{
// chunk-gate: boolean on/off
agent->ExecuteCommandLine("rl --set chunk-gate on");
no_agent_assertTrue(agent->GetLastCommandLineResult());
agent->ExecuteCommandLine("rl --set chunk-gate off");
no_agent_assertTrue(agent->GetLastCommandLineResult());

// chunk-gate-threshold: positive double
agent->ExecuteCommandLine("rl --set chunk-gate-threshold 0.05");
no_agent_assertTrue(agent->GetLastCommandLineResult());
agent->ExecuteCommandLine("rl --set chunk-gate-threshold 0.001");
no_agent_assertTrue(agent->GetLastCommandLineResult());

// chunk-gate-ema-decay: double in (0,1)
agent->ExecuteCommandLine("rl --set chunk-gate-ema-decay 0.9");
no_agent_assertTrue(agent->GetLastCommandLineResult());
agent->ExecuteCommandLine("rl --set chunk-gate-ema-decay 0.5");
no_agent_assertTrue(agent->GetLastCommandLineResult());

// Verify current values via rl --get
std::string result = agent->ExecuteCommandLine("rl --get chunk-gate");
no_agent_assertTrue(result.find("off") != std::string::npos);

result = agent->ExecuteCommandLine("rl --get chunk-gate-threshold");
no_agent_assertTrue(result.find("0.001") != std::string::npos);

result = agent->ExecuteCommandLine("rl --get chunk-gate-ema-decay");
no_agent_assertTrue(result.find("0.5") != std::string::npos);
}
12 changes: 12 additions & 0 deletions UnitTests/SoarUnitTests/FullTests.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ class FullTests_Parent
void testCommandToFile();
void testConvertIdentifier();
void testOutputLinkRemovalOrdering();
void testRLConvergenceGate();
void testRLConvergenceGateOff();
void testRLConvergenceGateParams();

void before() { setUp(); }
void after(bool caught) { tearDown(caught); }
Expand Down Expand Up @@ -236,6 +239,15 @@ class FullTests : public FullTests_Parent, public TestCategory
TEST(testOutputLinkRemovalOrdering, -1);
void testOutputLinkRemovalOrdering() { this->FullTests_Parent::testOutputLinkRemovalOrdering(); }

TEST(testRLConvergenceGate, -1);
void testRLConvergenceGate() { this->FullTests_Parent::testRLConvergenceGate(); }

TEST(testRLConvergenceGateOff, -1);
void testRLConvergenceGateOff() { this->FullTests_Parent::testRLConvergenceGateOff(); }

TEST(testRLConvergenceGateParams, -1);
void testRLConvergenceGateParams() { this->FullTests_Parent::testRLConvergenceGateParams(); }

void before() { setUp(); }
void after(bool caught) { tearDown(caught); }

Expand Down