Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
-- Fix gradual slowdown in update:runner_instances_state.
--
-- Root causes:
--
-- 1. GC seq-scan: The garbage-collection query filters runner_instances by
-- created_at with no index. As completed rows accumulate the scan grows
-- linearly, slowing GC and allowing the table to bloat, which in turn
-- makes every subsequent write (and TOAST churn) more expensive.
--
-- 2. TOAST dead-tuple accumulation: runner_instances.state is a large BYTEA
-- updated on every save_graphs_once call. PostgreSQL's default autovacuum
-- scale factor (20 %) lets dead TOAST chunks pile up before clean-up,
-- increasing write amplification over time.
--
-- 3. Missing lock_uuid index: update:runner_instances_state JOINs
-- queued_instances filtering by lock_uuid with no supporting index.

-- (1) Partial index covering only GC candidates (completed/failed instances).
-- Matches the WHERE clause in select:runner_instances_gc_candidates
-- exactly, so the planner can use an index scan instead of a seq-scan.
CREATE INDEX IF NOT EXISTS idx_runner_instances_gc_candidates
ON runner_instances(created_at)
WHERE result IS NOT NULL OR error IS NOT NULL;

-- (2) Tune autovacuum for runner_instances: vacuum when 1 % of rows are dead
-- (vs the default 20 %) and reduce the per-vacuum cost delay so dead
-- TOAST chunks from frequent state updates are reclaimed promptly.
ALTER TABLE runner_instances SET (
autovacuum_vacuum_scale_factor = 0.01,
autovacuum_vacuum_cost_delay = 2
);

-- (3) Partial index on queued_instances(lock_uuid) for locked rows only.
-- Used by the JOIN in update:runner_instances_state to verify lock
-- ownership without scanning unrelated (unlocked) rows.
CREATE INDEX IF NOT EXISTS idx_queued_instances_lock_uuid
ON queued_instances(lock_uuid)
WHERE lock_uuid IS NOT NULL;
12 changes: 9 additions & 3 deletions crates/lib/backend-postgres/src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,8 @@ impl PostgresBackend {
payloads.len(),
);
let now = Utc::now();
let mut tx = self.pool.begin().await?;

let mut schedule_builder: QueryBuilder<Postgres> = QueryBuilder::new(
"UPDATE queued_instances AS qi SET scheduled_at = v.scheduled_at, lock_expires_at = CASE WHEN qi.lock_expires_at IS NULL OR qi.lock_expires_at < v.lock_expires_at THEN v.lock_expires_at ELSE qi.lock_expires_at END FROM (",
);
Expand All @@ -567,7 +569,7 @@ impl PostgresBackend {
schedule_builder.push(")");
schedule_builder
.build()
.execute(&self.pool)
.execute(&mut *tx)
.timed(crate::query_timing_histogram!(
"update:queued_instances_scheduled_at"
))
Expand Down Expand Up @@ -599,7 +601,7 @@ impl PostgresBackend {
runner_builder.push(")");
runner_builder
.build()
.execute(&self.pool)
.execute(&mut *tx)
.timed(crate::query_timing_histogram!(
"update:runner_instances_state"
))
Expand All @@ -610,10 +612,14 @@ impl PostgresBackend {
"SELECT instance_id, lock_uuid, lock_expires_at FROM queued_instances WHERE instance_id = ANY($1)",
)
.bind(&ids)
.fetch_all(&self.pool)
.fetch_all(&mut *tx)
.timed(crate::query_timing_histogram!("select:queued_instances_lock_status_after_save_graphs"))
.await?;

tx.commit()
.timed(crate::query_timing_histogram!("commit:save_graphs_once"))
.await?;

let mut lock_map: HashMap<InstanceId, InstanceLockStatus> = HashMap::new();
for row in lock_rows {
let instance_id: InstanceId = row.get(0);
Expand Down
Loading