From 38cf336d5a8af7d7c794ac27c2a6a7c9b8328404 Mon Sep 17 00:00:00 2001 From: Marcel Menk Date: Sun, 31 May 2026 17:07:36 +0200 Subject: [PATCH] fix: job history retention --- docs/agenstra/deployment/background-jobs.md | 2 ++ .../shared/backend/util-queue/src/index.ts | 1 + .../src/lib/enqueue-unit-job.spec.ts | 7 ++++++- .../util-queue/src/lib/enqueue-unit-job.ts | 5 +++-- .../util-queue/src/lib/job-retention.spec.ts | 18 ++++++++++++++++++ .../util-queue/src/lib/job-retention.ts | 15 +++++++++++++++ .../backend/util-queue/src/lib/queue.module.ts | 5 +++-- ...register-repeatable-coordinator-job.spec.ts | 12 +++++++++++- .../lib/register-repeatable-coordinator-job.ts | 6 ++++-- 9 files changed, 63 insertions(+), 8 deletions(-) create mode 100644 libs/domains/shared/backend/util-queue/src/lib/job-retention.spec.ts create mode 100644 libs/domains/shared/backend/util-queue/src/lib/job-retention.ts diff --git a/docs/agenstra/deployment/background-jobs.md b/docs/agenstra/deployment/background-jobs.md index 9c6a94e0..98fca26e 100644 --- a/docs/agenstra/deployment/background-jobs.md +++ b/docs/agenstra/deployment/background-jobs.md @@ -63,6 +63,8 @@ When enabled on the API container (`QUEUE_BULL_BOARD_ENABLED=true`, default in c Bull Board uses **HTTP Basic authentication** (`QUEUE_BULL_BOARD_USERNAME` / `QUEUE_BULL_BOARD_PASSWORD`). Local compose defaults to `admin` / `bullmq`; override in production. Startup fails in production if the board is enabled without a password. +Completed and failed jobs are **not auto-removed** (`removeOnComplete: false`, `removeOnFail: false`) so run history stays in Bull Board. Treat the **last three runs** and **48 hours** as the minimum retention before any manual cleanup via Bull Board or ops. + Bull Board routes bypass the API **origin allowlist**, **HybridAuthGuard**, and **Keycloak guards** (when `AUTHENTICATION_METHOD=keycloak`) so dashboard actions (retry, delete, clean) are not blocked with `403 Forbidden` when the UI sends browser `Origin` headers or `Authorization: Basic` instead of the API key or OIDC token. Worker and scheduler containers set `QUEUE_BULL_BOARD_ENABLED=false` so they do not start an HTTP server solely for Bull Board. diff --git a/libs/domains/shared/backend/util-queue/src/index.ts b/libs/domains/shared/backend/util-queue/src/index.ts index 0817e54f..d03a3415 100644 --- a/libs/domains/shared/backend/util-queue/src/index.ts +++ b/libs/domains/shared/backend/util-queue/src/index.ts @@ -4,6 +4,7 @@ export * from './lib/bull-board-global-prefix'; export * from './lib/enqueue-unit-job'; export * from './lib/is-duplicate-job-enqueue-error'; export * from './lib/job-id.util'; +export * from './lib/job-retention'; export * from './lib/register-repeatable-coordinator-job'; export * from './lib/queue-connection.config'; export * from './lib/queue-role'; diff --git a/libs/domains/shared/backend/util-queue/src/lib/enqueue-unit-job.spec.ts b/libs/domains/shared/backend/util-queue/src/lib/enqueue-unit-job.spec.ts index b19ae6a2..f9186ee6 100644 --- a/libs/domains/shared/backend/util-queue/src/lib/enqueue-unit-job.spec.ts +++ b/libs/domains/shared/backend/util-queue/src/lib/enqueue-unit-job.spec.ts @@ -1,5 +1,6 @@ import type { Queue } from 'bullmq'; +import { defaultRemoveOnComplete, defaultRemoveOnFail } from './job-retention'; import { enqueueUnitJob } from './enqueue-unit-job'; describe('enqueueUnitJob', () => { @@ -18,7 +19,11 @@ describe('enqueueUnitJob', () => { expect(add).toHaveBeenCalledWith( 'billing.subscription.unit', { subscriptionId: 'abc' }, - expect.objectContaining({ jobId: 'billing.subscription.abc' }), + expect.objectContaining({ + jobId: 'billing.subscription.abc', + removeOnComplete: defaultRemoveOnComplete, + removeOnFail: defaultRemoveOnFail, + }), ); }); diff --git a/libs/domains/shared/backend/util-queue/src/lib/enqueue-unit-job.ts b/libs/domains/shared/backend/util-queue/src/lib/enqueue-unit-job.ts index 7a5a6083..40afecb2 100644 --- a/libs/domains/shared/backend/util-queue/src/lib/enqueue-unit-job.ts +++ b/libs/domains/shared/backend/util-queue/src/lib/enqueue-unit-job.ts @@ -1,6 +1,7 @@ import type { JobsOptions, Queue } from 'bullmq'; import { isDuplicateJobEnqueueError } from './is-duplicate-job-enqueue-error'; +import { defaultRemoveOnComplete, defaultRemoveOnFail } from './job-retention'; import { buildJobId } from './job-id.util'; export interface EnqueueUnitJobOptions { @@ -19,8 +20,8 @@ export async function enqueueUnitJob(options: EnqueueUnitJobOptions): Prom try { await options.queue.add(options.jobName, options.payload, { jobId, - removeOnComplete: { age: 3600, count: 1000 }, - removeOnFail: { age: 86400, count: 5000 }, + removeOnComplete: defaultRemoveOnComplete, + removeOnFail: defaultRemoveOnFail, attempts: 3, backoff: { type: 'exponential', delay: 5000 }, ...options.opts, diff --git a/libs/domains/shared/backend/util-queue/src/lib/job-retention.spec.ts b/libs/domains/shared/backend/util-queue/src/lib/job-retention.spec.ts new file mode 100644 index 00000000..22dab38d --- /dev/null +++ b/libs/domains/shared/backend/util-queue/src/lib/job-retention.spec.ts @@ -0,0 +1,18 @@ +import { + BULL_BOARD_JOB_RETENTION_AGE_SECONDS, + BULL_BOARD_JOB_RETENTION_COUNT, + defaultRemoveOnComplete, + defaultRemoveOnFail, +} from './job-retention'; + +describe('job retention defaults', () => { + it('disables automatic removal of completed and failed jobs', () => { + expect(defaultRemoveOnComplete).toBe(false); + expect(defaultRemoveOnFail).toBe(false); + }); + + it('documents the minimum Bull Board visibility policy', () => { + expect(BULL_BOARD_JOB_RETENTION_COUNT).toBe(3); + expect(BULL_BOARD_JOB_RETENTION_AGE_SECONDS).toBe(48 * 60 * 60); + }); +}); diff --git a/libs/domains/shared/backend/util-queue/src/lib/job-retention.ts b/libs/domains/shared/backend/util-queue/src/lib/job-retention.ts new file mode 100644 index 00000000..a2936c9f --- /dev/null +++ b/libs/domains/shared/backend/util-queue/src/lib/job-retention.ts @@ -0,0 +1,15 @@ +/** Documented minimum Bull Board visibility before any manual cleanup. */ +export const BULL_BOARD_JOB_RETENTION_COUNT = 3; + +/** Documented minimum Bull Board visibility in seconds (48 hours). */ +export const BULL_BOARD_JOB_RETENTION_AGE_SECONDS = 48 * 60 * 60; + +/** + * Do not auto-remove completed jobs so Bull Board keeps run history. + * Jobs should remain visible for at least the last three runs and 48 hours; + * automatic trimming is disabled and cleanup is manual via Bull Board or ops. + */ +export const defaultRemoveOnComplete = false; + +/** Do not auto-remove failed jobs so Bull Board keeps error history. */ +export const defaultRemoveOnFail = false; diff --git a/libs/domains/shared/backend/util-queue/src/lib/queue.module.ts b/libs/domains/shared/backend/util-queue/src/lib/queue.module.ts index edfa6c29..0849c449 100644 --- a/libs/domains/shared/backend/util-queue/src/lib/queue.module.ts +++ b/libs/domains/shared/backend/util-queue/src/lib/queue.module.ts @@ -6,6 +6,7 @@ import { DynamicModule, Module } from '@nestjs/common'; import type { QueueOptions } from 'bullmq'; import { createBullBoardAuthMiddlewareFromEnv } from './bull-board-auth'; +import { defaultRemoveOnComplete, defaultRemoveOnFail } from './job-retention'; import { shouldEnableBullBoard, shouldRegisterRepeatableJobs, shouldRunQueueWorkers } from './queue-role'; import { readBullBoardPath, @@ -32,8 +33,8 @@ export class SharedQueueModule { const concurrency = options.workerConcurrency ?? readQueueWorkerConcurrency(); const defaultJobOptions: QueueOptions['defaultJobOptions'] = { - removeOnComplete: { age: 3600, count: 1000 }, - removeOnFail: { age: 86400, count: 5000 }, + removeOnComplete: defaultRemoveOnComplete, + removeOnFail: defaultRemoveOnFail, attempts: 3, backoff: { type: 'exponential', delay: 5000 }, }; diff --git a/libs/domains/shared/backend/util-queue/src/lib/register-repeatable-coordinator-job.spec.ts b/libs/domains/shared/backend/util-queue/src/lib/register-repeatable-coordinator-job.spec.ts index b9b38232..28aaed58 100644 --- a/libs/domains/shared/backend/util-queue/src/lib/register-repeatable-coordinator-job.spec.ts +++ b/libs/domains/shared/backend/util-queue/src/lib/register-repeatable-coordinator-job.spec.ts @@ -1,5 +1,6 @@ import type { Queue } from 'bullmq'; +import { defaultRemoveOnComplete, defaultRemoveOnFail } from './job-retention'; import { registerRepeatableCoordinatorJob } from './register-repeatable-coordinator-job'; describe('registerRepeatableCoordinatorJob', () => { @@ -30,6 +31,8 @@ describe('registerRepeatableCoordinatorJob', () => { expect.objectContaining({ jobId: 'coordinator.filter-rules-sync', repeat: { every: 30_000 }, + removeOnComplete: defaultRemoveOnComplete, + removeOnFail: defaultRemoveOnFail, }), ); }); @@ -50,6 +53,13 @@ describe('registerRepeatableCoordinatorJob', () => { }); expect(queue.removeRepeatableByKey).not.toHaveBeenCalled(); - expect(add).toHaveBeenCalledTimes(1); + expect(add).toHaveBeenCalledWith( + 'billing.coordinator', + {}, + expect.objectContaining({ + removeOnComplete: defaultRemoveOnComplete, + removeOnFail: defaultRemoveOnFail, + }), + ); }); }); diff --git a/libs/domains/shared/backend/util-queue/src/lib/register-repeatable-coordinator-job.ts b/libs/domains/shared/backend/util-queue/src/lib/register-repeatable-coordinator-job.ts index 07c68135..186b7a75 100644 --- a/libs/domains/shared/backend/util-queue/src/lib/register-repeatable-coordinator-job.ts +++ b/libs/domains/shared/backend/util-queue/src/lib/register-repeatable-coordinator-job.ts @@ -1,5 +1,7 @@ import type { JobsOptions, Queue } from 'bullmq'; +import { defaultRemoveOnComplete, defaultRemoveOnFail } from './job-retention'; + export interface RegisterRepeatableCoordinatorJobOptions { queue: Queue; name: string; @@ -31,8 +33,8 @@ export async function registerRepeatableCoordinatorJob( { jobId: options.coordinatorJobId, repeat: { every: options.everyMs }, - removeOnComplete: options.removeOnComplete ?? true, - removeOnFail: options.removeOnFail ?? 100, + removeOnComplete: options.removeOnComplete ?? defaultRemoveOnComplete, + removeOnFail: options.removeOnFail ?? defaultRemoveOnFail, }, ); }