diff --git a/src/noise-filter.ts b/src/noise-filter.ts index 50b5d8e..c7469ac 100644 --- a/src/noise-filter.ts +++ b/src/noise-filter.ts @@ -1,7 +1,7 @@ /** * Noise Filter * Filters out low-quality memories (meta-questions, agent denials, session boilerplate) - * Inspired by openclaw-plugin-continuity's noise filtering approach. + * and strips untrusted metadata wrappers from text before storage/retrieval. */ // Agent-side denial patterns @@ -33,6 +33,23 @@ const BOILERPLATE_PATTERNS = [ /^HEARTBEAT/i, ]; +// Known noisy wrappers injected by chat transport / system envelopes +const METADATA_BLOCK_PATTERNS = [ + /Conversation info \(untrusted metadata\):\s*```json[\s\S]*?```/gi, + /Sender \(untrusted metadata\):\s*```json[\s\S]*?```/gi, + /\[Queued messages while agent was busy\]/gi, + /^\s*---\s*Queued\s*#\d+\s*$/gmi, + /^\s*Queued\s*#\d+\s*$/gmi, + /^\s*---\s*$/gmi, +]; + +const METADATA_MARKERS = [ + /Conversation info \(untrusted metadata\)/i, + /Sender \(untrusted metadata\)/i, + /\[Queued messages while agent was busy\]/i, + /Queued\s*#\d+/i, +]; + export interface NoiseFilterOptions { /** Filter agent denial responses (default: true) */ filterDenials?: boolean; @@ -48,19 +65,48 @@ const DEFAULT_OPTIONS: Required = { filterBoilerplate: true, }; +/** + * Remove transport/system wrappers while preserving human-readable content. + */ +export function sanitizeMemoryText(text: string): string { + let cleaned = (text || "").trim(); + if (!cleaned) return ""; + + for (const pattern of METADATA_BLOCK_PATTERNS) { + cleaned = cleaned.replace(pattern, " "); + } + + cleaned = cleaned + .replace(/\n{3,}/g, "\n\n") + .replace(/[ \t]{2,}/g, " ") + .trim(); + + return cleaned; +} + /** * Check if a memory text is noise that should be filtered out. * Returns true if the text is noise. */ export function isNoise(text: string, options: NoiseFilterOptions = {}): boolean { const opts = { ...DEFAULT_OPTIONS, ...options }; - const trimmed = text.trim(); + const trimmed = (text || "").trim(); if (trimmed.length < 5) return true; - if (opts.filterDenials && DENIAL_PATTERNS.some(p => p.test(trimmed))) return true; - if (opts.filterMetaQuestions && META_QUESTION_PATTERNS.some(p => p.test(trimmed))) return true; - if (opts.filterBoilerplate && BOILERPLATE_PATTERNS.some(p => p.test(trimmed))) return true; + const sanitized = sanitizeMemoryText(trimmed); + if (sanitized.length < 5) return true; + + // If text is mostly wrappers/metadata after sanitization, treat as noise. + const hasMetadataMarker = METADATA_MARKERS.some(p => p.test(trimmed)); + if (hasMetadataMarker) { + const keepRatio = sanitized.length / Math.max(1, trimmed.length); + if (keepRatio < 0.35) return true; + } + + if (opts.filterDenials && DENIAL_PATTERNS.some(p => p.test(sanitized))) return true; + if (opts.filterMetaQuestions && META_QUESTION_PATTERNS.some(p => p.test(sanitized))) return true; + if (opts.filterBoilerplate && BOILERPLATE_PATTERNS.some(p => p.test(sanitized))) return true; return false; } diff --git a/src/tools.ts b/src/tools.ts index 620845a..b9fb60c 100644 --- a/src/tools.ts +++ b/src/tools.ts @@ -11,7 +11,7 @@ import { homedir } from "node:os"; import { join } from "node:path"; import type { MemoryRetriever, RetrievalResult } from "./retriever.js"; import type { MemoryStore } from "./store.js"; -import { isNoise } from "./noise-filter.js"; +import { isNoise, sanitizeMemoryText } from "./noise-filter.js"; import type { MemoryScopeManager } from "./scopes.js"; import type { Embedder } from "./embedder.js"; import { appendSelfImprovementEntry, ensureSelfImprovementLearningFiles } from "./self-improvement-files.js"; @@ -68,7 +68,7 @@ function clamp01(value: number, fallback = 0.7): number { function sanitizeMemoryForSerialization(results: RetrievalResult[]) { return results.map((r) => ({ id: r.entry.id, - text: r.entry.text, + text: sanitizeMemoryText(r.entry.text), category: getDisplayCategoryTag(r.entry), rawCategory: r.entry.category, scope: r.entry.scope, @@ -414,7 +414,8 @@ export function registerMemoryRecallTool( if (r.sources.reranked) sources.push("reranked"); const categoryTag = getDisplayCategoryTag(r.entry); - return `${i + 1}. [${r.entry.id}] [${categoryTag}] ${r.entry.text} (${(r.score * 100).toFixed(0)}%${sources.length > 0 ? `, ${sources.join("+")}` : ""})`; + const cleanText = sanitizeMemoryText(r.entry.text) || r.entry.text; + return `${i + 1}. [${r.entry.id}] [${categoryTag}] ${cleanText} (${(r.score * 100).toFixed(0)}%${sources.length > 0 ? `, ${sources.join("+")}` : ""})`; }) .join("\n"); @@ -508,21 +509,22 @@ export function registerMemoryStoreTool( }; } - // Reject noise before wasting an embedding API call - if (isNoise(text)) { + // Strip transport/system wrappers and reject noise before embedding + const cleanedText = sanitizeMemoryText(text); + if (isNoise(cleanedText)) { return { content: [ { type: "text", - text: `Skipped: text detected as noise (greeting, boilerplate, or meta-question)`, + text: `Skipped: text detected as noise (greeting, boilerplate, metadata, or meta-question)`, }, ], - details: { action: "noise_filtered", text: text.slice(0, 60) }, + details: { action: "noise_filtered", text: cleanedText.slice(0, 60) }, }; } const safeImportance = clamp01(importance, 0.7); - const vector = await context.embedder.embedPassage(text); + const vector = await context.embedder.embedPassage(cleanedText); // Check for duplicates using raw vector similarity (bypasses importance/recency weighting) // Fail-open by design: dedup must never block a legitimate memory write. @@ -859,25 +861,27 @@ export function registerMemoryUpdateTool( } } - // If text changed, re-embed; reject noise + // If text changed, sanitize first, then re-embed let newVector: number[] | undefined; + let cleanedUpdateText: string | undefined; if (text) { - if (isNoise(text)) { + cleanedUpdateText = sanitizeMemoryText(text); + if (isNoise(cleanedUpdateText)) { return { content: [ { type: "text", - text: "Skipped: updated text detected as noise", + text: "Skipped: updated text detected as noise or metadata wrapper", }, ], details: { action: "noise_filtered" }, }; } - newVector = await context.embedder.embedPassage(text); + newVector = await context.embedder.embedPassage(cleanedUpdateText); } const updates: Record = {}; - if (text) updates.text = text; + if (cleanedUpdateText) updates.text = cleanedUpdateText; if (newVector) updates.vector = newVector; if (importance !== undefined) updates.importance = clamp01(importance, 0.7);