diff --git a/ROADMAP.md b/ROADMAP.md index 78cb668..5fea36e 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -82,6 +82,7 @@ - [x] Boyer–Moore fast substring search - [x] Suffix array construction utilities - [x] Longest common subsequence (LCS) enhancements and diff helpers + - [x] Aho–Corasick multi-pattern automaton **Data pipelines & utilities** - [x] Flatten/unflatten helpers for nested structures - [x] Pagination utilities for client-side paging diff --git a/docs/index.d.ts b/docs/index.d.ts index 1b8f2b0..7934744 100644 --- a/docs/index.d.ts +++ b/docs/index.d.ts @@ -79,6 +79,7 @@ export const examples: { readonly buildSuffixArray: 'examples/search.ts'; readonly longestCommonSubsequence: 'examples/search.ts'; readonly diffStrings: 'examples/search.ts'; + readonly createAhoCorasick: 'examples/search.ts'; }; readonly data: { readonly diff: 'examples/jsonDiff.ts'; @@ -2664,6 +2665,21 @@ export interface DiffOp { export function longestCommonSubsequence(options: LCSOptions): LCSResult; export function diffStrings(options: LCSOptions): DiffOp[]; +/** + * Aho–Corasick multi-pattern automaton. + * Use for: scanning texts for many patterns efficiently with overlaps. + * Performance: O(n + m + z) where n=text length, m=total pattern length, z=matches. + * Import: search/ahoCorasick.ts + */ +export interface AhoBuildOptions { + patterns: ReadonlyArray; + caseSensitive?: boolean; +} +export interface AhoAutomaton { + search(text: string): Record; +} +export function createAhoCorasick(options: AhoBuildOptions): AhoAutomaton; + // ============================================================================ // 📊 DATA TOOLS // ============================================================================ diff --git a/docs/list.md b/docs/list.md index e15c396..7d72cd0 100644 --- a/docs/list.md +++ b/docs/list.md @@ -129,6 +129,7 @@ Maximum Flow (Ford-Fulkerson) - Network flow Rabin-Karp - Multiple pattern matching Boyer-Moore - Fast single pattern search +Aho–Corasick - Multi-pattern automaton Longest Common Subsequence - Diff algorithms Suffix Array - Advanced pattern matching diff --git a/examples/search.ts b/examples/search.ts index 34e92ec..fc38fc0 100644 --- a/examples/search.ts +++ b/examples/search.ts @@ -10,6 +10,7 @@ import { buildSuffixArray, longestCommonSubsequence, diffStrings, + createAhoCorasick, } from '../src/index.js'; const items = ['alpha', 'beta', 'delta', 'epsilon', 'gamma']; @@ -43,3 +44,6 @@ console.log('LCS of dynamic/programming:', lcs); const diff = diffStrings({ a: 'kitten', b: 'sitting' }); console.log('Diff between kitten and sitting:', diff); + +const automaton = createAhoCorasick({ patterns: ['abra', 'cad'] }); +console.log('Aho–Corasick matches in abracadabra:', automaton.search('abracadabra')); diff --git a/package.json b/package.json index 91d8a1d..7bf057b 100644 --- a/package.json +++ b/package.json @@ -62,7 +62,7 @@ { "name": "bundle", "path": "dist/index.js", - "limit": "40 KB" + "limit": "41 KB" } ] } diff --git a/src/index.ts b/src/index.ts index f4fe9d2..99dad99 100644 --- a/src/index.ts +++ b/src/index.ts @@ -75,6 +75,7 @@ export const examples = { buildSuffixArray: 'examples/search.ts', longestCommonSubsequence: 'examples/search.ts', diffStrings: 'examples/search.ts', + createAhoCorasick: 'examples/search.ts', }, data: { diff: 'examples/jsonDiff.ts', @@ -924,6 +925,11 @@ export type { LCSOptions, LCSResult, DiffOp } from './search/lcs.js'; */ export { levenshteinDistance } from './search/levenshtein.js'; +/** + * Aho–Corasick multi-pattern automaton. + */ +export { createAhoCorasick } from './search/ahoCorasick.js'; + // ============================================================================ // 📊 DATA PROCESSING // ============================================================================ diff --git a/src/search/ahoCorasick.ts b/src/search/ahoCorasick.ts new file mode 100644 index 0000000..0c4794f --- /dev/null +++ b/src/search/ahoCorasick.ts @@ -0,0 +1,106 @@ +export interface AhoBuildOptions { + patterns: ReadonlyArray; + caseSensitive?: boolean; +} + +export interface AhoAutomaton { + search(text: string): Record; +} + +interface Node { + next: Map; + fail: number; + out: number[]; // indices into originalPatterns +} + +export function createAhoCorasick(options: AhoBuildOptions): AhoAutomaton { + validateOptions(options); + const caseSensitive = options.caseSensitive ?? true; + const originalPatterns = options.patterns.slice(); + const normalizedPatterns = caseSensitive + ? originalPatterns + : originalPatterns.map((p) => p.toLowerCase()); + + const nodes: Node[] = [{ next: new Map(), fail: 0, out: [] }]; + + // Build trie + normalizedPatterns.forEach((pattern, idx) => { + if (pattern.length === 0) return; + let state = 0; + for (const ch of pattern) { + let to = nodes[state].next.get(ch); + if (to === undefined) { + to = nodes.length; + nodes[state].next.set(ch, to); + nodes.push({ next: new Map(), fail: 0, out: [] }); + } + state = to; + } + nodes[state].out.push(idx); + }); + + // Build fail links via BFS + const queue: number[] = []; + for (const [, to] of nodes[0].next.entries()) { + nodes[to].fail = 0; + queue.push(to); + } + while (queue.length > 0) { + const v = queue.shift()!; + for (const [ch, to] of nodes[v].next.entries()) { + queue.push(to); + let f = nodes[v].fail; + while (f !== 0 && !nodes[f].next.has(ch)) { + f = nodes[f].fail; + } + if (nodes[f].next.has(ch)) { + f = nodes[f].next.get(ch)!; + } + nodes[to].fail = f; + nodes[to].out.push(...nodes[f].out); + } + } + + function search(text: string): Record { + const t = caseSensitive ? text : text.toLowerCase(); + const results: Record = {}; + // Handle empty patterns returning all positions + for (let i = 0; i < originalPatterns.length; i += 1) { + if (normalizedPatterns[i].length === 0) { + results[originalPatterns[i]] = Array.from({ length: text.length + 1 }, (_, p) => p); + } + } + + let state = 0; + for (let i = 0; i < t.length; i += 1) { + const ch = t[i]; + while (state !== 0 && !nodes[state].next.has(ch)) { + state = nodes[state].fail; + } + if (nodes[state].next.has(ch)) { + state = nodes[state].next.get(ch)!; + } + if (nodes[state].out.length > 0) { + for (const patIdx of nodes[state].out) { + const pat = originalPatterns[patIdx]; + const len = normalizedPatterns[patIdx].length; + const pos = i - len + 1; + if (!results[pat]) results[pat] = []; + results[pat].push(pos); + } + } + } + for (const pat of originalPatterns) { + if (!results[pat]) results[pat] = []; + } + return results; + } + + return { search }; +} + +function validateOptions(options: AhoBuildOptions): void { + if (!Array.isArray(options.patterns) || options.patterns.length === 0) { + throw new Error('patterns must contain at least one pattern.'); + } +} diff --git a/tests/ahoCorasick.test.ts b/tests/ahoCorasick.test.ts new file mode 100644 index 0000000..1254616 --- /dev/null +++ b/tests/ahoCorasick.test.ts @@ -0,0 +1,22 @@ +import { describe, expect, it } from 'vitest'; + +import { createAhoCorasick } from '../src/index.js'; + +describe('createAhoCorasick', () => { + it('finds overlapping multi-pattern matches', () => { + const ac = createAhoCorasick({ patterns: ['ab', 'bc', 'abc'] }); + const res = ac.search('ababc'); + expect(res['ab']).toEqual([0, 2]); + expect(res['bc']).toEqual([3]); + expect(res['abc']).toEqual([2]); + }); + + it('supports case-insensitive matching and empty patterns', () => { + const ac = createAhoCorasick({ patterns: ['He', 'eL', ''], caseSensitive: false }); + const res = ac.search('HeLlo'); + expect(res['He']).toEqual([0]); + expect(res['eL']).toEqual([1]); + expect(res['']).toEqual([0, 1, 2, 3, 4, 5]); + }); +}); + diff --git a/tests/index.test.ts b/tests/index.test.ts index ec636a3..dcf120e 100644 --- a/tests/index.test.ts +++ b/tests/index.test.ts @@ -137,6 +137,7 @@ describe('package entry point', () => { | 'buildSuffixArray' | 'longestCommonSubsequence' | 'diffStrings' + | 'createAhoCorasick' >(); expectTypeOf>().toEqualTypeOf<