diff --git a/CHANGELOG.md b/CHANGELOG.md index a0c60e5..7cfd005 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ # Changelog +## [1.3.1] - 2026-01-05 + +### Fixed +- **Auto-Heal Semantic Search**: Detects LanceDB schema corruption (missing `vector` column), triggers re-indexing, and retries search instead of silently falling back to keyword-only results. + ## [1.3.0] - 2026-01-01 ### Added diff --git a/internal-docs b/internal-docs index 559dfa0..3ba26ec 160000 --- a/internal-docs +++ b/internal-docs @@ -1 +1 @@ -Subproject commit 559dfa0bd97fd37f28348c2fb4157f7bcb3428c2 +Subproject commit 3ba26ece92443377fd3eef596bf181cc2e835082 diff --git a/package.json b/package.json index c317383..f0af5f8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "codebase-context", - "version": "1.3.0", + "version": "1.3.1", "description": "MCP server that helps AI agents understand your codebase - patterns, libraries, architecture, monorepo support", "type": "module", "main": "./dist/lib.js", @@ -120,4 +120,4 @@ "sharp" ] } -} \ No newline at end of file +} diff --git a/src/core/indexer.ts b/src/core/indexer.ts index 35cb8cf..f7112af 100644 --- a/src/core/indexer.ts +++ b/src/core/indexer.ts @@ -86,7 +86,7 @@ export class CodebaseIndexer { }, embedding: { provider: 'transformers', - model: 'Xenova/bge-base-en-v1.5', + model: 'Xenova/bge-small-en-v1.5', batchSize: 100 }, skipEmbedding: false, diff --git a/src/core/search.ts b/src/core/search.ts index 0edb119..d42e304 100644 --- a/src/core/search.ts +++ b/src/core/search.ts @@ -9,6 +9,7 @@ import { CodeChunk, SearchResult, SearchFilters } from '../types/index.js'; import { EmbeddingProvider, getEmbeddingProvider } from '../embeddings/index.js'; import { VectorStorageProvider, getStorageProvider } from '../storage/index.js'; import { analyzerRegistry } from './analyzer-registry.js'; +import { IndexCorruptedError } from '../errors/index.js'; export interface SearchOptions { useSemanticSearch?: boolean; @@ -62,6 +63,9 @@ export class CodebaseSearcher { this.initialized = true; } catch (error) { + if (error instanceof IndexCorruptedError) { + throw error; // Propagate to handler for auto-heal + } console.warn('Partial initialization (keyword search only):', error); this.initialized = true; } @@ -217,6 +221,9 @@ export class CodebaseSearcher { } }); } catch (error) { + if (error instanceof IndexCorruptedError) { + throw error; // Propagate to handler for auto-heal + } console.warn('Semantic search failed:', error); } } diff --git a/src/errors/index.ts b/src/errors/index.ts new file mode 100644 index 0000000..96a9a30 --- /dev/null +++ b/src/errors/index.ts @@ -0,0 +1,10 @@ +/** + * Thrown when the LanceDB index is corrupted or has a schema mismatch. + * This error signals that re-indexing is required for semantic search to work. + */ +export class IndexCorruptedError extends Error { + constructor(message: string) { + super(message); + this.name = 'IndexCorruptedError'; + } +} diff --git a/src/index.ts b/src/index.ts index 189cfe7..d06e93e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -20,12 +20,13 @@ import { Resource } from '@modelcontextprotocol/sdk/types.js'; import { CodebaseIndexer } from './core/indexer.js'; -import { IndexingStats } from './types/index.js'; +import { IndexingStats, SearchResult } from './types/index.js'; import { CodebaseSearcher } from './core/search.js'; import { analyzerRegistry } from './core/analyzer-registry.js'; import { AngularAnalyzer } from './analyzers/angular/index.js'; import { GenericAnalyzer } from './analyzers/generic/index.js'; import { InternalFileGraph } from './utils/usage-tracker.js'; +import { IndexCorruptedError } from './errors/index.js'; analyzerRegistry.register(new AngularAnalyzer()); analyzerRegistry.register(new GenericAnalyzer()); @@ -62,11 +63,10 @@ const indexState: IndexState = { status: 'idle' }; - const server: Server = new Server( { name: 'codebase-context', - version: '1.3.0' + version: '1.3.1' }, { capabilities: { @@ -492,7 +492,62 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { } const searcher = new CodebaseSearcher(ROOT_PATH); - const results = await searcher.search(query, limit || 5, filters); + let results: SearchResult[]; + + try { + results = await searcher.search(query, limit || 5, filters); + } catch (error) { + if (error instanceof IndexCorruptedError) { + console.error('[Auto-Heal] Index corrupted. Triggering full re-index...'); + + await performIndexing(); + + if (indexState.status === 'ready') { + console.error('[Auto-Heal] Success. Retrying search...'); + const freshSearcher = new CodebaseSearcher(ROOT_PATH); + try { + results = await freshSearcher.search(query, limit || 5, filters); + } catch (retryError) { + return { + content: [ + { + type: 'text', + text: JSON.stringify( + { + status: 'error', + message: `Auto-heal retry failed: ${ + retryError instanceof Error ? retryError.message : String(retryError) + }` + }, + null, + 2 + ) + } + ] + }; + } + } else { + return { + content: [ + { + type: 'text', + text: JSON.stringify( + { + status: 'error', + message: `Auto-heal failed: Indexing ended with status '${indexState.status}'`, + error: indexState.error + }, + null, + 2 + ) + } + ] + }; + } + } else { + throw error; // Propagate unexpected errors + } + } return { content: [ @@ -538,19 +593,19 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { lastIndexed: indexState.lastIndexed?.toISOString(), stats: indexState.stats ? { - totalFiles: indexState.stats.totalFiles, - indexedFiles: indexState.stats.indexedFiles, - totalChunks: indexState.stats.totalChunks, - duration: `${(indexState.stats.duration / 1000).toFixed(2)}s` - } + totalFiles: indexState.stats.totalFiles, + indexedFiles: indexState.stats.indexedFiles, + totalChunks: indexState.stats.totalChunks, + duration: `${(indexState.stats.duration / 1000).toFixed(2)}s` + } : undefined, progress: progress ? { - phase: progress.phase, - percentage: progress.percentage, - filesProcessed: progress.filesProcessed, - totalFiles: progress.totalFiles - } + phase: progress.phase, + percentage: progress.percentage, + filesProcessed: progress.filesProcessed, + totalFiles: progress.totalFiles + } : undefined, error: indexState.error, hint: 'Use refresh_index to manually trigger re-indexing when needed.' diff --git a/src/storage/lancedb.ts b/src/storage/lancedb.ts index 468f988..8730076 100644 --- a/src/storage/lancedb.ts +++ b/src/storage/lancedb.ts @@ -6,6 +6,7 @@ import { promises as fs } from 'fs'; import { VectorStorageProvider, CodeChunkWithEmbedding, VectorSearchResult } from './types.js'; import { CodeChunk, SearchFilters } from '../types/index.js'; +import { IndexCorruptedError } from '../errors/index.js'; export class LanceDBStorageProvider implements VectorStorageProvider { readonly name = 'lancedb'; @@ -20,44 +21,33 @@ export class LanceDBStorageProvider implements VectorStorageProvider { try { this.storagePath = storagePath; - - // Ensure directory exists await fs.mkdir(storagePath, { recursive: true }); - // Dynamic import to avoid issues at require time const lancedb = await import('@lancedb/lancedb'); - - // Connect to database this.db = await lancedb.connect(storagePath); - // Check if table exists and has valid schema + // Check if table exists and validate schema const tableNames = await this.db.tableNames(); if (tableNames.includes('code_chunks')) { this.table = await this.db.openTable('code_chunks'); - // Validate schema has vector column (required for semantic search) - try { - const schema = await this.table.schema(); - const hasVectorColumn = schema.fields.some((f: any) => f.name === 'vector'); - - if (!hasVectorColumn) { - console.error('Stale index detected (missing vector column). Rebuilding...'); - await this.db.dropTable('code_chunks'); - this.table = null; - } else { - console.error('Opened existing LanceDB table'); - } - } catch (_schemaError) { - // If schema check fails, table is likely corrupted - drop and rebuild - console.error('Failed to validate table schema, rebuilding index...'); - await this.db.dropTable('code_chunks'); - this.table = null; + const schema = await this.table.schema(); + const hasVectorColumn = schema.fields.some((f: any) => f.name === 'vector'); + + if (!hasVectorColumn) { + throw new IndexCorruptedError('LanceDB index corrupted: missing vector column'); } + console.error('Opened existing LanceDB table'); + } else { + this.table = null; } this.initialized = true; console.error(`LanceDB initialized at: ${storagePath}`); } catch (error) { + if (error instanceof IndexCorruptedError) { + throw error; + } console.error('Failed to initialize LanceDB:', error); throw error; } @@ -115,7 +105,7 @@ export class LanceDBStorageProvider implements VectorStorageProvider { filters?: SearchFilters ): Promise { if (!this.initialized || !this.table) { - return []; + throw new IndexCorruptedError('LanceDB index corrupted: no table available for search'); } try { @@ -170,7 +160,13 @@ export class LanceDBStorageProvider implements VectorStorageProvider { distance: result._distance || 0 })); } catch (error) { - console.error('Failed to search:', error); + // Only trigger auto-heal for verified corruption patterns + if (error instanceof Error && error.message.toLowerCase().includes('no vector column')) { + throw new IndexCorruptedError(`LanceDB index corrupted: ${error.message}`); + } + + // Transient errors - log and gracefully degrade + console.error('[LanceDB] Search error:', error instanceof Error ? error.message : error); return []; } } @@ -199,8 +195,7 @@ export class LanceDBStorageProvider implements VectorStorageProvider { } try { - const result = await this.table.countRows(); - return result; + return await this.table.countRows(); } catch (error) { console.error('Failed to count rows:', error); return 0; diff --git a/tests/lancedb-corruption.test.ts b/tests/lancedb-corruption.test.ts new file mode 100644 index 0000000..2ec825b --- /dev/null +++ b/tests/lancedb-corruption.test.ts @@ -0,0 +1,106 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { promises as fs } from 'fs'; +import os from 'os'; +import path from 'path'; +import { IndexCorruptedError } from '../src/errors/index.js'; + +const lancedb = vi.hoisted(() => ({ + connect: vi.fn() +})); + +vi.mock('@lancedb/lancedb', () => ({ + connect: lancedb.connect +})); + +describe('LanceDBStorageProvider corruption detection', () => { + let tempDir: string; + let consoleErrorSpy: ReturnType; + + beforeEach(async () => { + tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'lancedb-test-')); + lancedb.connect.mockReset(); + consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => { }); + }); + + afterEach(async () => { + consoleErrorSpy.mockRestore(); + await fs.rm(tempDir, { recursive: true, force: true }); + }); + + it('throws IndexCorruptedError when vector column missing during initialize()', async () => { + const db = { + tableNames: vi.fn(async () => ['code_chunks']), + openTable: vi.fn(async () => ({ + schema: vi.fn(async () => ({ fields: [{ name: 'id' }] })) + })) + }; + + lancedb.connect.mockResolvedValue(db); + + const { LanceDBStorageProvider } = await import('../src/storage/lancedb.js'); + const provider = new LanceDBStorageProvider(); + + await expect(provider.initialize(tempDir)).rejects.toBeInstanceOf(IndexCorruptedError); + // dropTable is no longer called within initialize (senior mindset: separation of concerns) + }); + + it('throws IndexCorruptedError when schema() throws during initialize()', async () => { + const db = { + tableNames: vi.fn(async () => ['code_chunks']), + openTable: vi.fn(async () => ({ + schema: vi.fn(async () => { + throw new Error('schema error'); + }) + })) + }; + + lancedb.connect.mockResolvedValue(db); + + const { LanceDBStorageProvider } = await import('../src/storage/lancedb.js'); + const provider = new LanceDBStorageProvider(); + + // This now throws the raw error (not IndexCorruptedError) since we don't wrap all errors + await expect(provider.initialize(tempDir)).rejects.toThrow('schema error'); + }); + + it('throws IndexCorruptedError when vector search fails with "No vector column"', async () => { + const { LanceDBStorageProvider } = await import('../src/storage/lancedb.js'); + const provider = new LanceDBStorageProvider() as any; + + const query = { + limit: vi.fn(() => query), + where: vi.fn(() => query), + toArray: vi.fn(async () => { + throw new Error('Schema Error: No vector column found to create index'); + }) + }; + + provider.initialized = true; + provider.table = { + vectorSearch: vi.fn(() => query) + }; + + await expect(provider.search([0.1, 0.2], 5)).rejects.toBeInstanceOf(IndexCorruptedError); + }); + + it('returns empty array for transient search errors', async () => { + const { LanceDBStorageProvider } = await import('../src/storage/lancedb.js'); + const provider = new LanceDBStorageProvider() as any; + + const query = { + limit: vi.fn(() => query), + where: vi.fn(() => query), + toArray: vi.fn(async () => { + throw new Error('Network timeout'); + }) + }; + + provider.initialized = true; + provider.table = { + vectorSearch: vi.fn(() => query) + }; + + const results = await provider.search([0.1, 0.2], 5); + expect(results).toEqual([]); + }); +}); diff --git a/tests/search-codebase-auto-heal.test.ts b/tests/search-codebase-auto-heal.test.ts new file mode 100644 index 0000000..09e638f --- /dev/null +++ b/tests/search-codebase-auto-heal.test.ts @@ -0,0 +1,116 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; + +const searchMocks = vi.hoisted(() => ({ + search: vi.fn() +})); + +const indexerMocks = vi.hoisted(() => ({ + index: vi.fn() +})); + +vi.mock('../src/core/search.js', async () => { + class CodebaseSearcher { + constructor(_rootPath: string) {} + + async search(query: string, limit: number, filters?: unknown) { + return searchMocks.search(query, limit, filters); + } + } + + return { CodebaseSearcher }; +}); + +vi.mock('../src/core/indexer.js', () => { + class CodebaseIndexer { + constructor(_options: unknown) {} + + getProgress() { + return { phase: 'complete', percentage: 100 }; + } + + async index() { + indexerMocks.index(); + return { + totalFiles: 0, + indexedFiles: 0, + skippedFiles: 0, + totalChunks: 0, + totalLines: 0, + duration: 0, + avgChunkSize: 0, + componentsByType: {}, + componentsByLayer: { + presentation: 0, + business: 0, + data: 0, + state: 0, + core: 0, + shared: 0, + feature: 0, + infrastructure: 0, + unknown: 0 + }, + errors: [], + startedAt: new Date(), + completedAt: new Date() + }; + } + } + + return { CodebaseIndexer }; +}); + +describe('search_codebase auto-heal', () => { + let consoleErrorSpy: ReturnType; + + beforeEach(() => { + searchMocks.search.mockReset(); + indexerMocks.index.mockReset(); + consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); + }); + + afterEach(() => { + consoleErrorSpy.mockRestore(); + }); + + it('triggers indexing and retries when IndexCorruptedError is thrown', async () => { + const { IndexCorruptedError } = await import('../src/errors/index.js'); + + searchMocks.search + .mockRejectedValueOnce(new IndexCorruptedError('LanceDB index corrupted: missing vector column')) + .mockResolvedValueOnce([ + { + summary: 'Test summary', + snippet: 'Test snippet', + filePath: '/tmp/file.ts', + startLine: 1, + endLine: 2, + score: 0.9, + language: 'ts', + metadata: {} + } + ]); + + const { server } = await import('../src/index.js'); + const handler = (server as any)._requestHandlers.get('tools/call'); + + const response = await handler({ + jsonrpc: '2.0', + id: 1, + method: 'tools/call', + params: { + name: 'search_codebase', + arguments: { + query: 'test' + } + } + }); + + const payload = JSON.parse(response.content[0].text); + expect(payload.status).toBe('success'); + expect(payload.results).toHaveLength(1); + expect(searchMocks.search).toHaveBeenCalledTimes(2); + expect(indexerMocks.index).toHaveBeenCalledTimes(1); + }); +}); + diff --git a/tests/searcher-corruption-propagation.test.ts b/tests/searcher-corruption-propagation.test.ts new file mode 100644 index 0000000..9834ab3 --- /dev/null +++ b/tests/searcher-corruption-propagation.test.ts @@ -0,0 +1,80 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { promises as fs } from 'fs'; +import os from 'os'; +import path from 'path'; +import { IndexCorruptedError } from '../src/errors/index.js'; + +const deps = vi.hoisted(() => ({ + getEmbeddingProvider: vi.fn(), + getStorageProvider: vi.fn() +})); + +vi.mock('../src/embeddings/index.js', () => ({ + getEmbeddingProvider: deps.getEmbeddingProvider +})); + +vi.mock('../src/storage/index.js', () => ({ + getStorageProvider: deps.getStorageProvider +})); + +describe('CodebaseSearcher IndexCorruptedError propagation', () => { + let tempDir: string; + let consoleWarnSpy: ReturnType; + let consoleErrorSpy: ReturnType; + + beforeEach(async () => { + tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'searcher-test-')); + deps.getEmbeddingProvider.mockReset(); + deps.getStorageProvider.mockReset(); + consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); + + await fs.writeFile(path.join(tempDir, '.codebase-index.json'), JSON.stringify([])); + await fs.writeFile(path.join(tempDir, '.codebase-intelligence.json'), JSON.stringify({})); + }); + + afterEach(async () => { + consoleWarnSpy.mockRestore(); + consoleErrorSpy.mockRestore(); + await fs.rm(tempDir, { recursive: true, force: true }); + }); + + it('rethrows IndexCorruptedError from initialize()', async () => { + deps.getEmbeddingProvider.mockResolvedValue({ + embed: vi.fn(async () => [0.1, 0.2]) + }); + + deps.getStorageProvider.mockRejectedValue( + new IndexCorruptedError('LanceDB index corrupted: missing vector column') + ); + + const { CodebaseSearcher } = await import('../src/core/search.js'); + const searcher = new CodebaseSearcher(tempDir); + + await expect(searcher.search('test', 5)).rejects.toBeInstanceOf(IndexCorruptedError); + }); + + it('rethrows IndexCorruptedError from semantic search', async () => { + deps.getEmbeddingProvider.mockResolvedValue({ + embed: vi.fn(async () => [0.1, 0.2]) + }); + + deps.getStorageProvider.mockResolvedValue({ + name: 'mock', + initialize: vi.fn(async () => {}), + store: vi.fn(async () => {}), + clear: vi.fn(async () => {}), + count: vi.fn(async () => 0), + isInitialized: vi.fn(() => true), + search: vi.fn(async () => { + throw new IndexCorruptedError('LanceDB index corrupted: missing vector column'); + }) + }); + + const { CodebaseSearcher } = await import('../src/core/search.js'); + const searcher = new CodebaseSearcher(tempDir); + + await expect(searcher.search('test', 5)).rejects.toBeInstanceOf(IndexCorruptedError); + }); +}); +