diff --git a/CHANGELOG.md b/CHANGELOG.md index a0c60e5..7cfd005 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ # Changelog +## [1.3.1] - 2026-01-05 + +### Fixed +- **Auto-Heal Semantic Search**: Detects LanceDB schema corruption (missing `vector` column), triggers re-indexing, and retries search instead of silently falling back to keyword-only results. + ## [1.3.0] - 2026-01-01 ### Added diff --git a/internal-docs b/internal-docs index 559dfa0..3ba26ec 160000 --- a/internal-docs +++ b/internal-docs @@ -1 +1 @@ -Subproject commit 559dfa0bd97fd37f28348c2fb4157f7bcb3428c2 +Subproject commit 3ba26ece92443377fd3eef596bf181cc2e835082 diff --git a/package.json b/package.json index c317383..f0af5f8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "codebase-context", - "version": "1.3.0", + "version": "1.3.1", "description": "MCP server that helps AI agents understand your codebase - patterns, libraries, architecture, monorepo support", "type": "module", "main": "./dist/lib.js", @@ -120,4 +120,4 @@ "sharp" ] } -} \ No newline at end of file +} diff --git a/src/core/indexer.ts b/src/core/indexer.ts index 35cb8cf..294a28f 100644 --- a/src/core/indexer.ts +++ b/src/core/indexer.ts @@ -86,7 +86,7 @@ export class CodebaseIndexer { }, embedding: { provider: 'transformers', - model: 'Xenova/bge-base-en-v1.5', + model: 'Xenova/bge-small-en-v1.5', batchSize: 100 }, skipEmbedding: false, @@ -376,8 +376,7 @@ export class CodebaseIndexer { if ((i + batchSize) % 100 === 0 || i + batchSize >= chunksToEmbed.length) { console.error( - `Embedded ${Math.min(i + batchSize, chunksToEmbed.length)}/${ - chunksToEmbed.length + `Embedded ${Math.min(i + batchSize, chunksToEmbed.length)}/${chunksToEmbed.length } chunks` ); } diff --git a/src/core/search.ts b/src/core/search.ts index 0edb119..228a05a 100644 --- a/src/core/search.ts +++ b/src/core/search.ts @@ -9,6 +9,7 @@ import { CodeChunk, SearchResult, SearchFilters } from '../types/index.js'; import { EmbeddingProvider, getEmbeddingProvider } from '../embeddings/index.js'; import { VectorStorageProvider, getStorageProvider } from '../storage/index.js'; import { analyzerRegistry } from './analyzer-registry.js'; +import { IndexCorruptedError } from '../errors/index.js'; export interface SearchOptions { useSemanticSearch?: boolean; @@ -62,6 +63,9 @@ export class CodebaseSearcher { this.initialized = true; } catch (error) { + if (error instanceof IndexCorruptedError) { + throw error; // Propagate to handler for auto-heal + } console.warn('Partial initialization (keyword search only):', error); this.initialized = true; } @@ -217,6 +221,9 @@ export class CodebaseSearcher { } }); } catch (error) { + if (error instanceof IndexCorruptedError) { + throw error; // Propagate to handler for auto-heal + } console.warn('Semantic search failed:', error); } } @@ -324,9 +331,8 @@ export class CodebaseSearcher { const name = componentName || (classMatch ? classMatch[1] : null); if (name && componentType) { - return `${ - componentType.charAt(0).toUpperCase() + componentType.slice(1) - } '${name}' in ${fileName}.`; + return `${componentType.charAt(0).toUpperCase() + componentType.slice(1) + } '${name}' in ${fileName}.`; } else if (name) { return `'${name}' defined in ${fileName}.`; } else if (componentType) { diff --git a/src/errors/index.ts b/src/errors/index.ts new file mode 100644 index 0000000..d2fd710 --- /dev/null +++ b/src/errors/index.ts @@ -0,0 +1,11 @@ +/** + * Thrown when the LanceDB index is corrupted or has a schema mismatch. + * This error signals that re-indexing is required for semantic search to work. + */ +export class IndexCorruptedError extends Error { + constructor(message: string) { + super(message); + this.name = 'IndexCorruptedError'; + } +} + diff --git a/src/index.ts b/src/index.ts index 189cfe7..2c09f7a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -20,12 +20,13 @@ import { Resource } from '@modelcontextprotocol/sdk/types.js'; import { CodebaseIndexer } from './core/indexer.js'; -import { IndexingStats } from './types/index.js'; +import { IndexingStats, SearchResult } from './types/index.js'; import { CodebaseSearcher } from './core/search.js'; import { analyzerRegistry } from './core/analyzer-registry.js'; import { AngularAnalyzer } from './analyzers/angular/index.js'; import { GenericAnalyzer } from './analyzers/generic/index.js'; import { InternalFileGraph } from './utils/usage-tracker.js'; +import { IndexCorruptedError } from './errors/index.js'; analyzerRegistry.register(new AngularAnalyzer()); analyzerRegistry.register(new GenericAnalyzer()); @@ -66,7 +67,7 @@ const indexState: IndexState = { const server: Server = new Server( { name: 'codebase-context', - version: '1.3.0' + version: '1.3.1' }, { capabilities: { @@ -492,7 +493,62 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { } const searcher = new CodebaseSearcher(ROOT_PATH); - const results = await searcher.search(query, limit || 5, filters); + let results: SearchResult[]; + + try { + results = await searcher.search(query, limit || 5, filters); + } catch (error) { + if (error instanceof IndexCorruptedError) { + console.error('[Auto-Heal] Index corrupted. Triggering full re-index...'); + + await performIndexing(); + + if (indexState.status === 'ready') { + console.error('[Auto-Heal] Success. Retrying search...'); + const freshSearcher = new CodebaseSearcher(ROOT_PATH); + try { + results = await freshSearcher.search(query, limit || 5, filters); + } catch (retryError) { + return { + content: [ + { + type: 'text', + text: JSON.stringify( + { + status: 'error', + message: `Auto-heal retry failed: ${ + retryError instanceof Error ? retryError.message : String(retryError) + }` + }, + null, + 2 + ) + } + ] + }; + } + } else { + return { + content: [ + { + type: 'text', + text: JSON.stringify( + { + status: 'error', + message: `Auto-heal failed: Indexing ended with status '${indexState.status}'`, + error: indexState.error + }, + null, + 2 + ) + } + ] + }; + } + } else { + throw error; // Propagate unexpected errors + } + } return { content: [ diff --git a/src/storage/lancedb.ts b/src/storage/lancedb.ts index 468f988..0df8ea6 100644 --- a/src/storage/lancedb.ts +++ b/src/storage/lancedb.ts @@ -6,6 +6,7 @@ import { promises as fs } from 'fs'; import { VectorStorageProvider, CodeChunkWithEmbedding, VectorSearchResult } from './types.js'; import { CodeChunk, SearchFilters } from '../types/index.js'; +import { IndexCorruptedError } from '../errors/index.js'; export class LanceDBStorageProvider implements VectorStorageProvider { readonly name = 'lancedb'; @@ -44,20 +45,31 @@ export class LanceDBStorageProvider implements VectorStorageProvider { console.error('Stale index detected (missing vector column). Rebuilding...'); await this.db.dropTable('code_chunks'); this.table = null; + throw new IndexCorruptedError('LanceDB index corrupted: missing vector column'); } else { console.error('Opened existing LanceDB table'); } - } catch (_schemaError) { + } catch (schemaError) { + if (schemaError instanceof IndexCorruptedError) { + throw schemaError; + } // If schema check fails, table is likely corrupted - drop and rebuild console.error('Failed to validate table schema, rebuilding index...'); await this.db.dropTable('code_chunks'); this.table = null; + throw new IndexCorruptedError('LanceDB index corrupted: schema validation failed'); } + } else { + // Table missing entirely - not necessarily an error during initialization + this.table = null; } this.initialized = true; console.error(`LanceDB initialized at: ${storagePath}`); } catch (error) { + if (error instanceof IndexCorruptedError) { + throw error; + } console.error('Failed to initialize LanceDB:', error); throw error; } @@ -115,7 +127,8 @@ export class LanceDBStorageProvider implements VectorStorageProvider { filters?: SearchFilters ): Promise { if (!this.initialized || !this.table) { - return []; + // If table is missing, throw so auto-heal can fix it + throw new IndexCorruptedError('LanceDB index corrupted: no table available for search'); } try { @@ -170,7 +183,15 @@ export class LanceDBStorageProvider implements VectorStorageProvider { distance: result._distance || 0 })); } catch (error) { + if (error instanceof Error && error.message.includes('No vector column')) { + throw new IndexCorruptedError('LanceDB index corrupted: missing vector column'); + } console.error('Failed to search:', error); + // For other errors, we throw IndexCorruptedError to be safe and trigger auto-heal + // if it looks like a database issue + if (error instanceof Error && (error.message.includes('LanceDB') || error.message.includes('Arrow'))) { + throw new IndexCorruptedError(`LanceDB runtime error: ${error.message}`); + } return []; } } diff --git a/tests/lancedb-corruption.test.ts b/tests/lancedb-corruption.test.ts new file mode 100644 index 0000000..ab6de69 --- /dev/null +++ b/tests/lancedb-corruption.test.ts @@ -0,0 +1,90 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { promises as fs } from 'fs'; +import os from 'os'; +import path from 'path'; +import { IndexCorruptedError } from '../src/errors/index.js'; + +const lancedb = vi.hoisted(() => ({ + connect: vi.fn() +})); + +vi.mock('@lancedb/lancedb', () => ({ + connect: lancedb.connect +})); + +describe('LanceDBStorageProvider corruption detection', () => { + let tempDir: string; + let consoleErrorSpy: ReturnType; + + beforeEach(async () => { + tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'lancedb-test-')); + lancedb.connect.mockReset(); + consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); + }); + + afterEach(async () => { + consoleErrorSpy.mockRestore(); + await fs.rm(tempDir, { recursive: true, force: true }); + }); + + it('throws IndexCorruptedError when vector column missing during initialize()', async () => { + const dropTable = vi.fn(async () => {}); + const db = { + tableNames: vi.fn(async () => ['code_chunks']), + openTable: vi.fn(async () => ({ + schema: vi.fn(async () => ({ fields: [{ name: 'id' }] })) + })), + dropTable + }; + + lancedb.connect.mockResolvedValue(db); + + const { LanceDBStorageProvider } = await import('../src/storage/lancedb.js'); + const provider = new LanceDBStorageProvider(); + + await expect(provider.initialize(tempDir)).rejects.toBeInstanceOf(IndexCorruptedError); + expect(dropTable).toHaveBeenCalledWith('code_chunks'); + }); + + it('throws IndexCorruptedError when schema validation fails during initialize()', async () => { + const dropTable = vi.fn(async () => {}); + const db = { + tableNames: vi.fn(async () => ['code_chunks']), + openTable: vi.fn(async () => ({ + schema: vi.fn(async () => { + throw new Error('schema error'); + }) + })), + dropTable + }; + + lancedb.connect.mockResolvedValue(db); + + const { LanceDBStorageProvider } = await import('../src/storage/lancedb.js'); + const provider = new LanceDBStorageProvider(); + + await expect(provider.initialize(tempDir)).rejects.toBeInstanceOf(IndexCorruptedError); + expect(dropTable).toHaveBeenCalledWith('code_chunks'); + }); + + it('throws IndexCorruptedError when vector search fails with "No vector column"', async () => { + const { LanceDBStorageProvider } = await import('../src/storage/lancedb.js'); + const provider = new LanceDBStorageProvider() as any; + + const query = { + limit: vi.fn(() => query), + where: vi.fn(() => query), + toArray: vi.fn(async () => { + throw new Error('Schema Error: No vector column found to create index'); + }) + }; + + provider.initialized = true; + provider.table = { + vectorSearch: vi.fn(() => query) + }; + + await expect(provider.search([0.1, 0.2], 5)).rejects.toBeInstanceOf(IndexCorruptedError); + }); +}); + diff --git a/tests/search-codebase-auto-heal.test.ts b/tests/search-codebase-auto-heal.test.ts new file mode 100644 index 0000000..09e638f --- /dev/null +++ b/tests/search-codebase-auto-heal.test.ts @@ -0,0 +1,116 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; + +const searchMocks = vi.hoisted(() => ({ + search: vi.fn() +})); + +const indexerMocks = vi.hoisted(() => ({ + index: vi.fn() +})); + +vi.mock('../src/core/search.js', async () => { + class CodebaseSearcher { + constructor(_rootPath: string) {} + + async search(query: string, limit: number, filters?: unknown) { + return searchMocks.search(query, limit, filters); + } + } + + return { CodebaseSearcher }; +}); + +vi.mock('../src/core/indexer.js', () => { + class CodebaseIndexer { + constructor(_options: unknown) {} + + getProgress() { + return { phase: 'complete', percentage: 100 }; + } + + async index() { + indexerMocks.index(); + return { + totalFiles: 0, + indexedFiles: 0, + skippedFiles: 0, + totalChunks: 0, + totalLines: 0, + duration: 0, + avgChunkSize: 0, + componentsByType: {}, + componentsByLayer: { + presentation: 0, + business: 0, + data: 0, + state: 0, + core: 0, + shared: 0, + feature: 0, + infrastructure: 0, + unknown: 0 + }, + errors: [], + startedAt: new Date(), + completedAt: new Date() + }; + } + } + + return { CodebaseIndexer }; +}); + +describe('search_codebase auto-heal', () => { + let consoleErrorSpy: ReturnType; + + beforeEach(() => { + searchMocks.search.mockReset(); + indexerMocks.index.mockReset(); + consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); + }); + + afterEach(() => { + consoleErrorSpy.mockRestore(); + }); + + it('triggers indexing and retries when IndexCorruptedError is thrown', async () => { + const { IndexCorruptedError } = await import('../src/errors/index.js'); + + searchMocks.search + .mockRejectedValueOnce(new IndexCorruptedError('LanceDB index corrupted: missing vector column')) + .mockResolvedValueOnce([ + { + summary: 'Test summary', + snippet: 'Test snippet', + filePath: '/tmp/file.ts', + startLine: 1, + endLine: 2, + score: 0.9, + language: 'ts', + metadata: {} + } + ]); + + const { server } = await import('../src/index.js'); + const handler = (server as any)._requestHandlers.get('tools/call'); + + const response = await handler({ + jsonrpc: '2.0', + id: 1, + method: 'tools/call', + params: { + name: 'search_codebase', + arguments: { + query: 'test' + } + } + }); + + const payload = JSON.parse(response.content[0].text); + expect(payload.status).toBe('success'); + expect(payload.results).toHaveLength(1); + expect(searchMocks.search).toHaveBeenCalledTimes(2); + expect(indexerMocks.index).toHaveBeenCalledTimes(1); + }); +}); + diff --git a/tests/searcher-corruption-propagation.test.ts b/tests/searcher-corruption-propagation.test.ts new file mode 100644 index 0000000..9834ab3 --- /dev/null +++ b/tests/searcher-corruption-propagation.test.ts @@ -0,0 +1,80 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { promises as fs } from 'fs'; +import os from 'os'; +import path from 'path'; +import { IndexCorruptedError } from '../src/errors/index.js'; + +const deps = vi.hoisted(() => ({ + getEmbeddingProvider: vi.fn(), + getStorageProvider: vi.fn() +})); + +vi.mock('../src/embeddings/index.js', () => ({ + getEmbeddingProvider: deps.getEmbeddingProvider +})); + +vi.mock('../src/storage/index.js', () => ({ + getStorageProvider: deps.getStorageProvider +})); + +describe('CodebaseSearcher IndexCorruptedError propagation', () => { + let tempDir: string; + let consoleWarnSpy: ReturnType; + let consoleErrorSpy: ReturnType; + + beforeEach(async () => { + tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'searcher-test-')); + deps.getEmbeddingProvider.mockReset(); + deps.getStorageProvider.mockReset(); + consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); + + await fs.writeFile(path.join(tempDir, '.codebase-index.json'), JSON.stringify([])); + await fs.writeFile(path.join(tempDir, '.codebase-intelligence.json'), JSON.stringify({})); + }); + + afterEach(async () => { + consoleWarnSpy.mockRestore(); + consoleErrorSpy.mockRestore(); + await fs.rm(tempDir, { recursive: true, force: true }); + }); + + it('rethrows IndexCorruptedError from initialize()', async () => { + deps.getEmbeddingProvider.mockResolvedValue({ + embed: vi.fn(async () => [0.1, 0.2]) + }); + + deps.getStorageProvider.mockRejectedValue( + new IndexCorruptedError('LanceDB index corrupted: missing vector column') + ); + + const { CodebaseSearcher } = await import('../src/core/search.js'); + const searcher = new CodebaseSearcher(tempDir); + + await expect(searcher.search('test', 5)).rejects.toBeInstanceOf(IndexCorruptedError); + }); + + it('rethrows IndexCorruptedError from semantic search', async () => { + deps.getEmbeddingProvider.mockResolvedValue({ + embed: vi.fn(async () => [0.1, 0.2]) + }); + + deps.getStorageProvider.mockResolvedValue({ + name: 'mock', + initialize: vi.fn(async () => {}), + store: vi.fn(async () => {}), + clear: vi.fn(async () => {}), + count: vi.fn(async () => 0), + isInitialized: vi.fn(() => true), + search: vi.fn(async () => { + throw new IndexCorruptedError('LanceDB index corrupted: missing vector column'); + }) + }); + + const { CodebaseSearcher } = await import('../src/core/search.js'); + const searcher = new CodebaseSearcher(tempDir); + + await expect(searcher.search('test', 5)).rejects.toBeInstanceOf(IndexCorruptedError); + }); +}); +