diff --git a/package-lock.json b/package-lock.json index d1b3279..249a0ee 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1396,7 +1396,6 @@ "cpu": [ "arm" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1410,7 +1409,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1424,7 +1422,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1438,7 +1435,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1452,7 +1448,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1466,7 +1461,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1480,7 +1474,6 @@ "cpu": [ "arm" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1494,7 +1487,6 @@ "cpu": [ "arm" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1508,7 +1500,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1522,7 +1513,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1536,7 +1526,6 @@ "cpu": [ "loong64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1550,7 +1539,6 @@ "cpu": [ "loong64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1564,7 +1552,6 @@ "cpu": [ "ppc64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1578,7 +1565,6 @@ "cpu": [ "ppc64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1592,7 +1578,6 @@ "cpu": [ "riscv64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1606,7 +1591,6 @@ "cpu": [ "riscv64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1620,7 +1604,6 @@ "cpu": [ "s390x" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1634,7 +1617,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1648,7 +1630,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1662,7 +1643,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1676,7 +1656,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1690,7 +1669,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1704,7 +1682,6 @@ "cpu": [ "ia32" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1718,7 +1695,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1732,7 +1708,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1879,7 +1854,7 @@ "version": "1.0.8", "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", - "dev": true, + "devOptional": true, "license": "MIT" }, "node_modules/@types/json-schema": { @@ -1903,7 +1878,6 @@ "version": "25.5.0", "resolved": "https://registry.npmjs.org/@types/node/-/node-25.5.0.tgz", "integrity": "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw==", - "devOptional": true, "license": "MIT", "dependencies": { "undici-types": "~7.18.0" @@ -3776,7 +3750,6 @@ "version": "2.3.3", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", - "dev": true, "hasInstallScript": true, "license": "MIT", "optional": true, @@ -4704,6 +4677,18 @@ "node": ">=8.6" } }, + "node_modules/micromatch/node_modules/picomatch": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz", + "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==", + "license": "MIT", + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, "node_modules/mimic-response": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz", @@ -18245,12 +18230,13 @@ "license": "ISC" }, "node_modules/picomatch": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz", - "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", + "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", + "dev": true, "license": "MIT", "engines": { - "node": ">=8.6" + "node": ">=12" }, "funding": { "url": "https://github.com/sponsors/jonschlinkert" @@ -18673,7 +18659,7 @@ "version": "4.60.1", "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.1.tgz", "integrity": "sha512-VmtB2rFU/GroZ4oL8+ZqXgSA38O6GR8KSIvWmEFv63pQ0G6KaBH9s07PO8XTXP4vI+3UJUEypOfjkGfmSBBR0w==", - "dev": true, + "devOptional": true, "license": "MIT", "dependencies": { "@types/estree": "1.0.8" @@ -19230,19 +19216,6 @@ "url": "https://github.com/sponsors/SuperchupuDev" } }, - "node_modules/tinyglobby/node_modules/picomatch": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", - "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, "node_modules/tinyrainbow": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", @@ -19490,7 +19463,7 @@ "version": "5.9.3", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", - "dev": true, + "devOptional": true, "license": "Apache-2.0", "bin": { "tsc": "bin/tsc", @@ -19544,7 +19517,6 @@ "version": "7.18.2", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz", "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==", - "devOptional": true, "license": "MIT" }, "node_modules/uri-js": { @@ -19648,19 +19620,6 @@ } } }, - "node_modules/vite/node_modules/picomatch": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", - "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, "node_modules/vitest": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/vitest/-/vitest-4.1.2.tgz", @@ -19743,19 +19702,6 @@ } } }, - "node_modules/vitest/node_modules/picomatch": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", - "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, "node_modules/web-tree-sitter": { "version": "0.22.6", "resolved": "https://registry.npmjs.org/web-tree-sitter/-/web-tree-sitter-0.22.6.tgz", @@ -20011,6 +19957,7 @@ "license": "Apache-2.0", "dependencies": { "better-sqlite3": "^12.6.2", + "cheerio": "^1.0.0-rc.12", "fast-glob": "^3.3.3", "openai": "^6.18.0" }, @@ -20026,7 +19973,7 @@ } }, "packages/toolpack-sdk": { - "version": "1.3.0", + "version": "1.2.0", "license": "Apache-2.0", "dependencies": { "@anthropic-ai/sdk": "^0.73.0", diff --git a/packages/toolpack-knowledge/README.md b/packages/toolpack-knowledge/README.md index 8acfd9d..bbf92fe 100644 --- a/packages/toolpack-knowledge/README.md +++ b/packages/toolpack-knowledge/README.md @@ -1,6 +1,6 @@ # toolpack-knowledge -RAG (Retrieval-Augmented Generation) package for Toolpack SDK. +RAG (Retrieval-Augmented Generation) package for Toolpack SDK with advanced features for web crawling, API indexing, streaming ingestion, and hybrid search. ## Installation @@ -54,6 +54,38 @@ const results = await kb.query('authentication setup', { }); ``` +### Advanced Usage + +```typescript +import { Knowledge, WebUrlSource, ApiDataSource, PersistentKnowledgeProvider, OllamaEmbedder } from '@toolpack-sdk/knowledge'; + +// Web crawling + API indexing with hybrid search +const kb = await Knowledge.create({ + provider: new PersistentKnowledgeProvider({ namespace: 'advanced-docs' }), + sources: [ + new WebUrlSource(['https://docs.example.com'], { + maxDepth: 2, + delayMs: 1000, + }), + new ApiDataSource('https://api.example.com/docs', { + pagination: { param: 'page', start: 1, maxPages: 5 }, + contentExtractor: (doc) => `${doc.title}\n\n${doc.content}`, + }), + ], + embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }), + streamingBatchSize: 50, // Efficient processing of large datasets + description: 'Comprehensive documentation from web and API sources.', +}); + +// Hybrid search combining semantic and keyword matching +const results = await kb.query('authentication setup', { + searchType: 'hybrid', + semanticWeight: 0.6, // 60% semantic, 40% keyword + limit: 10, + threshold: 0.7, +}); +``` + ### Agent Integration ```typescript @@ -75,6 +107,126 @@ const toolpack = await Toolpack.init({ const response = await toolpack.chat('How do I configure authentication?'); ``` +## Advanced Features + +### Web URL Sources + +Crawl and index websites with automatic HTML parsing and link following. + +```typescript +import { WebUrlSource } from '@toolpack-sdk/knowledge'; + +const webSource = new WebUrlSource(['https://docs.example.com'], { + maxDepth: 3, // Follow links up to 3 levels deep + delayMs: 1000, // Respectful crawling delay + userAgent: 'MyApp/1.0', // Custom user agent + maxChunkSize: 1500, // Chunk size for web content + timeoutMs: 30000, // Request timeout +}); + +const kb = await Knowledge.create({ + provider: new MemoryProvider(), + sources: [webSource], + embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }), + description: 'Web documentation and guides.', +}); +``` + +**Features:** +- Recursive website crawling with depth control +- Automatic HTML text extraction (removes scripts/styles) +- Link discovery and following +- Respectful crawling with configurable delays +- Metadata includes title, URL, and source type + +### API Data Sources + +Index data from REST APIs with pagination support. + +```typescript +import { ApiDataSource } from '@toolpack-sdk/knowledge'; + +const apiSource = new ApiDataSource('https://api.github.com/repos/toolpack-ai/toolpack-sdk/issues', { + headers: { + 'Authorization': `Bearer ${process.env.GITHUB_TOKEN}`, + 'Accept': 'application/vnd.github.v3+json', + }, + pagination: { + param: 'page', + start: 1, + maxPages: 5, + }, + dataPath: '', // Root level array + contentExtractor: (issue: any) => `${issue.title}\n\n${issue.body}`, + metadataExtractor: (issue: any) => ({ + id: issue.id, + state: issue.state, + labels: issue.labels?.map(l => l.name), + }), +}); + +const kb = await Knowledge.create({ + provider: new PersistentKnowledgeProvider({ namespace: 'github-issues' }), + sources: [apiSource], + embedder: new OpenAIEmbedder({ model: 'text-embedding-3-small' }), + description: 'GitHub issues and discussions.', +}); +``` + +**Features:** +- REST API data ingestion (GET/POST) +- Automatic pagination handling +- Custom content and metadata extractors +- JSON path support for nested data +- Flexible data transformation + +### Streaming Ingestion + +Process large datasets efficiently with batch processing. + +```typescript +const kb = await Knowledge.create({ + provider: new PersistentKnowledgeProvider({ namespace: 'large-dataset' }), + sources: [new ApiDataSource('https://api.example.com/large-dataset')], + embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }), + streamingBatchSize: 50, // Process 50 chunks at a time + description: 'Large dataset with streaming ingestion.', + onEmbeddingProgress: (event) => { + console.log(`Processed: ${event.current}/${event.total} chunks`); + }, +}); +``` + +### Hybrid Search + +Combine semantic and keyword search for better results. + +```typescript +// Semantic search (default) +const semanticResults = await kb.query('machine learning algorithms', { + searchType: 'semantic', + limit: 5, +}); + +// Keyword search +const keywordResults = await kb.query('machine learning algorithms', { + searchType: 'keyword', + limit: 5, +}); + +// Hybrid search (recommended) +const hybridResults = await kb.query('machine learning algorithms', { + searchType: 'hybrid', + semanticWeight: 0.7, // 70% semantic, 30% keyword + limit: 5, +}); +``` + +**Search Types:** +- `semantic` — Vector similarity search (default) +- `keyword` — Text matching search +- `hybrid` — Combined semantic + keyword search + ## Providers ### MemoryProvider @@ -121,6 +273,70 @@ new MarkdownSource('./docs/**/*.md', { - Code block detection (`hasCode` metadata) - Deterministic chunk IDs +### WebUrlSource + +Crawl and index web pages with HTML parsing. + +```typescript +new WebUrlSource(['https://example.com', 'https://docs.example.com'], { + maxDepth: 2, // Crawl depth (default: 1) + delayMs: 1000, // Delay between requests (default: 1000ms) + userAgent: 'MyApp/1.0', // Custom user agent + maxChunkSize: 2000, // Max tokens per chunk + chunkOverlap: 200, // Overlap between chunks + timeoutMs: 30000, // Request timeout (default: 30000ms) + namespace: 'web', // Chunk ID prefix + metadata: { source: 'web' }, // Added to all chunks +}) +``` + +**Features:** +- Recursive website crawling +- Automatic HTML text extraction +- Link discovery and following +- Respectful crawling with delays +- Error handling for failed requests + +### ApiDataSource + +Index data from REST APIs with pagination. + +```typescript +new ApiDataSource('https://api.example.com/data', { + method: 'GET', // HTTP method (default: 'GET') + headers: { // Request headers + 'Authorization': 'Bearer token', + 'Content-Type': 'application/json', + }, + body: JSON.stringify({}), // Request body for POST + pagination: { // Pagination config + param: 'page', // Query param name + start: 1, // Starting page number + step: 1, // Page increment + maxPages: 10, // Max pages to fetch + }, + dataPath: 'data.items', // JSON path to data array + contentExtractor: (item) => // Custom content extraction + `${item.title}\n\n${item.description}`, + metadataExtractor: (item) => ({ // Custom metadata extraction + id: item.id, + category: item.category, + }), + maxChunkSize: 2000, // Max tokens per chunk + chunkOverlap: 200, // Overlap between chunks + timeoutMs: 30000, // Request timeout + namespace: 'api', // Chunk ID prefix + metadata: { source: 'api' }, // Added to all chunks +}) +``` + +**Features:** +- REST API data ingestion +- Automatic pagination handling +- Custom data extractors +- JSON path support +- Flexible content transformation + ## Embedders ### OllamaEmbedder @@ -159,6 +375,7 @@ interface KnowledgeOptions { embedder: Embedder; description: string; // Required: used as tool description reSync?: boolean; // default: true + streamingBatchSize?: number; // Process chunks in batches (default: 100) onError?: (error, context) => 'skip' | 'abort'; onSync?: (event: SyncEvent) => void; onEmbeddingProgress?: (event: EmbeddingProgressEvent) => void; @@ -171,6 +388,8 @@ interface KnowledgeOptions { await kb.query('search query', { limit: 10, // Max results threshold: 0.7, // Similarity threshold (0-1) + searchType: 'hybrid', // 'semantic' | 'keyword' | 'hybrid' (default: 'semantic') + semanticWeight: 0.7, // Weight for semantic vs keyword in hybrid search (0-1) filter: { // Metadata filters hasCode: true, category: { $in: ['api', 'guide'] }, @@ -180,6 +399,20 @@ await kb.query('search query', { }); ``` +### Utility Functions + +```typescript +import { keywordSearch, combineScores } from '@toolpack-sdk/knowledge'; + +// Manual keyword search +const score = keywordSearch('document content', 'search query'); +// Returns: number between 0-1 + +// Combine semantic and keyword scores +const combinedScore = combineScores(semanticScore, keywordScore, 0.7); +// Returns: weighted combination +``` + ### Metadata Filters ```typescript diff --git a/packages/toolpack-knowledge/examples/advanced-features.ts b/packages/toolpack-knowledge/examples/advanced-features.ts new file mode 100644 index 0000000..22c026a --- /dev/null +++ b/packages/toolpack-knowledge/examples/advanced-features.ts @@ -0,0 +1,97 @@ +import { + Knowledge, + MemoryProvider, + WebUrlSource, + ApiDataSource, + MarkdownSource, + OllamaEmbedder +} from '../src/index.js'; + +async function main() { + console.log('Creating advanced knowledge base...'); + + const kb = await Knowledge.create({ + provider: new MemoryProvider(), + sources: [ + // Web URL source - crawl websites + new WebUrlSource(['https://example.com', 'https://httpbin.org'], { + maxDepth: 2, + delayMs: 1000, // Be respectful to servers + maxChunkSize: 1500, + }), + + // API data source - index REST API data + new ApiDataSource('https://jsonplaceholder.typicode.com/posts', { + dataPath: '', // Root level array + contentExtractor: (item: any) => `${item.title}\n\n${item.body}`, + metadataExtractor: (item: any) => ({ + id: item.id, + userId: item.userId, + }), + }), + + // Traditional markdown source + new MarkdownSource('./docs/**/*.md'), + ], + embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }), + description: 'Advanced knowledge base with web crawling, API indexing, and hybrid search', + streamingBatchSize: 50, // Process in batches for large datasets + onSync: (event) => { + if (event.type === 'start') { + console.log('Starting sync...'); + } else if (event.type === 'complete') { + console.log(`Sync complete! Indexed ${event.chunksAffected} chunks`); + } + }, + onEmbeddingProgress: (event) => { + console.log(`Embedding progress: ${event.percent}% (${event.current}/${event.total})`); + }, + }); + + console.log('\n=== Semantic Search ==='); + const semanticResults = await kb.query('web development technologies', { + limit: 3, + searchType: 'semantic', + }); + + console.log(`Found ${semanticResults.length} semantic results:`); + for (const result of semanticResults) { + console.log(`Score: ${result.score.toFixed(3)}`); + console.log(`Content: ${result.chunk.content.substring(0, 100)}...`); + console.log(`Source: ${result.chunk.metadata.source}`); + console.log('---\n'); + } + + console.log('\n=== Keyword Search ==='); + const keywordResults = await kb.query('web development', { + limit: 3, + searchType: 'keyword', + }); + + console.log(`Found ${keywordResults.length} keyword results:`); + for (const result of keywordResults) { + console.log(`Score: ${result.score.toFixed(3)}`); + console.log(`Content: ${result.chunk.content.substring(0, 100)}...`); + console.log(`Source: ${result.chunk.metadata.source}`); + console.log('---\n'); + } + + console.log('\n=== Hybrid Search ==='); + const hybridResults = await kb.query('web development technologies', { + limit: 3, + searchType: 'hybrid', + semanticWeight: 0.6, // 60% semantic, 40% keyword + }); + + console.log(`Found ${hybridResults.length} hybrid results:`); + for (const result of hybridResults) { + console.log(`Score: ${result.score.toFixed(3)}`); + console.log(`Content: ${result.chunk.content.substring(0, 100)}...`); + console.log(`Source: ${result.chunk.metadata.source}`); + console.log('---\n'); + } + + await kb.stop(); +} + +main().catch(console.error); \ No newline at end of file diff --git a/packages/toolpack-knowledge/package.json b/packages/toolpack-knowledge/package.json index 1a497d9..aabcd10 100644 --- a/packages/toolpack-knowledge/package.json +++ b/packages/toolpack-knowledge/package.json @@ -31,7 +31,11 @@ "embeddings", "vector-search", "knowledge-base", - "sdk" + "sdk", + "web-crawling", + "api-indexing", + "hybrid-search", + "streaming-ingestion" ], "engines": { "node": ">=20" @@ -48,6 +52,7 @@ }, "dependencies": { "better-sqlite3": "^12.6.2", + "cheerio": "^1.0.0-rc.12", "fast-glob": "^3.3.3", "openai": "^6.18.0" }, diff --git a/packages/toolpack-knowledge/src/__tests__/api-source.test.ts b/packages/toolpack-knowledge/src/__tests__/api-source.test.ts new file mode 100644 index 0000000..5117a0b --- /dev/null +++ b/packages/toolpack-knowledge/src/__tests__/api-source.test.ts @@ -0,0 +1,87 @@ +import { describe, it, expect, vi } from 'vitest'; +import { ApiDataSource } from '../sources/api.js'; + +// Mock fetch globally +global.fetch = vi.fn(); + +describe('ApiDataSource', () => { + it('should fetch and chunk API data', async () => { + const mockData = { + data: [ + { + id: 1, + title: 'First Item', + content: 'This is the content of the first item.', + author: 'Author 1', + }, + { + id: 2, + title: 'Second Item', + content: 'This is the content of the second item.', + author: 'Author 2', + }, + ], + }; + + (global.fetch as any).mockResolvedValueOnce({ + ok: true, + json: () => Promise.resolve(mockData), + }); + + const source = new ApiDataSource('https://api.example.com/data', { + dataPath: 'data', + contentExtractor: (item: any) => `${item.title}\n\n${item.content}`, + }); + + const chunks = []; + for await (const chunk of source.load()) { + chunks.push(chunk); + } + + expect(chunks.length).toBe(2); + expect(chunks[0].content).toContain('First Item'); + expect(chunks[0].content).toContain('content of the first item'); + expect(chunks[0].metadata.id).toBe(1); + expect(chunks[0].metadata.author).toBe('Author 1'); + expect(chunks[1].metadata.title).toBe('Second Item'); + }); + + it('should handle pagination', async () => { + const mockPage1 = { data: [{ id: 1, content: 'Page 1 content' }] }; + const mockPage2 = { data: [{ id: 2, content: 'Page 2 content' }] }; + const mockPage3 = { data: [] }; // Empty page to stop pagination + + (global.fetch as any) + .mockResolvedValueOnce({ + ok: true, + json: () => Promise.resolve(mockPage1), + }) + .mockResolvedValueOnce({ + ok: true, + json: () => Promise.resolve(mockPage2), + }) + .mockResolvedValueOnce({ + ok: true, + json: () => Promise.resolve(mockPage3), + }); + + const source = new ApiDataSource('https://api.example.com/data', { + dataPath: 'data', + pagination: { + param: 'page', + start: 1, + step: 1, + maxPages: 10, + }, + }); + + const chunks = []; + for await (const chunk of source.load()) { + chunks.push(chunk); + } + + expect(chunks.length).toBe(2); + expect(chunks[0].metadata.id).toBe(1); + expect(chunks[1].metadata.id).toBe(2); + }); +}); \ No newline at end of file diff --git a/packages/toolpack-knowledge/src/__tests__/keyword.test.ts b/packages/toolpack-knowledge/src/__tests__/keyword.test.ts new file mode 100644 index 0000000..f0a115d --- /dev/null +++ b/packages/toolpack-knowledge/src/__tests__/keyword.test.ts @@ -0,0 +1,46 @@ +import { describe, it, expect } from 'vitest'; +import { keywordSearch, combineScores } from '../../dist/index.js'; + +describe('keywordSearch', () => { + it('should return 1.0 for exact matches', () => { + const text = 'This is a test document with some content.'; + const query = 'test document'; + expect(keywordSearch(text, query)).toBe(1.0); + }); + + it('should return partial scores for word matches', () => { + const text = 'This is a test document with some content.'; + const query = 'test content extra'; + const score = keywordSearch(text, query); + expect(score).toBeGreaterThan(0); + expect(score).toBeLessThan(1.0); + }); + + it('should return 0 for no matches', () => { + const text = 'This is a test document.'; + const query = 'nonexistent'; + expect(keywordSearch(text, query)).toBe(0); + }); + + it('should handle case insensitive matching', () => { + const text = 'This is a TEST document.'; + const query = 'test'; + expect(keywordSearch(text, query)).toBe(1.0); + }); +}); + +describe('combineScores', () => { + it('should combine semantic and keyword scores', () => { + const semanticScore = 0.8; + const keywordScore = 0.6; + const combined = combineScores(semanticScore, keywordScore, 0.7); + expect(combined).toBe(0.8 * 0.7 + 0.6 * 0.3); + }); + + it('should handle equal weights', () => { + const semanticScore = 0.9; + const keywordScore = 0.5; + const combined = combineScores(semanticScore, keywordScore, 0.5); + expect(combined).toBe(0.7); + }); +}); \ No newline at end of file diff --git a/packages/toolpack-knowledge/src/__tests__/web-url-source.test.ts b/packages/toolpack-knowledge/src/__tests__/web-url-source.test.ts new file mode 100644 index 0000000..5569921 --- /dev/null +++ b/packages/toolpack-knowledge/src/__tests__/web-url-source.test.ts @@ -0,0 +1,60 @@ +import { describe, it, expect, vi } from 'vitest'; +import { WebUrlSource } from '../sources/web-url.js'; + +// Mock fetch globally +global.fetch = vi.fn(); + +describe('WebUrlSource', () => { + it('should crawl and chunk web pages', async () => { + const mockHtml = ` + + Test Page + +

Main Title

+

This is some content from a web page.

+

This is more content that should be extracted.

+ Internal Link + External Link + + + `; + + (global.fetch as any).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(mockHtml), + }); + + const source = new WebUrlSource(['https://example.com'], { + maxDepth: 1, + delayMs: 0, // No delay for tests + }); + + const chunks = []; + for await (const chunk of source.load()) { + chunks.push(chunk); + } + + expect(chunks.length).toBeGreaterThan(0); + expect(chunks[0].content).toContain('Main Title'); + expect(chunks[0].content).toContain('content from a web page'); + expect(chunks[0].metadata.title).toBe('Test Page'); + expect(chunks[0].metadata.url).toBe('https://example.com'); + expect(chunks[0].metadata.source).toBe('web'); + }); + + it('should handle fetch errors gracefully', async () => { + (global.fetch as any).mockRejectedValueOnce(new Error('Network error')); + + const source = new WebUrlSource(['https://failing-url.com'], { + delayMs: 0, + }); + + const chunks = []; + for await (const chunk of source.load()) { + chunks.push(chunk); + } + + // Should not throw, just skip the failing URL + expect(chunks.length).toBe(0); + }); +}); \ No newline at end of file diff --git a/packages/toolpack-knowledge/src/index.ts b/packages/toolpack-knowledge/src/index.ts index 020414a..13a0ef3 100644 --- a/packages/toolpack-knowledge/src/index.ts +++ b/packages/toolpack-knowledge/src/index.ts @@ -11,8 +11,17 @@ export type { PersistentKnowledgeProviderOptions } from './providers/persistent. export { MarkdownSource } from './sources/markdown.js'; export type { MarkdownSourceOptions } from './sources/markdown.js'; +export { WebUrlSource } from './sources/web-url.js'; +export type { WebUrlSourceOptions } from './sources/web-url.js'; + +export { ApiDataSource } from './sources/api.js'; +export type { ApiDataSourceOptions } from './sources/api.js'; + export { OllamaEmbedder } from './embedders/ollama.js'; export type { OllamaEmbedderOptions } from './embedders/ollama.js'; export { OpenAIEmbedder } from './embedders/openai.js'; export type { OpenAIEmbedderOptions } from './embedders/openai.js'; + +// Utility functions +export { keywordSearch, combineScores } from './utils/keyword.js'; diff --git a/packages/toolpack-knowledge/src/interfaces.ts b/packages/toolpack-knowledge/src/interfaces.ts index 5846f75..c7882c2 100644 --- a/packages/toolpack-knowledge/src/interfaces.ts +++ b/packages/toolpack-knowledge/src/interfaces.ts @@ -16,6 +16,8 @@ export interface QueryOptions { filter?: MetadataFilter; includeMetadata?: boolean; includeVectors?: boolean; + searchType?: 'semantic' | 'keyword' | 'hybrid'; + semanticWeight?: number; // For hybrid search, weight of semantic vs keyword (0-1) } export interface MetadataFilter { @@ -34,9 +36,11 @@ export interface QueryResult { export interface KnowledgeProvider { add(chunks: Chunk[]): Promise; query(queryVector: number[], options?: QueryOptions): Promise; + keywordQuery?(query: string, options?: QueryOptions): Promise; delete(ids: string[]): Promise; clear(): Promise; validateDimensions(dimensions: number): Promise; + getAllChunks?(): Promise; close?(): void; } diff --git a/packages/toolpack-knowledge/src/knowledge.ts b/packages/toolpack-knowledge/src/knowledge.ts index 51a105e..13790bd 100644 --- a/packages/toolpack-knowledge/src/knowledge.ts +++ b/packages/toolpack-knowledge/src/knowledge.ts @@ -1,4 +1,6 @@ import { KnowledgeProvider, KnowledgeSource, Embedder, QueryOptions, QueryResult, Chunk } from './interfaces.js'; +import { keywordSearch, combineScores } from './utils/keyword.js'; +import { matchesFilter } from './utils/cosine.js'; export interface KnowledgeOptions { provider: KnowledgeProvider; @@ -9,6 +11,7 @@ export interface KnowledgeOptions { onError?: ErrorHandler; onSync?: SyncEventHandler; onEmbeddingProgress?: EmbeddingProgressHandler; + streamingBatchSize?: number; } export type ErrorHandler = ( @@ -71,10 +74,134 @@ export class Knowledge { } async query(text: string, options?: QueryOptions): Promise { + const searchType = options?.searchType ?? 'semantic'; + const semanticWeight = options?.semanticWeight ?? 0.7; + + if (searchType === 'keyword') { + return this.keywordQuery(text, options); + } else if (searchType === 'hybrid') { + const [semanticResults, keywordResults] = await Promise.all([ + this.semanticQuery(text, options), + this.keywordQuery(text, options) + ]); + + return this.combineHybridResults(semanticResults, keywordResults, semanticWeight, options); + } else { + return this.semanticQuery(text, options); + } + } + + private async semanticQuery(text: string, options?: QueryOptions): Promise { const vector = await this.embedder.embed(text); return this.provider.query(vector, options); } + private async keywordQuery(text: string, options?: QueryOptions): Promise { + const { + limit = 10, + threshold = 0.1, + filter, + includeMetadata = true, + includeVectors = false, + } = options || {}; + + // Use provider's keywordQuery if available for better performance + if (typeof this.provider.keywordQuery === 'function') { + return this.provider.keywordQuery(text, options); + } + + // Fallback: get all chunks and score them in memory + const allChunks = await this.getAllChunks(); + + const results: QueryResult[] = []; + + for (const chunk of allChunks) { + if (filter && !matchesFilter(chunk.metadata, filter)) { + continue; + } + + const score = keywordSearch(chunk.content, text); + + if (score >= threshold) { + results.push({ + chunk: { + id: chunk.id, + content: chunk.content, + metadata: includeMetadata ? chunk.metadata : {}, + vector: includeVectors ? chunk.vector : undefined, + }, + score, + distance: 1 - score, + }); + } + } + + results.sort((a, b) => b.score - a.score); + return results.slice(0, limit); + } + + private combineHybridResults( + semanticResults: QueryResult[], + keywordResults: QueryResult[], + semanticWeight: number, + options?: QueryOptions + ): QueryResult[] { + const { + limit = 10, + threshold = 0.5, + includeMetadata = true, + includeVectors = false, + } = options || {}; + + // Create a map of chunk IDs to results for efficient lookup + const semanticMap = new Map(semanticResults.map(r => [r.chunk.id, r])); + const keywordMap = new Map(keywordResults.map(r => [r.chunk.id, r])); + + const combinedResults: QueryResult[] = []; + + // Combine results from both searches + const allIds = new Set([...semanticMap.keys(), ...keywordMap.keys()]); + + for (const id of allIds) { + const semanticResult = semanticMap.get(id); + const keywordResult = keywordMap.get(id); + + if (!semanticResult && !keywordResult) continue; + + const semanticScore = semanticResult?.score ?? 0; + const keywordScore = keywordResult?.score ?? 0; + const combinedScore = combineScores(semanticScore, keywordScore, semanticWeight); + + if (combinedScore >= threshold) { + combinedResults.push({ + chunk: { + id: id, + content: semanticResult?.chunk.content ?? keywordResult!.chunk.content, + metadata: includeMetadata ? (semanticResult?.chunk.metadata ?? keywordResult!.chunk.metadata) : {}, + vector: includeVectors ? (semanticResult?.chunk.vector ?? keywordResult!.chunk.vector) : undefined, + }, + score: combinedScore, + distance: 1 - combinedScore, + }); + } + } + + combinedResults.sort((a, b) => b.score - a.score); + return combinedResults.slice(0, limit); + } + + private async getAllChunks(): Promise { + if (typeof this.provider.getAllChunks === 'function') { + return this.provider.getAllChunks(); + } + + // Fallback: query with a dummy vector to get all chunks + // This won't work with all providers, but works with our current ones + const dummyVector = new Array(this.embedder.dimensions).fill(0); + return (await this.provider.query(dummyVector, { limit: 10000, threshold: 0 })) + .map(r => r.chunk); + } + async sync(): Promise { this.options.onSync?.({ type: 'start' }); @@ -83,21 +210,35 @@ export class Knowledge { await this.provider.clear(); await this.provider.validateDimensions(dimensions); - const allChunks: Chunk[] = []; - + const batchSize = this.options.streamingBatchSize ?? 100; + let totalChunks = 0; + const chunkBuffer: Chunk[] = []; + for (const source of this.sources) { for await (const chunk of source.load()) { - allChunks.push(chunk); + chunkBuffer.push(chunk); + + if (chunkBuffer.length >= batchSize) { + const embeddedBatch = await this.embedChunks(chunkBuffer); + if (embeddedBatch.length > 0) { + await this.provider.add(embeddedBatch); + totalChunks += embeddedBatch.length; + } + chunkBuffer.length = 0; // Clear buffer + } } } - const embeddedChunks = await this.embedChunks(allChunks); - - if (embeddedChunks.length > 0) { - await this.provider.add(embeddedChunks); + // Process remaining chunks + if (chunkBuffer.length > 0) { + const embeddedBatch = await this.embedChunks(chunkBuffer); + if (embeddedBatch.length > 0) { + await this.provider.add(embeddedBatch); + totalChunks += embeddedBatch.length; + } } - this.options.onSync?.({ type: 'complete', chunksAffected: embeddedChunks.length }); + this.options.onSync?.({ type: 'complete', chunksAffected: totalChunks }); } catch (error) { this.options.onSync?.({ type: 'error', error: error as Error }); throw error; diff --git a/packages/toolpack-knowledge/src/providers/memory.ts b/packages/toolpack-knowledge/src/providers/memory.ts index d9b03c0..3e34f22 100644 --- a/packages/toolpack-knowledge/src/providers/memory.ts +++ b/packages/toolpack-knowledge/src/providers/memory.ts @@ -1,6 +1,7 @@ import { KnowledgeProvider, Chunk, QueryOptions, QueryResult } from '../interfaces.js'; import { DimensionMismatchError, KnowledgeProviderError } from '../errors.js'; import { cosineSimilarity, matchesFilter } from '../utils/cosine.js'; +import { keywordSearch } from '../utils/keyword.js'; export interface MemoryProviderOptions { maxChunks?: number; @@ -77,6 +78,43 @@ export class MemoryProvider implements KnowledgeProvider { return results.slice(0, limit); } + async keywordQuery(query: string, options: QueryOptions = {}): Promise { + const { + limit = 10, + threshold = 0.1, + filter, + includeMetadata = true, + includeVectors = false, + } = options; + + const results: QueryResult[] = []; + + for (const { chunk, vector } of this.chunks.values()) { + if (filter && !matchesFilter(chunk.metadata, filter)) { + continue; + } + + const score = keywordSearch(chunk.content, query); + + if (score >= threshold) { + results.push({ + chunk: { + id: chunk.id, + content: chunk.content, + metadata: includeMetadata ? chunk.metadata : {}, + vector: includeVectors ? vector : undefined, + }, + score, + distance: 1 - score, + }); + } + } + + results.sort((a, b) => b.score - a.score); + + return results.slice(0, limit); + } + async delete(ids: string[]): Promise { for (const id of ids) { this.chunks.delete(id); @@ -87,4 +125,11 @@ export class MemoryProvider implements KnowledgeProvider { this.chunks.clear(); this.dimensions = undefined; } + + async getAllChunks(): Promise { + return Array.from(this.chunks.values()).map(({ chunk, vector }) => ({ + ...chunk, + vector, + })); + } } diff --git a/packages/toolpack-knowledge/src/providers/persistent.ts b/packages/toolpack-knowledge/src/providers/persistent.ts index e4127db..708a33a 100644 --- a/packages/toolpack-knowledge/src/providers/persistent.ts +++ b/packages/toolpack-knowledge/src/providers/persistent.ts @@ -5,6 +5,7 @@ import * as os from 'os'; import { KnowledgeProvider, Chunk, QueryOptions, QueryResult } from '../interfaces.js'; import { DimensionMismatchError, KnowledgeProviderError } from '../errors.js'; import { cosineSimilarity, matchesFilter } from '../utils/cosine.js'; +import { keywordSearch } from '../utils/keyword.js'; export interface PersistentKnowledgeProviderOptions { namespace: string; @@ -40,10 +41,29 @@ export class PersistentKnowledgeProvider implements KnowledgeProvider { synced_at INTEGER NOT NULL ); + CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5( + id, content, metadata + ); + CREATE TABLE IF NOT EXISTS provider_meta ( key TEXT PRIMARY KEY, value TEXT NOT NULL ); + + CREATE TRIGGER IF NOT EXISTS chunks_fts_insert AFTER INSERT ON chunks + BEGIN + INSERT INTO chunks_fts (id, content, metadata) VALUES (new.id, new.content, new.metadata); + END; + + CREATE TRIGGER IF NOT EXISTS chunks_fts_delete AFTER DELETE ON chunks + BEGIN + DELETE FROM chunks_fts WHERE id = old.id; + END; + + CREATE TRIGGER IF NOT EXISTS chunks_fts_update AFTER UPDATE ON chunks + BEGIN + UPDATE chunks_fts SET content = new.content, metadata = new.metadata WHERE id = new.id; + END; `); } @@ -139,6 +159,65 @@ export class PersistentKnowledgeProvider implements KnowledgeProvider { return results.slice(0, limit); } + async keywordQuery(query: string, options: QueryOptions = {}): Promise { + const { + limit = 10, + threshold = 0.1, + filter, + includeMetadata = true, + includeVectors = false, + } = options; + + // Use FTS for efficient keyword search + const ftsQuery = query.split(/\s+/).map(term => `"${term}"`).join(' OR '); + const rows = this.db.prepare(` + SELECT c.id, c.content, c.metadata, c.vector, highlight(chunks_fts, 1, '', '') as highlighted + FROM chunks_fts fts + JOIN chunks c ON fts.id = c.id + WHERE chunks_fts MATCH ? + ORDER BY bm25(chunks_fts) DESC + LIMIT ? + `).all(ftsQuery, limit * 2) as Array<{ + id: string; + content: string; + metadata: string; + vector: Buffer; + highlighted: string; + }>; + + const results: QueryResult[] = []; + + for (const row of rows) { + const metadata = JSON.parse(row.metadata); + + if (filter && !matchesFilter(metadata, filter)) { + continue; + } + + // Use keywordSearch for scoring since FTS doesn't give scores directly + const score = keywordSearch(row.content, query); + + if (score >= threshold) { + const vector = new Float32Array(row.vector.buffer, row.vector.byteOffset, row.vector.byteLength / 4); + + results.push({ + chunk: { + id: row.id, + content: row.content, + metadata: includeMetadata ? metadata : {}, + vector: includeVectors ? Array.from(vector) : undefined, + }, + score, + distance: 1 - score, + }); + } + } + + results.sort((a, b) => b.score - a.score); + + return results.slice(0, limit); + } + async delete(ids: string[]): Promise { const del = this.db.prepare('DELETE FROM chunks WHERE id = ?'); const transaction = this.db.transaction((ids: string[]) => { @@ -155,6 +234,27 @@ export class PersistentKnowledgeProvider implements KnowledgeProvider { this.dimensions = undefined; } + async getAllChunks(): Promise { + const rows = this.db.prepare('SELECT id, content, metadata, vector FROM chunks').all() as Array<{ + id: string; + content: string; + metadata: string; + vector: Buffer; + }>; + + return rows.map(row => { + const metadata = JSON.parse(row.metadata); + const vector = new Float32Array(row.vector.buffer, row.vector.byteOffset, row.vector.byteLength / 4); + + return { + id: row.id, + content: row.content, + metadata, + vector: Array.from(vector), + }; + }); + } + shouldReSync(): boolean { if (this.options.reSync === false) { const count = this.db.prepare('SELECT COUNT(*) as count FROM chunks').get() as { count: number }; diff --git a/packages/toolpack-knowledge/src/sources/api.ts b/packages/toolpack-knowledge/src/sources/api.ts new file mode 100644 index 0000000..785be66 --- /dev/null +++ b/packages/toolpack-knowledge/src/sources/api.ts @@ -0,0 +1,231 @@ +import * as crypto from 'crypto'; +import { KnowledgeSource, Chunk } from '../interfaces.js'; +import { IngestionError } from '../errors.js'; +import { estimateTokens, splitLargeChunk, applyOverlap } from '../utils/chunking.js'; + +export interface ApiDataSourceOptions { + maxChunkSize?: number; + chunkOverlap?: number; + minChunkSize?: number; + namespace?: string; + metadata?: Record; + headers?: Record; + method?: 'GET' | 'POST'; + body?: unknown; + timeoutMs?: number; + pagination?: { + param: string; + start: number; + step: number; + maxPages?: number; + } | null; + dataPath?: string; // JSON path to extract data array (e.g., 'data.items') + contentExtractor?: (item: unknown) => string; + metadataExtractor?: (item: unknown) => Record; +} + +export class ApiDataSource implements KnowledgeSource { + private options: ApiDataSourceOptions; + + constructor( + private url: string, + options: ApiDataSourceOptions = {} + ) { + this.options = { + maxChunkSize: options.maxChunkSize ?? 2000, + chunkOverlap: options.chunkOverlap ?? 200, + minChunkSize: options.minChunkSize ?? 100, + namespace: options.namespace ?? 'api', + metadata: options.metadata ?? {}, + headers: options.headers ?? {}, + method: options.method ?? 'GET', + timeoutMs: options.timeoutMs ?? 30000, + pagination: options.pagination, + dataPath: options.dataPath ?? '', + contentExtractor: options.contentExtractor ?? this.defaultContentExtractor, + metadataExtractor: options.metadataExtractor ?? this.defaultMetadataExtractor, + }; + } + + async *load(): AsyncIterable { + const items = await this.fetchData(); + + for (const item of items) { + try { + const chunks = this.chunkItem(item); + + for (const chunk of chunks) { + yield chunk; + } + } catch (error) { + throw new IngestionError(`Failed to process API item: ${(error as Error).message}`, this.url); + } + } + } + + private async fetchData(): Promise { + const allItems: unknown[] = []; + let page = this.options.pagination?.start ?? 0; + const maxPages = this.options.pagination?.maxPages ?? 1; + + while (page < maxPages) { + const pageUrl = this.buildUrl(page); + const items = await this.fetchPage(pageUrl); + + if (items.length === 0) { + break; // No more data + } + + allItems.push(...items); + page++; + + if (!this.options.pagination) { + break; // No pagination configured + } + } + + return allItems; + } + + private buildUrl(page: number): string { + if (!this.options.pagination) { + return this.url; + } + + const url = new URL(this.url); + url.searchParams.set(this.options.pagination.param, page.toString()); + return url.href; + } + + private async fetchPage(url: string): Promise { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), this.options.timeoutMs); + + try { + const response = await fetch(url, { + method: this.options.method, + headers: { + 'Content-Type': 'application/json', + ...this.options.headers, + }, + body: this.options.body ? JSON.stringify(this.options.body) : undefined, + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const data = await response.json(); + return this.extractItems(data); + } finally { + clearTimeout(timeoutId); + } + } + + private extractItems(data: unknown): unknown[] { + if (!this.options.dataPath) { + return Array.isArray(data) ? data : [data]; + } + + const path = this.options.dataPath.split('.'); + let current: unknown = data; + + for (const key of path) { + if (current && typeof current === 'object' && key in current) { + current = (current as Record)[key]; + } else { + throw new Error(`Data path '${this.options.dataPath}' not found in response`); + } + } + + return Array.isArray(current) ? current : [current]; + } + + private chunkItem(item: unknown): Chunk[] { + const content = this.options.contentExtractor!(item); + const itemMetadata = this.options.metadataExtractor!(item); + + const tokens = estimateTokens(content); + + let itemChunks: string[]; + if (tokens > (this.options.maxChunkSize ?? 2000)) { + itemChunks = splitLargeChunk(content, this.options.maxChunkSize ?? 2000); + } else { + itemChunks = [content]; + } + + if ((this.options.chunkOverlap ?? 200) > 0 && itemChunks.length > 1) { + itemChunks = applyOverlap(itemChunks, this.options.chunkOverlap ?? 200); + } + + const chunks: Chunk[] = []; + + for (let i = 0; i < itemChunks.length; i++) { + const chunkContent = itemChunks[i]; + const chunkId = this.generateChunkId(item, chunkContent, i); + + chunks.push({ + id: chunkId, + content: chunkContent, + metadata: { + ...this.options.metadata, + ...itemMetadata, + source: 'api', + apiUrl: this.url, + chunkIndex: i, + totalChunks: itemChunks.length, + }, + }); + } + + return chunks; + } + + private defaultContentExtractor(item: unknown): string { + if (typeof item === 'string') { + return item; + } + + if (typeof item === 'object' && item !== null) { + // Try common content fields + const contentFields = ['content', 'text', 'description', 'body', 'message']; + + for (const field of contentFields) { + if (field in item && typeof (item as Record)[field] === 'string') { + return (item as Record)[field] as string; + } + } + + // Fallback to JSON string + return JSON.stringify(item); + } + + return String(item); + } + + private defaultMetadataExtractor(item: unknown): Record { + if (typeof item === 'object' && item !== null) { + const metadata: Record = {}; + + // Extract common metadata fields + const metadataFields = ['id', 'title', 'name', 'created_at', 'updated_at', 'author', 'tags']; + + for (const field of metadataFields) { + if (field in item) { + metadata[field] = (item as Record)[field]; + } + } + + return metadata; + } + + return {}; + } + + private generateChunkId(item: unknown, content: string, index: number): string { + const hash = crypto.createHash('md5').update(content).digest('hex').substring(0, 8); + const itemHash = crypto.createHash('md5').update(JSON.stringify(item)).digest('hex').substring(0, 8); + return `${this.options.namespace}:${itemHash}:${index}:${hash}`; + } +} \ No newline at end of file diff --git a/packages/toolpack-knowledge/src/sources/web-url.ts b/packages/toolpack-knowledge/src/sources/web-url.ts new file mode 100644 index 0000000..1763820 --- /dev/null +++ b/packages/toolpack-knowledge/src/sources/web-url.ts @@ -0,0 +1,237 @@ +import * as crypto from 'crypto'; +import * as cheerio from 'cheerio'; +import { KnowledgeSource, Chunk } from '../interfaces.js'; +import { IngestionError } from '../errors.js'; +import { estimateTokens, splitLargeChunk, applyOverlap } from '../utils/chunking.js'; + +export interface WebUrlSourceOptions { + maxChunkSize?: number; + chunkOverlap?: number; + minChunkSize?: number; + namespace?: string; + metadata?: Record; + maxDepth?: number; + userAgent?: string; + delayMs?: number; + timeoutMs?: number; + sameDomainOnly?: boolean; + maxPagesPerDomain?: number; +} + +interface CrawledPage { + url: string; + title: string; + content: string; + links: string[]; +} + +export class WebUrlSource implements KnowledgeSource { + private options: Required; + private crawledUrls = new Set(); + private domainPageCount = new Map(); + private lastRequestTime = new Map(); + + constructor( + private urls: string[], + options: WebUrlSourceOptions = {} + ) { + this.options = { + maxChunkSize: options.maxChunkSize ?? 2000, + chunkOverlap: options.chunkOverlap ?? 200, + minChunkSize: options.minChunkSize ?? 100, + namespace: options.namespace ?? 'web', + metadata: options.metadata ?? {}, + maxDepth: options.maxDepth ?? 1, + userAgent: options.userAgent ?? 'Toolpack-Knowledge/1.0', + delayMs: options.delayMs ?? 1000, + timeoutMs: options.timeoutMs ?? 30000, + sameDomainOnly: options.sameDomainOnly ?? true, + maxPagesPerDomain: options.maxPagesPerDomain ?? 10, + }; + } + + async *load(): AsyncIterable { + const pages = await this.crawlUrls(this.urls, 0); + + for (const page of pages) { + try { + const chunks = this.chunkPage(page); + + for (const chunk of chunks) { + yield chunk; + } + } catch (error) { + throw new IngestionError(`Failed to process URL ${page.url}: ${(error as Error).message}`, page.url); + } + } + } + + private async crawlUrls(urls: string[], depth: number): Promise { + if (depth >= this.options.maxDepth) { + return []; + } + + const pages: CrawledPage[] = []; + const newUrls: string[] = []; + const initialDomains = new Set(urls.map(url => new URL(url).hostname)); + + for (const url of urls) { + if (this.crawledUrls.has(url)) { + continue; + } + + const domain = new URL(url).hostname; + const pageCount = this.domainPageCount.get(domain) ?? 0; + + if (this.options.sameDomainOnly && !initialDomains.has(domain)) { + continue; // Skip external domains + } + + if (pageCount >= this.options.maxPagesPerDomain) { + continue; // Skip if too many pages from this domain + } + + this.crawledUrls.add(url); + this.domainPageCount.set(domain, pageCount + 1); + + try { + // Rate limiting per domain + const lastTime = this.lastRequestTime.get(domain) ?? 0; + const timeSince = Date.now() - lastTime; + if (timeSince < this.options.delayMs) { + await new Promise(resolve => setTimeout(resolve, this.options.delayMs - timeSince)); + } + + const page = await this.fetchPage(url); + pages.push(page); + this.lastRequestTime.set(domain, Date.now()); + + if (depth < this.options.maxDepth - 1) { + newUrls.push(...page.links); + } + } catch (error) { + console.warn(`Failed to crawl ${url}: ${(error as Error).message}`); + } + } + + if (newUrls.length > 0) { + const subPages = await this.crawlUrls(newUrls, depth + 1); + pages.push(...subPages); + } + + return pages; + } + + private async fetchPage(url: string): Promise { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), this.options.timeoutMs); + + try { + const response = await fetch(url, { + signal: controller.signal, + headers: { + 'User-Agent': this.options.userAgent, + }, + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const html = await response.text(); + const $ = cheerio.load(html); + + // Remove script and style elements + $('script, style, nav, header, footer, aside').remove(); + + // Extract title + const title = $('title').text().trim() || $('h1').first().text().trim() || 'Untitled'; + + // Extract main content + const contentSelectors = ['main', 'article', '.content', '#content', 'body']; + let content = ''; + + for (const selector of contentSelectors) { + const element = $(selector); + if (element.length > 0) { + content = element.text().trim(); + break; + } + } + + if (!content) { + content = $('body').text().trim(); + } + + // Clean up whitespace + content = content.replace(/\s+/g, ' ').trim(); + + // Extract links + const links: string[] = []; + $('a[href]').each((_, element) => { + const href = $(element).attr('href'); + if (href) { + try { + const absoluteUrl = new URL(href, url).href; + if (absoluteUrl.startsWith('http') && !absoluteUrl.includes('#')) { + links.push(absoluteUrl); + } + } catch { + // Invalid URL, skip + } + } + }); + + return { + url, + title, + content, + links: [...new Set(links)], // Remove duplicates + }; + } finally { + clearTimeout(timeoutId); + } + } + + private chunkPage(page: CrawledPage): Chunk[] { + const chunks: Chunk[] = []; + const tokens = estimateTokens(page.content); + + let pageChunks: string[]; + if (tokens > this.options.maxChunkSize) { + pageChunks = splitLargeChunk(page.content, this.options.maxChunkSize); + } else { + pageChunks = [page.content]; + } + + if (this.options.chunkOverlap > 0 && pageChunks.length > 1) { + pageChunks = applyOverlap(pageChunks, this.options.chunkOverlap); + } + + for (let i = 0; i < pageChunks.length; i++) { + const chunkContent = pageChunks[i]; + const chunkId = this.generateChunkId(page.url, chunkContent, i); + + chunks.push({ + id: chunkId, + content: chunkContent, + metadata: { + ...this.options.metadata, + title: page.title, + url: page.url, + source: 'web', + chunkIndex: i, + totalChunks: pageChunks.length, + }, + }); + } + + return chunks; + } + + private generateChunkId(url: string, content: string, index: number): string { + const hash = crypto.createHash('md5').update(content).digest('hex').substring(0, 8); + const urlHash = crypto.createHash('md5').update(url).digest('hex').substring(0, 8); + return `${this.options.namespace}:${urlHash}:${index}:${hash}`; + } +} \ No newline at end of file diff --git a/packages/toolpack-knowledge/src/utils/keyword.ts b/packages/toolpack-knowledge/src/utils/keyword.ts new file mode 100644 index 0000000..1ac7640 --- /dev/null +++ b/packages/toolpack-knowledge/src/utils/keyword.ts @@ -0,0 +1,29 @@ +export function keywordSearch(text: string, query: string): number { + const textLower = text.toLowerCase(); + const queryLower = query.toLowerCase(); + + // Exact match gets highest score + if (textLower.includes(queryLower)) { + return 1.0; + } + + // Word-level matching + const queryWords = queryLower.split(/\s+/).filter(word => word.length > 2); + if (queryWords.length === 0) { + return 0.0; + } + + let matchCount = 0; + for (const word of queryWords) { + if (textLower.includes(word)) { + matchCount++; + } + } + + return matchCount / queryWords.length; +} + +export function combineScores(semanticScore: number, keywordScore: number, semanticWeight: number = 0.7): number { + const keywordWeight = 1 - semanticWeight; + return semanticScore * semanticWeight + keywordScore * keywordWeight; +} \ No newline at end of file