diff --git a/package-lock.json b/package-lock.json index d1b3279..249a0ee 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1396,7 +1396,6 @@ "cpu": [ "arm" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1410,7 +1409,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1424,7 +1422,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1438,7 +1435,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1452,7 +1448,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1466,7 +1461,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1480,7 +1474,6 @@ "cpu": [ "arm" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1494,7 +1487,6 @@ "cpu": [ "arm" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1508,7 +1500,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1522,7 +1513,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1536,7 +1526,6 @@ "cpu": [ "loong64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1550,7 +1539,6 @@ "cpu": [ "loong64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1564,7 +1552,6 @@ "cpu": [ "ppc64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1578,7 +1565,6 @@ "cpu": [ "ppc64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1592,7 +1578,6 @@ "cpu": [ "riscv64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1606,7 +1591,6 @@ "cpu": [ "riscv64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1620,7 +1604,6 @@ "cpu": [ "s390x" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1634,7 +1617,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1648,7 +1630,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1662,7 +1643,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1676,7 +1656,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1690,7 +1669,6 @@ "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1704,7 +1682,6 @@ "cpu": [ "ia32" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1718,7 +1695,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1732,7 +1708,6 @@ "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -1879,7 +1854,7 @@ "version": "1.0.8", "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", - "dev": true, + "devOptional": true, "license": "MIT" }, "node_modules/@types/json-schema": { @@ -1903,7 +1878,6 @@ "version": "25.5.0", "resolved": "https://registry.npmjs.org/@types/node/-/node-25.5.0.tgz", "integrity": "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw==", - "devOptional": true, "license": "MIT", "dependencies": { "undici-types": "~7.18.0" @@ -3776,7 +3750,6 @@ "version": "2.3.3", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", - "dev": true, "hasInstallScript": true, "license": "MIT", "optional": true, @@ -4704,6 +4677,18 @@ "node": ">=8.6" } }, + "node_modules/micromatch/node_modules/picomatch": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz", + "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==", + "license": "MIT", + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, "node_modules/mimic-response": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz", @@ -18245,12 +18230,13 @@ "license": "ISC" }, "node_modules/picomatch": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz", - "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", + "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", + "dev": true, "license": "MIT", "engines": { - "node": ">=8.6" + "node": ">=12" }, "funding": { "url": "https://github.com/sponsors/jonschlinkert" @@ -18673,7 +18659,7 @@ "version": "4.60.1", "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.1.tgz", "integrity": "sha512-VmtB2rFU/GroZ4oL8+ZqXgSA38O6GR8KSIvWmEFv63pQ0G6KaBH9s07PO8XTXP4vI+3UJUEypOfjkGfmSBBR0w==", - "dev": true, + "devOptional": true, "license": "MIT", "dependencies": { "@types/estree": "1.0.8" @@ -19230,19 +19216,6 @@ "url": "https://github.com/sponsors/SuperchupuDev" } }, - "node_modules/tinyglobby/node_modules/picomatch": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", - "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, "node_modules/tinyrainbow": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", @@ -19490,7 +19463,7 @@ "version": "5.9.3", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", - "dev": true, + "devOptional": true, "license": "Apache-2.0", "bin": { "tsc": "bin/tsc", @@ -19544,7 +19517,6 @@ "version": "7.18.2", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz", "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==", - "devOptional": true, "license": "MIT" }, "node_modules/uri-js": { @@ -19648,19 +19620,6 @@ } } }, - "node_modules/vite/node_modules/picomatch": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", - "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, "node_modules/vitest": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/vitest/-/vitest-4.1.2.tgz", @@ -19743,19 +19702,6 @@ } } }, - "node_modules/vitest/node_modules/picomatch": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", - "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, "node_modules/web-tree-sitter": { "version": "0.22.6", "resolved": "https://registry.npmjs.org/web-tree-sitter/-/web-tree-sitter-0.22.6.tgz", @@ -20011,6 +19957,7 @@ "license": "Apache-2.0", "dependencies": { "better-sqlite3": "^12.6.2", + "cheerio": "^1.0.0-rc.12", "fast-glob": "^3.3.3", "openai": "^6.18.0" }, @@ -20026,7 +19973,7 @@ } }, "packages/toolpack-sdk": { - "version": "1.3.0", + "version": "1.2.0", "license": "Apache-2.0", "dependencies": { "@anthropic-ai/sdk": "^0.73.0", diff --git a/packages/toolpack-knowledge/README.md b/packages/toolpack-knowledge/README.md index 8acfd9d..bbf92fe 100644 --- a/packages/toolpack-knowledge/README.md +++ b/packages/toolpack-knowledge/README.md @@ -1,6 +1,6 @@ # toolpack-knowledge -RAG (Retrieval-Augmented Generation) package for Toolpack SDK. +RAG (Retrieval-Augmented Generation) package for Toolpack SDK with advanced features for web crawling, API indexing, streaming ingestion, and hybrid search. ## Installation @@ -54,6 +54,38 @@ const results = await kb.query('authentication setup', { }); ``` +### Advanced Usage + +```typescript +import { Knowledge, WebUrlSource, ApiDataSource, PersistentKnowledgeProvider, OllamaEmbedder } from '@toolpack-sdk/knowledge'; + +// Web crawling + API indexing with hybrid search +const kb = await Knowledge.create({ + provider: new PersistentKnowledgeProvider({ namespace: 'advanced-docs' }), + sources: [ + new WebUrlSource(['https://docs.example.com'], { + maxDepth: 2, + delayMs: 1000, + }), + new ApiDataSource('https://api.example.com/docs', { + pagination: { param: 'page', start: 1, maxPages: 5 }, + contentExtractor: (doc) => `${doc.title}\n\n${doc.content}`, + }), + ], + embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }), + streamingBatchSize: 50, // Efficient processing of large datasets + description: 'Comprehensive documentation from web and API sources.', +}); + +// Hybrid search combining semantic and keyword matching +const results = await kb.query('authentication setup', { + searchType: 'hybrid', + semanticWeight: 0.6, // 60% semantic, 40% keyword + limit: 10, + threshold: 0.7, +}); +``` + ### Agent Integration ```typescript @@ -75,6 +107,126 @@ const toolpack = await Toolpack.init({ const response = await toolpack.chat('How do I configure authentication?'); ``` +## Advanced Features + +### Web URL Sources + +Crawl and index websites with automatic HTML parsing and link following. + +```typescript +import { WebUrlSource } from '@toolpack-sdk/knowledge'; + +const webSource = new WebUrlSource(['https://docs.example.com'], { + maxDepth: 3, // Follow links up to 3 levels deep + delayMs: 1000, // Respectful crawling delay + userAgent: 'MyApp/1.0', // Custom user agent + maxChunkSize: 1500, // Chunk size for web content + timeoutMs: 30000, // Request timeout +}); + +const kb = await Knowledge.create({ + provider: new MemoryProvider(), + sources: [webSource], + embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }), + description: 'Web documentation and guides.', +}); +``` + +**Features:** +- Recursive website crawling with depth control +- Automatic HTML text extraction (removes scripts/styles) +- Link discovery and following +- Respectful crawling with configurable delays +- Metadata includes title, URL, and source type + +### API Data Sources + +Index data from REST APIs with pagination support. + +```typescript +import { ApiDataSource } from '@toolpack-sdk/knowledge'; + +const apiSource = new ApiDataSource('https://api.github.com/repos/toolpack-ai/toolpack-sdk/issues', { + headers: { + 'Authorization': `Bearer ${process.env.GITHUB_TOKEN}`, + 'Accept': 'application/vnd.github.v3+json', + }, + pagination: { + param: 'page', + start: 1, + maxPages: 5, + }, + dataPath: '', // Root level array + contentExtractor: (issue: any) => `${issue.title}\n\n${issue.body}`, + metadataExtractor: (issue: any) => ({ + id: issue.id, + state: issue.state, + labels: issue.labels?.map(l => l.name), + }), +}); + +const kb = await Knowledge.create({ + provider: new PersistentKnowledgeProvider({ namespace: 'github-issues' }), + sources: [apiSource], + embedder: new OpenAIEmbedder({ model: 'text-embedding-3-small' }), + description: 'GitHub issues and discussions.', +}); +``` + +**Features:** +- REST API data ingestion (GET/POST) +- Automatic pagination handling +- Custom content and metadata extractors +- JSON path support for nested data +- Flexible data transformation + +### Streaming Ingestion + +Process large datasets efficiently with batch processing. + +```typescript +const kb = await Knowledge.create({ + provider: new PersistentKnowledgeProvider({ namespace: 'large-dataset' }), + sources: [new ApiDataSource('https://api.example.com/large-dataset')], + embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }), + streamingBatchSize: 50, // Process 50 chunks at a time + description: 'Large dataset with streaming ingestion.', + onEmbeddingProgress: (event) => { + console.log(`Processed: ${event.current}/${event.total} chunks`); + }, +}); +``` + +### Hybrid Search + +Combine semantic and keyword search for better results. + +```typescript +// Semantic search (default) +const semanticResults = await kb.query('machine learning algorithms', { + searchType: 'semantic', + limit: 5, +}); + +// Keyword search +const keywordResults = await kb.query('machine learning algorithms', { + searchType: 'keyword', + limit: 5, +}); + +// Hybrid search (recommended) +const hybridResults = await kb.query('machine learning algorithms', { + searchType: 'hybrid', + semanticWeight: 0.7, // 70% semantic, 30% keyword + limit: 5, +}); +``` + +**Search Types:** +- `semantic` — Vector similarity search (default) +- `keyword` — Text matching search +- `hybrid` — Combined semantic + keyword search + ## Providers ### MemoryProvider @@ -121,6 +273,70 @@ new MarkdownSource('./docs/**/*.md', { - Code block detection (`hasCode` metadata) - Deterministic chunk IDs +### WebUrlSource + +Crawl and index web pages with HTML parsing. + +```typescript +new WebUrlSource(['https://example.com', 'https://docs.example.com'], { + maxDepth: 2, // Crawl depth (default: 1) + delayMs: 1000, // Delay between requests (default: 1000ms) + userAgent: 'MyApp/1.0', // Custom user agent + maxChunkSize: 2000, // Max tokens per chunk + chunkOverlap: 200, // Overlap between chunks + timeoutMs: 30000, // Request timeout (default: 30000ms) + namespace: 'web', // Chunk ID prefix + metadata: { source: 'web' }, // Added to all chunks +}) +``` + +**Features:** +- Recursive website crawling +- Automatic HTML text extraction +- Link discovery and following +- Respectful crawling with delays +- Error handling for failed requests + +### ApiDataSource + +Index data from REST APIs with pagination. + +```typescript +new ApiDataSource('https://api.example.com/data', { + method: 'GET', // HTTP method (default: 'GET') + headers: { // Request headers + 'Authorization': 'Bearer token', + 'Content-Type': 'application/json', + }, + body: JSON.stringify({}), // Request body for POST + pagination: { // Pagination config + param: 'page', // Query param name + start: 1, // Starting page number + step: 1, // Page increment + maxPages: 10, // Max pages to fetch + }, + dataPath: 'data.items', // JSON path to data array + contentExtractor: (item) => // Custom content extraction + `${item.title}\n\n${item.description}`, + metadataExtractor: (item) => ({ // Custom metadata extraction + id: item.id, + category: item.category, + }), + maxChunkSize: 2000, // Max tokens per chunk + chunkOverlap: 200, // Overlap between chunks + timeoutMs: 30000, // Request timeout + namespace: 'api', // Chunk ID prefix + metadata: { source: 'api' }, // Added to all chunks +}) +``` + +**Features:** +- REST API data ingestion +- Automatic pagination handling +- Custom data extractors +- JSON path support +- Flexible content transformation + ## Embedders ### OllamaEmbedder @@ -159,6 +375,7 @@ interface KnowledgeOptions { embedder: Embedder; description: string; // Required: used as tool description reSync?: boolean; // default: true + streamingBatchSize?: number; // Process chunks in batches (default: 100) onError?: (error, context) => 'skip' | 'abort'; onSync?: (event: SyncEvent) => void; onEmbeddingProgress?: (event: EmbeddingProgressEvent) => void; @@ -171,6 +388,8 @@ interface KnowledgeOptions { await kb.query('search query', { limit: 10, // Max results threshold: 0.7, // Similarity threshold (0-1) + searchType: 'hybrid', // 'semantic' | 'keyword' | 'hybrid' (default: 'semantic') + semanticWeight: 0.7, // Weight for semantic vs keyword in hybrid search (0-1) filter: { // Metadata filters hasCode: true, category: { $in: ['api', 'guide'] }, @@ -180,6 +399,20 @@ await kb.query('search query', { }); ``` +### Utility Functions + +```typescript +import { keywordSearch, combineScores } from '@toolpack-sdk/knowledge'; + +// Manual keyword search +const score = keywordSearch('document content', 'search query'); +// Returns: number between 0-1 + +// Combine semantic and keyword scores +const combinedScore = combineScores(semanticScore, keywordScore, 0.7); +// Returns: weighted combination +``` + ### Metadata Filters ```typescript diff --git a/packages/toolpack-knowledge/examples/advanced-features.ts b/packages/toolpack-knowledge/examples/advanced-features.ts new file mode 100644 index 0000000..22c026a --- /dev/null +++ b/packages/toolpack-knowledge/examples/advanced-features.ts @@ -0,0 +1,97 @@ +import { + Knowledge, + MemoryProvider, + WebUrlSource, + ApiDataSource, + MarkdownSource, + OllamaEmbedder +} from '../src/index.js'; + +async function main() { + console.log('Creating advanced knowledge base...'); + + const kb = await Knowledge.create({ + provider: new MemoryProvider(), + sources: [ + // Web URL source - crawl websites + new WebUrlSource(['https://example.com', 'https://httpbin.org'], { + maxDepth: 2, + delayMs: 1000, // Be respectful to servers + maxChunkSize: 1500, + }), + + // API data source - index REST API data + new ApiDataSource('https://jsonplaceholder.typicode.com/posts', { + dataPath: '', // Root level array + contentExtractor: (item: any) => `${item.title}\n\n${item.body}`, + metadataExtractor: (item: any) => ({ + id: item.id, + userId: item.userId, + }), + }), + + // Traditional markdown source + new MarkdownSource('./docs/**/*.md'), + ], + embedder: new OllamaEmbedder({ model: 'nomic-embed-text' }), + description: 'Advanced knowledge base with web crawling, API indexing, and hybrid search', + streamingBatchSize: 50, // Process in batches for large datasets + onSync: (event) => { + if (event.type === 'start') { + console.log('Starting sync...'); + } else if (event.type === 'complete') { + console.log(`Sync complete! Indexed ${event.chunksAffected} chunks`); + } + }, + onEmbeddingProgress: (event) => { + console.log(`Embedding progress: ${event.percent}% (${event.current}/${event.total})`); + }, + }); + + console.log('\n=== Semantic Search ==='); + const semanticResults = await kb.query('web development technologies', { + limit: 3, + searchType: 'semantic', + }); + + console.log(`Found ${semanticResults.length} semantic results:`); + for (const result of semanticResults) { + console.log(`Score: ${result.score.toFixed(3)}`); + console.log(`Content: ${result.chunk.content.substring(0, 100)}...`); + console.log(`Source: ${result.chunk.metadata.source}`); + console.log('---\n'); + } + + console.log('\n=== Keyword Search ==='); + const keywordResults = await kb.query('web development', { + limit: 3, + searchType: 'keyword', + }); + + console.log(`Found ${keywordResults.length} keyword results:`); + for (const result of keywordResults) { + console.log(`Score: ${result.score.toFixed(3)}`); + console.log(`Content: ${result.chunk.content.substring(0, 100)}...`); + console.log(`Source: ${result.chunk.metadata.source}`); + console.log('---\n'); + } + + console.log('\n=== Hybrid Search ==='); + const hybridResults = await kb.query('web development technologies', { + limit: 3, + searchType: 'hybrid', + semanticWeight: 0.6, // 60% semantic, 40% keyword + }); + + console.log(`Found ${hybridResults.length} hybrid results:`); + for (const result of hybridResults) { + console.log(`Score: ${result.score.toFixed(3)}`); + console.log(`Content: ${result.chunk.content.substring(0, 100)}...`); + console.log(`Source: ${result.chunk.metadata.source}`); + console.log('---\n'); + } + + await kb.stop(); +} + +main().catch(console.error); \ No newline at end of file diff --git a/packages/toolpack-knowledge/package.json b/packages/toolpack-knowledge/package.json index 1a497d9..aabcd10 100644 --- a/packages/toolpack-knowledge/package.json +++ b/packages/toolpack-knowledge/package.json @@ -31,7 +31,11 @@ "embeddings", "vector-search", "knowledge-base", - "sdk" + "sdk", + "web-crawling", + "api-indexing", + "hybrid-search", + "streaming-ingestion" ], "engines": { "node": ">=20" @@ -48,6 +52,7 @@ }, "dependencies": { "better-sqlite3": "^12.6.2", + "cheerio": "^1.0.0-rc.12", "fast-glob": "^3.3.3", "openai": "^6.18.0" }, diff --git a/packages/toolpack-knowledge/src/__tests__/api-source.test.ts b/packages/toolpack-knowledge/src/__tests__/api-source.test.ts new file mode 100644 index 0000000..5117a0b --- /dev/null +++ b/packages/toolpack-knowledge/src/__tests__/api-source.test.ts @@ -0,0 +1,87 @@ +import { describe, it, expect, vi } from 'vitest'; +import { ApiDataSource } from '../sources/api.js'; + +// Mock fetch globally +global.fetch = vi.fn(); + +describe('ApiDataSource', () => { + it('should fetch and chunk API data', async () => { + const mockData = { + data: [ + { + id: 1, + title: 'First Item', + content: 'This is the content of the first item.', + author: 'Author 1', + }, + { + id: 2, + title: 'Second Item', + content: 'This is the content of the second item.', + author: 'Author 2', + }, + ], + }; + + (global.fetch as any).mockResolvedValueOnce({ + ok: true, + json: () => Promise.resolve(mockData), + }); + + const source = new ApiDataSource('https://api.example.com/data', { + dataPath: 'data', + contentExtractor: (item: any) => `${item.title}\n\n${item.content}`, + }); + + const chunks = []; + for await (const chunk of source.load()) { + chunks.push(chunk); + } + + expect(chunks.length).toBe(2); + expect(chunks[0].content).toContain('First Item'); + expect(chunks[0].content).toContain('content of the first item'); + expect(chunks[0].metadata.id).toBe(1); + expect(chunks[0].metadata.author).toBe('Author 1'); + expect(chunks[1].metadata.title).toBe('Second Item'); + }); + + it('should handle pagination', async () => { + const mockPage1 = { data: [{ id: 1, content: 'Page 1 content' }] }; + const mockPage2 = { data: [{ id: 2, content: 'Page 2 content' }] }; + const mockPage3 = { data: [] }; // Empty page to stop pagination + + (global.fetch as any) + .mockResolvedValueOnce({ + ok: true, + json: () => Promise.resolve(mockPage1), + }) + .mockResolvedValueOnce({ + ok: true, + json: () => Promise.resolve(mockPage2), + }) + .mockResolvedValueOnce({ + ok: true, + json: () => Promise.resolve(mockPage3), + }); + + const source = new ApiDataSource('https://api.example.com/data', { + dataPath: 'data', + pagination: { + param: 'page', + start: 1, + step: 1, + maxPages: 10, + }, + }); + + const chunks = []; + for await (const chunk of source.load()) { + chunks.push(chunk); + } + + expect(chunks.length).toBe(2); + expect(chunks[0].metadata.id).toBe(1); + expect(chunks[1].metadata.id).toBe(2); + }); +}); \ No newline at end of file diff --git a/packages/toolpack-knowledge/src/__tests__/keyword.test.ts b/packages/toolpack-knowledge/src/__tests__/keyword.test.ts new file mode 100644 index 0000000..f0a115d --- /dev/null +++ b/packages/toolpack-knowledge/src/__tests__/keyword.test.ts @@ -0,0 +1,46 @@ +import { describe, it, expect } from 'vitest'; +import { keywordSearch, combineScores } from '../../dist/index.js'; + +describe('keywordSearch', () => { + it('should return 1.0 for exact matches', () => { + const text = 'This is a test document with some content.'; + const query = 'test document'; + expect(keywordSearch(text, query)).toBe(1.0); + }); + + it('should return partial scores for word matches', () => { + const text = 'This is a test document with some content.'; + const query = 'test content extra'; + const score = keywordSearch(text, query); + expect(score).toBeGreaterThan(0); + expect(score).toBeLessThan(1.0); + }); + + it('should return 0 for no matches', () => { + const text = 'This is a test document.'; + const query = 'nonexistent'; + expect(keywordSearch(text, query)).toBe(0); + }); + + it('should handle case insensitive matching', () => { + const text = 'This is a TEST document.'; + const query = 'test'; + expect(keywordSearch(text, query)).toBe(1.0); + }); +}); + +describe('combineScores', () => { + it('should combine semantic and keyword scores', () => { + const semanticScore = 0.8; + const keywordScore = 0.6; + const combined = combineScores(semanticScore, keywordScore, 0.7); + expect(combined).toBe(0.8 * 0.7 + 0.6 * 0.3); + }); + + it('should handle equal weights', () => { + const semanticScore = 0.9; + const keywordScore = 0.5; + const combined = combineScores(semanticScore, keywordScore, 0.5); + expect(combined).toBe(0.7); + }); +}); \ No newline at end of file diff --git a/packages/toolpack-knowledge/src/__tests__/web-url-source.test.ts b/packages/toolpack-knowledge/src/__tests__/web-url-source.test.ts new file mode 100644 index 0000000..5569921 --- /dev/null +++ b/packages/toolpack-knowledge/src/__tests__/web-url-source.test.ts @@ -0,0 +1,60 @@ +import { describe, it, expect, vi } from 'vitest'; +import { WebUrlSource } from '../sources/web-url.js'; + +// Mock fetch globally +global.fetch = vi.fn(); + +describe('WebUrlSource', () => { + it('should crawl and chunk web pages', async () => { + const mockHtml = ` + +
This is some content from a web page.
+This is more content that should be extracted.
+ Internal Link + External Link + + + `; + + (global.fetch as any).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(mockHtml), + }); + + const source = new WebUrlSource(['https://example.com'], { + maxDepth: 1, + delayMs: 0, // No delay for tests + }); + + const chunks = []; + for await (const chunk of source.load()) { + chunks.push(chunk); + } + + expect(chunks.length).toBeGreaterThan(0); + expect(chunks[0].content).toContain('Main Title'); + expect(chunks[0].content).toContain('content from a web page'); + expect(chunks[0].metadata.title).toBe('Test Page'); + expect(chunks[0].metadata.url).toBe('https://example.com'); + expect(chunks[0].metadata.source).toBe('web'); + }); + + it('should handle fetch errors gracefully', async () => { + (global.fetch as any).mockRejectedValueOnce(new Error('Network error')); + + const source = new WebUrlSource(['https://failing-url.com'], { + delayMs: 0, + }); + + const chunks = []; + for await (const chunk of source.load()) { + chunks.push(chunk); + } + + // Should not throw, just skip the failing URL + expect(chunks.length).toBe(0); + }); +}); \ No newline at end of file diff --git a/packages/toolpack-knowledge/src/index.ts b/packages/toolpack-knowledge/src/index.ts index 020414a..13a0ef3 100644 --- a/packages/toolpack-knowledge/src/index.ts +++ b/packages/toolpack-knowledge/src/index.ts @@ -11,8 +11,17 @@ export type { PersistentKnowledgeProviderOptions } from './providers/persistent. export { MarkdownSource } from './sources/markdown.js'; export type { MarkdownSourceOptions } from './sources/markdown.js'; +export { WebUrlSource } from './sources/web-url.js'; +export type { WebUrlSourceOptions } from './sources/web-url.js'; + +export { ApiDataSource } from './sources/api.js'; +export type { ApiDataSourceOptions } from './sources/api.js'; + export { OllamaEmbedder } from './embedders/ollama.js'; export type { OllamaEmbedderOptions } from './embedders/ollama.js'; export { OpenAIEmbedder } from './embedders/openai.js'; export type { OpenAIEmbedderOptions } from './embedders/openai.js'; + +// Utility functions +export { keywordSearch, combineScores } from './utils/keyword.js'; diff --git a/packages/toolpack-knowledge/src/interfaces.ts b/packages/toolpack-knowledge/src/interfaces.ts index 5846f75..c7882c2 100644 --- a/packages/toolpack-knowledge/src/interfaces.ts +++ b/packages/toolpack-knowledge/src/interfaces.ts @@ -16,6 +16,8 @@ export interface QueryOptions { filter?: MetadataFilter; includeMetadata?: boolean; includeVectors?: boolean; + searchType?: 'semantic' | 'keyword' | 'hybrid'; + semanticWeight?: number; // For hybrid search, weight of semantic vs keyword (0-1) } export interface MetadataFilter { @@ -34,9 +36,11 @@ export interface QueryResult { export interface KnowledgeProvider { add(chunks: Chunk[]): Promise