diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 73f76f02..da66f7e1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,50 +46,20 @@ jobs: chmod +x ci.sh ./ci.sh frontend - electron: - runs-on: ubuntu-latest - timeout-minutes: 10 - - steps: - - uses: actions/checkout@v4 - - - name: Use Node.js 20 - uses: actions/setup-node@v4 - with: - node-version: '20' - cache: 'npm' - cache-dependency-path: electron/package-lock.json - - - name: Install system dependencies for Electron - run: | - sudo apt-get update - sudo apt-get install -y xvfb libgtk-3-0 libnotify-dev libgconf-2-4 libnss3 libxss1 libasound2 - - - name: Start Xvfb - run: | - Xvfb :99 -screen 0 1024x768x24 > /dev/null 2>&1 & - echo "DISPLAY=:99" >> $GITHUB_ENV - - - name: Run Electron CI - run: | - chmod +x ci.sh - ./ci.sh electron - # This job will only run if all jobs succeed # The workflow will fail if any job fails check-results: runs-on: ubuntu-latest - needs: [backend, frontend, electron] + needs: [backend, frontend] if: always() steps: - name: Check all jobs succeeded run: | - if [[ "${{ needs.backend.result }}" != "success" || "${{ needs.frontend.result }}" != "success" || "${{ needs.electron.result }}" != "success" ]]; then + if [[ "${{ needs.backend.result }}" != "success" || "${{ needs.frontend.result }}" != "success" ]]; then echo "One or more jobs failed:" echo "Backend: ${{ needs.backend.result }}" echo "Frontend: ${{ needs.frontend.result }}" - echo "Electron: ${{ needs.electron.result }}" exit 1 fi echo "All jobs succeeded!" diff --git a/backend/.env.example b/backend/.env.example index a4bfdafb..26544e5a 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -24,4 +24,7 @@ CHECKPOINT_MIN_CHARACTERS=500 # Provider timeout settings (in milliseconds) # PROVIDER_TIMEOUT_MS=10000 # PROVIDER_MODEL_FETCH_TIMEOUT_MS=3000 -# PROVIDER_STREAM_TIMEOUT_MS=300000 + +# Playwright settings (for browser-based tools) +PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 +# PLAYWRIGHT_EXECUTABLE_PATH=/usr/bin/chromium-browser diff --git a/backend/Dockerfile b/backend/Dockerfile index e86b2670..bebc0d23 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,25 +1,39 @@ -ARG NODE_IMAGE=node:20.18.0-alpine3.20 +ARG NODE_IMAGE=node:20.18.0-bookworm-slim FROM ${NODE_IMAGE} AS deps WORKDIR /app COPY package*.json ./ RUN chown -R node:node /app +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright +RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH && chown node:node $PLAYWRIGHT_BROWSERS_PATH USER node RUN npm ci +RUN npx playwright install chromium ARG NODE_IMAGE FROM ${NODE_IMAGE} AS prod-deps WORKDIR /app COPY package*.json ./ RUN chown -R node:node /app +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright +RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH && chown node:node $PLAYWRIGHT_BROWSERS_PATH USER node RUN npm ci --omit=dev +RUN npx playwright install chromium ARG NODE_IMAGE FROM ${NODE_IMAGE} AS dev WORKDIR /app ENV NODE_ENV=development COPY --chown=node:node package*.json ./ -RUN apk add --no-cache su-exec sqlite-libs +RUN apt-get update && apt-get install -y --no-install-recommends \ + gosu \ + libsqlite3-0 \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright +COPY --from=deps --chown=node:node $PLAYWRIGHT_BROWSERS_PATH $PLAYWRIGHT_BROWSERS_PATH +RUN npx playwright install-deps chromium COPY --from=deps --chown=node:node /app/node_modules ./node_modules COPY --chown=node:node . . RUN chmod +x entrypoint.sh @@ -38,7 +52,15 @@ ENV PORT=3001 ENV INSTALL_ON_START=0 COPY --from=prod-deps --chown=node:node /app/node_modules ./node_modules COPY --chown=node:node . . -RUN apk add --no-cache su-exec +RUN apt-get update && apt-get install -y --no-install-recommends \ + gosu \ + libsqlite3-0 \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright +COPY --from=prod-deps --chown=node:node $PLAYWRIGHT_BROWSERS_PATH $PLAYWRIGHT_BROWSERS_PATH +RUN npx playwright install-deps chromium RUN chmod +x entrypoint.sh RUN mkdir -p logs && chown -R node:node logs USER node diff --git a/backend/__tests__/web_fetch_enhanced.test.js b/backend/__tests__/web_fetch_enhanced.test.js index 761568e0..4b1c82bc 100644 --- a/backend/__tests__/web_fetch_enhanced.test.js +++ b/backend/__tests__/web_fetch_enhanced.test.js @@ -1,47 +1,56 @@ import { webFetchTool } from '../src/lib/tools/webFetch.js'; describe('web_fetch enhanced features', () => { - describe('heading_range parameter', () => { - it('should validate heading_range structure', () => { + describe('heading parameter as array', () => { + it('should validate heading as an array of strings', () => { expect( webFetchTool.validate({ url: 'https://example.com', - heading_range: { start: 1, end: 3 } + heading: ['Introduction', 'Usage'] }) ).toEqual({ url: 'https://example.com', maxChars: 10000, - targetHeading: null, - headingRange: { start: 1, end: 3 } + targetHeadings: ['Introduction', 'Usage'], + useBrowser: false }); }); - it('should reject invalid heading_range', () => { - expect(() => + it('should validate heading as an array of numbers', () => { + expect( webFetchTool.validate({ url: 'https://example.com', - heading_range: { start: 0, end: 3 } + heading: [1, 3] }) - ).toThrow('heading_range must have start >= 1'); + ).toEqual({ + url: 'https://example.com', + maxChars: 10000, + targetHeadings: [1, 3], + useBrowser: false + }); }); - it('should reject heading_range with end < start', () => { - expect(() => + it('should validate heading as a single string (backward compatibility)', () => { + expect( webFetchTool.validate({ url: 'https://example.com', - heading_range: { start: 5, end: 3 } + heading: 'Introduction' }) - ).toThrow('heading_range must have start >= 1 and end >= start'); + ).toEqual({ + url: 'https://example.com', + maxChars: 10000, + targetHeadings: ['Introduction'], + useBrowser: false + }); }); - it('should reject both heading and heading_range', () => { + it('should reject invalid heading types', () => { expect(() => webFetchTool.validate({ url: 'https://example.com', - heading: 'Introduction', - heading_range: { start: 1, end: 3 } + heading: { name: 'Intro' } }) - ).toThrow('Cannot use both "heading" and "heading_range" parameters'); + ).toThrow('heading must be a string, number, or an array of strings/numbers'); }); }); diff --git a/backend/__tests__/web_search_searxng_tool.test.js b/backend/__tests__/web_search_searxng_tool.test.js index 8ac2cc92..bf8ed2c7 100644 --- a/backend/__tests__/web_search_searxng_tool.test.js +++ b/backend/__tests__/web_search_searxng_tool.test.js @@ -95,7 +95,7 @@ describe('web_search_searxng tool', () => { test('validate rejects empty string for categories', () => { assert.throws( () => webSearchSearxngTool.validate({ query: 'test', categories: ' ' }), - /categories must be a non-empty string/ + /category must be a non-empty string/ ); }); diff --git a/backend/package-lock.json b/backend/package-lock.json index 0e31c0e0..dbe759d3 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -26,6 +26,9 @@ "p-limit": "^4.0.0", "pino": "^9.3.2", "pino-roll": "^3.1.0", + "playwright": "^1.57.0", + "playwright-extra": "^4.3.6", + "puppeteer-extra-plugin-stealth": "^2.11.2", "turndown": "^7.2.1", "uuid": "^9.0.1", "zod": "^3.23.8" @@ -1633,6 +1636,15 @@ "@babel/types": "^7.28.2" } }, + "node_modules/@types/debug": { + "version": "4.1.12", + "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz", + "integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==", + "license": "MIT", + "dependencies": { + "@types/ms": "*" + } + }, "node_modules/@types/estree": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", @@ -1674,6 +1686,12 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/ms": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz", + "integrity": "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==", + "license": "MIT" + }, "node_modules/@types/node": { "version": "24.10.1", "resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.1.tgz", @@ -2130,6 +2148,15 @@ "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", "license": "Python-2.0" }, + "node_modules/arr-union": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz", + "integrity": "sha512-sKpyeERZ02v1FeCZT8lrfJq5u6goHCtpTAzPwJYe7c8SPFOboNjNg1vz2L4VTn9T4PQxEx13TbXLmYUcS6Ug7Q==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/asap": { "version": "2.0.6", "resolved": "https://registry.npmjs.org/asap/-/asap-2.0.6.tgz", @@ -2255,7 +2282,6 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", - "dev": true, "license": "MIT" }, "node_modules/base64-js": { @@ -2372,7 +2398,6 @@ "version": "1.1.12", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", - "dev": true, "license": "MIT", "dependencies": { "balanced-match": "^1.0.0", @@ -2734,6 +2759,22 @@ "url": "https://github.com/chalk/wrap-ansi?sponsor=1" } }, + "node_modules/clone-deep": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-0.2.4.tgz", + "integrity": "sha512-we+NuQo2DHhSl+DP6jlUiAhyAjBQrYnpOk15rN6c6JSPScjiCLh8IbSU+VTcph6YS3o7mASE8a0+gbZ7ChLpgg==", + "license": "MIT", + "dependencies": { + "for-own": "^0.1.3", + "is-plain-object": "^2.0.1", + "kind-of": "^3.0.2", + "lazy-cache": "^1.0.3", + "shallow-clone": "^0.1.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/co": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", @@ -2805,7 +2846,6 @@ "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", - "dev": true, "license": "MIT" }, "node_modules/concat-stream": { @@ -3039,7 +3079,6 @@ "version": "4.3.1", "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", - "dev": true, "license": "MIT", "engines": { "node": ">=0.10.0" @@ -3851,6 +3890,27 @@ "dev": true, "license": "ISC" }, + "node_modules/for-in": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz", + "integrity": "sha512-7EwmXrOjyL+ChxMhmG5lnW9MPt1aIeZEwKhQzoBUdTV0N3zuwWDZYVJatDvZ2OyzPUvdIAZDsCetk3coyMfcnQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/for-own": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", + "integrity": "sha512-SKmowqGTJoPzLO1T0BBJpkfp3EMacCMOuH40hOUbrbzElVktk4DioXVM99QkLCyKoiuOmyjgcWMpVz2xjE7LZw==", + "license": "MIT", + "dependencies": { + "for-in": "^1.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/foreground-child": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz", @@ -3959,11 +4019,33 @@ "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==", "license": "MIT" }, + "node_modules/fs-extra": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-10.1.0.tgz", + "integrity": "sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ==", + "license": "MIT", + "dependencies": { + "graceful-fs": "^4.2.0", + "jsonfile": "^6.0.1", + "universalify": "^2.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/fs-extra/node_modules/universalify": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz", + "integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==", + "license": "MIT", + "engines": { + "node": ">= 10.0.0" + } + }, "node_modules/fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", - "dev": true, "license": "ISC" }, "node_modules/fsevents": { @@ -4165,7 +4247,6 @@ "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", "integrity": "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==", - "dev": true, "license": "ISC" }, "node_modules/has-flag": { @@ -4404,7 +4485,6 @@ "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", "deprecated": "This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful.", - "dev": true, "license": "ISC", "dependencies": { "once": "^1.3.0", @@ -4461,6 +4541,12 @@ "node": ">=8" } }, + "node_modules/is-buffer": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", + "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==", + "license": "MIT" + }, "node_modules/is-core-module": { "version": "2.16.1", "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz", @@ -4477,6 +4563,15 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-extendable": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz", + "integrity": "sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-extglob": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", @@ -4530,6 +4625,18 @@ "node": ">=0.12.0" } }, + "node_modules/is-plain-object": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", + "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", + "license": "MIT", + "dependencies": { + "isobject": "^3.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-potential-custom-element-name": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", @@ -4562,6 +4669,15 @@ "dev": true, "license": "ISC" }, + "node_modules/isobject": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/istanbul-lib-coverage": { "version": "3.2.2", "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz", @@ -5478,6 +5594,27 @@ "node": ">=6" } }, + "node_modules/jsonfile": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.2.0.tgz", + "integrity": "sha512-FGuPw30AdOIUTRMC2OMRtQV+jkVj2cfPqSeWXv1NEAJ1qZ5zb1X6z1mFhbfOB/iy3ssJCD+3KuZ8r8C3uVFlAg==", + "license": "MIT", + "dependencies": { + "universalify": "^2.0.0" + }, + "optionalDependencies": { + "graceful-fs": "^4.1.6" + } + }, + "node_modules/jsonfile/node_modules/universalify": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz", + "integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==", + "license": "MIT", + "engines": { + "node": ">= 10.0.0" + } + }, "node_modules/jsonwebtoken": { "version": "9.0.2", "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.2.tgz", @@ -5549,6 +5686,27 @@ "json-buffer": "3.0.1" } }, + "node_modules/kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==", + "license": "MIT", + "dependencies": { + "is-buffer": "^1.1.5" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/lazy-cache": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz", + "integrity": "sha512-RE2g0b5VGZsOCFOCgP7omTRYFqydmZkBwl5oNnQ1lDYC57uyO9KqNnNVxT7COSHTxrRCWVcAVOcbjk+tvh/rgQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/leven": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz", @@ -5718,6 +5876,20 @@ "node": ">= 0.8" } }, + "node_modules/merge-deep": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/merge-deep/-/merge-deep-3.0.3.tgz", + "integrity": "sha512-qtmzAS6t6grwEkNrunqTBdn0qKwFgNWvlxUbAV8es9M7Ot1EbyApytCnvE0jALPa46ZpKDUo527kKiaWplmlFA==", + "license": "MIT", + "dependencies": { + "arr-union": "^3.1.0", + "clone-deep": "^0.2.4", + "kind-of": "^3.0.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/merge-descriptors": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-2.0.0.tgz", @@ -5825,7 +5997,6 @@ "version": "3.1.2", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, "license": "ISC", "dependencies": { "brace-expansion": "^1.1.7" @@ -5853,6 +6024,28 @@ "node": ">=16 || 14 >=14.17" } }, + "node_modules/mixin-object": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mixin-object/-/mixin-object-2.0.1.tgz", + "integrity": "sha512-ALGF1Jt9ouehcaXaHhn6t1yGWRqGaHkPFndtFVHfZXOvkIZ/yoGaSi0AHVTafb3ZBGg4dr/bDwnaEKqCXzchMA==", + "license": "MIT", + "dependencies": { + "for-in": "^0.1.3", + "is-extendable": "^0.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/mixin-object/node_modules/for-in": { + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-0.1.8.tgz", + "integrity": "sha512-F0to7vbBSHP8E3l6dCjxNOLuSFAACIxFy3UehTUlG7svlXi37HHsDkyVcHo0Pq8QwrE+pXvWSVX3ZT1T9wAZ9g==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/mkdirp": { "version": "0.5.6", "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz", @@ -6400,7 +6593,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", - "dev": true, "license": "MIT", "engines": { "node": ">=0.10.0" @@ -6671,6 +6863,74 @@ "node": ">=8" } }, + "node_modules/playwright": { + "version": "1.57.0", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.57.0.tgz", + "integrity": "sha512-ilYQj1s8sr2ppEJ2YVadYBN0Mb3mdo9J0wQ+UuDhzYqURwSoW4n1Xs5vs7ORwgDGmyEh33tRMeS8KhdkMoLXQw==", + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.57.0" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.57.0", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.57.0.tgz", + "integrity": "sha512-agTcKlMw/mjBWOnD6kFZttAAGHgi/Nw0CZ2o6JqWSbMlI219lAFLZZCyqByTsvVAJq5XA5H8cA6PrvBRpBWEuQ==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/playwright-extra": { + "version": "4.3.6", + "resolved": "https://registry.npmjs.org/playwright-extra/-/playwright-extra-4.3.6.tgz", + "integrity": "sha512-q2rVtcE8V8K3vPVF1zny4pvwZveHLH8KBuVU2MoE3Jw4OKVoBWsHI9CH9zPydovHHOCDxjGN2Vg+2m644q3ijA==", + "license": "MIT", + "dependencies": { + "debug": "^4.3.4" + }, + "engines": { + "node": ">=12" + }, + "peerDependencies": { + "playwright": "*", + "playwright-core": "*" + }, + "peerDependenciesMeta": { + "playwright": { + "optional": true + }, + "playwright-core": { + "optional": true + } + } + }, + "node_modules/playwright/node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/prebuild-install": { "version": "7.1.3", "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz", @@ -6828,6 +7088,112 @@ "node": ">=6" } }, + "node_modules/puppeteer-extra-plugin": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin/-/puppeteer-extra-plugin-3.2.3.tgz", + "integrity": "sha512-6RNy0e6pH8vaS3akPIKGg28xcryKscczt4wIl0ePciZENGE2yoaQJNd17UiEbdmh5/6WW6dPcfRWT9lxBwCi2Q==", + "license": "MIT", + "dependencies": { + "@types/debug": "^4.1.0", + "debug": "^4.1.1", + "merge-deep": "^3.0.1" + }, + "engines": { + "node": ">=9.11.2" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-stealth": { + "version": "2.11.2", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz", + "integrity": "sha512-bUemM5XmTj9i2ZerBzsk2AN5is0wHMNE6K0hXBzBXOzP5m5G3Wl0RHhiqKeHToe/uIH8AoZiGhc1tCkLZQPKTQ==", + "license": "MIT", + "dependencies": { + "debug": "^4.1.1", + "puppeteer-extra-plugin": "^3.2.3", + "puppeteer-extra-plugin-user-preferences": "^2.4.1" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-user-data-dir": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-data-dir/-/puppeteer-extra-plugin-user-data-dir-2.4.1.tgz", + "integrity": "sha512-kH1GnCcqEDoBXO7epAse4TBPJh9tEpVEK/vkedKfjOVOhZAvLkHGc9swMs5ChrJbRnf8Hdpug6TJlEuimXNQ+g==", + "license": "MIT", + "dependencies": { + "debug": "^4.1.1", + "fs-extra": "^10.0.0", + "puppeteer-extra-plugin": "^3.2.3", + "rimraf": "^3.0.2" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-user-preferences": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-preferences/-/puppeteer-extra-plugin-user-preferences-2.4.1.tgz", + "integrity": "sha512-i1oAZxRbc1bk8MZufKCruCEC3CCafO9RKMkkodZltI4OqibLFXF3tj6HZ4LZ9C5vCXZjYcDWazgtY69mnmrQ9A==", + "license": "MIT", + "dependencies": { + "debug": "^4.1.1", + "deepmerge": "^4.2.2", + "puppeteer-extra-plugin": "^3.2.3", + "puppeteer-extra-plugin-user-data-dir": "^2.4.1" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, "node_modules/pure-rand": { "version": "7.0.1", "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-7.0.1.tgz", @@ -7046,6 +7412,43 @@ "node": ">=4" } }, + "node_modules/rimraf": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", + "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==", + "deprecated": "Rimraf versions prior to v4 are no longer supported", + "license": "ISC", + "dependencies": { + "glob": "^7.1.3" + }, + "bin": { + "rimraf": "bin.js" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/rimraf/node_modules/glob": { + "version": "7.2.3", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", + "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==", + "deprecated": "Glob versions prior to v9 are no longer supported", + "license": "ISC", + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.1.1", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + }, + "engines": { + "node": "*" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/router": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/router/-/router-2.2.0.tgz", @@ -7175,6 +7578,42 @@ "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==", "license": "ISC" }, + "node_modules/shallow-clone": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz", + "integrity": "sha512-J1zdXCky5GmNnuauESROVu31MQSnLoYvlyEn6j2Ztk6Q5EHFIhxkMhYcv6vuDzl2XEzoRr856QwzMgWM/TmZgw==", + "license": "MIT", + "dependencies": { + "is-extendable": "^0.1.1", + "kind-of": "^2.0.1", + "lazy-cache": "^0.2.3", + "mixin-object": "^2.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/shallow-clone/node_modules/kind-of": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-2.0.1.tgz", + "integrity": "sha512-0u8i1NZ/mg0b+W3MGGw5I7+6Eib2nx72S/QvXa0hYjEkjTknYmEYQJwGu3mLC0BrhtJjtQafTkyRUQ75Kx0LVg==", + "license": "MIT", + "dependencies": { + "is-buffer": "^1.0.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/shallow-clone/node_modules/lazy-cache": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-0.2.7.tgz", + "integrity": "sha512-gkX52wvU/R8DVMMt78ATVPFMJqfW8FPz1GZ1sVHBVQHmu/WvhIWE4cE1GBzhJNFicDeYhnwp6Rl35BcAIM3YOQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/shebang-command": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", diff --git a/backend/package.json b/backend/package.json index f9b20cca..f41c5fcb 100644 --- a/backend/package.json +++ b/backend/package.json @@ -29,12 +29,15 @@ "multer": "^2.0.2", "nanoid": "^5.1.6", "node-fetch": "^3.3.2", + "p-limit": "^4.0.0", "pino": "^9.3.2", "pino-roll": "^3.1.0", + "playwright": "^1.57.0", + "playwright-extra": "^4.3.6", + "puppeteer-extra-plugin-stealth": "^2.11.2", "turndown": "^7.2.1", "uuid": "^9.0.1", - "zod": "^3.23.8", - "p-limit": "^4.0.0" + "zod": "^3.23.8" }, "devDependencies": { "eslint": "^9.34.0", diff --git a/backend/src/lib/browser/BrowserService.js b/backend/src/lib/browser/BrowserService.js new file mode 100644 index 00000000..9dcfb6c5 --- /dev/null +++ b/backend/src/lib/browser/BrowserService.js @@ -0,0 +1,23 @@ +class BrowserService { + /** + * Fetches content from a URL using a browser engine. + * Logic splits based on environment: + * - Electron: Uses native BrowserWindow + * - Server/Docker: Uses Puppeteer/Chromium + * @param {string} url + * @returns {Promise} HTML content + */ + async fetchPageContent(url) { + if (process.env.IS_ELECTRON) { + // Use Electron's native browser capabilities (singleton) + const { electronProvider } = await import('./ElectronProvider.js'); + return electronProvider.fetchPageContent(url); + } else { + // Use Playwright (headless Chrome) + const { playwrightProvider } = await import('./PlaywrightProvider.js'); + return playwrightProvider.fetchPageContent(url); + } + } +} + +export const browserService = new BrowserService(); diff --git a/backend/src/lib/browser/ElectronProvider.js b/backend/src/lib/browser/ElectronProvider.js new file mode 100644 index 00000000..3f8bbb16 --- /dev/null +++ b/backend/src/lib/browser/ElectronProvider.js @@ -0,0 +1,64 @@ +export class ElectronProvider { + /** + * Load URL with timeout to prevent hanging on slow/broken pages + * @param {Electron.BrowserWindow} win + * @param {string} url + * @param {number} timeout - Timeout in milliseconds (default 30000) + * @returns {Promise} + */ + _loadWithTimeout(win, url, timeout = 30000) { + return new Promise((resolve, reject) => { + const timer = setTimeout(() => reject(new Error('Page load timeout')), timeout); + win.webContents.once('did-finish-load', () => { + clearTimeout(timer); + resolve(); + }); + win.webContents.once('did-fail-load', (_, code, desc) => { + clearTimeout(timer); + reject(new Error(`Load failed: ${desc}`)); + }); + win.loadURL(url); + }); + } + + async fetchPageContent(url) { + let BrowserWindow; + try { + const electron = await import('electron'); + BrowserWindow = electron.BrowserWindow; + } catch (error) { + throw new Error(`Failed to import electron: ${error.message}`); + } + + if (!BrowserWindow) { + throw new Error('BrowserWindow is not defined in electron module'); + } + + const win = new BrowserWindow({ + show: false, + webPreferences: { + offscreen: true, + nodeIntegration: false, + contextIsolation: true, + } + }); + + try { + await this._loadWithTimeout(win, url); + const content = await win.webContents.executeJavaScript('document.documentElement.outerHTML'); + return content; + } catch (error) { + // Add a comprehensive error message + console.error('[ElectronProvider] Error fetching page:', error); + throw error; + } finally { + // Ensure the window is destroyed to free memory + if (!win.isDestroyed()) { + win.destroy(); + } + } + } +} + +// Export singleton instance for consistency with PuppeteerProvider +export const electronProvider = new ElectronProvider(); diff --git a/backend/src/lib/browser/PlaywrightProvider.js b/backend/src/lib/browser/PlaywrightProvider.js new file mode 100644 index 00000000..072f2c90 --- /dev/null +++ b/backend/src/lib/browser/PlaywrightProvider.js @@ -0,0 +1,113 @@ +import { chromium } from 'playwright-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; +import pLimit from 'p-limit'; + +chromium.use(StealthPlugin()); + +class PlaywrightProvider { + constructor() { + this.browser = null; + this.browserPromise = null; // Promise-based lock for concurrent initialization + this.timeoutId = null; + this.limit = pLimit(5); // Max 5 concurrent pages + } + + async getBrowser() { + if (this.browser) { + this.rescheduleCleanup(); + return this.browser; + } + + // Return existing initialization if in progress (prevents race condition) + if (this.browserPromise) { + return this.browserPromise; + } + + this.browserPromise = this._initBrowser(); + try { + this.browser = await this.browserPromise; + this.rescheduleCleanup(); + return this.browser; + } finally { + this.browserPromise = null; + } + } + + /** + * Initialize browser instance (separated for race condition handling) + * @returns {Promise} + */ + async _initBrowser() { + // Determine executable path + // In Docker (Alpine), we set PUPPETEER_EXECUTABLE_PATH. + // We'll rename this to BROWSER_EXECUTABLE_PATH or keep it for compatibility or use PLAYWRIGHT_EXECUTABLE_PATH + let executablePath = process.env.PLAYWRIGHT_EXECUTABLE_PATH || process.env.PUPPETEER_EXECUTABLE_PATH; + let isAlpine = false; + + try { + const fs = await import('fs'); + if (fs.existsSync('/etc/alpine-release')) { + isAlpine = true; + } + } catch (e) { + // Ignore + } + + if (!executablePath && isAlpine) { + executablePath = '/usr/bin/chromium-browser'; + } + + if (executablePath) { + console.log(`[PlaywrightProvider] Launching system Chromium at ${executablePath}`); + } else { + console.log(`[PlaywrightProvider] Launching bundled Playwright Chromium`); + } + + const browser = await chromium.launch({ + executablePath: executablePath || undefined, + headless: true, + }); + + return browser; + } + + rescheduleCleanup() { + if (this.timeoutId) clearTimeout(this.timeoutId); + this.timeoutId = setTimeout(() => this.cleanup(), 5 * 60 * 1000); // 5 minutes + } + + async cleanup() { + if (this.browser) { + console.log('[PlaywrightProvider] Closing idle browser'); + await this.browser.close(); + this.browser = null; + } + } + + async fetchPageContent(url) { + return this.limit(async () => { + const browser = await this.getBrowser(); + // Use a fresh context for each request to avoid cookie/cache leaks + const context = await browser.newContext({ + userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', + ignoreHTTPSErrors: true, + }); + + const page = await context.newPage(); + try { + await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 }); + const content = await page.content(); + return content; + } catch (error) { + console.error('[PlaywrightProvider] Error fetching page:', error); + throw error; + } finally { + if (page) await page.close(); + if (context) await context.close(); + this.rescheduleCleanup(); + } + }); + } +} + +export const playwrightProvider = new PlaywrightProvider(); diff --git a/backend/src/lib/tools/webFetch.js b/backend/src/lib/tools/webFetch.js index bb02feb8..dc1b8a98 100644 --- a/backend/src/lib/tools/webFetch.js +++ b/backend/src/lib/tools/webFetch.js @@ -2,6 +2,7 @@ import { createTool } from './baseTool.js'; import TurndownService from 'turndown'; import { Readability } from '@mozilla/readability'; import { JSDOM } from 'jsdom'; +import { browserService } from '../browser/BrowserService.js'; const TOOL_NAME = 'web_fetch'; @@ -41,7 +42,7 @@ function validate(args) { throw new Error('web_fetch requires an arguments object'); } - const { url, max_chars, continuation_token, heading_range } = args; + const { url, max_chars, continuation_token, use_browser } = args; // If continuation_token is provided, we're fetching next chunk if (continuation_token) { @@ -88,44 +89,39 @@ function validate(args) { maxChars = max_chars; } - // Validate heading if provided + // Validate headings if provided + let targetHeadings = null; const { heading } = args; - let targetHeading = null; if (heading !== undefined && heading !== null) { - if (typeof heading === 'string') { - if (heading.trim().length === 0) { - // Treat empty/whitespace heading as not entered - targetHeading = null; - } else { - targetHeading = heading.trim(); - } + if (Array.isArray(heading)) { + targetHeadings = heading + .map(h => { + if (typeof h === 'string') return h.trim(); + if (typeof h === 'number') return h; + return null; + }) + .filter(h => h !== null && (typeof h !== 'string' || h.length > 0)); + if (targetHeadings.length === 0) targetHeadings = null; + } else if (typeof heading === 'string') { + const trimmed = heading.trim(); + targetHeadings = trimmed.length > 0 ? [trimmed] : null; + } else if (typeof heading === 'number') { + targetHeadings = [heading]; } else { - throw new Error('heading must be a string'); + throw new Error('heading must be a string, number, or an array of strings/numbers'); } } - // Validate heading_range if provided - let headingRange = null; - if (heading_range !== undefined) { - if (typeof heading_range !== 'object' || heading_range === null) { - throw new Error('heading_range must be an object with start and end properties'); - } - const { start, end } = heading_range; - if (typeof start !== 'number' || typeof end !== 'number') { - throw new Error('heading_range.start and heading_range.end must be numbers'); + // Validate use_browser if provided + let useBrowser = false; + if (use_browser !== undefined) { + if (typeof use_browser !== 'boolean') { + throw new Error('use_browser must be a boolean'); } - if (start < 1 || end < start) { - throw new Error('heading_range must have start >= 1 and end >= start'); - } - headingRange = { start, end }; + useBrowser = use_browser; } - // Can't use both heading and heading_range - if (targetHeading && headingRange) { - throw new Error('Cannot use both "heading" and "heading_range" parameters'); - } - - return { url, maxChars, targetHeading, headingRange }; + return { url, maxChars, targetHeadings, useBrowser }; } function generateCacheKey(url, filterType, filterValue) { @@ -170,136 +166,186 @@ function handleContinuation(token, maxChars) { }; } -async function handler({ url, maxChars, targetHeading, headingRange, continuation_token }) { - // Handle continuation token (fetch next chunk from cache) - if (continuation_token) { - return handleContinuation(continuation_token, maxChars); - } +// Helper: detect if a small binary buffer looks like text +function isProbablyText(buffer) { + if (!buffer || buffer.length === 0) return false; - try { - // Fetch the web page - const response = await fetch(url, { - headers: { - 'User-Agent': 'Mozilla/5.0 (compatible; ChatForge/1.0; +https://chatforge.app)', - }, - redirect: 'follow', - // 10 second timeout - signal: AbortSignal.timeout(10000), - }); + // Quick null-byte check (very likely binary) + const sampleLen = Math.min(buffer.length, 1024); + for (let i = 0; i < sampleLen; i++) { + if (buffer[i] === 0) return false; + } - if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); + // Decode and examine printable vs control chars + const sample = new TextDecoder('utf-8', { fatal: false }).decode(buffer.slice(0, sampleLen)); + let nonPrintable = 0; + let total = 0; + for (let i = 0; i < sample.length; i++) { + const code = sample.charCodeAt(i); + // allow common whitespace: tab, line feed, carriage return + if (code === 9 || code === 10 || code === 13) { + total++; + continue; + } + if (code < 32) { + nonPrintable++; } + total++; + } + if (total === 0) return false; + // If less than 10% of the sample are non-printable control chars, + // treat it as text. + return (nonPrintable / total) < 0.10; +} - const contentType = response.headers.get('content-type') || ''; +async function basicFetch(url) { + // Fetch the web page + const response = await fetch(url, { + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; ChatForge/1.0; +https://chatforge.app)', + }, + redirect: 'follow', + // 10 second timeout + signal: AbortSignal.timeout(10000), + }); - // If the Content-Type clearly indicates text-like content, accept it. - // Otherwise we'll peek at the first chunk of the body and apply a - // lightweight binary-vs-text heuristic to decide if the response is - // text-parsable. This allows fetching resources that may not set - // Content-Type correctly but are still text (e.g., some servers). - const contentTypeLooksLikeText = /^(?:text\/)|(?:application\/(?:xml|xhtml\+xml|json))|html|xml|json/i.test(contentType); + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } - // Stream response body with size limit to prevent memory blowup - const reader = response.body && typeof response.body.getReader === 'function' - ? response.body.getReader() - : null; + const contentType = response.headers.get('content-type') || ''; - if (!reader) { - throw new Error('Response body is not readable'); - } + // If the Content-Type clearly indicates text-like content, accept it. + // Otherwise we'll peek at the first chunk of the body and apply a + // lightweight binary-vs-text heuristic to decide if the response is + // text-parsable. This allows fetching resources that may not set + // Content-Type correctly but are still text (e.g., some servers). + const contentTypeLooksLikeText = /^(?:text\/)|(?:application\/(?:xml|xhtml\+xml|json))|html|xml|json/i.test(contentType); - const decoder = new TextDecoder(); - let html = ''; - let bytesDownloaded = 0; + // Stream response body with size limit to prevent memory blowup + const reader = response.body && typeof response.body.getReader === 'function' + ? response.body.getReader() + : null; - // Helper: detect if a small binary buffer looks like text - function isProbablyText(buffer) { - if (!buffer || buffer.length === 0) return false; + if (!reader) { + throw new Error('Response body is not readable'); + } - // Quick null-byte check (very likely binary) - const sampleLen = Math.min(buffer.length, 1024); - for (let i = 0; i < sampleLen; i++) { - if (buffer[i] === 0) return false; - } + const decoder = new TextDecoder(); + let html = ''; + let bytesDownloaded = 0; - // Decode and examine printable vs control chars - const sample = new TextDecoder('utf-8', { fatal: false }).decode(buffer.slice(0, sampleLen)); - let nonPrintable = 0; - let total = 0; - for (let i = 0; i < sample.length; i++) { - const code = sample.charCodeAt(i); - // allow common whitespace: tab, line feed, carriage return - if (code === 9 || code === 10 || code === 13) { - total++; - continue; - } - if (code < 32) { - nonPrintable++; - } - total++; - } - if (total === 0) return false; - // If less than 10% of the sample are non-printable control chars, - // treat it as text. - return (nonPrintable / total) < 0.10; + try { + // Read the first chunk to allow content sniffing when needed + const first = await reader.read(); + if (first.done) { + reader.releaseLock(); + throw new Error('Empty response body'); } - try { - // Read the first chunk to allow content sniffing when needed - const first = await reader.read(); - if (first.done) { - reader.releaseLock(); - throw new Error('Empty response body'); + const firstChunk = first.value; + bytesDownloaded += firstChunk.length; + + if (bytesDownloaded > MAX_BODY_SIZE) { + reader.cancel(); + throw new Error(`Response body exceeds maximum size limit of ${MAX_BODY_SIZE / (1024 * 1024)} MB`); + } + + if (!contentTypeLooksLikeText) { + // If the header doesn't clearly say text, use the heuristic on the + // first chunk to avoid reading binary blobs. + if (!isProbablyText(firstChunk)) { + reader.cancel(); + throw new Error(`URL does not return text-parsable content. Content-Type: ${contentType}`); } + } + + // Append first chunk and continue streaming the rest + html += decoder.decode(firstChunk, { stream: true }); + + while (true) { + const { done, value } = await reader.read(); + + if (done) break; - const firstChunk = first.value; - bytesDownloaded += firstChunk.length; + bytesDownloaded += value.length; if (bytesDownloaded > MAX_BODY_SIZE) { reader.cancel(); throw new Error(`Response body exceeds maximum size limit of ${MAX_BODY_SIZE / (1024 * 1024)} MB`); } - if (!contentTypeLooksLikeText) { - // If the header doesn't clearly say text, use the heuristic on the - // first chunk to avoid reading binary blobs. - if (!isProbablyText(firstChunk)) { - reader.cancel(); - throw new Error(`URL does not return text-parsable content. Content-Type: ${contentType}`); - } - } - - // Append first chunk and continue streaming the rest - html += decoder.decode(firstChunk, { stream: true }); - - while (true) { - const { done, value } = await reader.read(); + html += decoder.decode(value, { stream: true }); + } - if (done) break; + // Flush any remaining bytes in the decoder + html += decoder.decode(); + } finally { + try { + reader.releaseLock(); + } catch { + // ignore + } + } + return html; +} - bytesDownloaded += value.length; +async function handler({ url, maxChars, targetHeadings, continuation_token, useBrowser }) { + // Handle continuation token (fetch next chunk from cache) + if (continuation_token) { + return handleContinuation(continuation_token, maxChars); + } - if (bytesDownloaded > MAX_BODY_SIZE) { - reader.cancel(); - throw new Error(`Response body exceeds maximum size limit of ${MAX_BODY_SIZE / (1024 * 1024)} MB`); - } + let html = ''; + let errorMessages = []; - html += decoder.decode(value, { stream: true }); - } + if (useBrowser) { + try { + html = await browserService.fetchPageContent(url); + } catch (browserError) { + console.error('[webFetch] Forced browser fetch failed:', browserError); + throw new Error(`Forced browser fetch failed: ${browserError.message}`); + } + } else { + // 1. Try simple fetch + JSDOM first (fastest) + try { + html = await basicFetch(url); + } catch (error) { + errorMessages.push(`Basic fetch failed: ${error.message}`); + } - // Flush any remaining bytes in the decoder - html += decoder.decode(); - } finally { + // 2. Check for failure triggers (SPA detection) + // - No content (fetch failed) + // - Very short content (<300 chars usually means stub) + // - Specific "Enable JS" messages + // - noscript tag containing JavaScript requirement messages (not just any noscript tag) + const noscriptNeedsJs = /]*>.*?(?:enable|require|need).*?javascript/is.test(html); + const isFailure = !html + || html.length < 300 + || html.includes("You need to enable JavaScript") + || noscriptNeedsJs; + + // 3. Fallback to Browser Engine if needed + if (isFailure) { try { - reader.releaseLock(); - } catch { - // ignore + // console.log(`Triggering browser fallback for ${url}`); + html = await browserService.fetchPageContent(url); + } catch (browserError) { + console.error('[webFetch] Browser fallback failed:', browserError); + errorMessages.push(`Browser fallback failed: ${browserError.message}`); + + // If we have some content from basic fetch, usage it despite being "low quality" is better than crashing + // But if we have NO content, throw exception. + if (!html) { + throw new Error(`Failed to fetch URL. Errors: ${errorMessages.join('; ')}`); + } } } + } + try { const dom = new JSDOM(html, { url }); const document = dom.window.document; @@ -313,16 +359,17 @@ async function handler({ url, maxChars, targetHeading, headingRange, continuatio const reader = new Readability(document.cloneNode(true)); const article = reader.parse(); - if (article && article.textContent && article.textContent.length > MIN_READABILITY_LENGTH) { + if (article && article.length > MIN_READABILITY_LENGTH) { extractedContent = { html: article.content, title: article.title, excerpt: article.excerpt, byline: article.byline, - length: article.length, // Word count estimate - siteName: article.siteName, // Site metadata - lang: article.lang, // Language detection - publishedTime: article.publishedTime, + contentLength: article.length, // Character count of plain text + siteName: article.siteName, + lang: article.lang, + dir: article.dir, + publishedTime: extractPublishedTime(document), }; method = 'readability'; } @@ -365,6 +412,7 @@ async function handler({ url, maxChars, targetHeading, headingRange, continuatio extractedContent = { html: cleanedHtml, title: extractTitle(html), + publishedTime: extractPublishedTime(document), }; method = 'basic-clean'; } @@ -378,26 +426,15 @@ async function handler({ url, maxChars, targetHeading, headingRange, continuatio let filterResult = { html: extractedContent.html, filtered: false }; let filterMetadata = {}; - if (headingRange) { - // Filter by heading range (e.g., headings 2-4) - filterResult = filterContentByHeadingRange(extractedContent.html, allHeadings, headingRange); - - if (filterResult.error) { - filterMetadata.headingError = filterResult.error; - } else if (filterResult.filtered) { - extractedContent.html = filterResult.html; - filterMetadata.filteredByHeadingRange = headingRange; - filterMetadata.matchedHeadings = filterResult.matchedHeadings; - } - } else if (targetHeading) { - // Filter by single heading name - filterResult = filterContentByHeading(extractedContent.html, allHeadings, targetHeading); + if (targetHeadings && targetHeadings.length > 0) { + // Filter by heading names or indices + filterResult = filterContentByHeadings(extractedContent.html, allHeadings, targetHeadings); if (filterResult.error) { filterMetadata.headingError = filterResult.error; } else if (filterResult.filtered) { extractedContent.html = filterResult.html; - filterMetadata.filteredByHeading = filterResult.matchedHeading; + filterMetadata.filteredByHeadings = filterResult.matchedHeadings; } } @@ -411,7 +448,7 @@ async function handler({ url, maxChars, targetHeading, headingRange, continuatio let markdown = turndownService.turndown(extractedContent.html); // STRATEGY 2: If page has no headings, use continuation token approach - const usesContinuation = allHeadings.length === 0 && !targetHeading && !headingRange; + const usesContinuation = allHeadings.length === 0 && !targetHeadings; // Apply character limit const truncationResult = truncateMarkdown(markdown, maxChars, 0); @@ -437,7 +474,7 @@ async function handler({ url, maxChars, targetHeading, headingRange, continuatio // Prepend TOC to markdown if available (TOC is from FULL content, even if markdown is truncated) let finalMarkdown = markdown; - if (fullToc && !filterMetadata.filteredByHeading && !filterMetadata.filteredByHeadingRange) { + if (fullToc && !filterMetadata.filteredByHeadings) { // Only include TOC if we're showing full content (not filtered to a specific heading) finalMarkdown = `## Table of Contents\n\n${fullToc}\n\n---\n\n${markdown}`; } @@ -449,6 +486,10 @@ async function handler({ url, maxChars, targetHeading, headingRange, continuatio length: finalMarkdown.length, excerpt: extractedContent.excerpt, byline: extractedContent.byline, + siteName: extractedContent.siteName, + lang: extractedContent.lang, + dir: extractedContent.dir, + publishedTime: extractedContent.publishedTime, extractionMethod: method, // For debugging truncated: truncationResult.hasMore, ...(truncationResult.hasMore && { originalLength: truncationResult.originalLength }), @@ -504,95 +545,57 @@ function buildTOC(headings) { return toc.join('\n'); } -function filterContentByHeading(htmlContent, headings, targetHeading) { - if (!targetHeading || !headings || headings.length === 0) { +function filterContentByHeadings(htmlContent, headings, targets) { + if (!targets || targets.length === 0 || !headings || headings.length === 0) { return { html: htmlContent, filtered: false }; } - // Find the target heading (case-insensitive partial match) - const targetLower = targetHeading.toLowerCase(); - const matchIndex = headings.findIndex(h => - h.text.toLowerCase().includes(targetLower) || - targetLower.includes(h.text.toLowerCase()) - ); + let combinedHtml = ''; + const matchedHeadings = []; - if (matchIndex === -1) { - return { - html: htmlContent, - filtered: false, - error: `Heading "${targetHeading}" not found. Available headings: ${headings.map(h => h.text).join(', ')}` - }; - } - - const targetHeadingObj = headings[matchIndex]; - const startPos = targetHeadingObj.position; - - // Find the end position (next heading of same or higher level, or end of content) - let endPos = htmlContent.length; - for (let i = matchIndex + 1; i < headings.length; i++) { - if (headings[i].level <= targetHeadingObj.level) { - endPos = headings[i].position; - break; + for (const target of targets) { + let matchIndex = -1; + if (typeof target === 'number') { + // 1-based index + if (target >= 1 && target <= headings.length) { + matchIndex = target - 1; + } + } else if (typeof target === 'string') { + const targetLower = target.toLowerCase(); + matchIndex = headings.findIndex(h => + h.text.toLowerCase().includes(targetLower) || + targetLower.includes(h.text.toLowerCase()) + ); } - } - const filteredHtml = htmlContent.substring(startPos, endPos); + if (matchIndex !== -1) { + const targetHeadingObj = headings[matchIndex]; + const startPos = targetHeadingObj.position; - return { - html: filteredHtml, - filtered: true, - matchedHeading: targetHeadingObj.text - }; -} + // Find the end position (next heading of same or higher level, or end of content) + let endPos = htmlContent.length; + for (let i = matchIndex + 1; i < headings.length; i++) { + if (headings[i].level <= targetHeadingObj.level) { + endPos = headings[i].position; + break; + } + } -function filterContentByHeadingRange(htmlContent, headings, range) { - if (!range || !headings || headings.length === 0) { - return { html: htmlContent, filtered: false }; + combinedHtml += htmlContent.substring(startPos, endPos) + '\n\n'; + matchedHeadings.push(targetHeadingObj.text); + } } - const { start, end } = range; - - // Validate range against available headings (1-indexed) - if (start > headings.length) { + if (matchedHeadings.length === 0) { return { html: htmlContent, filtered: false, - error: `Start index ${start} exceeds available headings (${headings.length} total). Available headings: ${headings.map((h, i) => `${i + 1}. ${h.text}`).join(', ')}` + error: `None of the requested headings found. Available headings: ${headings.map((h, i) => `${i + 1}. ${h.text}`).join(', ')}` }; } - const actualEnd = Math.min(end, headings.length); - const startIndex = start - 1; // Convert to 0-indexed - const endIndex = actualEnd - 1; - - const startHeading = headings[startIndex]; - const startPos = startHeading.position; - - // Find end position (start of next heading after the range, or end of content) - let endPos = htmlContent.length; - if (endIndex + 1 < headings.length) { - // Check if next heading is at same or higher level as any in our range - const nextHeading = headings[endIndex + 1]; - const minLevelInRange = Math.min(...headings.slice(startIndex, endIndex + 1).map(h => h.level)); - - if (nextHeading.level <= minLevelInRange) { - endPos = nextHeading.position; - } else { - // Include subheadings, find next heading at same or higher level - for (let i = endIndex + 2; i < headings.length; i++) { - if (headings[i].level <= minLevelInRange) { - endPos = headings[i].position; - break; - } - } - } - } - - const filteredHtml = htmlContent.substring(startPos, endPos); - const matchedHeadings = headings.slice(startIndex, endIndex + 1).map(h => h.text); - return { - html: filteredHtml, + html: combinedHtml.trim(), filtered: true, matchedHeadings }; @@ -629,6 +632,40 @@ function extractTitle(html) { return titleMatch ? titleMatch[1].trim() : 'Untitled'; } +function extractPublishedTime(document) { + const selectors = [ + 'meta[property="article:published_time"]', + 'meta[property="og:published_time"]', + 'meta[name="pubdate"]', + 'meta[name="publish-date"]', + 'meta[name="dc.date"]', + 'meta[name="date"]', + 'time[datetime]', + ]; + + for (const selector of selectors) { + const el = document.querySelector(selector); + if (!el) continue; + + const value = el.getAttribute('content') || el.getAttribute('datetime'); + if (value) return value; + } + + // Try JSON-LD as a last resort (common for blogs) + try { + const jsonLdScripts = document.querySelectorAll('script[type="application/ld+json"]'); + for (const script of jsonLdScripts) { + const data = JSON.parse(script.textContent); + const date = data.datePublished || data.dateCreated || (Array.isArray(data['@graph']) ? data['@graph'].find(n => n.datePublished)?.datePublished : null); + if (date) return date; + } + } catch { + // Ignore JSON-LD errors + } + + return null; +} + function truncateMarkdown(markdown, maxChars, offset = 0) { const totalLength = markdown.length; const start = offset; @@ -698,7 +735,7 @@ export const webFetchTool = createTool({ function: { name: TOOL_NAME, description: - 'Fetch a web page and convert its HTML content to Markdown format. Returns the page title and content as markdown. Automatically detects headings (h1-h3) and includes a table of contents.\n\nNavigation strategies:\n1. For structured content with headings: Use heading or heading_range to get specific sections\n2. For unstructured content: Use continuation_token to fetch subsequent chunks\n\nThe tool automatically chooses the best strategy based on content structure.', + 'Fetch a web page and convert its HTML content to Markdown format. Returns the page title and content as markdown. Automatically detects headings (h1-h3) and includes a table of contents.\n\nNavigation strategies:\n1. For structured content with headings: Use heading or heading_range to get specific sections\n2. For unstructured content: Use continuation_token to fetch subsequent chunks\n3. For JavaScript-heavy or SPA sites: Use use_browser: true to ensure content is fully rendered before extraction\n\nThe tool automatically chooses the best strategy based on content structure.', parameters: { type: 'object', properties: { @@ -711,28 +748,23 @@ export const webFetchTool = createTool({ description: `Maximum number of characters to return per chunk (default: ${DEFAULT_MAX_CHARS}). Content will be intelligently truncated at paragraph/sentence boundaries.`, }, heading: { - type: 'string', - description: 'Optional: Retrieve content under a specific heading (h1-h3). Performs case-insensitive partial matching. Returns error with available headings if not found.', - }, - heading_range: { - type: 'object', - description: 'Optional: Retrieve content from a range of headings by index (1-based). Example: {start: 2, end: 4} gets content from 2nd to 4th heading. Includes all subheadings within range.', - properties: { - start: { - type: 'number', - description: 'Starting heading index (1-based, inclusive)' - }, - end: { - type: 'number', - description: 'Ending heading index (1-based, inclusive)' - } + type: 'array', + items: { + anyOf: [ + { type: 'string' }, + { type: 'number' } + ] }, - required: ['start', 'end'] + description: 'Optional: Array of headings (h1-h3) to retrieve content from. Can be heading names (strings, partial match) or indices (numbers, 1st heading is 1). Content includes subheadings until a same or higher-level heading is reached.', }, continuation_token: { type: 'string', description: 'Optional: Token from previous response to fetch the next chunk of content. Use this for pages without headings that were truncated. Omit url when using this.', }, + use_browser: { + type: 'boolean', + description: 'Optional: Force the use of a real browser to fetch the page. Use this when the initial fetch fails, returns empty content, or when the page is a Single Page Application (SPA) that requires JavaScript to render correctly (e.g., React, Vue, Angular sites, or sites with complex anti-bot measures).', + }, }, required: [], }, diff --git a/backend/src/lib/tools/webSearchSearxng.js b/backend/src/lib/tools/webSearchSearxng.js index bb17a868..245f20ca 100644 --- a/backend/src/lib/tools/webSearchSearxng.js +++ b/backend/src/lib/tools/webSearchSearxng.js @@ -11,12 +11,13 @@ function validate(args) { const validated = { query: args.query.trim() }; - // Optional parameters with validation - if (args.categories !== undefined) { - if (typeof args.categories !== 'string' || args.categories.trim().length === 0) { - throw new Error('categories must be a non-empty string'); + // Simplified category parameter (mapped from the older 'categories') + const category = args.category || args.categories; + if (category !== undefined) { + if (typeof category !== 'string' || category.trim().length === 0) { + throw new Error('category must be a non-empty string'); } - validated.categories = args.categories.trim(); + validated.categories = category.trim(); } if (args.engines !== undefined) { @@ -288,7 +289,7 @@ async function handler( export const webSearchSearxngTool = createTool({ name: TOOL_NAME, description: - 'Privacy-focused metasearch engine aggregating results from multiple sources. Best for privacy-conscious searches, accessing diverse search engines, and avoiding tracking. Self-hosted and highly configurable.', + 'Search the web using SearXNG. Aggregates results from multiple engines. Returns snippets and URLs.', validate, handler, openAI: { @@ -296,51 +297,29 @@ export const webSearchSearxngTool = createTool({ function: { name: TOOL_NAME, description: - 'Privacy‑focused metasearch (SearXNG) that pulls results from multiple engines (Google, Bing, DuckDuckGo, Wikipedia, etc.) without tracking. It aggregates headlines/snippets (not full content)—pair with the web fetch tool for full pages', + 'Search the web for latest news, info, and sites. Returns snippets (not full content). Pair with web_fetch to read full pages.', parameters: { type: 'object', properties: { query: { type: 'string', - description: 'The search query to execute', + description: 'The search query or question.', }, - categories: { + category: { type: 'string', - description: - 'Comma-separated list of search categories (e.g., "general", "images", "videos", "news", "music", "files", "it", "science", "map"). Default is "general".', - }, - engines: { - type: 'string', - description: - 'Comma-separated list of specific search engines to use (e.g., "google,duckduckgo,wikipedia"). Restricts search to only these engines.', - }, - language: { - type: 'string', - description: 'Language code for search results (e.g., "en", "fr", "de", "es"). Default is "all".', - }, - pageno: { - type: 'integer', - description: - 'Page number for pagination (default: 1). Each page typically returns 10-20 results depending on engines.', - minimum: 1, + enum: ['general', 'news', 'science', 'it'], + description: 'Search category (default: "general"). Use "news" for recent events.', }, time_range: { type: 'string', enum: ['day', 'week', 'month', 'year'], - description: 'Filter results by time range. Useful for finding recent content.', - }, - safesearch: { - type: 'integer', - description: 'Safe search level: 0 (none), 1 (moderate), 2 (strict). Default is 0.', - minimum: 0, - maximum: 2, + description: 'Filter by recency (e.g. "day" for latest news).', }, max_results: { type: 'integer', - description: - 'Maximum number of search results to display (default: 10, max: 50). Note: SearXNG may return fewer results depending on available engines.', + description: 'Number of results to return (1-20, default: 10).', minimum: 1, - maximum: 50, + maximum: 20, }, }, required: ['query'], diff --git a/docs/webfetch-spa-support-plan.md b/docs/webfetch-spa-support-plan.md index 6a810db8..a52cac71 100644 --- a/docs/webfetch-spa-support-plan.md +++ b/docs/webfetch-spa-support-plan.md @@ -8,404 +8,114 @@ Upgrade the `webFetch` tool (`backend/src/lib/tools/webFetch.js`) with a browser **Goal**: Add browser fallback for SPAs while keeping JSDOM as the primary (lightweight) method. -## Recommended Approach +## Approved Strategy -**Hybrid Strategy**: Try JSDOM first (current behavior), fall back to headless browser only when needed. +**Hybrid Engine Approach**: +1. **Server/Docker**: Use `@sparticuz/chromium` + `puppeteer-core`. +2. **Electron App**: Use native `BrowserWindow` (no extra dependencies). +3. **Trigger**: Try JSDOM first. Auto-fallback to browser if content is missing or requires JS. -**Browser Engine**: chrome-aws-lambda with puppeteer-core (lightest option at ~80-120MB) +## Research Decisions (Tasks Resolved) -## Research Tasks (Do These First) - -### 1. Research chrome-aws-lambda current status and alternatives -**Why**: -- chrome-aws-lambda may be deprecated or have maintenance issues -- Newer alternatives like @sparticuz/chromium might be better -- Need to verify 2025 compatibility with Node.js versions -- Check if there are lighter-weight alternatives - -**What to research**: -- Current maintenance status of chrome-aws-lambda -- Alternative packages (@sparticuz/chromium, playwright-chromium-headless) -- Version compatibility with Node.js 20+ -- Known issues in Docker environments - -### 2. Research Chromium Docker optimization best practices -**Why**: -- Docker Chromium setup has specific requirements that evolve -- Security concerns (sandboxing, capabilities) -- Minimal dependency list changes over time -- Multi-stage build optimizations - -**What to research**: -- Minimal Chromium dependencies for 2025 Debian/Alpine images -- Chrome flags for resource optimization in containers -- Security best practices (--no-sandbox implications) -- Font and locale requirements - -### 3. Research browser pooling patterns and resource limits -**Why**: -- Browser pooling libraries may exist (generic-pool, etc.) -- Best practices for connection limits have evolved -- Memory leak prevention strategies -- Graceful shutdown patterns - -**What to research**: -- Existing browser pooling libraries -- Recommended pool sizes for different memory constraints -- Browser instance lifecycle management -- Memory leak detection and prevention - -### 4. Research modern SPA detection techniques -**Why**: -- New JavaScript frameworks emerge constantly -- Detection patterns need to catch Vue 3, React 18+, Svelte, etc. -- Meta tags and data attributes have evolved -- Better heuristics may exist - -**What to research**: -- Current SPA framework detection methods -- Meta tags used by modern frameworks (2024-2025) -- DOM patterns that indicate client-side rendering -- Reliable heuristics (script-to-content ratio thresholds) +- **Engine Selection**: `@sparticuz/chromium` chosen for server environments (Node 20+ compatible). +- **Docker Optimization**: Will use Alpine Linux packages (`apk add chromium`) instead of complex manual builds. +- **Pooling**: Will implement a simple LRU-style pool (max 2 instances) directly. +- **SPA Detection**: "Failure-driven" detection (fallback if JSDOM gets <300 chars or specific "enable JS" warnings) instead of complex heuristics. ## Implementation Steps -### Step 1: Install Dependencies +### Step 1: Install Server Dependencies +For `backend/` only: ```bash -./dev.sh exec backend npm install chrome-aws-lambda puppeteer-core --save +npm install puppeteer-core @sparticuz/chromium ``` +*Note: These will be mostly unused in the Electron build, but required for the server build.* -**Note**: After research, this may change to a different package. - -### Step 2: Update Dockerfile -Update `backend/Dockerfile` to include Chromium and dependencies: +### Step 2: Update Dockerfile (Alpine) +Update `backend/Dockerfile` to install system Chromium for Alpine: ```dockerfile -# Install Chromium dependencies -RUN apt-get update && apt-get install -y \ +# Add to "prod-deps" and "runner" stages +RUN apk add --no-cache \ chromium \ - chromium-driver \ - fonts-liberation \ - libnss3 \ - libxss1 \ - && rm -rf /var/lib/apt/lists/* + nss \ + freetype \ + harfbuzz \ + ca-certificates \ + ttf-freefont -# Set Chromium environment variables -ENV CHROME_BIN=/usr/bin/chromium ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true -ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium -``` - -**Note**: Final dependency list should come from research task #2. - -### Step 3: Create Browser Utility Module -Create `backend/src/lib/browserFetcher.js` with: - -**Core functionality**: -- Browser initialization with optimized flags -- Browser instance pooling (reuse instances) -- Timeout handling (10s default) -- Resource cleanup -- Error handling - -**Example structure**: -```javascript -import chromium from 'chrome-aws-lambda'; -import puppeteer from 'puppeteer-core'; - -class BrowserPool { - constructor(maxInstances = 2) { - this.pool = []; - this.maxInstances = maxInstances; - this.idleTimeout = 5 * 60 * 1000; // 5 minutes - } - - async acquireBrowser() { - // Get or create browser instance - } - - async releaseBrowser(browser) { - // Return to pool or close if pool is full - } - - async fetchWithBrowser(url, options = {}) { - const browser = await this.acquireBrowser(); - try { - const page = await browser.newPage(); - await page.goto(url, { - waitUntil: 'networkidle0', - timeout: options.timeout || 10000 - }); - const html = await page.content(); - await page.close(); - return html; - } finally { - await this.releaseBrowser(browser); - } - } - - async cleanup() { - // Close all browser instances - } -} - -export const browserPool = new BrowserPool(); -``` - -**Optimized Chromium flags** (adjust based on research): -```javascript -const args = [ - '--disable-gpu', - '--disable-dev-shm-usage', - '--disable-setuid-sandbox', - '--no-sandbox', - '--no-zygote', - '--single-process', - '--disable-accelerated-2d-canvas', - '--disable-background-networking', - '--disable-default-apps', - '--disable-extensions', - '--disable-sync', - '--metrics-recording-only', - '--mute-audio', - '--no-first-run', -]; +ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser ``` -### Step 4: Add SPA Detection Logic -In `webFetch.js`, create function to detect if a page needs JavaScript: +### Step 3: Create Abstract Browser Interface +Create `backend/src/lib/browser/BrowserService.js` to handle the environment split: ```javascript -function isSPA(html, extractedContent) { - // Check if extracted content is too short - if (extractedContent.textContent.length < 300) { - return true; - } - - // Detect common SPA frameworks - const spaIndicators = [ - /
<\/div>/, // React - /
<\/div>/, // Vue - /ng-app=/, // Angular - /__NEXT_DATA__/, // Next.js - /__nuxt/, // Nuxt - ]; - - for (const pattern of spaIndicators) { - if (pattern.test(html)) { - return true; +class BrowserService { + async fetchPageContent(url) { + if (process.env.IS_ELECTRON) { + return this.fetchWithElectron(url); + } else { + return this.fetchWithPuppeteer(url); } } - // Check script-to-content ratio - const scriptMatches = html.match(/]*>[\s\S]*?<\/script>/gi) || []; - const totalScriptLength = scriptMatches.join('').length; - const contentLength = extractedContent.textContent.length; - - if (totalScriptLength > contentLength * 3) { - // More script than content likely indicates SPA - return true; + async fetchWithElectron(url) { + // Dynamic import to avoid bundling electron in server build + const { BrowserWindow } = await import('electron'); + // Create invisible window, load URL, get content, destroy window } - return false; -} -``` - -**Note**: Detection logic should be refined based on research task #4. - -### Step 5: Implement Browser Fallback -Update the `handler` function in `webFetch.js`: - -```javascript -import { browserPool } from './browserFetcher.js'; - -async function handler({ url, maxChars, targetHeading, headingRange, continuation_token }) { - // ... existing continuation handling ... - - try { - // Fetch the web page (existing code) - const response = await fetch(url, { /* ... */ }); - let html = await streamResponse(response); - - // Try JSDOM extraction first - const dom = new JSDOM(html, { url }); - const document = dom.window.document; - let extractedContent = tryExtractionStrategies(document); - - // Check if we need browser fallback - let usedBrowser = false; - let browserError = null; - - if (isSPA(html, extractedContent)) { - try { - // Re-fetch with browser - html = await browserPool.fetchWithBrowser(url); - - // Re-extract with browser-rendered HTML - const browserDom = new JSDOM(html, { url }); - extractedContent = tryExtractionStrategies(browserDom.window.document); - usedBrowser = true; - } catch (error) { - // Browser failed, use JSDOM result with warning - browserError = `Browser fallback failed: ${error.message}`; - } - } - - // ... rest of existing extraction logic ... - - return { - url, - title: extractedContent.title || 'Untitled', - markdown: finalMarkdown, - // ... existing fields ... - ...(usedBrowser && { extractionMethod: 'browser' }), - ...(browserError && { browserError }), - }; - } catch (error) { - // ... existing error handling ... + async fetchWithPuppeteer(url) { + // Check pool, acquire instance, new page, get content, release } } ``` -### Step 6: Browser Instance Pooling -Implement efficient resource management in `browserFetcher.js`: +### Step 4: Implement Puppeteer Provider (Server) +Create `backend/src/lib/browser/PuppeteerProvider.js`: +- Manages `@sparticuz/chromium` instance. +- Implements simple pooling (reuse browser instance). +- Handles resource cleanup (close browser after 5m idle). -**Features**: -- Create browser pool manager (max 2-3 instances) -- Reuse browser instances across requests -- Implement idle timeout (close after 5 min inactivity) -- Add graceful shutdown on process exit +### Step 5: Implement Electron Provider (Desktop) +Create `backend/src/lib/browser/ElectronProvider.js`: +- Uses `new BrowserWindow({ show: false, webPreferences: { offscreen: true } })`. +- Much faster as it shares the main process engine. +- Zero extra memory overhead compared to spawning a whole new Chromium. -**Pool configuration**: -```javascript -const POOL_CONFIG = { - maxInstances: 2, // Max concurrent browsers - idleTimeout: 5 * 60 * 1000, // 5 minutes - pageTimeout: 10000, // 10 seconds per page -}; -``` +### Step 6: Update WebFetch Logic +Update `webFetch.js` to use the fallback strategy: -**Graceful shutdown**: ```javascript -process.on('SIGTERM', async () => { - await browserPool.cleanup(); - process.exit(0); -}); - -process.on('SIGINT', async () => { - await browserPool.cleanup(); - process.exit(0); -}); +// 1. Try simple fetch + JSDOM +let content = await basicFetch(url); + +// 2. Check for failure triggers +const isFailure = content.length < 300 + || content.includes("You need to enable JavaScript") + || content.includes("