diff --git a/README.md b/README.md index f6258848..099c6a2e 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,37 @@ npm run dev - Open or clone a workspace, then click `Sync` to build the RAG index - Need OS-specific prerequisites or production setup? See `docs/getting-started/installation.md` +### Optional: PDF Image Extraction (canvas) + +Paper Study's multimodal vision feature renders PDF pages as images for vision-capable LLMs. +This requires the native `canvas` npm package, which depends on system-level libraries. +**Without these, `npm install` will still succeed** — image extraction falls back to text-only mode automatically. + +To enable PDF image extraction, install the prerequisites first: + +**macOS** +```bash +brew install pkg-config cairo pango libpng jpeg giflib librsvg +``` + +**Ubuntu / Debian** +```bash +sudo apt-get install -y build-essential libcairo2-dev libpango1.0-dev libjpeg-dev libgif-dev librsvg2-dev +``` + +**Fedora / RHEL** +```bash +sudo dnf install gcc-c++ cairo-devel pango-devel libjpeg-turbo-devel giflib-devel librsvg2-devel +``` + +**Windows** +See the [node-canvas wiki](https://github.com/Automattic/node-canvas/wiki/Installation:-Windows) for Windows-specific instructions. + +After installing the system dependencies, run: +```bash +npm install canvas +``` + ## 🛠️ What You Can Do - Chat with local files and code using grounded citations diff --git a/package.json b/package.json index e84fa011..1169d84d 100644 --- a/package.json +++ b/package.json @@ -48,7 +48,6 @@ "@xyflow/react": "^12.10.1", "ai": "^6.0.99", "better-sqlite3": "^12.6.2", - "canvas": "^3.2.1", "cheerio": "^1.2.0", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", @@ -84,6 +83,9 @@ "unified": "^11.0.5", "zod": "^4.3.6" }, + "optionalDependencies": { + "canvas": "^3.2.1" + }, "devDependencies": { "@tailwindcss/postcss": "^4", "@types/better-sqlite3": "^7.6.13", diff --git a/src/lib/files/pdf-image-extractor.ts b/src/lib/files/pdf-image-extractor.ts index 3cde55b5..8d612dbc 100644 --- a/src/lib/files/pdf-image-extractor.ts +++ b/src/lib/files/pdf-image-extractor.ts @@ -1,4 +1,3 @@ -import { createCanvas, type Canvas } from "canvas"; import * as pdfjsLib from "pdfjs-dist/legacy/build/pdf.mjs"; import type { TextItem } from "pdfjs-dist/types/src/display/api"; @@ -22,6 +21,29 @@ interface ExtractOptions { imageQuality?: number; } +/* ---------- lazy canvas import ---------- */ + +let canvasModule: typeof import("canvas") | null = null; +let canvasLoadAttempted = false; + +async function getCanvasModule(): Promise { + if (canvasLoadAttempted) return canvasModule; + canvasLoadAttempted = true; + try { + canvasModule = await import("canvas"); + } catch { + canvasModule = null; + } + return canvasModule; +} + +/** Returns true if the native `canvas` package is available. */ +export async function isCanvasAvailable(): Promise { + return (await getCanvasModule()) !== null; +} + +/* ---------- CustomCanvasFactory ---------- */ + /** * Custom CanvasFactory for pdfjs-dist in Node.js. * @@ -34,58 +56,74 @@ interface ExtractOptions { * We replicate the BaseCanvasFactory interface inline to avoid importing * pdfjs internals. */ -class CustomCanvasFactory { - constructor(_opts?: { enableHWA?: boolean; ownerDocument?: unknown }) { - // opts ignored — node-canvas doesn't use HWA or ownerDocument - } +function buildCanvasFactory(mod: typeof import("canvas")) { + const { createCanvas } = mod; - create(width: number, height: number) { - if (width <= 0 || height <= 0) throw new Error("Invalid canvas size"); - const canvas = createCanvas(width, height); - return { canvas, context: canvas.getContext("2d") }; - } + return class CustomCanvasFactory { + constructor(_opts?: { enableHWA?: boolean; ownerDocument?: unknown }) { + // opts ignored — node-canvas doesn't use HWA or ownerDocument + } - reset( - canvasAndContext: { canvas: ReturnType; context: ReturnType["getContext"]> }, - width: number, - height: number, - ) { - canvasAndContext.canvas.width = width; - canvasAndContext.canvas.height = height; - } + create(width: number, height: number) { + if (width <= 0 || height <= 0) throw new Error("Invalid canvas size"); + const canvas = createCanvas(width, height); + return { canvas, context: canvas.getContext("2d") }; + } - destroy(canvasAndContext: { canvas: ReturnType | null; context: unknown | null }) { - if (canvasAndContext.canvas) { - canvasAndContext.canvas.width = 0; - canvasAndContext.canvas.height = 0; + reset( + canvasAndContext: { canvas: ReturnType; context: ReturnType["getContext"]> }, + width: number, + height: number, + ) { + canvasAndContext.canvas.width = width; + canvasAndContext.canvas.height = height; } - canvasAndContext.canvas = null; - canvasAndContext.context = null; - } - /* Called by BaseCanvasFactory.create — but since we override create() directly, - this is only here as a safety net if pdfjs calls it via prototype chain. */ - _createCanvas(width: number, height: number) { - return createCanvas(width, height); - } + destroy(canvasAndContext: { canvas: ReturnType | null; context: unknown | null }) { + if (canvasAndContext.canvas) { + canvasAndContext.canvas.width = 0; + canvasAndContext.canvas.height = 0; + } + canvasAndContext.canvas = null; + canvasAndContext.context = null; + } + + _createCanvas(width: number, height: number) { + return createCanvas(width, height); + } + }; } /** * Extract text + page images from a PDF buffer using pdfjs-dist. * Returns interleaved parts: [text(p1), image(p1), text(p2), image(p2), ...] + * + * Falls back to text-only extraction if the native `canvas` package is not installed. */ export async function extractPdfPagesWithImages( buffer: Buffer, opts: ExtractOptions = {}, ): Promise { + const mod = await getCanvasModule(); + if (!mod) { + console.warn( + "[pdf-image-extractor] Native `canvas` package not available — falling back to text-only extraction. " + + "Install system dependencies (Cairo, Pango, etc.) and run `npm install canvas` for image support.", + ); + return extractPdfPagesTextOnly(buffer, opts.maxPages); + } + + const { createCanvas } = mod; + type CanvasType = import("canvas").Canvas; const { maxPages = 20, imageScale = 1.5, imageQuality = 0.75 } = opts; + const CanvasFactory = buildCanvasFactory(mod); const data = new Uint8Array(buffer); const doc = await pdfjsLib.getDocument({ data, useSystemFonts: true, disableFontFace: true, - CanvasFactory: CustomCanvasFactory, + CanvasFactory, } as Parameters[0]).promise; const pageCount = Math.min(doc.numPages, maxPages); @@ -122,7 +160,7 @@ export async function extractPdfPagesWithImages( } as Parameters[0]).promise; // Convert to JPEG base64 - const jpegBuffer = (renderCanvas as unknown as Canvas).toBuffer("image/jpeg", { + const jpegBuffer = (renderCanvas as unknown as CanvasType).toBuffer("image/jpeg", { quality: imageQuality, }); const base64 = jpegBuffer.toString("base64");