Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,37 @@ npm run dev
- Open or clone a workspace, then click `Sync` to build the RAG index
- Need OS-specific prerequisites or production setup? See `docs/getting-started/installation.md`

### Optional: PDF Image Extraction (canvas)

Paper Study's multimodal vision feature renders PDF pages as images for vision-capable LLMs.
This requires the native `canvas` npm package, which depends on system-level libraries.
**Without these, `npm install` will still succeed** — image extraction falls back to text-only mode automatically.

To enable PDF image extraction, install the prerequisites first:

**macOS**
```bash
brew install pkg-config cairo pango libpng jpeg giflib librsvg
```

**Ubuntu / Debian**
```bash
sudo apt-get install -y build-essential libcairo2-dev libpango1.0-dev libjpeg-dev libgif-dev librsvg2-dev
```

**Fedora / RHEL**
```bash
sudo dnf install gcc-c++ cairo-devel pango-devel libjpeg-turbo-devel giflib-devel librsvg2-devel
```

**Windows**
See the [node-canvas wiki](https://github.com/Automattic/node-canvas/wiki/Installation:-Windows) for Windows-specific instructions.

After installing the system dependencies, run:
```bash
npm install canvas
```

## 🛠️ What You Can Do

- Chat with local files and code using grounded citations
Expand Down
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
"@xyflow/react": "^12.10.1",
"ai": "^6.0.99",
"better-sqlite3": "^12.6.2",
"canvas": "^3.2.1",
"cheerio": "^1.2.0",
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
Expand Down Expand Up @@ -84,6 +83,9 @@
"unified": "^11.0.5",
"zod": "^4.3.6"
},
"optionalDependencies": {
"canvas": "^3.2.1"
},
"devDependencies": {
"@tailwindcss/postcss": "^4",
"@types/better-sqlite3": "^7.6.13",
Expand Down
102 changes: 70 additions & 32 deletions src/lib/files/pdf-image-extractor.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { createCanvas, type Canvas } from "canvas";
import * as pdfjsLib from "pdfjs-dist/legacy/build/pdf.mjs";
import type { TextItem } from "pdfjs-dist/types/src/display/api";

Expand All @@ -22,6 +21,29 @@ interface ExtractOptions {
imageQuality?: number;
}

/* ---------- lazy canvas import ---------- */

let canvasModule: typeof import("canvas") | null = null;
let canvasLoadAttempted = false;

async function getCanvasModule(): Promise<typeof import("canvas") | null> {
if (canvasLoadAttempted) return canvasModule;
canvasLoadAttempted = true;
try {
canvasModule = await import("canvas");
} catch {
canvasModule = null;
}
return canvasModule;
}

/** Returns true if the native `canvas` package is available. */
export async function isCanvasAvailable(): Promise<boolean> {
return (await getCanvasModule()) !== null;
}

/* ---------- CustomCanvasFactory ---------- */

/**
* Custom CanvasFactory for pdfjs-dist in Node.js.
*
Expand All @@ -34,58 +56,74 @@ interface ExtractOptions {
* We replicate the BaseCanvasFactory interface inline to avoid importing
* pdfjs internals.
*/
class CustomCanvasFactory {
constructor(_opts?: { enableHWA?: boolean; ownerDocument?: unknown }) {
// opts ignored — node-canvas doesn't use HWA or ownerDocument
}
function buildCanvasFactory(mod: typeof import("canvas")) {
const { createCanvas } = mod;

create(width: number, height: number) {
if (width <= 0 || height <= 0) throw new Error("Invalid canvas size");
const canvas = createCanvas(width, height);
return { canvas, context: canvas.getContext("2d") };
}
return class CustomCanvasFactory {
constructor(_opts?: { enableHWA?: boolean; ownerDocument?: unknown }) {
// opts ignored — node-canvas doesn't use HWA or ownerDocument
}

reset(
canvasAndContext: { canvas: ReturnType<typeof createCanvas>; context: ReturnType<ReturnType<typeof createCanvas>["getContext"]> },
width: number,
height: number,
) {
canvasAndContext.canvas.width = width;
canvasAndContext.canvas.height = height;
}
create(width: number, height: number) {
if (width <= 0 || height <= 0) throw new Error("Invalid canvas size");
const canvas = createCanvas(width, height);
return { canvas, context: canvas.getContext("2d") };
}

destroy(canvasAndContext: { canvas: ReturnType<typeof createCanvas> | null; context: unknown | null }) {
if (canvasAndContext.canvas) {
canvasAndContext.canvas.width = 0;
canvasAndContext.canvas.height = 0;
reset(
canvasAndContext: { canvas: ReturnType<typeof createCanvas>; context: ReturnType<ReturnType<typeof createCanvas>["getContext"]> },
width: number,
height: number,
) {
canvasAndContext.canvas.width = width;
canvasAndContext.canvas.height = height;
}
canvasAndContext.canvas = null;
canvasAndContext.context = null;
}

/* Called by BaseCanvasFactory.create — but since we override create() directly,
this is only here as a safety net if pdfjs calls it via prototype chain. */
_createCanvas(width: number, height: number) {
return createCanvas(width, height);
}
destroy(canvasAndContext: { canvas: ReturnType<typeof createCanvas> | null; context: unknown | null }) {
if (canvasAndContext.canvas) {
canvasAndContext.canvas.width = 0;
canvasAndContext.canvas.height = 0;
}
canvasAndContext.canvas = null;
canvasAndContext.context = null;
}

_createCanvas(width: number, height: number) {
return createCanvas(width, height);
}
};
}

/**
* Extract text + page images from a PDF buffer using pdfjs-dist.
* Returns interleaved parts: [text(p1), image(p1), text(p2), image(p2), ...]
*
* Falls back to text-only extraction if the native `canvas` package is not installed.
*/
export async function extractPdfPagesWithImages(
buffer: Buffer,
opts: ExtractOptions = {},
): Promise<PaperContentPart[]> {
const mod = await getCanvasModule();
if (!mod) {
console.warn(
"[pdf-image-extractor] Native `canvas` package not available — falling back to text-only extraction. " +
"Install system dependencies (Cairo, Pango, etc.) and run `npm install canvas` for image support.",
);
return extractPdfPagesTextOnly(buffer, opts.maxPages);
}

const { createCanvas } = mod;
type CanvasType = import("canvas").Canvas;
const { maxPages = 20, imageScale = 1.5, imageQuality = 0.75 } = opts;
const CanvasFactory = buildCanvasFactory(mod);

const data = new Uint8Array(buffer);
const doc = await pdfjsLib.getDocument({
data,
useSystemFonts: true,
disableFontFace: true,
CanvasFactory: CustomCanvasFactory,
CanvasFactory,
} as Parameters<typeof pdfjsLib.getDocument>[0]).promise;

const pageCount = Math.min(doc.numPages, maxPages);
Expand Down Expand Up @@ -122,7 +160,7 @@ export async function extractPdfPagesWithImages(
} as Parameters<typeof page.render>[0]).promise;

// Convert to JPEG base64
const jpegBuffer = (renderCanvas as unknown as Canvas).toBuffer("image/jpeg", {
const jpegBuffer = (renderCanvas as unknown as CanvasType).toBuffer("image/jpeg", {
quality: imageQuality,
});
const base64 = jpegBuffer.toString("base64");
Expand Down
Loading