diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 0749371..6e1f082 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -4,7 +4,7 @@ env: APP_NAME: pdfdown MACOSX_DEPLOYMENT_TARGET: '10.13' CARGO_INCREMENTAL: '1' -'on': +on: push: branches: - main @@ -23,6 +23,7 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: + lint: name: Lint runs-on: ubuntu-latest @@ -203,6 +204,32 @@ jobs: brew update brew install tesseract tesseract --version + - name: Download PDFium (Linux) + if: runner.os == 'Linux' + run: | + ARCH=$(uname -m) + if [ "$ARCH" = "x86_64" ]; then + PDFIUM_ARCH="linux-x64" + elif [ "$ARCH" = "aarch64" ]; then + PDFIUM_ARCH="linux-arm64" + fi + curl -L "https://github.com/nicksrandall/pdfium-lib/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -o pdfium.tgz + mkdir -p /tmp/pdfium + tar xzf pdfium.tgz -C /tmp/pdfium + echo "PDFIUM_LIBRARY_PATH=/tmp/pdfium/lib/libpdfium.so" >> $GITHUB_ENV + - name: Download PDFium (macOS) + if: runner.os == 'macOS' + run: | + ARCH=$(uname -m) + if [ "$ARCH" = "x86_64" ]; then + PDFIUM_ARCH="mac-x64" + elif [ "$ARCH" = "arm64" ]; then + PDFIUM_ARCH="mac-arm64" + fi + curl -L "https://github.com/nicksrandall/pdfium-lib/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -o pdfium.tgz + mkdir -p /tmp/pdfium + tar xzf pdfium.tgz -C /tmp/pdfium + echo "PDFIUM_LIBRARY_PATH=/tmp/pdfium/lib/libpdfium.dylib" >> $GITHUB_ENV - name: Install dependencies run: yarn install - name: Build OCR @@ -447,6 +474,32 @@ jobs: TESS_PREFIX=$(tesseract --list-langs 2>&1 | head -1 | sed 's/.*"\(.*\)".*/\1/') echo "TESSDATA_PREFIX=${TESS_PREFIX}" >> $GITHUB_ENV echo "Tesseract data path: ${TESS_PREFIX}" + - name: Download PDFium (Linux) + if: runner.os == 'Linux' + run: | + ARCH=$(uname -m) + if [ "$ARCH" = "x86_64" ]; then + PDFIUM_ARCH="linux-x64" + elif [ "$ARCH" = "aarch64" ]; then + PDFIUM_ARCH="linux-arm64" + fi + curl -L "https://github.com/nicksrandall/pdfium-lib/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -o pdfium.tgz + mkdir -p /tmp/pdfium + tar xzf pdfium.tgz -C /tmp/pdfium + echo "PDFIUM_LIBRARY_PATH=/tmp/pdfium/lib/libpdfium.so" >> $GITHUB_ENV + - name: Download PDFium (macOS) + if: runner.os == 'macOS' + run: | + ARCH=$(uname -m) + if [ "$ARCH" = "x86_64" ]; then + PDFIUM_ARCH="mac-x64" + elif [ "$ARCH" = "arm64" ]; then + PDFIUM_ARCH="mac-arm64" + fi + curl -L "https://github.com/nicksrandall/pdfium-lib/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -o pdfium.tgz + mkdir -p /tmp/pdfium + tar xzf pdfium.tgz -C /tmp/pdfium + echo "PDFIUM_LIBRARY_PATH=/tmp/pdfium/lib/libpdfium.dylib" >> $GITHUB_ENV - name: Install dependencies run: yarn install - name: Download OCR artifact diff --git a/Cargo.toml b/Cargo.toml index 9b344df..e6dd893 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ edition = "2024" name = "pdfdown" -version = "0.9.7" +version = "0.9.8" [lib] crate-type = ["cdylib"] @@ -20,10 +20,17 @@ napi = { version = "3.0.0", features = ["napi6"] } napi-derive = "3.0.0" rayon = "1.11" tesseract-rs = { version = "0.1", optional = true } +pdfium-render = { version = "0.8", optional = true, default-features = false, features = [ + "pdfium_latest", + "image_025", + "thread_safe", + "sync", +] } [features] default = [] ocr = ["tesseract-rs"] +render = ["pdfium-render"] [build-dependencies] napi-build = "2" diff --git a/README.md b/README.md index 4c5a20f..9f1e4f0 100644 --- a/README.md +++ b/README.md @@ -230,7 +230,7 @@ for (const img of images) { console.log(`Page ${img.page} image ${img.imageIndex}: ${img.width}x${img.height} ${img.colorSpace}`) } ``` - + #### Extract images as PNG (async) ```typescript diff --git a/index.d.ts b/index.d.ts index 07637b9..dcc15ae 100644 --- a/index.d.ts +++ b/index.d.ts @@ -2,30 +2,19 @@ /* eslint-disable */ export declare class PdfDown { constructor(buffer: Buffer) - /** Sync: extract text per page (reuses the already-parsed document) */ textPerPage(): Array - /** Sync: extract images per page (reuses the already-parsed document) */ imagesPerPage(): Array - /** Sync: extract annotations per page (reuses the already-parsed document) */ annotationsPerPage(): Array - /** Sync: get PDF metadata (reuses the already-parsed document) */ metadata(): PdfMeta - /** Async: extract text per page on the libuv thread pool (shares parsed document via Arc) */ textPerPageAsync(): Promise> - /** Async: extract images per page on the libuv thread pool (shares parsed document via Arc) */ imagesPerPageAsync(): Promise> - /** Async: extract annotations per page on the libuv thread pool (shares parsed document via Arc) */ annotationsPerPageAsync(): Promise> - /** Async: get PDF metadata on the libuv thread pool (shares parsed document via Arc) */ metadataAsync(): Promise - /** Sync: extract everything from the PDF in one call (reuses the already-parsed document) */ document(): PdfDocument - /** Async: extract everything from the PDF on the libuv thread pool (shares parsed document via Arc) */ documentAsync(): Promise - /** Sync: extract structured text with header/footer detection */ structuredText(): Array - /** Async: extract structured text with header/footer detection */ structuredTextAsync(): Promise> + renderPagesAsync(opts?: RenderOptions | undefined | null): Promise> } export declare const enum BoxType { @@ -34,6 +23,13 @@ export declare const enum BoxType { Unknown = 'Unknown' } +export declare function capabilities(): Capabilities + +export interface Capabilities { + ocr: boolean + render: boolean +} + export declare function extractAnnotationsPerPage(buffer: Buffer): Array export declare function extractAnnotationsPerPageAsync(buffer: Buffer): Promise> @@ -133,6 +129,27 @@ export declare function pdfMetadata(buffer: Buffer): PdfMeta export declare function pdfMetadataAsync(buffer: Buffer): Promise +export interface RenderedPage { + page: number + width: number + height: number + dpi: number + data: Buffer +} + +export declare const enum RenderMode { + Auto = 'Auto', + Never = 'Never', + Always = 'Always' +} + +export interface RenderOptions { + dpi?: number + mode?: RenderMode +} + +export declare function renderPagesAsync(buffer: Buffer, opts?: RenderOptions | undefined | null): Promise> + export interface StructuredPageText { page: number header: string diff --git a/index.js b/index.js index 0080109..2813605 100644 --- a/index.js +++ b/index.js @@ -80,8 +80,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-android-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-android-arm64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -96,8 +96,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-android-arm-eabi') const bindingPackageVersion = require('@d0paminedriven/pdfdown-android-arm-eabi/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -116,8 +116,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-win32-x64-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-win32-x64-msvc/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -132,8 +132,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-win32-ia32-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-win32-ia32-msvc/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -148,8 +148,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-win32-arm64-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-win32-arm64-msvc/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -167,8 +167,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-darwin-universal') const bindingPackageVersion = require('@d0paminedriven/pdfdown-darwin-universal/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -183,8 +183,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-darwin-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-darwin-x64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -199,8 +199,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-darwin-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-darwin-arm64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -219,8 +219,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-freebsd-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-freebsd-x64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -235,8 +235,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-freebsd-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-freebsd-arm64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -256,8 +256,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-x64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-x64-musl/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -272,8 +272,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-x64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-x64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -290,8 +290,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-arm64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-arm64-musl/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -306,8 +306,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-arm64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-arm64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -324,8 +324,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-arm-musleabihf') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-arm-musleabihf/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -340,8 +340,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-arm-gnueabihf') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-arm-gnueabihf/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -358,8 +358,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-loong64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-loong64-musl/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -374,8 +374,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-loong64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-loong64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -392,8 +392,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-riscv64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-riscv64-musl/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -408,8 +408,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-riscv64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-riscv64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -425,8 +425,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-ppc64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-ppc64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -441,8 +441,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-s390x-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-s390x-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -461,8 +461,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-openharmony-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-openharmony-arm64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -477,8 +477,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-openharmony-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-openharmony-x64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -493,8 +493,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-openharmony-arm') const bindingPackageVersion = require('@d0paminedriven/pdfdown-openharmony-arm/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -559,6 +559,7 @@ if (!nativeBinding) { module.exports = nativeBinding module.exports.PdfDown = nativeBinding.PdfDown module.exports.BoxType = nativeBinding.BoxType +module.exports.capabilities = nativeBinding.capabilities module.exports.extractAnnotationsPerPage = nativeBinding.extractAnnotationsPerPage module.exports.extractAnnotationsPerPageAsync = nativeBinding.extractAnnotationsPerPageAsync module.exports.extractImagesPerPage = nativeBinding.extractImagesPerPage @@ -571,3 +572,5 @@ module.exports.pdfDocument = nativeBinding.pdfDocument module.exports.pdfDocumentAsync = nativeBinding.pdfDocumentAsync module.exports.pdfMetadata = nativeBinding.pdfMetadata module.exports.pdfMetadataAsync = nativeBinding.pdfMetadataAsync +module.exports.RenderMode = nativeBinding.RenderMode +module.exports.renderPagesAsync = nativeBinding.renderPagesAsync diff --git a/ocr/__test__/fixtures/advanced_organic_chem_presentation_autumn_2014.pdf b/ocr/__test__/fixtures/advanced_organic_chem_presentation_autumn_2014.pdf new file mode 100644 index 0000000..6a61262 Binary files /dev/null and b/ocr/__test__/fixtures/advanced_organic_chem_presentation_autumn_2014.pdf differ diff --git a/ocr/__test__/index.spec.ts b/ocr/__test__/index.spec.ts index 5bca3e7..d9fef33 100644 --- a/ocr/__test__/index.spec.ts +++ b/ocr/__test__/index.spec.ts @@ -10,6 +10,8 @@ import { extractImagesPerPage, PdfDown, TextSource, + capabilities, + RenderMode, } from '../index' // ── Fixtures ───────────────────────────────────────────────────────────────── @@ -80,12 +82,12 @@ test('extractTextWithOcrPerPage — native text PDF uses Native source', (t) => t.true(pages.length > 0, 'should have pages') - const nativePages = pages.filter((p) => p.source === 'Native') + const nativePages = pages.filter((p) => p.source === TextSource.Native) t.true(nativePages.length > 0, 'at least some pages should use native extraction') t.log(`Total pages: ${pages.length}`) t.log(`Native: ${nativePages.length}`) - t.log(`OCR: ${pages.filter((p) => p.source === 'Ocr').length}`) + t.log(`OCR: ${pages.filter((p) => p.source === TextSource.Ocr).length}`) }) test('extractTextWithOcrPerPageAsync — native text PDF matches sync', async (t) => { @@ -104,21 +106,22 @@ test('extractTextWithOcrPerPageAsync — native text PDF matches sync', async (t test('extractTextWithOcrPerPage — high minTextLength forces OCR on native pages', (t) => { // Set absurdly high threshold so even pages with native text fall back to OCR - const pages = extractTextWithOcrPerPage(nativePdf, { minTextLength: 999999 }) + const pages = extractTextWithOcrPerPage(nativePdf, {render: RenderMode.Auto, minTextLength: 999999 }) // Pages with images should now be OCR'd - const ocrPages = pages.filter((p) => p.source === 'Ocr') - const nativePages = pages.filter((p) => p.source === 'Native') + const ocrPages = pages.filter((p) => p.source === TextSource.Ocr) + const nativePages = pages.filter((p) => p.source === TextSource.Native) - t.log(`With minTextLength=999999: OCR=${ocrPages.length}, Native=${nativePages.length}`) + const renderedPages = pages.filter((p) => p.source === TextSource.Rendered) + t.log(`With minTextLength=999999: OCR=${ocrPages.length}, Native=${nativePages.length}, Rendered=${renderedPages.length}`) - // Pages without images will still be "Native" (nothing to OCR), but pages with - // images should fall back to OCR since native text won't meet the threshold - // Just verify we get valid results either way for (const p of pages) { t.is(typeof p.page, 'number', 'page should be a number') t.is(typeof p.text, 'string', 'text should be a string') - t.true(p.source === 'Native' || p.source === 'Ocr', 'source should be Native or Ocr') + t.true( + p.source === TextSource.Native || p.source === TextSource.Ocr || p.source === TextSource.Rendered, + `source should be a valid TextSource, got: ${p.source}`, + ) } }) @@ -194,3 +197,70 @@ test('base API: PdfDown class methods work through OCR package', (t) => { const text = pdf.textPerPage() t.is(text.length, meta.pageCount) }) + +// ── Render + Three-tier cascade tests ──────────────────────────────────────── + +const chemPdf = readFileSync(join(FIXTURES, 'advanced_organic_chem_presentation_autumn_2014.pdf')) + +// ── capabilities ───────────────────────────────────────────────────────────── + +test('capabilities — returns ocr and render booleans', (t) => { + const caps = capabilities() + t.is(typeof caps.ocr, 'boolean', 'ocr should be a boolean') + t.is(typeof caps.render, 'boolean', 'render should be a boolean') + t.true(caps.ocr, 'ocr should be true in the OCR package') + t.log(`capabilities: ocr=${caps.ocr}, render=${caps.render}`) +}) + +// ── PdfDown async render + cascade (only runs if PDFium is available) ──────── + +const caps = capabilities() + +if (caps.render) { + test('PdfDown — render + cascade on vector-heavy PDF', async (t) => { + const pdf = new PdfDown(chemPdf) + + const [allPages, autoOnly, lowDpi, highDpi, autoOcr, neverOcr, alwaysOcr] = await Promise.all([ + pdf.renderPagesAsync({ mode: RenderMode.Always }), + pdf.renderPagesAsync({ mode: RenderMode.Auto }), + pdf.renderPagesAsync({ mode: RenderMode.Auto, dpi: 72 }), + pdf.renderPagesAsync({ mode: RenderMode.Auto, dpi: 300 }), + pdf.textWithOcrPerPageAsync({ render: RenderMode.Auto }), + pdf.textWithOcrPerPageAsync({ render: RenderMode.Never }), + pdf.textWithOcrPerPageAsync({ render: RenderMode.Always }), + ]) + + // renderPagesAsync: all pages (Always mode) + t.is(allPages.length, 21) + for (const p of allPages) { + t.true(p.width > 0) + t.true(p.height > 0) + t.deepEqual([...p.data.subarray(0, 4)], [0x89, 0x50, 0x4e, 0x47]) + } + + // renderPagesAsync: Auto mode (only empty pages) + t.true(autoOnly.length > 0) + t.true(autoOnly.length < 21) + + // DPI comparison + t.is(lowDpi.length, highDpi.length) + t.true(highDpi[0].width > lowDpi[0].width) + t.true(highDpi[0].height > lowDpi[0].height) + + // OCR cascade: Auto + t.is(autoOcr.length, 21) + t.true(autoOcr.filter((p) => p.source === TextSource.Rendered).length > 0) + t.true(autoOcr.filter((p) => p.source === TextSource.Native).length > 0) + + // OCR cascade: Never + t.is(neverOcr.filter((p) => p.source === TextSource.Rendered).length, 0) + + // OCR cascade: Always + t.is(alwaysOcr.filter((p) => p.source === TextSource.Rendered).length, alwaysOcr.length) + }) +} else { + test('render tests skipped — PDFium not available', (t) => { + t.log('PDFium library not found; render tests skipped. Set PDFIUM_LIBRARY_PATH to enable.') + t.pass() + }) +} diff --git a/ocr/index.d.ts b/ocr/index.d.ts index d081d0e..51f8a16 100644 --- a/ocr/index.d.ts +++ b/ocr/index.d.ts @@ -2,38 +2,23 @@ /* eslint-disable */ export declare class PdfDown { constructor(buffer: Buffer) - /** Sync: extract text per page (reuses the already-parsed document) */ textPerPage(): Array - /** Sync: extract images per page (reuses the already-parsed document) */ imagesPerPage(): Array - /** Sync: extract annotations per page (reuses the already-parsed document) */ annotationsPerPage(): Array - /** Sync: get PDF metadata (reuses the already-parsed document) */ metadata(): PdfMeta - /** Async: extract text per page on the libuv thread pool (shares parsed document via Arc) */ textPerPageAsync(): Promise> - /** Async: extract images per page on the libuv thread pool (shares parsed document via Arc) */ imagesPerPageAsync(): Promise> - /** Async: extract annotations per page on the libuv thread pool (shares parsed document via Arc) */ annotationsPerPageAsync(): Promise> - /** Async: get PDF metadata on the libuv thread pool (shares parsed document via Arc) */ metadataAsync(): Promise - /** Sync: extract everything from the PDF in one call (reuses the already-parsed document) */ document(): PdfDocument - /** Async: extract everything from the PDF on the libuv thread pool (shares parsed document via Arc) */ documentAsync(): Promise - /** Sync: extract structured text with header/footer detection */ structuredText(): Array - /** Async: extract structured text with header/footer detection */ structuredTextAsync(): Promise> - /** Sync: extract text with OCR fallback for image-only pages */ textWithOcrPerPage(opts?: OcrOptions | undefined | null): Array - /** Async: extract text with OCR fallback for image-only pages */ textWithOcrPerPageAsync(opts?: OcrOptions | undefined | null): Promise> - /** Sync: extract everything from the PDF with OCR text fallback */ documentOcr(opts?: OcrOptions | undefined | null): PdfDocumentOcr - /** Async: extract everything from the PDF with OCR text fallback */ documentOcrAsync(opts?: OcrOptions | undefined | null): Promise + renderPagesAsync(opts?: RenderOptions | undefined | null): Promise> } export declare const enum BoxType { @@ -42,6 +27,13 @@ export declare const enum BoxType { Unknown = 'Unknown' } +export declare function capabilities(): Capabilities + +export interface Capabilities { + ocr: boolean + render: boolean +} + export declare function extractAnnotationsPerPage(buffer: Buffer): Array export declare function extractAnnotationsPerPageAsync(buffer: Buffer): Promise> @@ -66,6 +58,9 @@ export interface OcrOptions { lang?: string minTextLength?: number maxThreads?: number + render?: RenderMode + renderDpi?: number + pdfiumPath?: string } export interface OcrPageText { @@ -188,6 +183,27 @@ export declare function pdfMetadata(buffer: Buffer): PdfMeta export declare function pdfMetadataAsync(buffer: Buffer): Promise +export interface RenderedPage { + page: number + width: number + height: number + dpi: number + data: Buffer +} + +export declare const enum RenderMode { + Auto = 'Auto', + Never = 'Never', + Always = 'Always' +} + +export interface RenderOptions { + dpi?: number + mode?: RenderMode +} + +export declare function renderPagesAsync(buffer: Buffer, opts?: RenderOptions | undefined | null): Promise> + export interface StructuredPageText { page: number header: string @@ -197,5 +213,6 @@ export interface StructuredPageText { export declare const enum TextSource { Native = 'Native', - Ocr = 'Ocr' + Ocr = 'Ocr', + Rendered = 'Rendered' } diff --git a/ocr/index.js b/ocr/index.js index 2c7edf2..e77235f 100644 --- a/ocr/index.js +++ b/ocr/index.js @@ -80,8 +80,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-android-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-android-arm64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -96,8 +96,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-android-arm-eabi') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-android-arm-eabi/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -116,8 +116,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-win32-x64-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-win32-x64-msvc/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -132,8 +132,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-win32-ia32-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-win32-ia32-msvc/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -148,8 +148,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-win32-arm64-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-win32-arm64-msvc/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -167,8 +167,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-darwin-universal') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-darwin-universal/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -183,8 +183,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-darwin-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-darwin-x64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -199,8 +199,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-darwin-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-darwin-arm64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -219,8 +219,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-freebsd-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-freebsd-x64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -235,8 +235,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-freebsd-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-freebsd-arm64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -256,8 +256,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-x64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-x64-musl/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -272,8 +272,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-x64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-x64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -290,8 +290,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm64-musl/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -306,8 +306,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -324,8 +324,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm-musleabihf') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm-musleabihf/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -340,8 +340,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm-gnueabihf') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm-gnueabihf/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -358,8 +358,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-loong64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-loong64-musl/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -374,8 +374,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-loong64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-loong64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -392,8 +392,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-musl/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -408,8 +408,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -425,8 +425,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-ppc64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-ppc64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -441,8 +441,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-s390x-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-s390x-gnu/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -461,8 +461,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-openharmony-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-openharmony-arm64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -477,8 +477,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-openharmony-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-openharmony-x64/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -493,8 +493,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-openharmony-arm') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-openharmony-arm/package.json').version - if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.8' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.8 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -559,6 +559,7 @@ if (!nativeBinding) { module.exports = nativeBinding module.exports.PdfDown = nativeBinding.PdfDown module.exports.BoxType = nativeBinding.BoxType +module.exports.capabilities = nativeBinding.capabilities module.exports.extractAnnotationsPerPage = nativeBinding.extractAnnotationsPerPage module.exports.extractAnnotationsPerPageAsync = nativeBinding.extractAnnotationsPerPageAsync module.exports.extractImagesPerPage = nativeBinding.extractImagesPerPage @@ -575,4 +576,6 @@ module.exports.pdfDocumentOcr = nativeBinding.pdfDocumentOcr module.exports.pdfDocumentOcrAsync = nativeBinding.pdfDocumentOcrAsync module.exports.pdfMetadata = nativeBinding.pdfMetadata module.exports.pdfMetadataAsync = nativeBinding.pdfMetadataAsync +module.exports.RenderMode = nativeBinding.RenderMode +module.exports.renderPagesAsync = nativeBinding.renderPagesAsync module.exports.TextSource = nativeBinding.TextSource diff --git a/ocr/package.json b/ocr/package.json index 452b14e..bdc7002 100644 --- a/ocr/package.json +++ b/ocr/package.json @@ -1,6 +1,6 @@ { "name": "@d0paminedriven/pdfdown-ocr", - "version": "0.9.7", + "version": "0.9.8", "description": "Rust powered PDF extraction for Node with OCR fallback (requires system tesseract).", "main": "index.js", "repository": { diff --git a/package.json b/package.json index 7ab9383..02826af 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@d0paminedriven/pdfdown", - "version": "0.9.7", + "version": "0.9.8", "description": "Rust powered PDF extraction for Node", "main": "index.js", "repository": { @@ -52,9 +52,9 @@ "artifacts": "napi artifacts", "artifacts:ocr": "napi artifacts --package-json-path ocr/package.json --npm-dir ocr/npm", "bench": "node --import @oxc-node/core/register benchmark/bench.ts", - "build": "napi build --platform --release", - "build:debug": "napi build --platform", - "build:ocr": "napi build --platform --release --features ocr --package-json-path ocr/package.json --output-dir ocr", + "build": "napi build --platform --release --features render", + "build:debug": "napi build --platform --features render", + "build:ocr": "napi build --platform --release --features ocr,render --package-json-path ocr/package.json --output-dir ocr", "create-npm-dirs:ocr": "napi create-npm-dirs --package-json-path ocr/package.json --npm-dir ocr/npm", "format": "run-p format:prettier format:rs format:toml", "format:prettier": "prettier . -w", diff --git a/pdfdown.wasi-browser.js b/pdfdown.wasi-browser.js index d06520d..a33eadf 100644 --- a/pdfdown.wasi-browser.js +++ b/pdfdown.wasi-browser.js @@ -58,6 +58,7 @@ const { export default __napiModule.exports export const PdfDown = __napiModule.exports.PdfDown export const BoxType = __napiModule.exports.BoxType +export const capabilities = __napiModule.exports.capabilities export const extractAnnotationsPerPage = __napiModule.exports.extractAnnotationsPerPage export const extractAnnotationsPerPageAsync = __napiModule.exports.extractAnnotationsPerPageAsync export const extractImagesPerPage = __napiModule.exports.extractImagesPerPage @@ -70,3 +71,5 @@ export const pdfDocument = __napiModule.exports.pdfDocument export const pdfDocumentAsync = __napiModule.exports.pdfDocumentAsync export const pdfMetadata = __napiModule.exports.pdfMetadata export const pdfMetadataAsync = __napiModule.exports.pdfMetadataAsync +export const RenderMode = __napiModule.exports.RenderMode +export const renderPagesAsync = __napiModule.exports.renderPagesAsync diff --git a/pdfdown.wasi.cjs b/pdfdown.wasi.cjs index e0fe3cc..89b1eb8 100644 --- a/pdfdown.wasi.cjs +++ b/pdfdown.wasi.cjs @@ -110,6 +110,7 @@ const { instance: __napiInstance, module: __wasiModule, napiModule: __napiModule module.exports = __napiModule.exports module.exports.PdfDown = __napiModule.exports.PdfDown module.exports.BoxType = __napiModule.exports.BoxType +module.exports.capabilities = __napiModule.exports.capabilities module.exports.extractAnnotationsPerPage = __napiModule.exports.extractAnnotationsPerPage module.exports.extractAnnotationsPerPageAsync = __napiModule.exports.extractAnnotationsPerPageAsync module.exports.extractImagesPerPage = __napiModule.exports.extractImagesPerPage @@ -122,3 +123,5 @@ module.exports.pdfDocument = __napiModule.exports.pdfDocument module.exports.pdfDocumentAsync = __napiModule.exports.pdfDocumentAsync module.exports.pdfMetadata = __napiModule.exports.pdfMetadata module.exports.pdfMetadataAsync = __napiModule.exports.pdfMetadataAsync +module.exports.RenderMode = __napiModule.exports.RenderMode +module.exports.renderPagesAsync = __napiModule.exports.renderPagesAsync diff --git a/src/core/document.rs b/src/core/document.rs index 9bf2c56..076f120 100644 --- a/src/core/document.rs +++ b/src/core/document.rs @@ -147,13 +147,26 @@ pub(crate) fn extract_all(doc: &Document) -> Result { #[cfg(feature = "ocr")] pub(crate) fn extract_all_with_ocr( doc: &Document, + pdf_bytes: &[u8], lang: &str, min_len: u32, max_threads: u32, + render_dpi: u32, + render_mode: u8, ) -> Result { let meta = extract_metadata(doc); let (text, (images, annotations)) = rayon::join( - || extract_text_with_ocr(doc, lang, min_len, max_threads), + || { + extract_text_with_ocr( + doc, + pdf_bytes, + lang, + min_len, + max_threads, + render_dpi, + render_mode, + ) + }, || rayon::join(|| extract_images_raw(doc), || extract_annotations(doc)), ); let text = text?; diff --git a/src/core/images.rs b/src/core/images.rs index 12ff87c..8c281f7 100644 --- a/src/core/images.rs +++ b/src/core/images.rs @@ -17,8 +17,8 @@ pub(crate) fn extract_images_raw(doc: &Document) -> Vec { } /// Decode all image XObjects on a page to DynamicImages (no PNG encoding). -/// Used by OCR to avoid the PNG encode→decode roundtrip. -#[cfg(feature = "ocr")] +/// Used by OCR to avoid the PNG encode→decode roundtrip and by render for empty page detection. +#[cfg(any(feature = "ocr", feature = "render"))] pub(crate) fn collect_page_decoded_images(doc: &Document, page_id: ObjectId) -> Vec { let mut decoded = Vec::new(); @@ -217,19 +217,16 @@ fn get_inherited_resources(doc: &Document, page_id: ObjectId) -> Option HashSet> { - let mut names = HashSet::new(); - +/// Extract raw content bytes from a page's content stream(s). +fn get_page_content_bytes(doc: &Document, page_id: ObjectId) -> Vec { let page_dict = match doc.get_dictionary(page_id) { Ok(d) => d, - Err(_) => return names, + Err(_) => return Vec::new(), }; let contents = match page_dict.get(b"Contents") { Ok(c) => c, - Err(_) => return names, + Err(_) => return Vec::new(), }; let stream_ids: Vec = match contents { @@ -244,7 +241,7 @@ fn get_referenced_xobject_names(doc: &Document, page_id: ObjectId) -> HashSet return names, + _ => return Vec::new(), }; let mut all_bytes = Vec::new(); @@ -255,8 +252,12 @@ fn get_referenced_xobject_names(doc: &Document, page_id: ObjectId) -> HashSet>) { + if let Ok(content) = lopdf::content::Content::decode(content_bytes) { for op in &content.operations { if op.operator == "Do" && let Some(Object::Name(name)) = op.operands.first() @@ -265,10 +266,129 @@ fn get_referenced_xobject_names(doc: &Document, page_id: ObjectId) -> HashSet HashSet> { + let mut names = HashSet::new(); + + let content_bytes = get_page_content_bytes(doc, page_id); + if content_bytes.is_empty() { + return names; + } + + collect_do_names(&content_bytes, &mut names); + + // One-level recursion: check if any referenced XObjects are Form XObjects + // and scan their content streams for additional Do operators + if names.is_empty() { + return names; + } + + let xobjects = match get_page_xobjects(doc, page_id) { + Some(x) => x, + None => return names, + }; + + let initial_names: Vec> = names.iter().cloned().collect(); + for name in &initial_names { + let obj_ref = match xobjects.get(name.as_slice()) { + Ok(r) => r, + Err(_) => continue, + }; + let obj_id = match obj_ref { + Object::Reference(id) => *id, + _ => continue, + }; + let stream = match doc.get_object(obj_id) { + Ok(Object::Stream(s)) => s, + _ => continue, + }; + let subtype = stream.dict.get(b"Subtype").ok().and_then(|v| { + if let Object::Name(n) = v { + Some(n.as_slice()) + } else { + None + } + }); + if subtype != Some(b"Form") { + continue; + } + // Decompress and scan the Form XObject's content stream + let mut form_stream = stream.clone(); + let _ = form_stream.decompress(); + collect_do_names(&form_stream.content, &mut names); + } names } +/// Check if any referenced XObject on the page has `/Subtype /Form`. +#[cfg(all(feature = "ocr", feature = "render"))] +pub(crate) fn page_has_form_xobjects(doc: &Document, page_id: ObjectId) -> bool { + let xobjects = match get_page_xobjects(doc, page_id) { + Some(x) => x, + None => return false, + }; + + let referenced_names = get_referenced_xobject_names(doc, page_id); + + for (name, obj_ref) in xobjects.iter() { + if !referenced_names.is_empty() && !referenced_names.contains(name) { + continue; + } + let obj_id = match obj_ref { + Object::Reference(id) => *id, + _ => continue, + }; + let stream = match doc.get_object(obj_id) { + Ok(Object::Stream(s)) => s, + _ => continue, + }; + let subtype = stream.dict.get(b"Subtype").ok().and_then(|v| { + if let Object::Name(n) = v { + Some(n.as_slice()) + } else { + None + } + }); + if subtype == Some(b"Form") { + return true; + } + } + false +} + +/// Check if the page content stream contains vector drawing operators. +/// Returns true if the count of path operators exceeds a threshold (> 10). +#[cfg(all(feature = "ocr", feature = "render"))] +pub(crate) fn page_has_vector_content(doc: &Document, page_id: ObjectId) -> bool { + let content_bytes = get_page_content_bytes(doc, page_id); + if content_bytes.is_empty() { + return false; + } + + let content = match lopdf::content::Content::decode(&content_bytes) { + Ok(c) => c, + Err(_) => return false, + }; + + let path_ops: &[&str] = &[ + "m", "l", "c", "v", "y", "h", "re", "S", "s", "f", "F", "f*", "B", "B*", "b", "b*", "n", "W", + "W*", + ]; + + let count = content + .operations + .iter() + .filter(|op| path_ops.contains(&op.operator.as_str())) + .count(); + + count > 10 +} + /// Resolve /DecodeParms from a stream dictionary, following indirect references. fn resolve_decode_parms(doc: &Document, dict: &lopdf::Dictionary) -> Option { let dp = dict.get(b"DecodeParms").ok()?; diff --git a/src/core/mod.rs b/src/core/mod.rs index bcfe5f3..dd9f0b6 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -5,3 +5,6 @@ pub(crate) mod text; #[cfg(feature = "ocr")] pub(crate) mod ocr; + +#[cfg(feature = "render")] +pub(crate) mod render; diff --git a/src/core/ocr.rs b/src/core/ocr.rs index 934f105..69ee44c 100644 --- a/src/core/ocr.rs +++ b/src/core/ocr.rs @@ -7,6 +7,11 @@ use napi::Result; use rayon::prelude::*; use std::sync::Arc; +#[cfg(all(feature = "ocr", feature = "render"))] +use crate::core::images::{page_has_form_xobjects, page_has_vector_content}; +#[cfg(all(feature = "ocr", feature = "render"))] +use crate::core::render; + pub(crate) fn normalize_max_threads(v: Option) -> u32 { let default = 4u32; let max = std::thread::available_parallelism() @@ -33,29 +38,21 @@ fn get_ocr_pool(threads: usize) -> Arc { } /// Auto-detect tessdata path, cached for the lifetime of the process. -/// Checks `TESSDATA_PREFIX` env var first (user override), then falls back to -/// parsing the output of `tesseract --list-langs` (e.g. -/// `List of available languages in "/usr/share/tesseract-ocr/5/tessdata/" (161):`). -/// Returns `None` if neither source yields a path, letting tesseract use its -/// compiled-in default. fn get_tessdata_prefix() -> Option<&'static str> { use std::sync::OnceLock; static TESSDATA_PATH: OnceLock> = OnceLock::new(); TESSDATA_PATH .get_or_init(|| { - // User-provided override takes priority if let Ok(path) = std::env::var("TESSDATA_PREFIX") { return Some(path); } - // Auto-detect from tesseract --list-langs let output = std::process::Command::new("tesseract") .arg("--list-langs") .output() .ok()?; - // tesseract writes the path header to stderr let stderr = String::from_utf8_lossy(&output.stderr); let text = if stderr.contains('"') { stderr @@ -63,7 +60,6 @@ fn get_tessdata_prefix() -> Option<&'static str> { String::from_utf8_lossy(&output.stdout) }; - // Parse: `List of available languages in "/path/to/tessdata/" (N):` let start = text.find('"')?; let end = text[start + 1..].find('"')?; Some(text[start + 1..start + 1 + end].to_string()) @@ -71,42 +67,76 @@ fn get_tessdata_prefix() -> Option<&'static str> { .as_deref() } -fn ocr_page_images(doc: &Document, page_id: ObjectId, lang: &str) -> String { +/// OCR a single DynamicImage and return extracted text. +fn ocr_dynamic_image(img: &DynamicImage, lang: &str) -> String { let datapath = get_tessdata_prefix().unwrap_or(""); + let rgb = img.to_rgb8(); + let (w, h) = rgb.dimensions(); + let pixels = rgb.as_raw(); + + let tess = tesseract_rs::TesseractAPI::new(); + if tess.init(datapath, lang).is_err() { + return String::new(); + } + if tess + .set_image(pixels, w as i32, h as i32, 3, (w * 3) as i32) + .is_err() + { + return String::new(); + } + match tess.get_utf8_text() { + Ok(text) => { + let trimmed = text.trim(); + if trimmed.is_empty() { + String::new() + } else { + trimmed.to_string() + } + } + Err(_) => String::new(), + } +} + +fn ocr_page_images(doc: &Document, page_id: ObjectId, lang: &str) -> String { let images: Vec = collect_page_decoded_images(doc, page_id); let mut texts = Vec::new(); for dyn_img in &images { - let rgb = dyn_img.to_rgb8(); - let (w, h) = rgb.dimensions(); - let pixels = rgb.as_raw(); - - let tess = tesseract_rs::TesseractAPI::new(); - if tess.init(datapath, lang).is_err() { - continue; - } - if tess - .set_image(pixels, w as i32, h as i32, 3, (w * 3) as i32) - .is_err() - { - continue; - } - if let Ok(text) = tess.get_utf8_text() { - let trimmed = text.trim(); - if !trimmed.is_empty() { - texts.push(trimmed.to_string()); - } + let text = ocr_dynamic_image(dyn_img, lang); + if !text.is_empty() { + texts.push(text); } } texts.join("\n") } +// Re-export render-mode constants so existing `use crate::core::ocr::RENDER_MODE_*` +// paths keep compiling. +#[cfg(feature = "render")] +pub(crate) use crate::core::render::{RENDER_MODE_ALWAYS, RENDER_MODE_NEVER}; + +// When the render feature is off, define the constants locally (they're still +// accepted as parameters but silently ignored). +#[cfg(not(feature = "render"))] +pub(crate) const RENDER_MODE_AUTO: u8 = 0; +#[cfg(not(feature = "render"))] +pub(crate) const RENDER_MODE_NEVER: u8 = 1; +#[cfg(not(feature = "render"))] +pub(crate) const RENDER_MODE_ALWAYS: u8 = 2; + +/// Extract text with OCR fallback, with optional render tier. +/// +/// `render_dpi` and `render_mode` are always accepted as params. +/// When the `render` feature is disabled they are silently ignored. pub(crate) fn extract_text_with_ocr( doc: &Document, + pdf_bytes: &[u8], lang: &str, min_len: u32, max_threads: u32, + render_dpi: u32, + render_mode: u8, ) -> Result> { let pages = doc.get_pages(); let page_count_str = pages.len().to_string(); @@ -114,27 +144,82 @@ pub(crate) fn extract_text_with_ocr( let pool = get_ocr_pool(max_threads as usize); + #[cfg(feature = "render")] + let pdf_arc: Arc> = Arc::new(pdf_bytes.to_vec()); + #[cfg(not(feature = "render"))] + let _ = (pdf_bytes, render_dpi, render_mode); + let mut results: Vec = pool.install(|| { page_entries .par_iter() .map(|&(page_num, page_id)| { + // Tier 1: Native text extraction let raw = doc.extract_text(&[page_num]).unwrap_or_default(); let native = strip_footer_artifacts(&raw, &page_count_str); let non_ws: usize = native.chars().filter(|c| !c.is_whitespace()).count(); if non_ws >= min_len as usize { - OcrPageText { + #[cfg(feature = "render")] + { + // In Always mode, render every page regardless + if render_mode == RENDER_MODE_ALWAYS { + if let Some(rendered_text) = try_render_ocr_page(&pdf_arc, page_num, render_dpi, lang) + { + if !rendered_text.is_empty() { + return OcrPageText { + page: page_num, + text: rendered_text, + source: TextSource::Rendered, + }; + } + } + } + } + return OcrPageText { page: page_num, text: native, source: TextSource::Native, - } - } else { - let ocr_text = ocr_page_images(doc, page_id, lang); - OcrPageText { + }; + } + + // Tier 2: OCR extracted raster images + let ocr_text = ocr_page_images(doc, page_id, lang); + if !ocr_text.is_empty() { + return OcrPageText { page: page_num, text: ocr_text, source: TextSource::Ocr, + }; + } + + // Tier 3: Render page + OCR (render feature only) + #[cfg(feature = "render")] + { + if render_mode != RENDER_MODE_NEVER { + let should_render = render_mode == RENDER_MODE_ALWAYS + || page_has_form_xobjects(doc, page_id) + || page_has_vector_content(doc, page_id); + + if should_render { + if let Some(rendered_text) = try_render_ocr_page(&pdf_arc, page_num, render_dpi, lang) + { + if !rendered_text.is_empty() { + return OcrPageText { + page: page_num, + text: rendered_text, + source: TextSource::Rendered, + }; + } + } + } } } + + // Fallback: empty + OcrPageText { + page: page_num, + text: String::new(), + source: TextSource::Ocr, + } }) .collect() }); @@ -142,8 +227,20 @@ pub(crate) fn extract_text_with_ocr( Ok(results) } +/// Render a page to an image and OCR the result. +#[cfg(feature = "render")] +fn try_render_ocr_page(pdf_bytes: &[u8], page_num: u32, dpi: u32, lang: &str) -> Option { + if !render::is_pdfium_available() { + return None; + } + // page_num is 1-based, PDFium page index is 0-based + let page_index = (page_num - 1) as u16; + let img = render::render_page_to_image_from_bytes(pdf_bytes, page_index, dpi)?; + let text = ocr_dynamic_image(&img, lang); + Some(text) +} + pub(crate) fn detect_headers_footers_ocr(pages: &[OcrPageText]) -> Vec { - // Convert to PageText for header/footer detection let as_page_text: Vec = pages .iter() .map(|p| PageText { @@ -152,7 +249,6 @@ pub(crate) fn detect_headers_footers_ocr(pages: &[OcrPageText]) -> Vec Vec TextSource::Native, TextSource::Ocr => TextSource::Ocr, + #[cfg(feature = "render")] + TextSource::Rendered => TextSource::Rendered, }, }) .collect() diff --git a/src/core/render.rs b/src/core/render.rs new file mode 100644 index 0000000..3643c72 --- /dev/null +++ b/src/core/render.rs @@ -0,0 +1,159 @@ +use image::{DynamicImage, ImageFormat}; +use pdfium_render::prelude::*; +use std::io::Cursor; +use std::sync::{Mutex, OnceLock}; + +static PDFIUM: OnceLock> = OnceLock::new(); + +/// Serializes all PDFium FFI operations across libuv worker threads. +static PDFIUM_LOCK: Mutex<()> = Mutex::new(()); + +/// Render-mode constants shared by both the render and OCR subsystems. +pub(crate) const RENDER_MODE_AUTO: u8 = 0; +pub(crate) const RENDER_MODE_NEVER: u8 = 1; +pub(crate) const RENDER_MODE_ALWAYS: u8 = 2; + +fn try_bind_pdfium_at(path: &str) -> Option { + Pdfium::bind_to_library(path).ok().map(Pdfium::new) +} + +fn init_pdfium() -> Result { + // 1. PDFIUM_LIBRARY_PATH env var + if let Ok(path) = std::env::var("PDFIUM_LIBRARY_PATH") { + if let Some(p) = try_bind_pdfium_at(&path) { + return Ok(p); + } + } + + // 2. System library paths (dlopen search) + if let Ok(bindings) = Pdfium::bind_to_system_library() { + return Ok(Pdfium::new(bindings)); + } + + Err("PDFium library not found. Install PDFium or set PDFIUM_LIBRARY_PATH.".to_string()) +} + +fn get_pdfium() -> Result<&'static Pdfium, &'static str> { + PDFIUM + .get_or_init(init_pdfium) + .as_ref() + .map_err(|e| e.as_str()) +} + +/// Attempt to initialize PDFium with a custom path (called before the lazy singleton). +fn get_pdfium_with_path(custom_path: Option<&str>) -> Result<&'static Pdfium, &'static str> { + if let Some(path) = custom_path { + if PDFIUM.get().is_none() { + let result = try_bind_pdfium_at(path).map(Ok).unwrap_or_else(init_pdfium); + let _ = PDFIUM.set(result); + } + } + get_pdfium() +} + +pub(crate) fn is_pdfium_available() -> bool { + get_pdfium().is_ok() +} + +pub(crate) fn ensure_pdfium_with_path(path: Option<&str>) -> Result<(), &'static str> { + get_pdfium_with_path(path).map(|_| ()) +} + +pub(crate) fn normalize_dpi(dpi: Option) -> u32 { + dpi.unwrap_or(300).clamp(72, 600) +} + +// ── Internal (caller must hold PDFIUM_LOCK) ───────────────────── + +fn render_page_to_image_inner(pdf_bytes: &[u8], page_index: u16, dpi: u32) -> Option { + let pdfium = get_pdfium().ok()?; + let doc = pdfium.load_pdf_from_byte_slice(pdf_bytes, None).ok()?; + let page = doc.pages().get(page_index).ok()?; + + let config = PdfRenderConfig::new() + .set_target_width(((page.width().value * dpi as f32) / 72.0) as Pixels) + .set_target_height(((page.height().value * dpi as f32) / 72.0) as Pixels) + .render_form_data(true); + + page + .render_with_config(&config) + .ok() + .map(|bmp| bmp.as_image()) +} + +// ── Public lock-acquiring wrappers ────────────────────────────── + +pub(crate) fn render_page_to_image( + pdf_bytes: &[u8], + page_index: u16, + dpi: u32, +) -> Option { + let _guard = PDFIUM_LOCK.lock().unwrap(); + render_page_to_image_inner(pdf_bytes, page_index, dpi) +} + +#[allow(dead_code)] +pub(crate) fn render_page_to_png( + pdf_bytes: &[u8], + page_index: u16, + dpi: u32, +) -> Option<(u32, u32, Vec)> { + let _guard = PDFIUM_LOCK.lock().unwrap(); + let img = render_page_to_image_inner(pdf_bytes, page_index, dpi)?; + let (w, h) = (img.width(), img.height()); + let mut buf = Cursor::new(Vec::new()); + img.write_to(&mut buf, ImageFormat::Png).ok()?; + Some((w, h, buf.into_inner())) +} + +pub(crate) fn render_pages_to_png( + pdf_bytes: &[u8], + page_indices: &[u16], + dpi: u32, +) -> Vec<(u16, u32, u32, Vec)> { + let _guard = PDFIUM_LOCK.lock().unwrap(); + + let pdfium = match get_pdfium() { + Ok(p) => p, + Err(_) => return Vec::new(), + }; + let doc = match pdfium.load_pdf_from_byte_slice(pdf_bytes, None) { + Ok(d) => d, + Err(_) => return Vec::new(), + }; + + let mut results = Vec::with_capacity(page_indices.len()); + + for &idx in page_indices { + let page = match doc.pages().get(idx) { + Ok(p) => p, + Err(_) => continue, + }; + + let config = PdfRenderConfig::new() + .set_target_width(((page.width().value * dpi as f32) / 72.0) as Pixels) + .set_target_height(((page.height().value * dpi as f32) / 72.0) as Pixels) + .render_form_data(true); + + if let Ok(bmp) = page.render_with_config(&config) { + let img = bmp.as_image(); + let (w, h) = (img.width(), img.height()); + let mut buf = Cursor::new(Vec::new()); + if img.write_to(&mut buf, ImageFormat::Png).is_ok() { + results.push((idx, w, h, buf.into_inner())); + } + } + } + + results +} + +/// Render a single page to a DynamicImage. Used by the OCR cascade. +#[cfg_attr(not(feature = "ocr"), allow(dead_code))] +pub(crate) fn render_page_to_image_from_bytes( + pdf_bytes: &[u8], + page_index: u16, + dpi: u32, +) -> Option { + render_page_to_image(pdf_bytes, page_index, dpi) +} diff --git a/src/lib.rs b/src/lib.rs index c8153a3..e651ff3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,12 +12,16 @@ mod types; // Public API types (appear in generated .d.ts) pub use types::{ - BoxType, PageAnnotation, PageBox, PageImage, PageText, PdfDocument, PdfMeta, StructuredPageText, + BoxType, Capabilities, PageAnnotation, PageBox, PageImage, PageText, PdfDocument, PdfMeta, + StructuredPageText, }; #[cfg(feature = "ocr")] pub use types::{OcrOptions, OcrPageText, OcrStructuredPageText, PdfDocumentOcr, TextSource}; +#[cfg(feature = "render")] +pub use types::{RawRenderedPage, RenderMode, RenderOptions, RenderedPage}; + // Internal plumbing (used by Task impls in this file — must be `pub` for napi Task trait) pub use types::{RawPageImage, RawPdfDocument}; @@ -48,6 +52,49 @@ fn load_doc(buf: &[u8]) -> Result { Document::load_mem(buf).map_err(|e| Error::from_reason(format!("Failed to load PDF: {e}"))) } +/// Extract render mode (u8) from OcrOptions when render feature is enabled. +#[cfg(all(feature = "ocr", feature = "render"))] +fn extract_render_mode(opts: &Option) -> u8 { + use crate::core::render::{RENDER_MODE_ALWAYS, RENDER_MODE_AUTO, RENDER_MODE_NEVER}; + opts + .as_ref() + .and_then(|o| o.render.as_ref()) + .map(|m| match m { + RenderMode::Auto => RENDER_MODE_AUTO, + RenderMode::Never => RENDER_MODE_NEVER, + RenderMode::Always => RENDER_MODE_ALWAYS, + }) + .unwrap_or(RENDER_MODE_AUTO) +} + +/// Extract render DPI from OcrOptions when render feature is enabled. +#[cfg(all(feature = "ocr", feature = "render"))] +fn extract_render_dpi(opts: &Option) -> u32 { + crate::core::render::normalize_dpi(opts.as_ref().and_then(|o| o.render_dpi)) +} + +/// Extract render mode as u8 — when render feature is off, always return Auto (0). +#[cfg(all(feature = "ocr", not(feature = "render")))] +fn extract_render_mode(_opts: &Option) -> u8 { + crate::core::ocr::RENDER_MODE_AUTO +} + +/// Extract render DPI — when render feature is off, return default 300. +#[cfg(all(feature = "ocr", not(feature = "render")))] +fn extract_render_dpi(_opts: &Option) -> u32 { + 300 +} + +/// Initialize PDFium from pdfium_path in OcrOptions (if present). +#[cfg(all(feature = "ocr", feature = "render"))] +fn maybe_init_pdfium(opts: &Option) { + let path = opts.as_ref().and_then(|o| o.pdfium_path.as_deref()); + let _ = crate::core::render::ensure_pdfium_with_path(path); +} + +#[cfg(all(feature = "ocr", not(feature = "render")))] +fn maybe_init_pdfium(_opts: &Option) {} + // ── Standalone sync functions ─────────────────────────────────── #[napi] @@ -97,6 +144,7 @@ pub fn extract_text_with_ocr_per_page( buffer: Buffer, opts: Option, ) -> Result> { + maybe_init_pdfium(&opts); let doc = load_doc(buffer.as_ref())?; let lang = opts .as_ref() @@ -104,12 +152,23 @@ pub fn extract_text_with_ocr_per_page( .unwrap_or("eng"); let min_len = opts.as_ref().and_then(|o| o.min_text_length).unwrap_or(1); let max_threads = normalize_max_threads(opts.as_ref().and_then(|o| o.max_threads)); - extract_text_with_ocr(&doc, lang, min_len, max_threads) + let render_dpi = extract_render_dpi(&opts); + let render_mode = extract_render_mode(&opts); + extract_text_with_ocr( + &doc, + buffer.as_ref(), + lang, + min_len, + max_threads, + render_dpi, + render_mode, + ) } #[cfg(feature = "ocr")] #[napi] pub fn pdf_document_ocr(buffer: Buffer, opts: Option) -> Result { + maybe_init_pdfium(&opts); let doc = load_doc(buffer.as_ref())?; let lang = opts .as_ref() @@ -117,14 +176,130 @@ pub fn pdf_document_ocr(buffer: Buffer, opts: Option) -> Result, + dpi: u32, + mode: u8, +} + +#[cfg(feature = "render")] +#[napi] +impl Task for RenderPagesTask { + type Output = Vec; + type JsValue = Vec; + + fn compute(&mut self) -> Result { + use crate::core::render::{RENDER_MODE_AUTO, RENDER_MODE_NEVER}; + + let dpi = self.dpi; + let mode = self.mode; + let pdf_bytes = &self.data; + + if mode == RENDER_MODE_NEVER { + return Ok(Vec::new()); + } + + crate::core::render::ensure_pdfium_with_path(None) + .map_err(|e| Error::from_reason(e.to_string()))?; + + let doc = load_doc(pdf_bytes)?; + let pages = doc.get_pages(); + let page_count = pages.len() as u16; + + let indices: Vec = if mode == RENDER_MODE_AUTO { + pages + .iter() + .filter_map(|(&page_num, &page_id)| { + let raw = doc.extract_text(&[page_num]).unwrap_or_default(); + let non_ws: usize = raw.chars().filter(|c| !c.is_whitespace()).count(); + if non_ws > 0 { + return None; + } + let has_images = + !crate::core::images::collect_page_decoded_images(&doc, page_id).is_empty(); + if has_images { + return None; + } + Some((page_num - 1) as u16) + }) + .collect() + } else { + (0..page_count).collect() + }; + + let rendered = crate::core::render::render_pages_to_png(pdf_bytes, &indices, dpi); + Ok( + rendered + .into_iter() + .map(|(idx, w, h, data)| RawRenderedPage { + page: (idx as u32) + 1, + width: w, + height: h, + dpi, + data, + }) + .collect(), + ) + } + + fn resolve(&mut self, _env: Env, output: Self::Output) -> Result { + Ok(output.into_iter().map(RenderedPage::from).collect()) + } +} + +#[cfg(feature = "render")] +#[napi] +pub fn render_pages_async( + buffer: Buffer, + opts: Option, +) -> AsyncTask { + use crate::core::render::{RENDER_MODE_ALWAYS, RENDER_MODE_AUTO, RENDER_MODE_NEVER}; + let dpi = crate::core::render::normalize_dpi(opts.as_ref().and_then(|o| o.dpi)); + let mode = opts + .as_ref() + .and_then(|o| o.mode.as_ref()) + .map(|m| match m { + RenderMode::Auto => RENDER_MODE_AUTO, + RenderMode::Never => RENDER_MODE_NEVER, + RenderMode::Always => RENDER_MODE_ALWAYS, + }) + .unwrap_or(RENDER_MODE_ALWAYS); + AsyncTask::new(RenderPagesTask { + data: buffer.to_vec(), + dpi, + mode, + }) +} + +// ── Capabilities check ────────────────────────────────────────── + +#[napi] +pub fn capabilities() -> Capabilities { + Capabilities { + ocr: cfg!(feature = "ocr"), + #[cfg(feature = "render")] + render: crate::core::render::is_pdfium_available(), + #[cfg(not(feature = "render"))] + render: false, + } +} + // ── Standalone async functions (libuv thread pool via AsyncTask) ─ pub struct ExtractTextTask(Vec); @@ -267,6 +442,8 @@ pub struct ExtractTextOcrTask { lang: String, min_len: u32, max_threads: u32, + render_dpi: u32, + render_mode: u8, } #[cfg(feature = "ocr")] @@ -277,7 +454,15 @@ impl Task for ExtractTextOcrTask { fn compute(&mut self) -> Result { let doc = load_doc(&self.data)?; - extract_text_with_ocr(&doc, &self.lang, self.min_len, self.max_threads) + extract_text_with_ocr( + &doc, + &self.data, + &self.lang, + self.min_len, + self.max_threads, + self.render_dpi, + self.render_mode, + ) } fn resolve(&mut self, _env: Env, output: Self::Output) -> Result { @@ -291,17 +476,22 @@ pub fn extract_text_with_ocr_per_page_async( buffer: Buffer, opts: Option, ) -> AsyncTask { + maybe_init_pdfium(&opts); let lang = opts .as_ref() .and_then(|o| o.lang.clone()) .unwrap_or_else(|| "eng".to_string()); let min_len = opts.as_ref().and_then(|o| o.min_text_length).unwrap_or(1); let max_threads = normalize_max_threads(opts.as_ref().and_then(|o| o.max_threads)); + let render_dpi = extract_render_dpi(&opts); + let render_mode = extract_render_mode(&opts); AsyncTask::new(ExtractTextOcrTask { data: buffer.to_vec(), lang, min_len, max_threads, + render_dpi, + render_mode, }) } @@ -311,6 +501,8 @@ pub struct PdfDocumentOcrTask { lang: String, min_len: u32, max_threads: u32, + render_dpi: u32, + render_mode: u8, } #[cfg(feature = "ocr")] @@ -321,7 +513,15 @@ impl Task for PdfDocumentOcrTask { fn compute(&mut self) -> Result { let doc = load_doc(&self.data)?; - extract_all_with_ocr(&doc, &self.lang, self.min_len, self.max_threads) + extract_all_with_ocr( + &doc, + &self.data, + &self.lang, + self.min_len, + self.max_threads, + self.render_dpi, + self.render_mode, + ) } fn resolve(&mut self, _env: Env, output: Self::Output) -> Result { @@ -335,17 +535,22 @@ pub fn pdf_document_ocr_async( buffer: Buffer, opts: Option, ) -> AsyncTask { + maybe_init_pdfium(&opts); let lang = opts .as_ref() .and_then(|o| o.lang.clone()) .unwrap_or_else(|| "eng".to_string()); let min_len = opts.as_ref().and_then(|o| o.min_text_length).unwrap_or(1); let max_threads = normalize_max_threads(opts.as_ref().and_then(|o| o.max_threads)); + let render_dpi = extract_render_dpi(&opts); + let render_mode = extract_render_mode(&opts); AsyncTask::new(PdfDocumentOcrTask { data: buffer.to_vec(), lang, min_len, max_threads, + render_dpi, + render_mode, }) } @@ -452,9 +657,12 @@ impl Task for SharedStructuredTextTask { #[cfg(feature = "ocr")] pub struct SharedExtractTextOcrTask { doc: Arc, + raw: Arc>, lang: String, min_len: u32, max_threads: u32, + render_dpi: u32, + render_mode: u8, } #[cfg(feature = "ocr")] @@ -464,7 +672,15 @@ impl Task for SharedExtractTextOcrTask { type JsValue = Vec; fn compute(&mut self) -> Result { - extract_text_with_ocr(&self.doc, &self.lang, self.min_len, self.max_threads) + extract_text_with_ocr( + &self.doc, + &self.raw, + &self.lang, + self.min_len, + self.max_threads, + self.render_dpi, + self.render_mode, + ) } fn resolve(&mut self, _env: Env, output: Self::Output) -> Result { @@ -475,9 +691,12 @@ impl Task for SharedExtractTextOcrTask { #[cfg(feature = "ocr")] pub struct SharedPdfDocumentOcrTask { doc: Arc, + raw: Arc>, lang: String, min_len: u32, max_threads: u32, + render_dpi: u32, + render_mode: u8, } #[cfg(feature = "ocr")] @@ -487,7 +706,15 @@ impl Task for SharedPdfDocumentOcrTask { type JsValue = PdfDocumentOcr; fn compute(&mut self) -> Result { - extract_all_with_ocr(&self.doc, &self.lang, self.min_len, self.max_threads) + extract_all_with_ocr( + &self.doc, + &self.raw, + &self.lang, + self.min_len, + self.max_threads, + self.render_dpi, + self.render_mode, + ) } fn resolve(&mut self, _env: Env, output: Self::Output) -> Result { @@ -495,27 +722,103 @@ impl Task for SharedPdfDocumentOcrTask { } } +#[cfg(feature = "render")] +pub struct SharedRenderPagesTask { + raw: Arc>, + dpi: u32, + mode: u8, +} + +#[cfg(feature = "render")] +#[napi] +impl Task for SharedRenderPagesTask { + type Output = Vec; + type JsValue = Vec; + + fn compute(&mut self) -> Result { + use crate::core::render::{RENDER_MODE_AUTO, RENDER_MODE_NEVER}; + + let dpi = self.dpi; + let mode = self.mode; + let pdf_bytes = &self.raw; + + if mode == RENDER_MODE_NEVER { + return Ok(Vec::new()); + } + + crate::core::render::ensure_pdfium_with_path(None) + .map_err(|e| Error::from_reason(e.to_string()))?; + + let doc = load_doc(pdf_bytes)?; + let pages = doc.get_pages(); + let page_count = pages.len() as u16; + + let indices: Vec = if mode == RENDER_MODE_AUTO { + pages + .iter() + .filter_map(|(&page_num, &page_id)| { + let raw = doc.extract_text(&[page_num]).unwrap_or_default(); + let non_ws: usize = raw.chars().filter(|c| !c.is_whitespace()).count(); + if non_ws > 0 { + return None; + } + let has_images = + !crate::core::images::collect_page_decoded_images(&doc, page_id).is_empty(); + if has_images { + return None; + } + Some((page_num - 1) as u16) + }) + .collect() + } else { + (0..page_count).collect() + }; + + let rendered = crate::core::render::render_pages_to_png(pdf_bytes, &indices, dpi); + Ok( + rendered + .into_iter() + .map(|(idx, w, h, data)| RawRenderedPage { + page: (idx as u32) + 1, + width: w, + height: h, + dpi, + data, + }) + .collect(), + ) + } + + fn resolve(&mut self, _env: Env, output: Self::Output) -> Result { + Ok(output.into_iter().map(RenderedPage::from).collect()) + } +} + #[napi] pub struct PdfDown { doc: Arc, + #[allow(dead_code)] // used when ocr or render features are enabled + raw: Arc>, } #[napi] impl PdfDown { #[napi(constructor)] pub fn new(buffer: Buffer) -> Result { - let doc = Document::load_mem(buffer.as_ref()) + let bytes = buffer.to_vec(); + let doc = Document::load_mem(&bytes) .map_err(|e| Error::from_reason(format!("Failed to load PDF: {e}")))?; - Ok(PdfDown { doc: Arc::new(doc) }) + Ok(PdfDown { + doc: Arc::new(doc), + raw: Arc::new(bytes), + }) } - /// Sync: extract text per page (reuses the already-parsed document) #[napi] pub fn text_per_page(&self) -> Result> { extract_text(&self.doc) } - /// Sync: extract images per page (reuses the already-parsed document) #[napi] pub fn images_per_page(&self) -> Result> { Ok( @@ -526,61 +829,51 @@ impl PdfDown { ) } - /// Sync: extract annotations per page (reuses the already-parsed document) #[napi] pub fn annotations_per_page(&self) -> Vec { extract_annotations(&self.doc) } - /// Sync: get PDF metadata (reuses the already-parsed document) #[napi] pub fn metadata(&self) -> PdfMeta { extract_metadata(&self.doc) } - /// Async: extract text per page on the libuv thread pool (shares parsed document via Arc) #[napi] pub fn text_per_page_async(&self) -> AsyncTask { AsyncTask::new(SharedExtractTextTask(Arc::clone(&self.doc))) } - /// Async: extract images per page on the libuv thread pool (shares parsed document via Arc) #[napi] pub fn images_per_page_async(&self) -> AsyncTask { AsyncTask::new(SharedExtractImagesTask(Arc::clone(&self.doc))) } - /// Async: extract annotations per page on the libuv thread pool (shares parsed document via Arc) #[napi] pub fn annotations_per_page_async(&self) -> AsyncTask { AsyncTask::new(SharedExtractAnnotationsTask(Arc::clone(&self.doc))) } - /// Async: get PDF metadata on the libuv thread pool (shares parsed document via Arc) #[napi] pub fn metadata_async(&self) -> AsyncTask { AsyncTask::new(SharedPdfMetaTask(Arc::clone(&self.doc))) } - /// Sync: extract everything from the PDF in one call (reuses the already-parsed document) #[napi] pub fn document(&self) -> Result { Ok(PdfDocument::from(extract_all(&self.doc)?)) } - /// Async: extract everything from the PDF on the libuv thread pool (shares parsed document via Arc) #[napi] pub fn document_async(&self) -> AsyncTask { AsyncTask::new(SharedPdfDocumentTask(Arc::clone(&self.doc))) } - /// Sync: extract structured text with header/footer detection #[napi] pub fn structured_text(&self) -> Result> { extract_structured_text(&self.doc) } - /// Async: extract structured text with header/footer detection #[napi] pub fn structured_text_async(&self) -> AsyncTask { AsyncTask::new(SharedStructuredTextTask(Arc::clone(&self.doc))) @@ -590,72 +883,124 @@ impl PdfDown { #[cfg(feature = "ocr")] #[napi] impl PdfDown { - /// Sync: extract text with OCR fallback for image-only pages #[napi] pub fn text_with_ocr_per_page(&self, opts: Option) -> Result> { + maybe_init_pdfium(&opts); let lang = opts .as_ref() .and_then(|o| o.lang.as_deref()) .unwrap_or("eng"); let min_len = opts.as_ref().and_then(|o| o.min_text_length).unwrap_or(1); let max_threads = normalize_max_threads(opts.as_ref().and_then(|o| o.max_threads)); - extract_text_with_ocr(&self.doc, lang, min_len, max_threads) + let render_dpi = extract_render_dpi(&opts); + let render_mode = extract_render_mode(&opts); + extract_text_with_ocr( + &self.doc, + &self.raw, + lang, + min_len, + max_threads, + render_dpi, + render_mode, + ) } - /// Async: extract text with OCR fallback for image-only pages #[napi] pub fn text_with_ocr_per_page_async( &self, opts: Option, ) -> AsyncTask { + maybe_init_pdfium(&opts); let lang = opts .as_ref() .and_then(|o| o.lang.clone()) .unwrap_or_else(|| "eng".to_string()); let min_len = opts.as_ref().and_then(|o| o.min_text_length).unwrap_or(1); let max_threads = normalize_max_threads(opts.as_ref().and_then(|o| o.max_threads)); + let render_dpi = extract_render_dpi(&opts); + let render_mode = extract_render_mode(&opts); AsyncTask::new(SharedExtractTextOcrTask { doc: Arc::clone(&self.doc), + raw: Arc::clone(&self.raw), lang, min_len, max_threads, + render_dpi, + render_mode, }) } - /// Sync: extract everything from the PDF with OCR text fallback #[napi] pub fn document_ocr(&self, opts: Option) -> Result { + maybe_init_pdfium(&opts); let lang = opts .as_ref() .and_then(|o| o.lang.as_deref()) .unwrap_or("eng"); let min_len = opts.as_ref().and_then(|o| o.min_text_length).unwrap_or(1); let max_threads = normalize_max_threads(opts.as_ref().and_then(|o| o.max_threads)); + let render_dpi = extract_render_dpi(&opts); + let render_mode = extract_render_mode(&opts); Ok(PdfDocumentOcr::from(extract_all_with_ocr( &self.doc, + &self.raw, lang, min_len, max_threads, + render_dpi, + render_mode, )?)) } - /// Async: extract everything from the PDF with OCR text fallback #[napi] pub fn document_ocr_async( &self, opts: Option, ) -> AsyncTask { + maybe_init_pdfium(&opts); let lang = opts .as_ref() .and_then(|o| o.lang.clone()) .unwrap_or_else(|| "eng".to_string()); let min_len = opts.as_ref().and_then(|o| o.min_text_length).unwrap_or(1); let max_threads = normalize_max_threads(opts.as_ref().and_then(|o| o.max_threads)); + let render_dpi = extract_render_dpi(&opts); + let render_mode = extract_render_mode(&opts); AsyncTask::new(SharedPdfDocumentOcrTask { doc: Arc::clone(&self.doc), + raw: Arc::clone(&self.raw), lang, min_len, max_threads, + render_dpi, + render_mode, + }) + } +} + +#[cfg(feature = "render")] +#[napi] +impl PdfDown { + #[napi] + pub fn render_pages_async( + &self, + opts: Option, + ) -> AsyncTask { + use crate::core::render::{RENDER_MODE_ALWAYS, RENDER_MODE_AUTO, RENDER_MODE_NEVER}; + let dpi = crate::core::render::normalize_dpi(opts.as_ref().and_then(|o| o.dpi)); + let mode = opts + .as_ref() + .and_then(|o| o.mode.as_ref()) + .map(|m| match m { + RenderMode::Auto => RENDER_MODE_AUTO, + RenderMode::Never => RENDER_MODE_NEVER, + RenderMode::Always => RENDER_MODE_ALWAYS, + }) + .unwrap_or(RENDER_MODE_ALWAYS); + AsyncTask::new(SharedRenderPagesTask { + raw: Arc::clone(&self.raw), + dpi, + mode, }) } } diff --git a/src/types.rs b/src/types.rs index e84fe63..df9a1ff 100644 --- a/src/types.rs +++ b/src/types.rs @@ -17,11 +17,19 @@ pub struct StructuredPageText { pub footer: String, } -#[cfg(feature = "ocr")] +#[cfg(all(feature = "ocr", not(feature = "render")))] +#[napi(string_enum)] +pub enum TextSource { + Native, + Ocr, +} + +#[cfg(all(feature = "ocr", feature = "render"))] #[napi(string_enum)] pub enum TextSource { Native, Ocr, + Rendered, } #[cfg(feature = "ocr")] @@ -32,7 +40,7 @@ pub struct OcrPageText { pub source: TextSource, } -#[cfg(feature = "ocr")] +#[cfg(all(feature = "ocr", not(feature = "render")))] #[napi(object)] pub struct OcrOptions { pub lang: Option, @@ -40,6 +48,70 @@ pub struct OcrOptions { pub max_threads: Option, } +#[cfg(all(feature = "ocr", feature = "render"))] +#[napi(object)] +pub struct OcrOptions { + pub lang: Option, + pub min_text_length: Option, + pub max_threads: Option, + pub render: Option, + pub render_dpi: Option, + pub pdfium_path: Option, +} + +#[cfg(feature = "render")] +#[napi(string_enum)] +pub enum RenderMode { + Auto, + Never, + Always, +} + +#[cfg(feature = "render")] +#[napi(object)] +pub struct RenderedPage { + pub page: u32, + pub width: u32, + pub height: u32, + pub dpi: u32, + pub data: Buffer, +} + +#[cfg(feature = "render")] +pub struct RawRenderedPage { + pub page: u32, + pub width: u32, + pub height: u32, + pub dpi: u32, + pub data: Vec, +} + +#[cfg(feature = "render")] +impl From for RenderedPage { + fn from(r: RawRenderedPage) -> Self { + RenderedPage { + page: r.page, + width: r.width, + height: r.height, + dpi: r.dpi, + data: r.data.into(), + } + } +} + +#[cfg(feature = "render")] +#[napi(object)] +pub struct RenderOptions { + pub dpi: Option, + pub mode: Option, +} + +#[napi(object)] +pub struct Capabilities { + pub ocr: bool, + pub render: bool, +} + #[napi(object)] pub struct PdfMeta { pub page_count: u32,