Skip to content
Merged

Dev #17

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 54 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ env:
APP_NAME: pdfdown
MACOSX_DEPLOYMENT_TARGET: '10.13'
CARGO_INCREMENTAL: '1'
'on':
on:
push:
branches:
- main
Expand All @@ -23,6 +23,7 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:

lint:
name: Lint
runs-on: ubuntu-latest
Expand Down Expand Up @@ -203,6 +204,32 @@ jobs:
brew update
brew install tesseract
tesseract --version
- name: Download PDFium (Linux)
if: runner.os == 'Linux'
run: |
ARCH=$(uname -m)
if [ "$ARCH" = "x86_64" ]; then
PDFIUM_ARCH="linux-x64"
elif [ "$ARCH" = "aarch64" ]; then
PDFIUM_ARCH="linux-arm64"
fi
curl -L "https://github.com/nicksrandall/pdfium-lib/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -o pdfium.tgz
mkdir -p /tmp/pdfium
tar xzf pdfium.tgz -C /tmp/pdfium
echo "PDFIUM_LIBRARY_PATH=/tmp/pdfium/lib/libpdfium.so" >> $GITHUB_ENV
- name: Download PDFium (macOS)
if: runner.os == 'macOS'
run: |
ARCH=$(uname -m)
if [ "$ARCH" = "x86_64" ]; then
PDFIUM_ARCH="mac-x64"
elif [ "$ARCH" = "arm64" ]; then
PDFIUM_ARCH="mac-arm64"
fi
curl -L "https://github.com/nicksrandall/pdfium-lib/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -o pdfium.tgz
mkdir -p /tmp/pdfium
tar xzf pdfium.tgz -C /tmp/pdfium
echo "PDFIUM_LIBRARY_PATH=/tmp/pdfium/lib/libpdfium.dylib" >> $GITHUB_ENV
- name: Install dependencies
run: yarn install
- name: Build OCR
Expand Down Expand Up @@ -447,6 +474,32 @@ jobs:
TESS_PREFIX=$(tesseract --list-langs 2>&1 | head -1 | sed 's/.*"\(.*\)".*/\1/')
echo "TESSDATA_PREFIX=${TESS_PREFIX}" >> $GITHUB_ENV
echo "Tesseract data path: ${TESS_PREFIX}"
- name: Download PDFium (Linux)
if: runner.os == 'Linux'
run: |
ARCH=$(uname -m)
if [ "$ARCH" = "x86_64" ]; then
PDFIUM_ARCH="linux-x64"
elif [ "$ARCH" = "aarch64" ]; then
PDFIUM_ARCH="linux-arm64"
fi
curl -L "https://github.com/nicksrandall/pdfium-lib/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -o pdfium.tgz
mkdir -p /tmp/pdfium
tar xzf pdfium.tgz -C /tmp/pdfium
echo "PDFIUM_LIBRARY_PATH=/tmp/pdfium/lib/libpdfium.so" >> $GITHUB_ENV
- name: Download PDFium (macOS)
if: runner.os == 'macOS'
run: |
ARCH=$(uname -m)
if [ "$ARCH" = "x86_64" ]; then
PDFIUM_ARCH="mac-x64"
elif [ "$ARCH" = "arm64" ]; then
PDFIUM_ARCH="mac-arm64"
fi
curl -L "https://github.com/nicksrandall/pdfium-lib/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -o pdfium.tgz
mkdir -p /tmp/pdfium
tar xzf pdfium.tgz -C /tmp/pdfium
echo "PDFIUM_LIBRARY_PATH=/tmp/pdfium/lib/libpdfium.dylib" >> $GITHUB_ENV
- name: Install dependencies
run: yarn install
- name: Download OCR artifact
Expand Down
9 changes: 8 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
edition = "2024"

name = "pdfdown"
version = "0.9.7"
version = "0.9.8"

[lib]
crate-type = ["cdylib"]
Expand All @@ -20,10 +20,17 @@ napi = { version = "3.0.0", features = ["napi6"] }
napi-derive = "3.0.0"
rayon = "1.11"
tesseract-rs = { version = "0.1", optional = true }
pdfium-render = { version = "0.8", optional = true, default-features = false, features = [
"pdfium_latest",
"image_025",
"thread_safe",
"sync",
] }

[features]
default = []
ocr = ["tesseract-rs"]
render = ["pdfium-render"]

[build-dependencies]
napi-build = "2"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ for (const img of images) {
console.log(`Page ${img.page} image ${img.imageIndex}: ${img.width}x${img.height} ${img.colorSpace}`)
}
```

#### Extract images as PNG (async)

```typescript
Expand Down
41 changes: 29 additions & 12 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,19 @@
/* eslint-disable */
export declare class PdfDown {
constructor(buffer: Buffer)
/** Sync: extract text per page (reuses the already-parsed document) */
textPerPage(): Array<PageText>
/** Sync: extract images per page (reuses the already-parsed document) */
imagesPerPage(): Array<PageImage>
/** Sync: extract annotations per page (reuses the already-parsed document) */
annotationsPerPage(): Array<PageAnnotation>
/** Sync: get PDF metadata (reuses the already-parsed document) */
metadata(): PdfMeta
/** Async: extract text per page on the libuv thread pool (shares parsed document via Arc) */
textPerPageAsync(): Promise<Array<PageText>>
/** Async: extract images per page on the libuv thread pool (shares parsed document via Arc) */
imagesPerPageAsync(): Promise<Array<PageImage>>
/** Async: extract annotations per page on the libuv thread pool (shares parsed document via Arc) */
annotationsPerPageAsync(): Promise<Array<PageAnnotation>>
/** Async: get PDF metadata on the libuv thread pool (shares parsed document via Arc) */
metadataAsync(): Promise<PdfMeta>
/** Sync: extract everything from the PDF in one call (reuses the already-parsed document) */
document(): PdfDocument
/** Async: extract everything from the PDF on the libuv thread pool (shares parsed document via Arc) */
documentAsync(): Promise<PdfDocument>
/** Sync: extract structured text with header/footer detection */
structuredText(): Array<StructuredPageText>
/** Async: extract structured text with header/footer detection */
structuredTextAsync(): Promise<Array<StructuredPageText>>
renderPagesAsync(opts?: RenderOptions | undefined | null): Promise<Array<RenderedPage>>
}

export declare const enum BoxType {
Expand All @@ -34,6 +23,13 @@ export declare const enum BoxType {
Unknown = 'Unknown'
}

export declare function capabilities(): Capabilities

export interface Capabilities {
ocr: boolean
render: boolean
}

export declare function extractAnnotationsPerPage(buffer: Buffer): Array<PageAnnotation>

export declare function extractAnnotationsPerPageAsync(buffer: Buffer): Promise<Array<PageAnnotation>>
Expand Down Expand Up @@ -133,6 +129,27 @@ export declare function pdfMetadata(buffer: Buffer): PdfMeta

export declare function pdfMetadataAsync(buffer: Buffer): Promise<PdfMeta>

export interface RenderedPage {
page: number
width: number
height: number
dpi: number
data: Buffer
}

export declare const enum RenderMode {
Auto = 'Auto',
Never = 'Never',
Always = 'Always'
}

export interface RenderOptions {
dpi?: number
mode?: RenderMode
}

export declare function renderPagesAsync(buffer: Buffer, opts?: RenderOptions | undefined | null): Promise<Array<RenderedPage>>

export interface StructuredPageText {
page: number
header: string
Expand Down
Loading
Loading