Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 59 additions & 4 deletions .github/workflows/node.js.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ jobs:

steps:
- name: Checkout code
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Setup Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v3
uses: actions/setup-node@v4
with:
node-version: ${{ matrix.node-version }}
cache: 'npm'
Expand All @@ -37,6 +37,61 @@ jobs:
- name: Build (if present)
run: npm run build --if-present

- name: Run tests
run: npm test
- name: Run tests with coverage
run: npm run test:coverage
continue-on-error: ${{ matrix.experimental == true }}

- name: Generate coverage reports
if: matrix.node-version == '20.x'
run: |
npm run test:coverage:text
npm run test:coverage:lcov
echo "### Test Coverage Report" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
npx nyc report --reporter=text-summary >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY

- name: Upload coverage artifacts
if: matrix.node-version == '20.x'
uses: actions/upload-artifact@v4
with:
name: coverage-report
path: coverage/
retention-days: 7

- name: Comment PR with coverage
if: matrix.node-version == '20.x' && github.event_name == 'pull_request'
uses: romeovs/lcov-reporter-action@v0.3.1
with:
lcov-file: ./coverage/lcov.info
github-token: ${{ secrets.GITHUB_TOKEN }}

coverage-check:
needs: build
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20.x'
cache: 'npm'

- name: Install dependencies
run: npm ci

- name: Check coverage thresholds
run: |
npm run test:coverage
echo "### Coverage Threshold Check" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
npx nyc report --reporter=text >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY

- name: Enforce coverage thresholds
run: |
npx nyc check-coverage --lines 80 --functions 80 --branches 80 --statements 80
58 changes: 56 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,59 @@
.DS_STORE
# Node.js dependencies
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
package-lock.json
yarn.lock
pnpm-lock.yaml

# Environment variables
.env
.env.*.local

# Logs
logs/
*.log
log.txt

node_modules/
# Runtime data
pids/
*.pid
*.seed
*.pid.lock

# Coverage directories
coverage/
.nyc_output/

# Optional npm cache directory
.npm/

# IDEs and editors
.idea/
.vscode/
*.sublime-workspace
*.sublime-project

# OS-specific
.DS_Store
Thumbs.db

# Build directories
dist/
build/
tmp/
temp/

# TypeScript
*.tsbuildinfo

# Optional ESLint cache
.eslintcache

# Optional stylelint cache
.stylelintcache

# Optional REPL history
.node_repl_history
56 changes: 38 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
- **Page-by-page processing** - Process PDFs page by page
- **Metadata extraction** - Extract author, title, creation date, and more
- **Thumbnail generation** - Generate preview images from PDF pages
- **Buffer support** - Process PDFs from memory buffers or file paths
- **TypeScript support** - Full type definitions included
- **Async/Promise based** - Modern async API
- **Configurable** - Extensive options for customization
Expand Down Expand Up @@ -54,44 +55,59 @@ The installation process will automatically download the required Apache Tika an

```javascript
const pdf2html = require('pdf2html');
const fs = require('fs');

// Simple conversion
// From file path
const html = await pdf2html.html('path/to/document.pdf');
console.log(html);

// From buffer
const pdfBuffer = fs.readFileSync('path/to/document.pdf');
const html = await pdf2html.html(pdfBuffer);
console.log(html);

// With options
const html = await pdf2html.html('path/to/document.pdf', {
const html = await pdf2html.html(pdfBuffer, {
maxBuffer: 1024 * 1024 * 10, // 10MB buffer
});
```

### Extract Text

```javascript
// Extract all text from PDF
// From file path
const text = await pdf2html.text('path/to/document.pdf');

// From buffer
const pdfBuffer = fs.readFileSync('path/to/document.pdf');
const text = await pdf2html.text(pdfBuffer);
console.log(text);
```

### Process Pages Individually

```javascript
// Get HTML for each page
// From file path
const htmlPages = await pdf2html.pages('path/to/document.pdf');

// From buffer
const pdfBuffer = fs.readFileSync('path/to/document.pdf');
const htmlPages = await pdf2html.pages(pdfBuffer);
htmlPages.forEach((page, index) => {
console.log(`Page ${index + 1}:`, page);
});

// Get text for each page
const textPages = await pdf2html.pages('path/to/document.pdf', {
const textPages = await pdf2html.pages(pdfBuffer, {
text: true,
});
```

### Extract Metadata

```javascript
const metadata = await pdf2html.meta('path/to/document.pdf');
// From file path or buffer
const metadata = await pdf2html.meta(pdfBuffer);
console.log(metadata);
// Output: {
// title: 'Document Title',
Expand All @@ -109,12 +125,16 @@ console.log(metadata);
### Generate Thumbnails

```javascript
// Generate thumbnail with default settings
// From file path
const thumbnailPath = await pdf2html.thumbnail('path/to/document.pdf');

// From buffer
const pdfBuffer = fs.readFileSync('path/to/document.pdf');
const thumbnailPath = await pdf2html.thumbnail(pdfBuffer);
console.log('Thumbnail saved to:', thumbnailPath);

// Custom thumbnail options
const thumbnailPath = await pdf2html.thumbnail('path/to/document.pdf', {
const thumbnailPath = await pdf2html.thumbnail(pdfBuffer, {
page: 1, // Page number (default: 1)
imageType: 'png', // 'png' or 'jpg' (default: 'png')
width: 300, // Width in pixels (default: 160)
Expand Down Expand Up @@ -162,48 +182,48 @@ try {

## 🏗️ API Reference

### `pdf2html.html(filepath, [options])`
### `pdf2html.html(input, [options])`

Converts PDF to HTML format.

- **filepath** `string` - Path to the PDF file
- **input** `string | Buffer` - Path to the PDF file or PDF buffer
- **options** `object` (optional)
- `maxBuffer` `number` - Maximum buffer size in bytes (default: 2MB)
- **Returns:** `Promise<string>` - HTML content

### `pdf2html.text(filepath, [options])`
### `pdf2html.text(input, [options])`

Extracts text from PDF.

- **filepath** `string` - Path to the PDF file
- **input** `string | Buffer` - Path to the PDF file or PDF buffer
- **options** `object` (optional)
- `maxBuffer` `number` - Maximum buffer size in bytes
- **Returns:** `Promise<string>` - Extracted text

### `pdf2html.pages(filepath, [options])`
### `pdf2html.pages(input, [options])`

Processes PDF page by page.

- **filepath** `string` - Path to the PDF file
- **input** `string | Buffer` - Path to the PDF file or PDF buffer
- **options** `object` (optional)
- `text` `boolean` - Extract text instead of HTML (default: false)
- `maxBuffer` `number` - Maximum buffer size in bytes
- **Returns:** `Promise<string[]>` - Array of HTML or text strings

### `pdf2html.meta(filepath, [options])`
### `pdf2html.meta(input, [options])`

Extracts PDF metadata.

- **filepath** `string` - Path to the PDF file
- **input** `string | Buffer` - Path to the PDF file or PDF buffer
- **options** `object` (optional)
- `maxBuffer` `number` - Maximum buffer size in bytes
- **Returns:** `Promise<object>` - Metadata object

### `pdf2html.thumbnail(filepath, [options])`
### `pdf2html.thumbnail(input, [options])`

Generates a thumbnail image from PDF.

- **filepath** `string` - Path to the PDF file
- **input** `string | Buffer` - Path to the PDF file or PDF buffer
- **options** `object` (optional)
- `page` `number` - Page to thumbnail (default: 1)
- `imageType` `string` - 'png' or 'jpg' (default: 'png')
Expand Down
64 changes: 64 additions & 0 deletions lib/FileManager.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,20 @@ const debug = require('debug')('pdf2html');
const fse = require('fs-extra');
const path = require('path');
const URI = require('urijs');
const crypto = require('crypto');
const constants = require('../constants');

/**
* File management utilities
*/
class FileManager {
static async withTempFile(sourceFile, tempDir, operation) {
// If a source file is already in the temp directory, don't copy it
if (sourceFile.includes(tempDir)) {
const uri = new URI(sourceFile);
return operation(sourceFile, uri);
}

const uri = new URI(sourceFile);
const tempFilePath = path.join(tempDir, uri.filename());

Expand All @@ -25,6 +32,63 @@ class FileManager {
const dirs = Object.values(constants.DIRECTORY);
await Promise.all(dirs.map((dir) => fse.ensureDir(dir)));
}

/**
* Creates a temporary file from a buffer
* @param {Buffer} buffer - The buffer to write
* @param {string} extension - File extension (e.g., '.pdf')
* @returns {Promise<string>} - Path to the temporary file
*/
static async createTempFileFromBuffer(buffer, extension = '.pdf') {
await this.ensureDirectories();

// Generate unique filename using hash of buffer content
const timestamp = Date.now();
const randomBytes = crypto.randomBytes(8).toString('hex');
const tempFileName = `temp_${timestamp}_${randomBytes}${extension}`;
const tempFilePath = path.join(constants.DIRECTORY.PDF, tempFileName);

await fse.writeFile(tempFilePath, buffer);
return tempFilePath;
}

/**
* Processes input that can be either a file path or buffer
* @param {string|Buffer} input - File path or buffer
* @param {Function} processor - Function to process the file path
* @returns {Promise<*>} - Result from processor
*/
static async processInput(input, processor) {
// Validate input
if (input === null || input === undefined) {
throw new Error('Input cannot be null or undefined');
}

if (typeof input === 'string') {
if (input.trim() === '') {
throw new Error('File path cannot be empty');
}
} else if (!Buffer.isBuffer(input)) {
throw new Error('Input must be a file path (string) or Buffer');
}

const isBuffer = Buffer.isBuffer(input);
let filePath = input;
let tempFilePath = null;

try {
if (isBuffer) {
tempFilePath = await this.createTempFileFromBuffer(input, '.pdf');
filePath = tempFilePath;
}

return await processor(filePath, isBuffer, tempFilePath);
} finally {
if (tempFilePath) {
await fse.remove(tempFilePath).catch((err) => debug(`Failed to remove temp file ${tempFilePath}: ${err.message}`));
}
}
}
}

module.exports = FileManager;
Loading