Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
[![License](https://img.shields.io/npm/l/pdf2html.svg)](https://www.npmjs.org/package/pdf2html)
[![Node.js Version](https://img.shields.io/node/v/pdf2html.svg)](https://nodejs.org)

> Convert PDF files to HTML, extract text, generate thumbnails, and extract metadata using Apache Tika and PDFBox
> Convert PDF files to HTML, extract text, generate thumbnails, extract images, and extract metadata using Apache Tika and PDFBox

## 🚀 Features

Expand All @@ -15,6 +15,7 @@
- **Page-by-page processing** - Process PDFs page by page
- **Metadata extraction** - Extract author, title, creation date, and more
- **Thumbnail generation** - Generate preview images from PDF pages
- **Image extraction** - Extract all embedded images from PDFs
- **Buffer support** - Process PDFs from memory buffers or file paths
- **TypeScript support** - Full type definitions included
- **Async/Promise based** - Modern async API
Expand Down Expand Up @@ -142,6 +143,30 @@ const thumbnailPath = await pdf2html.thumbnail(pdfBuffer, {
});
```

### Extract Images

```javascript
// From file path
const imagePaths = await pdf2html.extractImages('path/to/document.pdf');
console.log('Extracted images:', imagePaths);
// Output: ['/absolute/path/to/files/image/document1.jpg', '/absolute/path/to/files/image/document2.png', ...]

// From buffer
const pdfBuffer = fs.readFileSync('path/to/document.pdf');
const imagePaths = await pdf2html.extractImages(pdfBuffer);

// With custom output directory
const imagePaths = await pdf2html.extractImages(pdfBuffer, {
outputDirectory: './extracted-images', // Custom output directory
});

// With custom buffer size for large PDFs
const imagePaths = await pdf2html.extractImages('large-document.pdf', {
outputDirectory: './output',
maxBuffer: 1024 * 1024 * 10, // 10MB buffer
});
```

## 💻 TypeScript Support

This package includes TypeScript type definitions out of the box. No need to install `@types/pdf2html`.
Expand All @@ -151,7 +176,7 @@ This package includes TypeScript type definitions out of the box. No need to ins
```typescript
import * as pdf2html from 'pdf2html';
// or
import { html, text, pages, meta, thumbnail, PDFMetadata, PDFProcessingError } from 'pdf2html';
import { html, text, pages, meta, thumbnail, extractImages, PDFMetadata, PDFProcessingError } from 'pdf2html';

async function convertPDF() {
try {
Expand Down
9 changes: 9 additions & 0 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,15 @@ declare module 'pdf2html' {
*/
export function thumbnail(input: PDFInput, options?: ThumbnailOptions): Promise<string>;

/**
* Extract images from PDF
* @param input - Path to PDF file or PDF buffer
* @param options - Image extraction options
* @returns Promise resolving to an array of paths to extracted images
* @throws Error if a file not found or processing fails
*/
export function extractImages(input: PDFInput, options?: ProcessingOptions): Promise<string[]>;

/**
* PDF processing error class
*/
Expand Down
1 change: 1 addition & 0 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module.exports = {
text: PDFProcessor.toText.bind(PDFProcessor),
meta: PDFProcessor.extractMetadata.bind(PDFProcessor),
thumbnail: PDFProcessor.generateThumbnail.bind(PDFProcessor),
extractImages: PDFProcessor.extractImages.bind(PDFProcessor),

// Export classes for advanced usage
PDFProcessor,
Expand Down
27 changes: 27 additions & 0 deletions lib/PDFBoxWrapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,33 @@ class PDFBoxWrapper {
await fse.remove(sourcePath).catch((err) => debug(`Failed to remove PDFBox image: ${err.message}`));
}
}

static async extractAllImages(filepath, options = {}) {
const outputDirectory = options.outputDirectory || constants.DIRECTORY.IMAGE;
await fse.ensureDir(outputDirectory);

const pdfFileName = path.basename(filepath, path.extname(filepath));
const prefix = path.join(outputDirectory, pdfFileName);

const args = [
'-jar',
path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_PDF_BOX_JAR),
'ExtractImages',
'-prefix',
prefix,
filepath,
];

await CommandExecutor.execute('java', args, {
maxBuffer: options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer,
});

const extractedImages = await fse.readdir(outputDirectory);

return extractedImages
.filter((file) => file.startsWith(pdfFileName) && (file.endsWith('.jpg') || file.endsWith('.png') || file.endsWith('.gif') || file.endsWith('.bmp') || file.endsWith('.jpeg')))
.map((file) => path.join(outputDirectory, file));
}
}

module.exports = PDFBoxWrapper;
14 changes: 14 additions & 0 deletions lib/PDFProcessor.js
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,20 @@ class PDFProcessor {
});
}

/**
* Extract images from PDF
* @param {string|Buffer} input - Path to PDF file or PDF buffer
* @param {Object} options - Processing options, including output directory
* @returns {Promise<Array<string>>} Array of paths to extracted images
*/
static async extractImages(input, options = {}) {
return FileManager.processInput(input, async (filePath) => {
await this.validateFile(filePath);
await FileManager.ensureDirectories();
return PDFBoxWrapper.extractAllImages(filePath, options);
});
}

/**
* Validate file existence
* @private
Expand Down
78 changes: 78 additions & 0 deletions test/image_extraction.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
const path = require('path');
const fs = require('fs');
const chai = require('chai');
const fse = require('fs-extra');

const { expect } = chai;
const should = chai.should();

const pdf2html = require('../index');

const pdfImageFilepath = path.join(__dirname, './sample-images.pdf');
const pdfImageBuffer = fs.readFileSync(pdfImageFilepath);

describe('PDF to Images with images', () => {
const outputDir = path.join(__dirname, '../files/temp_extracted_images');

beforeEach(async () => {
await fse.remove(outputDir);
await fse.ensureDir(outputDir);
});

afterEach(async () => {
await fse.remove(outputDir);
});

describe('File path input', () => {
it('should extract images to the specified directory', async () => {
const extractedImagePaths = await pdf2html.extractImages(pdfImageFilepath, { outputDirectory: outputDir });
should.exist(extractedImagePaths);
expect(extractedImagePaths).to.be.an('array');
expect(extractedImagePaths).to.have.lengthOf(3);
});
});

describe('Buffer input', () => {
it('should extract images from buffer to the specified directory', async () => {
const extractedImagePaths = await pdf2html.extractImages(pdfImageBuffer, { outputDirectory: outputDir });
should.exist(extractedImagePaths);
expect(extractedImagePaths).to.be.an('array');
expect(extractedImagePaths).to.have.lengthOf(3);
});
});

describe('Default options', () => {
it('should extract images with default options when options not provided', async () => {
const extractedImagePaths = await pdf2html.extractImages(pdfImageFilepath);
should.exist(extractedImagePaths);
expect(extractedImagePaths).to.be.an('array');
expect(extractedImagePaths).to.have.lengthOf(3);
// Check that images are saved to default directory
extractedImagePaths.forEach(imagePath => {
expect(imagePath).to.include('/files/image/');
});
});
});

describe('Error handling', () => {
it('should handle non-existent PDF file', async () => {
try {
await pdf2html.extractImages('/path/to/non-existent.pdf');
expect.fail('Should have thrown an error');
} catch (error) {
should.exist(error);
expect(error.message).to.include('not found');
}
});

it('should handle invalid PDF buffer', async () => {
const invalidBuffer = Buffer.from('This is not a PDF');
try {
await pdf2html.extractImages(invalidBuffer, { outputDirectory: outputDir });
expect.fail('Should have thrown an error');
} catch (error) {
should.exist(error);
}
});
});
});
Binary file added test/sample-images.pdf
Binary file not shown.
File renamed without changes.
Loading