diff --git a/README.md b/README.md index 32a6d4a..7bc4cbf 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![License](https://img.shields.io/npm/l/pdf2html.svg)](https://www.npmjs.org/package/pdf2html) [![Node.js Version](https://img.shields.io/node/v/pdf2html.svg)](https://nodejs.org) -> Convert PDF files to HTML, extract text, generate thumbnails, and extract metadata using Apache Tika and PDFBox +> Convert PDF files to HTML, extract text, generate thumbnails, extract images, and extract metadata using Apache Tika and PDFBox ## 🚀 Features @@ -15,6 +15,7 @@ - **Page-by-page processing** - Process PDFs page by page - **Metadata extraction** - Extract author, title, creation date, and more - **Thumbnail generation** - Generate preview images from PDF pages +- **Image extraction** - Extract all embedded images from PDFs - **Buffer support** - Process PDFs from memory buffers or file paths - **TypeScript support** - Full type definitions included - **Async/Promise based** - Modern async API @@ -142,6 +143,30 @@ const thumbnailPath = await pdf2html.thumbnail(pdfBuffer, { }); ``` +### Extract Images + +```javascript +// From file path +const imagePaths = await pdf2html.extractImages('path/to/document.pdf'); +console.log('Extracted images:', imagePaths); +// Output: ['/absolute/path/to/files/image/document1.jpg', '/absolute/path/to/files/image/document2.png', ...] + +// From buffer +const pdfBuffer = fs.readFileSync('path/to/document.pdf'); +const imagePaths = await pdf2html.extractImages(pdfBuffer); + +// With custom output directory +const imagePaths = await pdf2html.extractImages(pdfBuffer, { + outputDirectory: './extracted-images', // Custom output directory +}); + +// With custom buffer size for large PDFs +const imagePaths = await pdf2html.extractImages('large-document.pdf', { + outputDirectory: './output', + maxBuffer: 1024 * 1024 * 10, // 10MB buffer +}); +``` + ## 💻 TypeScript Support This package includes TypeScript type definitions out of the box. No need to install `@types/pdf2html`. @@ -151,7 +176,7 @@ This package includes TypeScript type definitions out of the box. No need to ins ```typescript import * as pdf2html from 'pdf2html'; // or -import { html, text, pages, meta, thumbnail, PDFMetadata, PDFProcessingError } from 'pdf2html'; +import { html, text, pages, meta, thumbnail, extractImages, PDFMetadata, PDFProcessingError } from 'pdf2html'; async function convertPDF() { try { diff --git a/index.d.ts b/index.d.ts index 81d1207..1a6cef9 100644 --- a/index.d.ts +++ b/index.d.ts @@ -129,6 +129,15 @@ declare module 'pdf2html' { */ export function thumbnail(input: PDFInput, options?: ThumbnailOptions): Promise; + /** + * Extract images from PDF + * @param input - Path to PDF file or PDF buffer + * @param options - Image extraction options + * @returns Promise resolving to an array of paths to extracted images + * @throws Error if a file not found or processing fails + */ + export function extractImages(input: PDFInput, options?: ProcessingOptions): Promise; + /** * PDF processing error class */ diff --git a/index.js b/index.js index acd4dd1..ad7024d 100644 --- a/index.js +++ b/index.js @@ -14,6 +14,7 @@ module.exports = { text: PDFProcessor.toText.bind(PDFProcessor), meta: PDFProcessor.extractMetadata.bind(PDFProcessor), thumbnail: PDFProcessor.generateThumbnail.bind(PDFProcessor), + extractImages: PDFProcessor.extractImages.bind(PDFProcessor), // Export classes for advanced usage PDFProcessor, diff --git a/lib/PDFBoxWrapper.js b/lib/PDFBoxWrapper.js index e0f16be..27907fa 100644 --- a/lib/PDFBoxWrapper.js +++ b/lib/PDFBoxWrapper.js @@ -56,6 +56,33 @@ class PDFBoxWrapper { await fse.remove(sourcePath).catch((err) => debug(`Failed to remove PDFBox image: ${err.message}`)); } } + + static async extractAllImages(filepath, options = {}) { + const outputDirectory = options.outputDirectory || constants.DIRECTORY.IMAGE; + await fse.ensureDir(outputDirectory); + + const pdfFileName = path.basename(filepath, path.extname(filepath)); + const prefix = path.join(outputDirectory, pdfFileName); + + const args = [ + '-jar', + path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_PDF_BOX_JAR), + 'ExtractImages', + '-prefix', + prefix, + filepath, + ]; + + await CommandExecutor.execute('java', args, { + maxBuffer: options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer, + }); + + const extractedImages = await fse.readdir(outputDirectory); + + return extractedImages + .filter((file) => file.startsWith(pdfFileName) && (file.endsWith('.jpg') || file.endsWith('.png') || file.endsWith('.gif') || file.endsWith('.bmp') || file.endsWith('.jpeg'))) + .map((file) => path.join(outputDirectory, file)); + } } module.exports = PDFBoxWrapper; diff --git a/lib/PDFProcessor.js b/lib/PDFProcessor.js index d507e95..feb283d 100644 --- a/lib/PDFProcessor.js +++ b/lib/PDFProcessor.js @@ -102,6 +102,20 @@ class PDFProcessor { }); } + /** + * Extract images from PDF + * @param {string|Buffer} input - Path to PDF file or PDF buffer + * @param {Object} options - Processing options, including output directory + * @returns {Promise>} Array of paths to extracted images + */ + static async extractImages(input, options = {}) { + return FileManager.processInput(input, async (filePath) => { + await this.validateFile(filePath); + await FileManager.ensureDirectories(); + return PDFBoxWrapper.extractAllImages(filePath, options); + }); + } + /** * Validate file existence * @private diff --git a/test/image_extraction.test.js b/test/image_extraction.test.js new file mode 100644 index 0000000..b76d45a --- /dev/null +++ b/test/image_extraction.test.js @@ -0,0 +1,78 @@ +const path = require('path'); +const fs = require('fs'); +const chai = require('chai'); +const fse = require('fs-extra'); + +const { expect } = chai; +const should = chai.should(); + +const pdf2html = require('../index'); + +const pdfImageFilepath = path.join(__dirname, './sample-images.pdf'); +const pdfImageBuffer = fs.readFileSync(pdfImageFilepath); + +describe('PDF to Images with images', () => { + const outputDir = path.join(__dirname, '../files/temp_extracted_images'); + + beforeEach(async () => { + await fse.remove(outputDir); + await fse.ensureDir(outputDir); + }); + + afterEach(async () => { + await fse.remove(outputDir); + }); + + describe('File path input', () => { + it('should extract images to the specified directory', async () => { + const extractedImagePaths = await pdf2html.extractImages(pdfImageFilepath, { outputDirectory: outputDir }); + should.exist(extractedImagePaths); + expect(extractedImagePaths).to.be.an('array'); + expect(extractedImagePaths).to.have.lengthOf(3); + }); + }); + + describe('Buffer input', () => { + it('should extract images from buffer to the specified directory', async () => { + const extractedImagePaths = await pdf2html.extractImages(pdfImageBuffer, { outputDirectory: outputDir }); + should.exist(extractedImagePaths); + expect(extractedImagePaths).to.be.an('array'); + expect(extractedImagePaths).to.have.lengthOf(3); + }); + }); + + describe('Default options', () => { + it('should extract images with default options when options not provided', async () => { + const extractedImagePaths = await pdf2html.extractImages(pdfImageFilepath); + should.exist(extractedImagePaths); + expect(extractedImagePaths).to.be.an('array'); + expect(extractedImagePaths).to.have.lengthOf(3); + // Check that images are saved to default directory + extractedImagePaths.forEach(imagePath => { + expect(imagePath).to.include('/files/image/'); + }); + }); + }); + + describe('Error handling', () => { + it('should handle non-existent PDF file', async () => { + try { + await pdf2html.extractImages('/path/to/non-existent.pdf'); + expect.fail('Should have thrown an error'); + } catch (error) { + should.exist(error); + expect(error.message).to.include('not found'); + } + }); + + it('should handle invalid PDF buffer', async () => { + const invalidBuffer = Buffer.from('This is not a PDF'); + try { + await pdf2html.extractImages(invalidBuffer, { outputDirectory: outputDir }); + expect.fail('Should have thrown an error'); + } catch (error) { + should.exist(error); + } + }); + }); +}); diff --git a/test/sample-images.pdf b/test/sample-images.pdf new file mode 100644 index 0000000..733667e Binary files /dev/null and b/test/sample-images.pdf differ diff --git a/sample.pdf b/test/sample.pdf similarity index 100% rename from sample.pdf rename to test/sample.pdf diff --git a/test/test.js b/test/test.js index e537813..c05adc9 100644 --- a/test/test.js +++ b/test/test.js @@ -1,15 +1,16 @@ const path = require('path'); const fs = require('fs'); const chai = require('chai'); +const fse = require('fs-extra'); const { expect } = chai; const should = chai.should(); const pdf2html = require('../index'); -const pdfFilepath = path.join(__dirname, '/../sample.pdf'); +const pdfFilepath = path.join(__dirname, './sample.pdf'); const pdfThumbnailFilepath = path.join(__dirname, '/../files/image/sample.png'); -const pdfInvalidFilepath = path.join(__dirname, '/../sample2.pdf'); +const pdfInvalidFilepath = path.join(__dirname, './sample2.pdf'); // Load PDF buffer for buffer-based tests const pdfBuffer = fs.readFileSync(pdfFilepath); @@ -226,10 +227,22 @@ describe('PDF to Meta', () => { }); describe('PDF to Thumbnail', () => { + const tempDir = path.join(__dirname, '../files/temp_thumbnails'); + + beforeEach(async () => { + await fse.ensureDir(tempDir); + }); + + afterEach(async () => { + await fse.remove(tempDir); + }); + describe('File path input', () => { it('should return thumbnail for the pdf file', async () => { - const thumbnailPath = await pdf2html.thumbnail(pdfFilepath); - expect(thumbnailPath).to.equal(pdfThumbnailFilepath); + const thumbnailPath = await pdf2html.thumbnail(pdfFilepath, { outputDirectory: tempDir }); + expect(thumbnailPath).to.be.a('string'); + expect(thumbnailPath).to.include('.png'); + expect(await fse.pathExists(thumbnailPath)).to.be.true; }); it('should return thumbnail with custom options', async () => { @@ -238,22 +251,24 @@ describe('PDF to Thumbnail', () => { imageType: 'png', width: 200, height: 300, + outputDirectory: tempDir, }); expect(thumbnailPath).to.be.a('string'); expect(thumbnailPath).to.include('.png'); + expect(await fse.pathExists(thumbnailPath)).to.be.true; }); it('should return error for the pdf file that does not exist', async () => { - await expectReject(pdf2html.thumbnail(pdfInvalidFilepath)); + await expectReject(pdf2html.thumbnail(pdfInvalidFilepath, { outputDirectory: tempDir })); }); }); describe('Buffer input', () => { it('should return thumbnail for the pdf buffer', async () => { - const thumbnailPath = await pdf2html.thumbnail(pdfBuffer); + const thumbnailPath = await pdf2html.thumbnail(pdfBuffer, { outputDirectory: tempDir }); expect(thumbnailPath).to.be.a('string'); expect(thumbnailPath).to.include('.png'); - // Note: Buffer input creates temp files, so path won't match exactly + expect(await fse.pathExists(thumbnailPath)).to.be.true; }); it('should return thumbnail with custom options for buffer', async () => { @@ -262,13 +277,15 @@ describe('PDF to Thumbnail', () => { imageType: 'jpg', width: 320, height: 480, + outputDirectory: tempDir, }); expect(thumbnailPath).to.be.a('string'); expect(thumbnailPath).to.include('.jpg'); + expect(await fse.pathExists(thumbnailPath)).to.be.true; }); it('should return error for invalid pdf buffer', async () => { - await expectReject(pdf2html.thumbnail(invalidBuffer)); + await expectReject(pdf2html.thumbnail(invalidBuffer, { outputDirectory: tempDir })); }); }); }); @@ -331,3 +348,120 @@ describe('Buffer vs File Path Consistency', () => { expect(metaFromFile).to.deep.equal(metaFromBuffer); }); }); + +// Internal module tests +const CommandExecutor = require('../lib/CommandExecutor'); +const { PDFProcessingError } = require('../lib/errors'); + +describe('CommandExecutor', () => { + describe('execute', () => { + it('should execute a simple command successfully', async () => { + const result = await CommandExecutor.execute('echo', ['hello']); + expect(result.trim()).to.equal('hello'); + }); + + it('should handle commands with multiple arguments', async () => { + const result = await CommandExecutor.execute('echo', ['hello', 'world']); + expect(result.trim()).to.equal('hello world'); + }); + + it('should reject when command exits with non-zero code', async () => { + try { + await CommandExecutor.execute('sh', ['-c', 'exit 1']); + expect.fail('Should have thrown an error'); + } catch (error) { + expect(error).to.be.instanceOf(PDFProcessingError); + expect(error.command).to.include('sh -c exit 1'); + expect(error.exitCode).to.equal(1); + } + }); + + it('should include stderr in error message when command fails', async () => { + try { + await CommandExecutor.execute('sh', ['-c', 'echo "error message" >&2; exit 1']); + expect.fail('Should have thrown an error'); + } catch (error) { + expect(error).to.be.instanceOf(PDFProcessingError); + expect(error.message).to.include('error message'); + } + }); + + it('should handle spawn errors for non-existent commands', async () => { + try { + await CommandExecutor.execute('nonexistentcommand123456', ['arg']); + expect.fail('Should have thrown an error'); + } catch (error) { + expect(error).to.be.instanceOf(PDFProcessingError); + expect(error.message).to.include('Failed to spawn process'); + expect(error.command).to.include('nonexistentcommand123456'); + } + }); + + + it('should capture stdout correctly', async () => { + const result = await CommandExecutor.execute('sh', ['-c', 'echo "line1"; echo "line2"']); + expect(result).to.include('line1'); + expect(result).to.include('line2'); + }); + + it('should pass options to spawn', async () => { + const result = await CommandExecutor.execute('pwd', [], { cwd: '/tmp' }); + // On macOS, /tmp is a symlink to /private/tmp + expect(result.trim()).to.match(/\/tmp$|\/private\/tmp$/); + }); + }); +}); + +// Additional internal module tests for coverage +const FileManager = require('../lib/FileManager'); +const HTMLParser = require('../lib/HTMLParser'); +const ImageProcessor = require('../lib/ImageProcessor'); + +describe('Internal Modules - Coverage Tests', () => { + describe('FileManager', () => { + it('should use default .pdf extension when not provided', async () => { + const buffer = Buffer.from('test content'); + const filePath = await FileManager.createTempFileFromBuffer(buffer); + + expect(filePath).to.include('.pdf'); + + // Clean up + if (fs.existsSync(filePath)) { + fs.unlinkSync(filePath); + } + }); + }); + + describe('HTMLParser', () => { + it('should use default empty options when not provided', () => { + const html = '
Page 1
Page 2
'; + const pages = HTMLParser.extractPages(html); + + expect(pages).to.be.an('array'); + expect(pages).to.have.length(2); + expect(pages[0]).to.include('Page 1'); + expect(pages[1]).to.include('Page 2'); + }); + }); + + describe('ImageProcessor', () => { + it('should throw error when sharp fails to process image', async () => { + // Create a file that's not a valid image + const invalidImagePath = path.join(__dirname, 'test-invalid.txt'); + fs.writeFileSync(invalidImagePath, 'This is not an image'); + + try { + await ImageProcessor.resize(invalidImagePath, 100, 100); + expect.fail('Should have thrown an error'); + } catch (error) { + expect(error).to.exist; + expect(error.message).to.include('Missing output file'); + } finally { + // Clean up + if (fs.existsSync(invalidImagePath)) { + fs.unlinkSync(invalidImagePath); + } + } + }); + }); +});