diff --git a/Cargo.toml b/Cargo.toml index 2fadf19..9b344df 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ edition = "2024" name = "pdfdown" -version = "0.9.6" +version = "0.9.7" [lib] crate-type = ["cdylib"] diff --git a/index.js b/index.js index 3393b08..0080109 100644 --- a/index.js +++ b/index.js @@ -80,8 +80,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-android-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-android-arm64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -96,8 +96,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-android-arm-eabi') const bindingPackageVersion = require('@d0paminedriven/pdfdown-android-arm-eabi/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -116,8 +116,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-win32-x64-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-win32-x64-msvc/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -132,8 +132,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-win32-ia32-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-win32-ia32-msvc/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -148,8 +148,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-win32-arm64-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-win32-arm64-msvc/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -167,8 +167,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-darwin-universal') const bindingPackageVersion = require('@d0paminedriven/pdfdown-darwin-universal/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -183,8 +183,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-darwin-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-darwin-x64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -199,8 +199,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-darwin-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-darwin-arm64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -219,8 +219,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-freebsd-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-freebsd-x64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -235,8 +235,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-freebsd-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-freebsd-arm64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -256,8 +256,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-x64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-x64-musl/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -272,8 +272,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-x64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-x64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -290,8 +290,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-arm64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-arm64-musl/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -306,8 +306,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-arm64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-arm64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -324,8 +324,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-arm-musleabihf') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-arm-musleabihf/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -340,8 +340,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-arm-gnueabihf') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-arm-gnueabihf/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -358,8 +358,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-loong64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-loong64-musl/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -374,8 +374,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-loong64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-loong64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -392,8 +392,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-riscv64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-riscv64-musl/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -408,8 +408,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-riscv64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-riscv64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -425,8 +425,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-ppc64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-ppc64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -441,8 +441,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-linux-s390x-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-linux-s390x-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -461,8 +461,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-openharmony-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-openharmony-arm64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -477,8 +477,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-openharmony-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-openharmony-x64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -493,8 +493,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-openharmony-arm') const bindingPackageVersion = require('@d0paminedriven/pdfdown-openharmony-arm/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { diff --git a/ocr/index.js b/ocr/index.js index f858b21..2c7edf2 100644 --- a/ocr/index.js +++ b/ocr/index.js @@ -80,8 +80,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-android-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-android-arm64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -96,8 +96,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-android-arm-eabi') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-android-arm-eabi/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -116,8 +116,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-win32-x64-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-win32-x64-msvc/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -132,8 +132,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-win32-ia32-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-win32-ia32-msvc/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -148,8 +148,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-win32-arm64-msvc') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-win32-arm64-msvc/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -167,8 +167,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-darwin-universal') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-darwin-universal/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -183,8 +183,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-darwin-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-darwin-x64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -199,8 +199,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-darwin-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-darwin-arm64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -219,8 +219,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-freebsd-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-freebsd-x64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -235,8 +235,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-freebsd-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-freebsd-arm64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -256,8 +256,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-x64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-x64-musl/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -272,8 +272,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-x64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-x64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -290,8 +290,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm64-musl/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -306,8 +306,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -324,8 +324,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm-musleabihf') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm-musleabihf/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -340,8 +340,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm-gnueabihf') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm-gnueabihf/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -358,8 +358,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-loong64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-loong64-musl/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -374,8 +374,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-loong64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-loong64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -392,8 +392,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-musl') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-musl/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -408,8 +408,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -425,8 +425,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-ppc64-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-ppc64-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -441,8 +441,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-linux-s390x-gnu') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-s390x-gnu/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -461,8 +461,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-openharmony-arm64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-openharmony-arm64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -477,8 +477,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-openharmony-x64') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-openharmony-x64/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -493,8 +493,8 @@ function requireNative() { try { const binding = require('@d0paminedriven/pdfdown-ocr-openharmony-arm') const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-openharmony-arm/package.json').version - if (bindingPackageVersion !== '0.9.6' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.9.6 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.7' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.7 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { diff --git a/ocr/package.json b/ocr/package.json index 9c46db1..452b14e 100644 --- a/ocr/package.json +++ b/ocr/package.json @@ -1,6 +1,6 @@ { "name": "@d0paminedriven/pdfdown-ocr", - "version": "0.9.6", + "version": "0.9.7", "description": "Rust powered PDF extraction for Node with OCR fallback (requires system tesseract).", "main": "index.js", "repository": { diff --git a/package.json b/package.json index 795a23e..7ab9383 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@d0paminedriven/pdfdown", - "version": "0.9.6", + "version": "0.9.7", "description": "Rust powered PDF extraction for Node", "main": "index.js", "repository": { diff --git a/src/core/document.rs b/src/core/document.rs new file mode 100644 index 0000000..9bf2c56 --- /dev/null +++ b/src/core/document.rs @@ -0,0 +1,187 @@ +use crate::core::images::extract_images_raw; +use crate::core::meta::extract_metadata; +use crate::core::text::{detect_headers_footers, extract_text}; +use crate::types::{PageAnnotation, RawPdfDocument}; +use lopdf::{Document, Object, ObjectId}; +use napi::Result; +use rayon::prelude::*; +use std::collections::HashSet; + +#[cfg(feature = "ocr")] +use crate::core::ocr::{detect_headers_footers_ocr, extract_text_with_ocr}; +#[cfg(feature = "ocr")] +use crate::types::RawPdfDocumentOcr; + +fn collect_page_annotations( + doc: &Document, + page_id: ObjectId, + page_num: u32, +) -> Vec { + let annots = match doc.get_page_annotations(page_id) { + Ok(a) => a, + Err(_) => return Vec::new(), + }; + + let mut results = Vec::new(); + for annot in annots { + let subtype = annot + .get(b"Subtype") + .ok() + .and_then(|v| { + if let Object::Name(n) = v { + Some(String::from_utf8_lossy(n).to_string()) + } else { + None + } + }) + .unwrap_or_default(); + + let rect = annot + .get(b"Rect") + .ok() + .and_then(|v| { + if let Object::Array(arr) = v { + Some( + arr + .iter() + .filter_map(|o| match o { + Object::Real(f) => Some(*f as f64), + Object::Integer(i) => Some(*i as f64), + _ => None, + }) + .collect::>(), + ) + } else { + None + } + }) + .unwrap_or_default(); + + // Extract URI from /A action dictionary + let uri = annot.get(b"A").ok().and_then(|action| { + let action_dict = match action { + Object::Dictionary(d) => Some(d), + Object::Reference(id) => doc.get_dictionary(*id).ok(), + _ => None, + }?; + let uri_obj = action_dict.get(b"URI").ok()?; + match uri_obj { + Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).to_string()), + _ => None, + } + }); + + // Extract /Dest (named or direct destination) + let dest = annot.get(b"Dest").ok().and_then(|d| match d { + Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).to_string()), + Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()), + _ => None, + }); + + // Extract /Contents (tooltip / alt text) + let content = annot.get(b"Contents").ok().and_then(|c| match c { + Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).to_string()), + _ => None, + }); + + results.push(PageAnnotation { + page: page_num, + subtype, + rect, + uri, + dest, + content, + }); + } + + results +} + +pub(crate) fn extract_annotations(doc: &Document) -> Vec { + let pages = doc.get_pages(); + let page_entries: Vec<(u32, ObjectId)> = pages.iter().map(|(&k, &v)| (k, v)).collect(); + let mut results: Vec = page_entries + .par_iter() + .flat_map(|&(page_num, page_id)| collect_page_annotations(doc, page_id, page_num)) + .collect(); + results.sort_unstable_by_key(|a| a.page); + results +} + +pub(crate) fn extract_all(doc: &Document) -> Result { + let meta = extract_metadata(doc); + let ((text, images), annotations) = rayon::join( + || rayon::join(|| extract_text(doc), || extract_images_raw(doc)), + || extract_annotations(doc), + ); + let text = text?; + let structured_text = detect_headers_footers(&text); + + let mut image_pages: Vec = images + .iter() + .map(|i| i.page) + .collect::>() + .into_iter() + .collect(); + image_pages.sort_unstable(); + + let mut annotation_pages: Vec = annotations + .iter() + .map(|a| a.page) + .collect::>() + .into_iter() + .collect(); + annotation_pages.sort_unstable(); + + Ok(RawPdfDocument { + meta, + text, + structured_text, + images, + annotations, + image_pages, + annotation_pages, + }) +} + +#[cfg(feature = "ocr")] +pub(crate) fn extract_all_with_ocr( + doc: &Document, + lang: &str, + min_len: u32, + max_threads: u32, +) -> Result { + let meta = extract_metadata(doc); + let (text, (images, annotations)) = rayon::join( + || extract_text_with_ocr(doc, lang, min_len, max_threads), + || rayon::join(|| extract_images_raw(doc), || extract_annotations(doc)), + ); + let text = text?; + let structured_text = detect_headers_footers_ocr(&text); + + let mut image_pages: Vec = images + .iter() + .map(|i| i.page) + .collect::>() + .into_iter() + .collect(); + image_pages.sort_unstable(); + + let mut annotation_pages: Vec = annotations + .iter() + .map(|a| a.page) + .collect::>() + .into_iter() + .collect(); + annotation_pages.sort_unstable(); + + Ok(RawPdfDocumentOcr { + meta, + text, + structured_text, + images, + annotations, + image_pages, + annotation_pages, + }) +} diff --git a/src/core/images.rs b/src/core/images.rs new file mode 100644 index 0000000..12ff87c --- /dev/null +++ b/src/core/images.rs @@ -0,0 +1,755 @@ +use crate::types::RawPageImage; +use image::{DynamicImage, ImageBuffer, ImageFormat}; +use lopdf::{Document, Object, ObjectId}; +use rayon::prelude::*; +use std::collections::HashSet; +use std::io::Cursor; + +pub(crate) fn extract_images_raw(doc: &Document) -> Vec { + let pages = doc.get_pages(); + let page_entries: Vec<(u32, ObjectId)> = pages.iter().map(|(&k, &v)| (k, v)).collect(); + let mut results: Vec = page_entries + .par_iter() + .flat_map(|&(page_num, page_id)| collect_page_images_raw(doc, page_id, page_num)) + .collect(); + results.sort_unstable_by_key(|r| (r.page, r.image_index)); + results +} + +/// Decode all image XObjects on a page to DynamicImages (no PNG encoding). +/// Used by OCR to avoid the PNG encode→decode roundtrip. +#[cfg(feature = "ocr")] +pub(crate) fn collect_page_decoded_images(doc: &Document, page_id: ObjectId) -> Vec { + let mut decoded = Vec::new(); + + let xobjects = match get_page_xobjects(doc, page_id) { + Some(x) => x, + None => return decoded, + }; + + let referenced_names = get_referenced_xobject_names(doc, page_id); + + for (name, obj_ref) in xobjects.iter() { + if !referenced_names.is_empty() && !referenced_names.contains(name) { + continue; + } + + let obj_id = match obj_ref { + Object::Reference(id) => *id, + _ => continue, + }; + + let stream = match doc.get_object(obj_id) { + Ok(Object::Stream(s)) => s, + _ => continue, + }; + + let subtype = stream.dict.get(b"Subtype").ok().and_then(|v| { + if let Object::Name(n) = v { + Some(n.as_slice()) + } else { + None + } + }); + if subtype != Some(b"Image") { + continue; + } + + let width = get_dict_int(&stream.dict, b"Width").unwrap_or(0) as u32; + let height = get_dict_int(&stream.dict, b"Height").unwrap_or(0) as u32; + let bpc = get_dict_int(&stream.dict, b"BitsPerComponent").unwrap_or(8) as u32; + if width == 0 || height == 0 { + continue; + } + + let color_space = resolve_color_space(&stream.dict, doc); + let filter = resolve_filter(&stream.dict); + + let content = match filter.as_str() { + "DCTDecode" | "JPXDecode" => stream.content.clone(), + _ => { + let mut s = stream.clone(); + if s.decompress().is_ok() { + s.content + } else { + stream.content.clone() + } + } + }; + + // Skip SMask for OCR — to_rgb8() drops alpha anyway + if let Some(img) = + decode_xobject_to_dynamic_image(&content, width, height, bpc, &color_space, &filter, None) + { + decoded.push(img); + } + } + + decoded +} + +fn collect_page_images_raw(doc: &Document, page_id: ObjectId, page_num: u32) -> Vec { + let mut images = Vec::new(); + + // Get XObjects from page resources (with parent inheritance) + let xobjects = match get_page_xobjects(doc, page_id) { + Some(x) => x, + None => return images, + }; + + // Get the set of XObject names actually referenced by Do operators in the content stream + let referenced_names = get_referenced_xobject_names(doc, page_id); + + let mut img_index = 0u32; + + for (name, obj_ref) in xobjects.iter() { + // Only process XObjects actually painted on the page via Do operators + if !referenced_names.is_empty() && !referenced_names.contains(name) { + continue; + } + + let obj_id = match obj_ref { + Object::Reference(id) => *id, + _ => continue, + }; + + let stream = match doc.get_object(obj_id) { + Ok(Object::Stream(s)) => s, + _ => continue, + }; + + // Only process Image XObjects + let subtype = stream.dict.get(b"Subtype").ok().and_then(|v| { + if let Object::Name(n) = v { + Some(n.as_slice()) + } else { + None + } + }); + + if subtype != Some(b"Image") { + continue; + } + + let width = get_dict_int(&stream.dict, b"Width").unwrap_or(0) as u32; + let height = get_dict_int(&stream.dict, b"Height").unwrap_or(0) as u32; + let bpc = get_dict_int(&stream.dict, b"BitsPerComponent").unwrap_or(8) as u32; + + if width == 0 || height == 0 { + continue; + } + + let color_space = resolve_color_space(&stream.dict, doc); + let filter = resolve_filter(&stream.dict); + + let channels: u32 = match color_space.as_str() { + "DeviceRGB" | "ICCBased3" | "CalRGB" => 3, + "DeviceGray" | "ICCBased1" | "CalGray" => 1, + "DeviceCMYK" | "ICCBased4" => 4, + _ => 3, + }; + + // Step 4: Skip the full stream clone for DCT/JPX — they're already in their + // target encoded format and don't need lopdf decompression. + let content = match filter.as_str() { + "DCTDecode" | "JPXDecode" => stream.content.clone(), + _ => decompress_stream_content(doc, stream, width, height, channels, bpc), + }; + + // Check for SMask (alpha channel) + let smask_data = get_smask_data(doc, &stream.dict); + + let png_data = match encode_to_png( + &content, + width, + height, + bpc, + &color_space, + &filter, + smask_data.as_deref(), + ) { + Some(data) => data, + None => continue, + }; + + let xobject_name = String::from_utf8_lossy(name).to_string(); + let object_id_str = format!("{} {} obj", obj_id.0, obj_id.1); + + images.push(RawPageImage { + page: page_num, + image_index: img_index, + width, + height, + data: png_data, + color_space, + bits_per_component: bpc, + filter, + xobject_name, + object_id: object_id_str, + }); + + img_index += 1; + } + + images +} + +/// Walk the page tree to find /Resources (handles inheritance from /Parent) +fn get_page_xobjects(doc: &Document, page_id: ObjectId) -> Option { + let resources = get_inherited_resources(doc, page_id)?; + let xobject_obj = resources.get(b"XObject").ok()?; + resolve_to_dict(doc, xobject_obj) +} + +fn get_inherited_resources(doc: &Document, page_id: ObjectId) -> Option { + let mut current_id = Some(page_id); + while let Some(id) = current_id { + let dict = doc.get_dictionary(id).ok()?; + if let Ok(resources_obj) = dict.get(b"Resources") { + return resolve_to_dict(doc, resources_obj); + } + // Walk up to /Parent + current_id = dict.get(b"Parent").ok().and_then(|p| match p { + Object::Reference(ref_id) => Some(*ref_id), + _ => None, + }); + } + None +} + +/// Parse the page content stream to find XObject names referenced by `Do` operators. +/// This filters out XObjects that are defined in Resources but never actually painted. +fn get_referenced_xobject_names(doc: &Document, page_id: ObjectId) -> HashSet> { + let mut names = HashSet::new(); + + let page_dict = match doc.get_dictionary(page_id) { + Ok(d) => d, + Err(_) => return names, + }; + + let contents = match page_dict.get(b"Contents") { + Ok(c) => c, + Err(_) => return names, + }; + + let stream_ids: Vec = match contents { + Object::Reference(id) => vec![*id], + Object::Array(arr) => arr + .iter() + .filter_map(|o| { + if let Object::Reference(id) = o { + Some(*id) + } else { + None + } + }) + .collect(), + _ => return names, + }; + + let mut all_bytes = Vec::new(); + for stream_id in stream_ids { + if let Ok(Object::Stream(s)) = doc.get_object(stream_id) { + let mut s = s.clone(); + let _ = s.decompress(); + all_bytes.extend_from_slice(&s.content); + } + } + + if let Ok(content) = lopdf::content::Content::decode(&all_bytes) { + for op in &content.operations { + if op.operator == "Do" + && let Some(Object::Name(name)) = op.operands.first() + { + names.insert(name.clone()); + } + } + } + + names +} + +/// Resolve /DecodeParms from a stream dictionary, following indirect references. +fn resolve_decode_parms(doc: &Document, dict: &lopdf::Dictionary) -> Option { + let dp = dict.get(b"DecodeParms").ok()?; + match dp { + Object::Dictionary(d) => Some(d.clone()), + Object::Reference(id) => match doc.get_object(*id) { + Ok(Object::Dictionary(d)) => Some(d.clone()), + _ => None, + }, + Object::Array(arr) => { + // Filter chain: DecodeParms is an array parallel to Filter array. + // Use the first dictionary entry found. + for item in arr { + match item { + Object::Dictionary(d) => return Some(d.clone()), + Object::Reference(id) => { + if let Ok(Object::Dictionary(d)) = doc.get_object(*id) { + return Some(d.clone()); + } + } + _ => {} + } + } + None + } + _ => None, + } +} + +/// Apply PNG predictor unfiltering to raw decompressed data. +/// Each row has a 1-byte filter type prefix followed by `row_bytes` of filtered data. +/// `bytes_per_pixel` is the number of bytes per pixel (channels * ceil(bpc/8)). +fn apply_png_predictor(data: &[u8], bytes_per_pixel: usize, row_bytes: usize) -> Option> { + let src_row_len = row_bytes + 1; // +1 for filter type byte + if !data.len().is_multiple_of(src_row_len) { + return None; + } + let num_rows = data.len() / src_row_len; + let mut output = Vec::with_capacity(num_rows * row_bytes); + let mut prev_row = vec![0u8; row_bytes]; + + for row_idx in 0..num_rows { + let row_start = row_idx * src_row_len; + let filter_byte = data[row_start]; + let mut current_row = data[row_start + 1..row_start + src_row_len].to_vec(); + + match filter_byte { + 0 => { /* None */ } + 1 => { + // Sub + for i in bytes_per_pixel..row_bytes { + current_row[i] = current_row[i].wrapping_add(current_row[i - bytes_per_pixel]); + } + } + 2 => { + // Up + for i in 0..row_bytes { + current_row[i] = current_row[i].wrapping_add(prev_row[i]); + } + } + 3 => { + // Average + for i in 0..bytes_per_pixel { + current_row[i] = current_row[i].wrapping_add(prev_row[i] / 2); + } + for i in bytes_per_pixel..row_bytes { + current_row[i] = current_row[i].wrapping_add( + ((current_row[i - bytes_per_pixel] as u16 + prev_row[i] as u16) / 2) as u8, + ); + } + } + 4 => { + // Paeth + for i in 0..bytes_per_pixel { + current_row[i] = current_row[i].wrapping_add(paeth_predictor(0, prev_row[i], 0)); + } + for i in bytes_per_pixel..row_bytes { + current_row[i] = current_row[i].wrapping_add(paeth_predictor( + current_row[i - bytes_per_pixel], + prev_row[i], + prev_row[i - bytes_per_pixel], + )); + } + } + _ => return None, // Unknown filter type + } + + output.extend_from_slice(¤t_row); + prev_row = current_row; + } + + Some(output) +} + +fn paeth_predictor(a: u8, b: u8, c: u8) -> u8 { + let pa = (b as i16 - c as i16).abs(); + let pb = (a as i16 - c as i16).abs(); + let pc = (a as i16 + b as i16 - 2 * c as i16).abs(); + if pa <= pb && pa <= pc { + a + } else if pb <= pc { + b + } else { + c + } +} + +/// Decompress a stream's content with correct predictor handling. +/// +/// lopdf's built-in `decompress()` attempts PNG predictor unfiltering internally +/// but produces corrupted output for some streams (e.g. xdvipdfmx/pandoc images). +/// We bypass it entirely: raw zlib inflate via `flate2`, then apply our own +/// predictor reversal. +fn decompress_stream_content( + doc: &Document, + stream: &lopdf::Stream, + width: u32, + height: u32, + channels: u32, + bpc: u32, +) -> Vec { + let bytes_per_sample = if bpc > 8 { 2u32 } else { 1u32 }; + let row_bytes = (width * channels * bpc / 8) as usize; + let expected = (width * height * channels * bytes_per_sample) as usize; + let predicted_len = height as usize * (row_bytes + 1); + + // Check if the stream uses FlateDecode + let uses_flate = match stream.dict.get(b"Filter") { + Ok(Object::Name(n)) => n == b"FlateDecode", + Ok(Object::Array(arr)) => arr + .iter() + .any(|o| matches!(o, Object::Name(n) if n == b"FlateDecode")), + _ => false, + }; + + // Step 1: Raw inflate — bypass lopdf's decompress to avoid its buggy predictor handling + let content = if uses_flate { + raw_inflate(&stream.content).unwrap_or_else(|| { + // Fallback: let lopdf try (handles edge cases like chained filters) + let mut s = stream.clone(); + if s.decompress().is_ok() { + s.content + } else { + stream.content.clone() + } + }) + } else { + stream.content.clone() + }; + + // Step 2: Apply predictor reversal if DecodeParms specifies one + if let Some(dp) = resolve_decode_parms(doc, &stream.dict) { + let predictor = get_dict_int(&dp, b"Predictor").unwrap_or(1); + + // TIFF Predictor 2: horizontal differencing (same size as raw pixels) + if predictor == 2 && content.len() == expected { + let bpp = (channels * bpc / 8).max(1) as usize; + let mut data = content; + apply_tiff_predictor2(&mut data, bpp, row_bytes); + return data; + } + + // PNG Predictors 10-15: each row has a leading filter type byte + if (10..=15).contains(&predictor) && content.len() == predicted_len { + let bpp = (channels * bpc / 8).max(1) as usize; + if let Some(unfiltered) = apply_png_predictor(&content, bpp, row_bytes) { + return unfiltered; + } + } + } + + content +} + +/// Raw zlib inflate without any predictor handling. +fn raw_inflate(data: &[u8]) -> Option> { + use std::io::Read; + // Try zlib wrapper first (most common in PDF) + let mut output = Vec::new(); + if flate2::read::ZlibDecoder::new(data) + .read_to_end(&mut output) + .is_ok() + { + return Some(output); + } + // Fallback to raw deflate (no zlib header) + output.clear(); + if flate2::read::DeflateDecoder::new(data) + .read_to_end(&mut output) + .is_ok() + { + return Some(output); + } + None +} + +/// Reverse TIFF Predictor 2 (horizontal differencing) in-place. +/// Each byte after the first `bpp` bytes in each row is a delta from the previous byte. +fn apply_tiff_predictor2(data: &mut [u8], bpp: usize, row_bytes: usize) { + if row_bytes == 0 { + return; + } + let num_rows = data.len() / row_bytes; + for row in 0..num_rows { + let start = row * row_bytes; + for i in (start + bpp)..(start + row_bytes) { + data[i] = data[i].wrapping_add(data[i - bpp]); + } + } +} + +/// Retrieve and decompress the SMask (soft mask / alpha channel) image data if present +fn get_smask_data(doc: &Document, dict: &lopdf::Dictionary) -> Option> { + let smask_ref = dict.get(b"SMask").ok()?; + let smask_id = match smask_ref { + Object::Reference(id) => *id, + _ => return None, + }; + + let smask_stream = match doc.get_object(smask_id) { + Ok(Object::Stream(s)) => s, + _ => return None, + }; + + // Verify it's an Image subtype + let subtype = smask_stream.dict.get(b"Subtype").ok().and_then(|v| { + if let Object::Name(n) = v { + Some(n.as_slice()) + } else { + None + } + }); + if subtype != Some(b"Image") { + return None; + } + + // SMask is always DeviceGray with 1 channel + let smask_width = get_dict_int(&smask_stream.dict, b"Width").unwrap_or(0) as u32; + let smask_height = get_dict_int(&smask_stream.dict, b"Height").unwrap_or(0) as u32; + let smask_bpc = get_dict_int(&smask_stream.dict, b"BitsPerComponent").unwrap_or(8) as u32; + Some(decompress_stream_content( + doc, + smask_stream, + smask_width, + smask_height, + 1, + smask_bpc, + )) +} + +fn resolve_to_dict(doc: &Document, obj: &Object) -> Option { + match obj { + Object::Dictionary(d) => Some(d.clone()), + Object::Reference(id) => match doc.get_object(*id).ok()? { + Object::Dictionary(d) => Some(d.clone()), + _ => None, + }, + _ => None, + } +} + +fn get_dict_int(dict: &lopdf::Dictionary, key: &[u8]) -> Option { + match dict.get(key).ok()? { + Object::Integer(i) => Some(*i), + _ => None, + } +} + +fn resolve_color_space(dict: &lopdf::Dictionary, doc: &Document) -> String { + let cs = match dict.get(b"ColorSpace") { + Ok(obj) => obj, + Err(_) => return "DeviceRGB".to_string(), + }; + + match cs { + Object::Name(name) => String::from_utf8_lossy(name).to_string(), + Object::Reference(id) => match doc.get_object(*id) { + Ok(Object::Name(name)) => String::from_utf8_lossy(name).to_string(), + // ICCBased is typically [/ICCBased ] + Ok(Object::Array(arr)) => parse_color_space_array(arr, doc), + _ => "DeviceRGB".to_string(), + }, + Object::Array(arr) => parse_color_space_array(arr, doc), + _ => "DeviceRGB".to_string(), + } +} + +fn parse_color_space_array(arr: &[Object], doc: &Document) -> String { + if arr.is_empty() { + return "DeviceRGB".to_string(); + } + + let cs_name = match &arr[0] { + Object::Name(n) => String::from_utf8_lossy(n).to_string(), + _ => return "DeviceRGB".to_string(), + }; + + if cs_name == "ICCBased" && arr.len() > 1 { + // Get /N from the ICCBased stream to determine channel count + let stream_id = match &arr[1] { + Object::Reference(id) => *id, + _ => return "ICCBased".to_string(), + }; + + if let Ok(Object::Stream(s)) = doc.get_object(stream_id) { + let n = get_dict_int(&s.dict, b"N").unwrap_or(3); + return format!("ICCBased{n}"); + } + } + + cs_name +} + +fn resolve_filter(dict: &lopdf::Dictionary) -> String { + match dict.get(b"Filter") { + Ok(Object::Name(name)) => String::from_utf8_lossy(name).to_string(), + Ok(Object::Array(arr)) => { + // Filter chain — return the last (innermost) filter for image type detection + if let Some(Object::Name(name)) = arr.last() { + String::from_utf8_lossy(name).to_string() + } else { + "None".to_string() + } + } + _ => "None".to_string(), + } +} + +/// Decode an XObject stream into a DynamicImage (shared by PNG export and OCR). +fn decode_xobject_to_dynamic_image( + content: &[u8], + width: u32, + height: u32, + bpc: u32, + color_space: &str, + filter: &str, + smask: Option<&[u8]>, +) -> Option { + let dynamic_img = if filter == "DCTDecode" { + image::load_from_memory_with_format(content, ImageFormat::Jpeg).ok()? + } else if filter == "JPXDecode" { + decode_jpx(content)? + } else { + decode_raw_pixels(content, width, height, bpc, color_space)? + }; + + Some(if let Some(mask_data) = smask { + apply_smask(dynamic_img, mask_data, width, height) + } else { + dynamic_img + }) +} + +fn encode_to_png( + content: &[u8], + width: u32, + height: u32, + bpc: u32, + color_space: &str, + filter: &str, + smask: Option<&[u8]>, +) -> Option> { + let final_img = + decode_xobject_to_dynamic_image(content, width, height, bpc, color_space, filter, smask)?; + let mut png_buf = Cursor::new(Vec::new()); + final_img.write_to(&mut png_buf, ImageFormat::Png).ok()?; + Some(png_buf.into_inner()) +} + +/// Decode a JPEG 2000 (JPXDecode) stream using hayro-jpeg2000 (pure Rust) +fn decode_jpx(content: &[u8]) -> Option { + let jp2_img = + hayro_jpeg2000::Image::new(content, &hayro_jpeg2000::DecodeSettings::default()).ok()?; + DynamicImage::from_decoder(jp2_img).ok() +} + +/// Decode raw pixel data (FlateDecode / uncompressed) into a DynamicImage +fn decode_raw_pixels( + content: &[u8], + width: u32, + height: u32, + bpc: u32, + color_space: &str, +) -> Option { + let channels: u32 = match color_space { + "DeviceRGB" | "ICCBased3" | "CalRGB" => 3, + "DeviceGray" | "ICCBased1" | "CalGray" => 1, + "DeviceCMYK" | "ICCBased4" => 4, + _ => 3, + }; + let bytes_per_sample = if bpc > 8 { 2u32 } else { 1u32 }; + let expected = (width * height * channels * bytes_per_sample) as usize; + + // Validate buffer size before constructing image + if content.len() < expected { + return None; + } + // Use exactly the expected number of bytes + let pixel_data = &content[..expected]; + + // Downscale 16-bit to 8-bit if needed + let pixel_data_8bit: Vec = if bytes_per_sample == 2 { + pixel_data + .chunks_exact(2) + .map(|pair| (u16::from_be_bytes([pair[0], pair[1]]) >> 8) as u8) + .collect() + } else { + pixel_data.to_vec() + }; + + match color_space { + "DeviceRGB" | "ICCBased3" | "CalRGB" => { + let img: ImageBuffer, Vec> = + ImageBuffer::from_raw(width, height, pixel_data_8bit)?; + Some(DynamicImage::ImageRgb8(img)) + } + "DeviceGray" | "ICCBased1" | "CalGray" => { + let img: ImageBuffer, Vec> = + ImageBuffer::from_raw(width, height, pixel_data_8bit)?; + Some(DynamicImage::ImageLuma8(img)) + } + "DeviceCMYK" | "ICCBased4" => { + let rgb_bytes = cmyk_to_rgb(&pixel_data_8bit); + let img: ImageBuffer, Vec> = + ImageBuffer::from_raw(width, height, rgb_bytes)?; + Some(DynamicImage::ImageRgb8(img)) + } + _ => { + let expected_rgb = (width * height * 3) as usize; + if pixel_data_8bit.len() == expected_rgb { + let img: ImageBuffer, Vec> = + ImageBuffer::from_raw(width, height, pixel_data_8bit)?; + Some(DynamicImage::ImageRgb8(img)) + } else { + None + } + } + } +} + +/// Combine a base RGB image with a grayscale SMask to produce an RGBA PNG +fn apply_smask(base: DynamicImage, mask_data: &[u8], width: u32, height: u32) -> DynamicImage { + let rgb = base.to_rgb8(); + let expected_mask_len = (width * height) as usize; + + if mask_data.len() < expected_mask_len { + return DynamicImage::ImageRgb8(rgb); + } + + let mut rgba_pixels = Vec::with_capacity((width * height * 4) as usize); + for (rgb_pixel, &alpha) in rgb.pixels().zip(mask_data.iter()) { + rgba_pixels.push(rgb_pixel[0]); + rgba_pixels.push(rgb_pixel[1]); + rgba_pixels.push(rgb_pixel[2]); + rgba_pixels.push(alpha); + } + + match ImageBuffer::from_raw(width, height, rgba_pixels) { + Some(img) => DynamicImage::ImageRgba8(img), + None => DynamicImage::ImageRgb8(rgb), + } +} + +fn cmyk_to_rgb(cmyk: &[u8]) -> Vec { + let pixel_count = cmyk.len() / 4; + let mut rgb = Vec::with_capacity(pixel_count * 3); + + for i in 0..pixel_count { + let c = cmyk[i * 4] as f32 / 255.0; + let m = cmyk[i * 4 + 1] as f32 / 255.0; + let y = cmyk[i * 4 + 2] as f32 / 255.0; + let k = cmyk[i * 4 + 3] as f32 / 255.0; + + let r = 255.0 * (1.0 - c) * (1.0 - k); + let g = 255.0 * (1.0 - m) * (1.0 - k); + let b = 255.0 * (1.0 - y) * (1.0 - k); + + rgb.push(r as u8); + rgb.push(g as u8); + rgb.push(b as u8); + } + + rgb +} diff --git a/src/core/meta.rs b/src/core/meta.rs new file mode 100644 index 0000000..7fcd627 --- /dev/null +++ b/src/core/meta.rs @@ -0,0 +1,332 @@ +use crate::types::{BoxType, PageBox, PdfMeta}; +use lopdf::{Document, Object, ObjectId}; +use std::collections::{BTreeMap, HashMap}; + +fn extract_info_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option { + match dict.get(key).ok()? { + Object::String(bytes, _) => { + if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF { + // UTF-16 BOM + let utf16: Vec = bytes[2..] + .chunks_exact(2) + .map(|c| u16::from_be_bytes([c[0], c[1]])) + .collect(); + Some(String::from_utf16_lossy(&utf16)) + } else { + Some(String::from_utf8_lossy(bytes).to_string()) + } + } + _ => None, + } +} + +/// Convert a PDF date string to ISO 8601. +/// Input format: `D:YYYYMMDDHHmmSS+HH'mm'` (D: prefix optional, timezone optional) +/// Output format: `YYYY-MM-DDTHH:mm:SS+HH:mm` or `…Z` +fn pdf_date_to_iso8601(raw: &str) -> String { + let s = raw.strip_prefix("D:").unwrap_or(raw); + + // Need at least YYYY (4 chars) + if s.len() < 4 { + return raw.to_string(); + } + + let yyyy = &s[..4]; + let mm = s.get(4..6).unwrap_or("01"); + let dd = s.get(6..8).unwrap_or("01"); + let hh = s.get(8..10).unwrap_or("00"); + let min = s.get(10..12).unwrap_or("00"); + let sec = s.get(12..14).unwrap_or("00"); + + let tz_part = &s[14.min(s.len())..]; + let tz = if tz_part.is_empty() { + String::new() + } else if tz_part.starts_with('Z') { + "Z".to_string() + } else { + // e.g. +05'30' or -06'00' → +05:30 or -06:00 + let cleaned = tz_part.replace('\'', ""); + if cleaned.len() >= 3 { + let sign = &cleaned[..1]; + let tzh = &cleaned[1..3]; + let tzm = if cleaned.len() >= 5 { + &cleaned[3..5] + } else { + "00" + }; + format!("{sign}{tzh}:{tzm}") + } else { + String::new() + } + }; + + format!("{yyyy}-{mm}-{dd}T{hh}:{min}:{sec}{tz}") +} + +fn parse_page_box(obj: &Object) -> Option<[f64; 4]> { + let arr = match obj { + Object::Array(a) => a, + _ => return None, + }; + if arr.len() < 4 { + return None; + } + let mut out = [0.0f64; 4]; + for (idx, slot) in out.iter_mut().enumerate().take(4) { + *slot = match arr[idx] { + Object::Integer(v) => v as f64, + Object::Real(v) => v as f64, + _ => return None, + }; + } + Some(out) +} + +/// Walk the page tree to find an inheritable page box (e.g., /MediaBox, /CropBox). +/// Resolves indirect references — some PDFs store the box array via `Object::Reference`. +fn get_inherited_page_box(doc: &Document, page_id: ObjectId, key: &[u8]) -> Option<[f64; 4]> { + let mut current_id = Some(page_id); + while let Some(id) = current_id { + let dict = doc.get_dictionary(id).ok()?; + if let Ok(obj) = dict.get(key) { + // Resolve indirect reference if the box value is stored as one + let resolved = match obj { + Object::Reference(ref_id) => doc.get_object(*ref_id).ok().cloned(), + other => Some(other.clone()), + }; + if let Some(ref val) = resolved + && let Some(rect) = parse_page_box(val) + { + return Some(rect); + } + } + // Walk up to /Parent + current_id = dict.get(b"Parent").ok().and_then(|p| match p { + Object::Reference(ref_id) => Some(*ref_id), + _ => None, + }); + } + None +} + +/// Key type for grouping page boxes by geometry. +/// Uses `to_bits()` so NaN/negative-zero edge cases hash correctly. +#[derive(Clone, PartialEq, Eq, Hash)] +struct PageBoxKey { + left: u64, + bottom: u64, + right: u64, + top: u64, + box_type: u8, // 0=CropBox, 1=MediaBox, 2=Unknown +} + +/// Intermediate representation before we decide which group is dominant. +struct PageBoxGroup { + left: f64, + bottom: f64, + right: f64, + top: f64, + box_type: BoxType, + page_nums: Vec, +} + +pub(crate) fn extract_page_boxes(doc: &Document, pages: &BTreeMap) -> Vec { + let mut page_entries: Vec<(u32, ObjectId)> = pages.iter().map(|(&k, &v)| (k, v)).collect(); + page_entries.sort_unstable_by_key(|(page, _)| *page); + + // Maintain insertion order via Vec + HashMap index + let mut groups: Vec = Vec::new(); + let mut key_to_idx: HashMap = HashMap::new(); + + for (page_num, page_id) in page_entries { + let (box_type, rect) = if let Some(rect) = get_inherited_page_box(doc, page_id, b"CropBox") { + (BoxType::CropBox, rect) + } else if let Some(rect) = get_inherited_page_box(doc, page_id, b"MediaBox") { + (BoxType::MediaBox, rect) + } else { + (BoxType::Unknown, [0.0, 0.0, 0.0, 0.0]) + }; + + let (left, right) = if rect[0] <= rect[2] { + (rect[0], rect[2]) + } else { + (rect[2], rect[0]) + }; + let (bottom, top) = if rect[1] <= rect[3] { + (rect[1], rect[3]) + } else { + (rect[3], rect[1]) + }; + + let key = PageBoxKey { + left: left.to_bits(), + bottom: bottom.to_bits(), + right: right.to_bits(), + top: top.to_bits(), + box_type: match box_type { + BoxType::CropBox => 0, + BoxType::MediaBox => 1, + BoxType::Unknown => 2, + }, + }; + + if let Some(&idx) = key_to_idx.get(&key) { + groups[idx].page_nums.push(page_num); + } else { + let idx = groups.len(); + key_to_idx.insert(key, idx); + groups.push(PageBoxGroup { + left, + bottom, + right, + top, + box_type, + page_nums: vec![page_num], + }); + } + } + + // Find the dominant group (most pages) + let dominant_idx = groups + .iter() + .enumerate() + .max_by_key(|(_, g)| g.page_nums.len()) + .map(|(i, _)| i) + .unwrap_or(0); + + groups + .into_iter() + .enumerate() + .map(|(i, g)| { + let count = g.page_nums.len() as u32; + let pages = if i == dominant_idx { + None + } else { + Some(g.page_nums) + }; + PageBox { + page_count: count, + left: g.left, + bottom: g.bottom, + right: g.right, + top: g.top, + width: g.right - g.left, + height: g.top - g.bottom, + box_type: g.box_type, + pages, + } + }) + .collect() +} + +pub(crate) fn extract_metadata(doc: &Document) -> PdfMeta { + let pages = doc.get_pages(); + let page_count = pages.len() as u32; + let version = doc.version.clone(); + let is_linearized = doc.trailer.get(b"Linearized").is_ok(); + + let info_dict = doc.trailer.get(b"Info").ok().and_then(|obj| match obj { + Object::Reference(id) => doc.get_dictionary(*id).ok(), + _ => None, + }); + + let (creator, producer, creation_date, modification_date) = match info_dict { + Some(d) => ( + extract_info_string(d, b"Creator"), + extract_info_string(d, b"Producer"), + extract_info_string(d, b"CreationDate").map(|s| pdf_date_to_iso8601(&s)), + extract_info_string(d, b"ModDate").map(|s| pdf_date_to_iso8601(&s)), + ), + None => (None, None, None, None), + }; + + let page_boxes = extract_page_boxes(doc, &pages); + + PdfMeta { + page_count, + version, + is_linearized, + creator, + producer, + creation_date, + modification_date, + page_boxes, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_page_box_valid_integers() { + let obj = Object::Array(vec![ + Object::Integer(0), + Object::Integer(0), + Object::Integer(612), + Object::Integer(792), + ]); + let result = parse_page_box(&obj); + assert_eq!(result, Some([0.0, 0.0, 612.0, 792.0])); + } + + #[test] + fn parse_page_box_valid_reals() { + let obj = Object::Array(vec![ + Object::Real(0.0), + Object::Real(0.0), + Object::Real(595.0), + Object::Real(842.0), + ]); + let result = parse_page_box(&obj); + assert_eq!(result, Some([0.0, 0.0, 595.0, 842.0])); + } + + #[test] + fn parse_page_box_mixed_types() { + let obj = Object::Array(vec![ + Object::Integer(0), + Object::Real(0.5), + Object::Integer(612), + Object::Real(792.0), + ]); + let result = parse_page_box(&obj); + assert_eq!(result, Some([0.0, 0.5, 612.0, 792.0])); + } + + #[test] + fn parse_page_box_too_short() { + let obj = Object::Array(vec![Object::Integer(0), Object::Integer(0)]); + assert_eq!(parse_page_box(&obj), None); + } + + #[test] + fn parse_page_box_non_numeric() { + let obj = Object::Array(vec![ + Object::Integer(0), + Object::Integer(0), + Object::Name(b"bad".to_vec()), + Object::Integer(792), + ]); + assert_eq!(parse_page_box(&obj), None); + } + + #[test] + fn parse_page_box_not_array() { + let obj = Object::Integer(42); + assert_eq!(parse_page_box(&obj), None); + } + + #[test] + fn parse_page_box_extra_elements_ignored() { + let obj = Object::Array(vec![ + Object::Integer(0), + Object::Integer(0), + Object::Integer(612), + Object::Integer(792), + Object::Integer(999), + ]); + // Only first 4 used + assert_eq!(parse_page_box(&obj), Some([0.0, 0.0, 612.0, 792.0])); + } +} diff --git a/src/core/mod.rs b/src/core/mod.rs new file mode 100644 index 0000000..bcfe5f3 --- /dev/null +++ b/src/core/mod.rs @@ -0,0 +1,7 @@ +pub(crate) mod document; +pub(crate) mod images; +pub(crate) mod meta; +pub(crate) mod text; + +#[cfg(feature = "ocr")] +pub(crate) mod ocr; diff --git a/src/core/ocr.rs b/src/core/ocr.rs new file mode 100644 index 0000000..934f105 --- /dev/null +++ b/src/core/ocr.rs @@ -0,0 +1,170 @@ +use crate::core::images::collect_page_decoded_images; +use crate::core::text::{detect_headers_footers, strip_footer_artifacts}; +use crate::types::{OcrPageText, OcrStructuredPageText, PageText, TextSource}; +use image::DynamicImage; +use lopdf::{Document, ObjectId}; +use napi::Result; +use rayon::prelude::*; +use std::sync::Arc; + +pub(crate) fn normalize_max_threads(v: Option) -> u32 { + let default = 4u32; + let max = std::thread::available_parallelism() + .map(|n| n.get() as u32) + .unwrap_or(default); + v.unwrap_or(default).clamp(1, max) +} + +fn get_ocr_pool(threads: usize) -> Arc { + use std::collections::HashMap; + use std::sync::{Mutex, OnceLock}; + + static POOLS: OnceLock>>> = OnceLock::new(); + let map = POOLS.get_or_init(|| Mutex::new(HashMap::new())); + let mut guard = map.lock().unwrap(); + Arc::clone(guard.entry(threads).or_insert_with(|| { + Arc::new( + rayon::ThreadPoolBuilder::new() + .num_threads(threads) + .build() + .expect("failed to build OCR thread pool"), + ) + })) +} + +/// Auto-detect tessdata path, cached for the lifetime of the process. +/// Checks `TESSDATA_PREFIX` env var first (user override), then falls back to +/// parsing the output of `tesseract --list-langs` (e.g. +/// `List of available languages in "/usr/share/tesseract-ocr/5/tessdata/" (161):`). +/// Returns `None` if neither source yields a path, letting tesseract use its +/// compiled-in default. +fn get_tessdata_prefix() -> Option<&'static str> { + use std::sync::OnceLock; + static TESSDATA_PATH: OnceLock> = OnceLock::new(); + + TESSDATA_PATH + .get_or_init(|| { + // User-provided override takes priority + if let Ok(path) = std::env::var("TESSDATA_PREFIX") { + return Some(path); + } + + // Auto-detect from tesseract --list-langs + let output = std::process::Command::new("tesseract") + .arg("--list-langs") + .output() + .ok()?; + + // tesseract writes the path header to stderr + let stderr = String::from_utf8_lossy(&output.stderr); + let text = if stderr.contains('"') { + stderr + } else { + String::from_utf8_lossy(&output.stdout) + }; + + // Parse: `List of available languages in "/path/to/tessdata/" (N):` + let start = text.find('"')?; + let end = text[start + 1..].find('"')?; + Some(text[start + 1..start + 1 + end].to_string()) + }) + .as_deref() +} + +fn ocr_page_images(doc: &Document, page_id: ObjectId, lang: &str) -> String { + let datapath = get_tessdata_prefix().unwrap_or(""); + let images: Vec = collect_page_decoded_images(doc, page_id); + let mut texts = Vec::new(); + + for dyn_img in &images { + let rgb = dyn_img.to_rgb8(); + let (w, h) = rgb.dimensions(); + let pixels = rgb.as_raw(); + + let tess = tesseract_rs::TesseractAPI::new(); + if tess.init(datapath, lang).is_err() { + continue; + } + if tess + .set_image(pixels, w as i32, h as i32, 3, (w * 3) as i32) + .is_err() + { + continue; + } + if let Ok(text) = tess.get_utf8_text() { + let trimmed = text.trim(); + if !trimmed.is_empty() { + texts.push(trimmed.to_string()); + } + } + } + + texts.join("\n") +} + +pub(crate) fn extract_text_with_ocr( + doc: &Document, + lang: &str, + min_len: u32, + max_threads: u32, +) -> Result> { + let pages = doc.get_pages(); + let page_count_str = pages.len().to_string(); + let page_entries: Vec<(u32, ObjectId)> = pages.iter().map(|(&k, &v)| (k, v)).collect(); + + let pool = get_ocr_pool(max_threads as usize); + + let mut results: Vec = pool.install(|| { + page_entries + .par_iter() + .map(|&(page_num, page_id)| { + let raw = doc.extract_text(&[page_num]).unwrap_or_default(); + let native = strip_footer_artifacts(&raw, &page_count_str); + let non_ws: usize = native.chars().filter(|c| !c.is_whitespace()).count(); + if non_ws >= min_len as usize { + OcrPageText { + page: page_num, + text: native, + source: TextSource::Native, + } + } else { + let ocr_text = ocr_page_images(doc, page_id, lang); + OcrPageText { + page: page_num, + text: ocr_text, + source: TextSource::Ocr, + } + } + }) + .collect() + }); + results.sort_unstable_by_key(|r| r.page); + Ok(results) +} + +pub(crate) fn detect_headers_footers_ocr(pages: &[OcrPageText]) -> Vec { + // Convert to PageText for header/footer detection + let as_page_text: Vec = pages + .iter() + .map(|p| PageText { + page: p.page, + text: p.text.clone(), + }) + .collect(); + let structured = detect_headers_footers(&as_page_text); + // Zip back with source info + structured + .into_iter() + .zip(pages.iter()) + .map(|(s, ocr)| OcrStructuredPageText { + page: s.page, + header: s.header, + body: s.body, + footer: s.footer, + source: match ocr.source { + TextSource::Native => TextSource::Native, + TextSource::Ocr => TextSource::Ocr, + }, + }) + .collect() +} diff --git a/src/core/text.rs b/src/core/text.rs new file mode 100644 index 0000000..4d98266 --- /dev/null +++ b/src/core/text.rs @@ -0,0 +1,258 @@ +use crate::types::{PageText, StructuredPageText}; +use lopdf::Document; +use napi::Result; +use rayon::prelude::*; + +pub(crate) fn extract_text(doc: &Document) -> Result> { + let pages = doc.get_pages(); + let page_count_str = pages.len().to_string(); + let page_nums: Vec = pages.keys().copied().collect(); + let mut results: Vec = page_nums + .par_iter() + .map(|&page_num| { + let raw = doc.extract_text(&[page_num]).unwrap_or_default(); + let text = strip_footer_artifacts(&raw, &page_count_str); + PageText { + page: page_num, + text, + } + }) + .collect(); + results.sort_unstable_by_key(|p| p.page); + Ok(results) +} + +/// Normalize a line for header/footer comparison: trim whitespace and replace +/// contiguous digit sequences with `` so "Page 1" matches "Page 42". +fn normalize_header_footer_line(line: &str) -> String { + let trimmed = line.trim(); + let mut out = String::with_capacity(trimmed.len()); + let mut in_digits = false; + for ch in trimmed.chars() { + if ch.is_ascii_digit() { + if !in_digits { + out.push_str(""); + in_digits = true; + } + } else { + in_digits = false; + out.push(ch); + } + } + out +} + +/// Strip Chromium footer artifacts from extracted text. +/// +/// Chromium's Skia PDF renderer writes page footers (e.g., `1 / 38`) as 2-3 +/// separate text operations. `lopdf::extract_text` concatenates these in +/// content-stream order, causing orphaned fragments like ` / \n38\n` to appear +/// mid-text on every page. This function removes the known pattern: a line +/// containing just `/` followed by a line containing just the total page count. +pub(crate) fn strip_footer_artifacts(text: &str, page_count_str: &str) -> String { + let lines: Vec<&str> = text.lines().collect(); + if lines.len() < 2 { + return text.to_string(); + } + let mut skip = vec![false; lines.len()]; + for i in 0..lines.len() - 1 { + if lines[i].trim() == "/" && lines[i + 1].trim() == page_count_str { + skip[i] = true; + skip[i + 1] = true; + } + } + if !skip.iter().any(|&s| s) { + return text.to_string(); + } + lines + .iter() + .zip(skip.iter()) + .filter(|&(_, &s)| !s) + .map(|(&line, _)| line) + .collect::>() + .join("\n") +} + +/// Detect repeated header/footer lines across pages and split each page's text +/// into header, body, and footer sections. +pub(crate) fn detect_headers_footers(pages: &[PageText]) -> Vec { + // For fewer than 3 pages, no meaningful detection — return everything as body + if pages.len() < 3 { + return pages + .iter() + .map(|p| StructuredPageText { + page: p.page, + header: String::new(), + body: p.text.clone(), + footer: String::new(), + }) + .collect(); + } + + let threshold = (pages.len() as f64 * 0.6).ceil() as usize; + let max_check = 3usize; // check up to 3 lines from top/bottom + + // Split each page into lines + let page_lines: Vec> = pages.iter().map(|p| p.text.lines().collect()).collect(); + + // Detect header line count: for each position 0..max_check, check if the + // normalized line at that position appears on >= threshold pages + let mut header_count = 0usize; + for pos in 0..max_check { + let mut freq = std::collections::HashMap::::new(); + for lines in &page_lines { + if let Some(&line) = lines.get(pos) { + let norm = normalize_header_footer_line(line); + if !norm.is_empty() { + *freq.entry(norm).or_insert(0) += 1; + } + } + } + if freq.values().any(|&c| c >= threshold) { + header_count = pos + 1; + } else { + break; + } + } + + // Detect footer line count (from the bottom) + let mut footer_count = 0usize; + for pos in 0..max_check { + let mut freq = std::collections::HashMap::::new(); + for lines in &page_lines { + if lines.len() > pos { + let idx = lines.len() - 1 - pos; + // Don't overlap with headers + if idx >= header_count { + let norm = normalize_header_footer_line(lines[idx]); + if !norm.is_empty() { + *freq.entry(norm).or_insert(0) += 1; + } + } + } + } + if freq.values().any(|&c| c >= threshold) { + footer_count = pos + 1; + } else { + break; + } + } + + pages + .iter() + .zip(page_lines.iter()) + .map(|(p, lines)| { + let total = lines.len(); + let h_end = header_count.min(total); + let f_start = if footer_count > 0 { + total.saturating_sub(footer_count).max(h_end) + } else { + total + }; + + let header = lines[..h_end].join("\n"); + let body = lines[h_end..f_start].join("\n"); + let footer = lines[f_start..].join("\n"); + + StructuredPageText { + page: p.page, + header, + body, + footer, + } + }) + .collect() +} + +pub(crate) fn extract_structured_text(doc: &Document) -> Result> { + let pages = extract_text(doc)?; + Ok(detect_headers_footers(&pages)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn strip_basic_footer_artifact() { + let text = "Some content\n/\n38\nMore content"; + let result = strip_footer_artifacts(text, "38"); + assert_eq!(result, "Some content\nMore content"); + } + + #[test] + fn strip_footer_artifact_with_whitespace() { + let text = "Some content\n / \n 38 \nMore content"; + let result = strip_footer_artifacts(text, "38"); + assert_eq!(result, "Some content\nMore content"); + } + + #[test] + fn no_match_passthrough() { + let text = "Some content\nNo footer here\nMore content"; + let result = strip_footer_artifacts(text, "38"); + assert_eq!(result, text); + } + + #[test] + fn multiple_occurrences() { + let text = "Page one\n/\n38\nPage two\n/\n38\nPage three"; + let result = strip_footer_artifacts(text, "38"); + assert_eq!(result, "Page one\nPage two\nPage three"); + } + + #[test] + fn at_start_of_text() { + let text = "/\n38\nContent after"; + let result = strip_footer_artifacts(text, "38"); + assert_eq!(result, "Content after"); + } + + #[test] + fn at_end_of_text() { + let text = "Content before\n/\n38"; + let result = strip_footer_artifacts(text, "38"); + assert_eq!(result, "Content before"); + } + + #[test] + fn empty_input() { + let result = strip_footer_artifacts("", "38"); + assert_eq!(result, ""); + } + + #[test] + fn single_line_input() { + let result = strip_footer_artifacts("just one line", "38"); + assert_eq!(result, "just one line"); + } + + #[test] + fn consecutive_pairs() { + // Two pairs back to back: `/\n38\n/\n38` + let text = "start\n/\n38\n/\n38\nend"; + let result = strip_footer_artifacts(text, "38"); + assert_eq!(result, "start\nend"); + } + + #[test] + fn slash_not_followed_by_count() { + let text = "Some content\n/\n99\nMore content"; + let result = strip_footer_artifacts(text, "38"); + assert_eq!(result, text); + } + + #[test] + fn slash_with_extra_text_not_stripped() { + let text = "Some content\n/ extra\n38\nMore content"; + let result = strip_footer_artifacts(text, "38"); + assert_eq!(result, text); + } + + #[test] + fn count_with_extra_text_not_stripped() { + let text = "Some content\n/\n38 pages\nMore content"; + let result = strip_footer_artifacts(text, "38"); + assert_eq!(result, text); + } +} diff --git a/src/lib.rs b/src/lib.rs index 36b7d8e..c8153a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,1130 +4,50 @@ use napi::bindgen_prelude::*; use napi::{Env, Task}; use napi_derive::napi; -use image::{DynamicImage, ImageBuffer, ImageFormat}; -use lopdf::{Document, Object, ObjectId}; -use rayon::prelude::*; -use std::collections::HashSet; -use std::io::Cursor; +use lopdf::Document; use std::sync::Arc; -// ── Step 1: Compile-time assertion that lopdf::Document is Send + Sync ── -const _: () = { - fn assert_send_sync() {} - fn check() { - assert_send_sync::(); - } -}; - -// ── Napi object types (JS boundary) ───────────────────────────── - -#[napi(object)] -pub struct PageText { - pub page: u32, - pub text: String, -} +mod core; +mod types; -#[napi(object)] -pub struct StructuredPageText { - pub page: u32, - pub header: String, - pub body: String, - pub footer: String, -} +// Public API types (appear in generated .d.ts) +pub use types::{ + BoxType, PageAnnotation, PageBox, PageImage, PageText, PdfDocument, PdfMeta, StructuredPageText, +}; #[cfg(feature = "ocr")] -#[napi(string_enum)] -pub enum TextSource { - Native, - Ocr, -} +pub use types::{OcrOptions, OcrPageText, OcrStructuredPageText, PdfDocumentOcr, TextSource}; -#[cfg(feature = "ocr")] -#[napi(object)] -pub struct OcrPageText { - pub page: u32, - pub text: String, - pub source: TextSource, -} +// Internal plumbing (used by Task impls in this file — must be `pub` for napi Task trait) +pub use types::{RawPageImage, RawPdfDocument}; #[cfg(feature = "ocr")] -#[napi(object)] -pub struct OcrOptions { - pub lang: Option, - pub min_text_length: Option, - pub max_threads: Option, -} - -#[napi(object)] -pub struct PdfMeta { - pub page_count: u32, - pub version: String, - pub is_linearized: bool, - pub creator: Option, - pub producer: Option, - pub creation_date: Option, - pub modification_date: Option, - pub page_boxes: Vec, -} - -#[napi(string_enum)] -pub enum BoxType { - CropBox, - MediaBox, - Unknown, -} - -#[napi(object)] -pub struct PageBox { - /// Number of pages that share these dimensions. - pub page_count: u32, - pub left: f64, - pub bottom: f64, - pub right: f64, - pub top: f64, - pub width: f64, - pub height: f64, - pub box_type: BoxType, - /// Present only on non-dominant boxes — lists the specific pages with these - /// dimensions. `None` on the first (most frequent) entry means "all pages - /// not listed in any other entry's `pages` array." - pub pages: Option>, -} - -#[napi(object)] -pub struct PageImage { - pub page: u32, - pub image_index: u32, - pub width: u32, - pub height: u32, - pub data: Buffer, - pub color_space: String, - pub bits_per_component: u32, - pub filter: String, - pub xobject_name: String, - pub object_id: String, -} - -#[napi(object)] -pub struct PageAnnotation { - pub page: u32, - pub subtype: String, - pub rect: Vec, - pub uri: Option, - pub dest: Option, - pub content: Option, -} - -#[napi(object)] -pub struct PdfDocument { - pub version: String, - pub is_linearized: bool, - pub page_count: u32, - pub creator: Option, - pub producer: Option, - pub creation_date: Option, - pub modification_date: Option, - pub page_boxes: Vec, - pub total_images: u32, - pub total_annotations: u32, - pub image_pages: Vec, - pub annotation_pages: Vec, - pub text: Vec, - pub structured_text: Vec, - pub images: Vec, - pub annotations: Vec, -} - -#[cfg(feature = "ocr")] -#[napi(object)] -pub struct OcrStructuredPageText { - pub page: u32, - pub header: String, - pub body: String, - pub footer: String, - pub source: TextSource, -} - -#[cfg(feature = "ocr")] -#[napi(object)] -pub struct PdfDocumentOcr { - pub version: String, - pub is_linearized: bool, - pub page_count: u32, - pub creator: Option, - pub producer: Option, - pub creation_date: Option, - pub modification_date: Option, - pub page_boxes: Vec, - pub total_images: u32, - pub total_annotations: u32, - pub image_pages: Vec, - pub annotation_pages: Vec, - pub text: Vec, - pub structured_text: Vec, - pub images: Vec, - pub annotations: Vec, -} - -// ── Step 2: Internal type — no napi types, safe for any thread ── - -pub struct RawPageImage { - pub page: u32, - pub image_index: u32, - pub width: u32, - pub height: u32, - pub data: Vec, - pub color_space: String, - pub bits_per_component: u32, - pub filter: String, - pub xobject_name: String, - pub object_id: String, -} +pub use types::RawPdfDocumentOcr; -impl From for PageImage { - fn from(r: RawPageImage) -> Self { - PageImage { - page: r.page, - image_index: r.image_index, - width: r.width, - height: r.height, - data: r.data.into(), - color_space: r.color_space, - bits_per_component: r.bits_per_component, - filter: r.filter, - xobject_name: r.xobject_name, - object_id: r.object_id, - } +// ── Step 1: Compile-time assertion that lopdf::Document is Send + Sync ── +const _: () = { + fn assert_send_sync() {} + fn check() { + assert_send_sync::(); } -} +}; -pub struct RawPdfDocument { - pub meta: PdfMeta, - pub text: Vec, - pub structured_text: Vec, - pub images: Vec, - pub annotations: Vec, - pub image_pages: Vec, - pub annotation_pages: Vec, -} +// ── Shared helpers ────────────────────────────────────────────── -impl From for PdfDocument { - fn from(r: RawPdfDocument) -> Self { - let total_images = r.images.len() as u32; - let total_annotations = r.annotations.len() as u32; - PdfDocument { - version: r.meta.version, - is_linearized: r.meta.is_linearized, - page_count: r.meta.page_count, - creator: r.meta.creator, - producer: r.meta.producer, - creation_date: r.meta.creation_date, - modification_date: r.meta.modification_date, - page_boxes: r.meta.page_boxes, - total_images, - total_annotations, - image_pages: r.image_pages, - annotation_pages: r.annotation_pages, - text: r.text, - structured_text: r.structured_text, - images: r.images.into_iter().map(PageImage::from).collect(), - annotations: r.annotations, - } - } -} +use crate::core::document::{extract_all, extract_annotations}; +use crate::core::images::extract_images_raw; +use crate::core::meta::extract_metadata; +use crate::core::text::{extract_structured_text, extract_text}; #[cfg(feature = "ocr")] -pub struct RawPdfDocumentOcr { - pub meta: PdfMeta, - pub text: Vec, - pub structured_text: Vec, - pub images: Vec, - pub annotations: Vec, - pub image_pages: Vec, - pub annotation_pages: Vec, -} - +use crate::core::document::extract_all_with_ocr; #[cfg(feature = "ocr")] -impl From for PdfDocumentOcr { - fn from(r: RawPdfDocumentOcr) -> Self { - let total_images = r.images.len() as u32; - let total_annotations = r.annotations.len() as u32; - PdfDocumentOcr { - version: r.meta.version, - is_linearized: r.meta.is_linearized, - page_count: r.meta.page_count, - creator: r.meta.creator, - producer: r.meta.producer, - creation_date: r.meta.creation_date, - modification_date: r.meta.modification_date, - page_boxes: r.meta.page_boxes, - total_images, - total_annotations, - image_pages: r.image_pages, - annotation_pages: r.annotation_pages, - text: r.text, - structured_text: r.structured_text, - images: r.images.into_iter().map(PageImage::from).collect(), - annotations: r.annotations, - } - } -} - -// ── Shared helpers ────────────────────────────────────────────── +use crate::core::ocr::{extract_text_with_ocr, normalize_max_threads}; fn load_doc(buf: &[u8]) -> Result { Document::load_mem(buf).map_err(|e| Error::from_reason(format!("Failed to load PDF: {e}"))) } -fn extract_text(doc: &Document) -> Result> { - let pages = doc.get_pages(); - let page_count_str = pages.len().to_string(); - let page_nums: Vec = pages.keys().copied().collect(); - let mut results: Vec = page_nums - .par_iter() - .map(|&page_num| { - let raw = doc.extract_text(&[page_num]).unwrap_or_default(); - let text = strip_footer_artifacts(&raw, &page_count_str); - PageText { - page: page_num, - text, - } - }) - .collect(); - results.sort_unstable_by_key(|p| p.page); - Ok(results) -} - -/// Normalize a line for header/footer comparison: trim whitespace and replace -/// contiguous digit sequences with `` so "Page 1" matches "Page 42". -fn normalize_header_footer_line(line: &str) -> String { - let trimmed = line.trim(); - let mut out = String::with_capacity(trimmed.len()); - let mut in_digits = false; - for ch in trimmed.chars() { - if ch.is_ascii_digit() { - if !in_digits { - out.push_str(""); - in_digits = true; - } - } else { - in_digits = false; - out.push(ch); - } - } - out -} - -/// Strip Chromium footer artifacts from extracted text. -/// -/// Chromium's Skia PDF renderer writes page footers (e.g., `1 / 38`) as 2-3 -/// separate text operations. `lopdf::extract_text` concatenates these in -/// content-stream order, causing orphaned fragments like ` / \n38\n` to appear -/// mid-text on every page. This function removes the known pattern: a line -/// containing just `/` followed by a line containing just the total page count. -fn strip_footer_artifacts(text: &str, page_count_str: &str) -> String { - let lines: Vec<&str> = text.lines().collect(); - if lines.len() < 2 { - return text.to_string(); - } - let mut skip = vec![false; lines.len()]; - for i in 0..lines.len() - 1 { - if lines[i].trim() == "/" && lines[i + 1].trim() == page_count_str { - skip[i] = true; - skip[i + 1] = true; - } - } - if !skip.iter().any(|&s| s) { - return text.to_string(); - } - lines - .iter() - .zip(skip.iter()) - .filter(|&(_, &s)| !s) - .map(|(&line, _)| line) - .collect::>() - .join("\n") -} - -/// Detect repeated header/footer lines across pages and split each page's text -/// into header, body, and footer sections. -fn detect_headers_footers(pages: &[PageText]) -> Vec { - // For fewer than 3 pages, no meaningful detection — return everything as body - if pages.len() < 3 { - return pages - .iter() - .map(|p| StructuredPageText { - page: p.page, - header: String::new(), - body: p.text.clone(), - footer: String::new(), - }) - .collect(); - } - - let threshold = (pages.len() as f64 * 0.6).ceil() as usize; - let max_check = 3usize; // check up to 3 lines from top/bottom - - // Split each page into lines - let page_lines: Vec> = pages.iter().map(|p| p.text.lines().collect()).collect(); - - // Detect header line count: for each position 0..max_check, check if the - // normalized line at that position appears on >= threshold pages - let mut header_count = 0usize; - for pos in 0..max_check { - let mut freq = std::collections::HashMap::::new(); - for lines in &page_lines { - if let Some(&line) = lines.get(pos) { - let norm = normalize_header_footer_line(line); - if !norm.is_empty() { - *freq.entry(norm).or_insert(0) += 1; - } - } - } - if freq.values().any(|&c| c >= threshold) { - header_count = pos + 1; - } else { - break; - } - } - - // Detect footer line count (from the bottom) - let mut footer_count = 0usize; - for pos in 0..max_check { - let mut freq = std::collections::HashMap::::new(); - for lines in &page_lines { - if lines.len() > pos { - let idx = lines.len() - 1 - pos; - // Don't overlap with headers - if idx >= header_count { - let norm = normalize_header_footer_line(lines[idx]); - if !norm.is_empty() { - *freq.entry(norm).or_insert(0) += 1; - } - } - } - } - if freq.values().any(|&c| c >= threshold) { - footer_count = pos + 1; - } else { - break; - } - } - - pages - .iter() - .zip(page_lines.iter()) - .map(|(p, lines)| { - let total = lines.len(); - let h_end = header_count.min(total); - let f_start = if footer_count > 0 { - total.saturating_sub(footer_count).max(h_end) - } else { - total - }; - - let header = lines[..h_end].join("\n"); - let body = lines[h_end..f_start].join("\n"); - let footer = lines[f_start..].join("\n"); - - StructuredPageText { - page: p.page, - header, - body, - footer, - } - }) - .collect() -} - -fn extract_structured_text(doc: &Document) -> Result> { - let pages = extract_text(doc)?; - Ok(detect_headers_footers(&pages)) -} - -fn extract_info_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option { - match dict.get(key).ok()? { - Object::String(bytes, _) => { - if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF { - // UTF-16 BOM - let utf16: Vec = bytes[2..] - .chunks_exact(2) - .map(|c| u16::from_be_bytes([c[0], c[1]])) - .collect(); - Some(String::from_utf16_lossy(&utf16)) - } else { - Some(String::from_utf8_lossy(bytes).to_string()) - } - } - _ => None, - } -} - -/// Convert a PDF date string to ISO 8601. -/// Input format: `D:YYYYMMDDHHmmSS+HH'mm'` (D: prefix optional, timezone optional) -/// Output format: `YYYY-MM-DDTHH:mm:SS+HH:mm` or `…Z` -fn pdf_date_to_iso8601(raw: &str) -> String { - let s = raw.strip_prefix("D:").unwrap_or(raw); - - // Need at least YYYY (4 chars) - if s.len() < 4 { - return raw.to_string(); - } - - let yyyy = &s[..4]; - let mm = s.get(4..6).unwrap_or("01"); - let dd = s.get(6..8).unwrap_or("01"); - let hh = s.get(8..10).unwrap_or("00"); - let min = s.get(10..12).unwrap_or("00"); - let sec = s.get(12..14).unwrap_or("00"); - - let tz_part = &s[14.min(s.len())..]; - let tz = if tz_part.is_empty() { - String::new() - } else if tz_part.starts_with('Z') { - "Z".to_string() - } else { - // e.g. +05'30' or -06'00' → +05:30 or -06:00 - let cleaned = tz_part.replace('\'', ""); - if cleaned.len() >= 3 { - let sign = &cleaned[..1]; - let tzh = &cleaned[1..3]; - let tzm = if cleaned.len() >= 5 { - &cleaned[3..5] - } else { - "00" - }; - format!("{sign}{tzh}:{tzm}") - } else { - String::new() - } - }; - - format!("{yyyy}-{mm}-{dd}T{hh}:{min}:{sec}{tz}") -} - -fn parse_page_box(obj: &Object) -> Option<[f64; 4]> { - let arr = match obj { - Object::Array(a) => a, - _ => return None, - }; - if arr.len() < 4 { - return None; - } - let mut out = [0.0f64; 4]; - for (idx, slot) in out.iter_mut().enumerate().take(4) { - *slot = match arr[idx] { - Object::Integer(v) => v as f64, - Object::Real(v) => v as f64, - _ => return None, - }; - } - Some(out) -} - -/// Walk the page tree to find an inheritable page box (e.g., /MediaBox, /CropBox). -/// Resolves indirect references — some PDFs store the box array via `Object::Reference`. -fn get_inherited_page_box(doc: &Document, page_id: ObjectId, key: &[u8]) -> Option<[f64; 4]> { - let mut current_id = Some(page_id); - while let Some(id) = current_id { - let dict = doc.get_dictionary(id).ok()?; - if let Ok(obj) = dict.get(key) { - // Resolve indirect reference if the box value is stored as one - let resolved = match obj { - Object::Reference(ref_id) => doc.get_object(*ref_id).ok().cloned(), - other => Some(other.clone()), - }; - if let Some(ref val) = resolved - && let Some(rect) = parse_page_box(val) - { - return Some(rect); - } - } - // Walk up to /Parent - current_id = dict.get(b"Parent").ok().and_then(|p| match p { - Object::Reference(ref_id) => Some(*ref_id), - _ => None, - }); - } - None -} - -/// Key type for grouping page boxes by geometry. -/// Uses `to_bits()` so NaN/negative-zero edge cases hash correctly. -#[derive(Clone, PartialEq, Eq, Hash)] -struct PageBoxKey { - left: u64, - bottom: u64, - right: u64, - top: u64, - box_type: u8, // 0=CropBox, 1=MediaBox, 2=Unknown -} - -/// Intermediate representation before we decide which group is dominant. -struct PageBoxGroup { - left: f64, - bottom: f64, - right: f64, - top: f64, - box_type: BoxType, - page_nums: Vec, -} - -fn extract_page_boxes( - doc: &Document, - pages: &std::collections::BTreeMap, -) -> Vec { - let mut page_entries: Vec<(u32, ObjectId)> = pages.iter().map(|(&k, &v)| (k, v)).collect(); - page_entries.sort_unstable_by_key(|(page, _)| *page); - - // Maintain insertion order via Vec + HashMap index - let mut groups: Vec = Vec::new(); - let mut key_to_idx: std::collections::HashMap = - std::collections::HashMap::new(); - - for (page_num, page_id) in page_entries { - let (box_type, rect) = if let Some(rect) = get_inherited_page_box(doc, page_id, b"CropBox") { - (BoxType::CropBox, rect) - } else if let Some(rect) = get_inherited_page_box(doc, page_id, b"MediaBox") { - (BoxType::MediaBox, rect) - } else { - (BoxType::Unknown, [0.0, 0.0, 0.0, 0.0]) - }; - - let (left, right) = if rect[0] <= rect[2] { - (rect[0], rect[2]) - } else { - (rect[2], rect[0]) - }; - let (bottom, top) = if rect[1] <= rect[3] { - (rect[1], rect[3]) - } else { - (rect[3], rect[1]) - }; - - let key = PageBoxKey { - left: left.to_bits(), - bottom: bottom.to_bits(), - right: right.to_bits(), - top: top.to_bits(), - box_type: match box_type { - BoxType::CropBox => 0, - BoxType::MediaBox => 1, - BoxType::Unknown => 2, - }, - }; - - if let Some(&idx) = key_to_idx.get(&key) { - groups[idx].page_nums.push(page_num); - } else { - let idx = groups.len(); - key_to_idx.insert(key, idx); - groups.push(PageBoxGroup { - left, - bottom, - right, - top, - box_type, - page_nums: vec![page_num], - }); - } - } - - // Find the dominant group (most pages) - let dominant_idx = groups - .iter() - .enumerate() - .max_by_key(|(_, g)| g.page_nums.len()) - .map(|(i, _)| i) - .unwrap_or(0); - - groups - .into_iter() - .enumerate() - .map(|(i, g)| { - let count = g.page_nums.len() as u32; - let pages = if i == dominant_idx { - None - } else { - Some(g.page_nums) - }; - PageBox { - page_count: count, - left: g.left, - bottom: g.bottom, - right: g.right, - top: g.top, - width: g.right - g.left, - height: g.top - g.bottom, - box_type: g.box_type, - pages, - } - }) - .collect() -} - -fn extract_metadata(doc: &Document) -> PdfMeta { - let pages = doc.get_pages(); - let page_count = pages.len() as u32; - let version = doc.version.clone(); - let is_linearized = doc.trailer.get(b"Linearized").is_ok(); - - let info_dict = doc.trailer.get(b"Info").ok().and_then(|obj| match obj { - Object::Reference(id) => doc.get_dictionary(*id).ok(), - _ => None, - }); - - let (creator, producer, creation_date, modification_date) = match info_dict { - Some(d) => ( - extract_info_string(d, b"Creator"), - extract_info_string(d, b"Producer"), - extract_info_string(d, b"CreationDate").map(|s| pdf_date_to_iso8601(&s)), - extract_info_string(d, b"ModDate").map(|s| pdf_date_to_iso8601(&s)), - ), - None => (None, None, None, None), - }; - - let page_boxes = extract_page_boxes(doc, &pages); - - PdfMeta { - page_count, - version, - is_linearized, - creator, - producer, - creation_date, - modification_date, - page_boxes, - } -} - -fn extract_images_raw(doc: &Document) -> Vec { - let pages = doc.get_pages(); - let page_entries: Vec<(u32, ObjectId)> = pages.iter().map(|(&k, &v)| (k, v)).collect(); - let mut results: Vec = page_entries - .par_iter() - .flat_map(|&(page_num, page_id)| collect_page_images_raw(doc, page_id, page_num)) - .collect(); - results.sort_unstable_by_key(|r| (r.page, r.image_index)); - results -} - -fn collect_page_annotations( - doc: &Document, - page_id: ObjectId, - page_num: u32, -) -> Vec { - let annots = match doc.get_page_annotations(page_id) { - Ok(a) => a, - Err(_) => return Vec::new(), - }; - - let mut results = Vec::new(); - for annot in annots { - let subtype = annot - .get(b"Subtype") - .ok() - .and_then(|v| { - if let Object::Name(n) = v { - Some(String::from_utf8_lossy(n).to_string()) - } else { - None - } - }) - .unwrap_or_default(); - - let rect = annot - .get(b"Rect") - .ok() - .and_then(|v| { - if let Object::Array(arr) = v { - Some( - arr - .iter() - .filter_map(|o| match o { - Object::Real(f) => Some(*f as f64), - Object::Integer(i) => Some(*i as f64), - _ => None, - }) - .collect::>(), - ) - } else { - None - } - }) - .unwrap_or_default(); - - // Extract URI from /A action dictionary - let uri = annot.get(b"A").ok().and_then(|action| { - let action_dict = match action { - Object::Dictionary(d) => Some(d), - Object::Reference(id) => doc.get_dictionary(*id).ok(), - _ => None, - }?; - let uri_obj = action_dict.get(b"URI").ok()?; - match uri_obj { - Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).to_string()), - _ => None, - } - }); - - // Extract /Dest (named or direct destination) - let dest = annot.get(b"Dest").ok().and_then(|d| match d { - Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).to_string()), - Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()), - _ => None, - }); - - // Extract /Contents (tooltip / alt text) - let content = annot.get(b"Contents").ok().and_then(|c| match c { - Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).to_string()), - _ => None, - }); - - results.push(PageAnnotation { - page: page_num, - subtype, - rect, - uri, - dest, - content, - }); - } - - results -} - -fn extract_annotations(doc: &Document) -> Vec { - let pages = doc.get_pages(); - let page_entries: Vec<(u32, ObjectId)> = pages.iter().map(|(&k, &v)| (k, v)).collect(); - let mut results: Vec = page_entries - .par_iter() - .flat_map(|&(page_num, page_id)| collect_page_annotations(doc, page_id, page_num)) - .collect(); - results.sort_unstable_by_key(|a| a.page); - results -} - -fn extract_all(doc: &Document) -> Result { - let meta = extract_metadata(doc); - let ((text, images), annotations) = rayon::join( - || rayon::join(|| extract_text(doc), || extract_images_raw(doc)), - || extract_annotations(doc), - ); - let text = text?; - let structured_text = detect_headers_footers(&text); - - let mut image_pages: Vec = images - .iter() - .map(|i| i.page) - .collect::>() - .into_iter() - .collect(); - image_pages.sort_unstable(); - - let mut annotation_pages: Vec = annotations - .iter() - .map(|a| a.page) - .collect::>() - .into_iter() - .collect(); - annotation_pages.sort_unstable(); - - Ok(RawPdfDocument { - meta, - text, - structured_text, - images, - annotations, - image_pages, - annotation_pages, - }) -} - -// ── OCR fallback (feature-gated) ──────────────────────────────── - -#[cfg(feature = "ocr")] -fn normalize_max_threads(v: Option) -> u32 { - let default = 4u32; - let max = std::thread::available_parallelism() - .map(|n| n.get() as u32) - .unwrap_or(default); - v.unwrap_or(default).clamp(1, max) -} - -#[cfg(feature = "ocr")] -fn get_ocr_pool(threads: usize) -> Arc { - use std::collections::HashMap; - use std::sync::{Mutex, OnceLock}; - - static POOLS: OnceLock>>> = OnceLock::new(); - let map = POOLS.get_or_init(|| Mutex::new(HashMap::new())); - let mut guard = map.lock().unwrap(); - Arc::clone(guard.entry(threads).or_insert_with(|| { - Arc::new( - rayon::ThreadPoolBuilder::new() - .num_threads(threads) - .build() - .expect("failed to build OCR thread pool"), - ) - })) -} - -/// Decode all image XObjects on a page to DynamicImages (no PNG encoding). -/// Used by OCR to avoid the PNG encode→decode roundtrip. -#[cfg(feature = "ocr")] -fn collect_page_decoded_images(doc: &Document, page_id: ObjectId) -> Vec { - let mut decoded = Vec::new(); - - let xobjects = match get_page_xobjects(doc, page_id) { - Some(x) => x, - None => return decoded, - }; - - let referenced_names = get_referenced_xobject_names(doc, page_id); - - for (name, obj_ref) in xobjects.iter() { - if !referenced_names.is_empty() && !referenced_names.contains(name) { - continue; - } - - let obj_id = match obj_ref { - Object::Reference(id) => *id, - _ => continue, - }; - - let stream = match doc.get_object(obj_id) { - Ok(Object::Stream(s)) => s, - _ => continue, - }; - - let subtype = stream.dict.get(b"Subtype").ok().and_then(|v| { - if let Object::Name(n) = v { - Some(n.as_slice()) - } else { - None - } - }); - if subtype != Some(b"Image") { - continue; - } - - let width = get_dict_int(&stream.dict, b"Width").unwrap_or(0) as u32; - let height = get_dict_int(&stream.dict, b"Height").unwrap_or(0) as u32; - let bpc = get_dict_int(&stream.dict, b"BitsPerComponent").unwrap_or(8) as u32; - if width == 0 || height == 0 { - continue; - } - - let color_space = resolve_color_space(&stream.dict, doc); - let filter = resolve_filter(&stream.dict); - - let content = match filter.as_str() { - "DCTDecode" | "JPXDecode" => stream.content.clone(), - _ => { - let mut s = stream.clone(); - if s.decompress().is_ok() { - s.content - } else { - stream.content.clone() - } - } - }; - - // Skip SMask for OCR — to_rgb8() drops alpha anyway - if let Some(img) = - decode_xobject_to_dynamic_image(&content, width, height, bpc, &color_space, &filter, None) - { - decoded.push(img); - } - } - - decoded -} - -/// Auto-detect tessdata path, cached for the lifetime of the process. -/// Checks `TESSDATA_PREFIX` env var first (user override), then falls back to -/// parsing the output of `tesseract --list-langs` (e.g. -/// `List of available languages in "/usr/share/tesseract-ocr/5/tessdata/" (161):`). -/// Returns `None` if neither source yields a path, letting tesseract use its -/// compiled-in default. -#[cfg(feature = "ocr")] -fn get_tessdata_prefix() -> Option<&'static str> { - use std::sync::OnceLock; - static TESSDATA_PATH: OnceLock> = OnceLock::new(); - - TESSDATA_PATH - .get_or_init(|| { - // User-provided override takes priority - if let Ok(path) = std::env::var("TESSDATA_PREFIX") { - return Some(path); - } - - // Auto-detect from tesseract --list-langs - let output = std::process::Command::new("tesseract") - .arg("--list-langs") - .output() - .ok()?; - - // tesseract writes the path header to stderr - let stderr = String::from_utf8_lossy(&output.stderr); - let text = if stderr.contains('"') { - stderr - } else { - String::from_utf8_lossy(&output.stdout) - }; - - // Parse: `List of available languages in "/path/to/tessdata/" (N):` - let start = text.find('"')?; - let end = text[start + 1..].find('"')?; - Some(text[start + 1..start + 1 + end].to_string()) - }) - .as_deref() -} - -#[cfg(feature = "ocr")] -fn ocr_page_images(doc: &Document, page_id: ObjectId, lang: &str) -> String { - let datapath = get_tessdata_prefix().unwrap_or(""); - let images = collect_page_decoded_images(doc, page_id); - let mut texts = Vec::new(); - - for dyn_img in &images { - let rgb = dyn_img.to_rgb8(); - let (w, h) = rgb.dimensions(); - let pixels = rgb.as_raw(); - - let tess = tesseract_rs::TesseractAPI::new(); - if tess.init(datapath, lang).is_err() { - continue; - } - if tess - .set_image(pixels, w as i32, h as i32, 3, (w * 3) as i32) - .is_err() - { - continue; - } - if let Ok(text) = tess.get_utf8_text() { - let trimmed = text.trim(); - if !trimmed.is_empty() { - texts.push(trimmed.to_string()); - } - } - } - - texts.join("\n") -} - -#[cfg(feature = "ocr")] -fn extract_text_with_ocr( - doc: &Document, - lang: &str, - min_len: u32, - max_threads: u32, -) -> Result> { - let pages = doc.get_pages(); - let page_count_str = pages.len().to_string(); - let page_entries: Vec<(u32, ObjectId)> = pages.iter().map(|(&k, &v)| (k, v)).collect(); - - let pool = get_ocr_pool(max_threads as usize); - - let mut results: Vec = pool.install(|| { - page_entries - .par_iter() - .map(|&(page_num, page_id)| { - let raw = doc.extract_text(&[page_num]).unwrap_or_default(); - let native = strip_footer_artifacts(&raw, &page_count_str); - let non_ws: usize = native.chars().filter(|c| !c.is_whitespace()).count(); - if non_ws >= min_len as usize { - OcrPageText { - page: page_num, - text: native, - source: TextSource::Native, - } - } else { - let ocr_text = ocr_page_images(doc, page_id, lang); - OcrPageText { - page: page_num, - text: ocr_text, - source: TextSource::Ocr, - } - } - }) - .collect() - }); - results.sort_unstable_by_key(|r| r.page); - Ok(results) -} - -#[cfg(feature = "ocr")] -fn detect_headers_footers_ocr(pages: &[OcrPageText]) -> Vec { - // Convert to PageText for header/footer detection - let as_page_text: Vec = pages - .iter() - .map(|p| PageText { - page: p.page, - text: p.text.clone(), - }) - .collect(); - let structured = detect_headers_footers(&as_page_text); - // Zip back with source info - structured - .into_iter() - .zip(pages.iter()) - .map(|(s, ocr)| OcrStructuredPageText { - page: s.page, - header: s.header, - body: s.body, - footer: s.footer, - source: match ocr.source { - TextSource::Native => TextSource::Native, - TextSource::Ocr => TextSource::Ocr, - }, - }) - .collect() -} - -#[cfg(feature = "ocr")] -fn extract_all_with_ocr( - doc: &Document, - lang: &str, - min_len: u32, - max_threads: u32, -) -> Result { - let meta = extract_metadata(doc); - let (text, (images, annotations)) = rayon::join( - || extract_text_with_ocr(doc, lang, min_len, max_threads), - || rayon::join(|| extract_images_raw(doc), || extract_annotations(doc)), - ); - let text = text?; - let structured_text = detect_headers_footers_ocr(&text); - - let mut image_pages: Vec = images - .iter() - .map(|i| i.page) - .collect::>() - .into_iter() - .collect(); - image_pages.sort_unstable(); - - let mut annotation_pages: Vec = annotations - .iter() - .map(|a| a.page) - .collect::>() - .into_iter() - .collect(); - annotation_pages.sort_unstable(); - - Ok(RawPdfDocumentOcr { - meta, - text, - structured_text, - images, - annotations, - image_pages, - annotation_pages, - }) -} - // ── Standalone sync functions ─────────────────────────────────── #[napi] @@ -1205,674 +125,6 @@ pub fn pdf_document_ocr(buffer: Buffer, opts: Option) -> Result Vec { - let mut images = Vec::new(); - - // Get XObjects from page resources (with parent inheritance) - let xobjects = match get_page_xobjects(doc, page_id) { - Some(x) => x, - None => return images, - }; - - // Get the set of XObject names actually referenced by Do operators in the content stream - let referenced_names = get_referenced_xobject_names(doc, page_id); - - let mut img_index = 0u32; - - for (name, obj_ref) in xobjects.iter() { - // Only process XObjects actually painted on the page via Do operators - if !referenced_names.is_empty() && !referenced_names.contains(name) { - continue; - } - - let obj_id = match obj_ref { - Object::Reference(id) => *id, - _ => continue, - }; - - let stream = match doc.get_object(obj_id) { - Ok(Object::Stream(s)) => s, - _ => continue, - }; - - // Only process Image XObjects - let subtype = stream.dict.get(b"Subtype").ok().and_then(|v| { - if let Object::Name(n) = v { - Some(n.as_slice()) - } else { - None - } - }); - - if subtype != Some(b"Image") { - continue; - } - - let width = get_dict_int(&stream.dict, b"Width").unwrap_or(0) as u32; - let height = get_dict_int(&stream.dict, b"Height").unwrap_or(0) as u32; - let bpc = get_dict_int(&stream.dict, b"BitsPerComponent").unwrap_or(8) as u32; - - if width == 0 || height == 0 { - continue; - } - - let color_space = resolve_color_space(&stream.dict, doc); - let filter = resolve_filter(&stream.dict); - - let channels: u32 = match color_space.as_str() { - "DeviceRGB" | "ICCBased3" | "CalRGB" => 3, - "DeviceGray" | "ICCBased1" | "CalGray" => 1, - "DeviceCMYK" | "ICCBased4" => 4, - _ => 3, - }; - - // Step 4: Skip the full stream clone for DCT/JPX — they're already in their - // target encoded format and don't need lopdf decompression. - let content = match filter.as_str() { - "DCTDecode" | "JPXDecode" => stream.content.clone(), - _ => decompress_stream_content(doc, stream, width, height, channels, bpc), - }; - - // Check for SMask (alpha channel) - let smask_data = get_smask_data(doc, &stream.dict); - - let png_data = match encode_to_png( - &content, - width, - height, - bpc, - &color_space, - &filter, - smask_data.as_deref(), - ) { - Some(data) => data, - None => continue, - }; - - let xobject_name = String::from_utf8_lossy(name).to_string(); - let object_id_str = format!("{} {} obj", obj_id.0, obj_id.1); - - images.push(RawPageImage { - page: page_num, - image_index: img_index, - width, - height, - data: png_data, - color_space, - bits_per_component: bpc, - filter, - xobject_name, - object_id: object_id_str, - }); - - img_index += 1; - } - - images -} - -/// Walk the page tree to find /Resources (handles inheritance from /Parent) -fn get_page_xobjects(doc: &Document, page_id: ObjectId) -> Option { - let resources = get_inherited_resources(doc, page_id)?; - let xobject_obj = resources.get(b"XObject").ok()?; - resolve_to_dict(doc, xobject_obj) -} - -fn get_inherited_resources(doc: &Document, page_id: ObjectId) -> Option { - let mut current_id = Some(page_id); - while let Some(id) = current_id { - let dict = doc.get_dictionary(id).ok()?; - if let Ok(resources_obj) = dict.get(b"Resources") { - return resolve_to_dict(doc, resources_obj); - } - // Walk up to /Parent - current_id = dict.get(b"Parent").ok().and_then(|p| match p { - Object::Reference(ref_id) => Some(*ref_id), - _ => None, - }); - } - None -} - -/// Parse the page content stream to find XObject names referenced by `Do` operators. -/// This filters out XObjects that are defined in Resources but never actually painted. -fn get_referenced_xobject_names(doc: &Document, page_id: ObjectId) -> HashSet> { - let mut names = HashSet::new(); - - let page_dict = match doc.get_dictionary(page_id) { - Ok(d) => d, - Err(_) => return names, - }; - - let contents = match page_dict.get(b"Contents") { - Ok(c) => c, - Err(_) => return names, - }; - - let stream_ids: Vec = match contents { - Object::Reference(id) => vec![*id], - Object::Array(arr) => arr - .iter() - .filter_map(|o| { - if let Object::Reference(id) = o { - Some(*id) - } else { - None - } - }) - .collect(), - _ => return names, - }; - - let mut all_bytes = Vec::new(); - for stream_id in stream_ids { - if let Ok(Object::Stream(s)) = doc.get_object(stream_id) { - let mut s = s.clone(); - let _ = s.decompress(); - all_bytes.extend_from_slice(&s.content); - } - } - - if let Ok(content) = lopdf::content::Content::decode(&all_bytes) { - for op in &content.operations { - if op.operator == "Do" - && let Some(Object::Name(name)) = op.operands.first() - { - names.insert(name.clone()); - } - } - } - - names -} - -/// Resolve /DecodeParms from a stream dictionary, following indirect references. -fn resolve_decode_parms(doc: &Document, dict: &lopdf::Dictionary) -> Option { - let dp = dict.get(b"DecodeParms").ok()?; - match dp { - Object::Dictionary(d) => Some(d.clone()), - Object::Reference(id) => match doc.get_object(*id) { - Ok(Object::Dictionary(d)) => Some(d.clone()), - _ => None, - }, - Object::Array(arr) => { - // Filter chain: DecodeParms is an array parallel to Filter array. - // Use the first dictionary entry found. - for item in arr { - match item { - Object::Dictionary(d) => return Some(d.clone()), - Object::Reference(id) => { - if let Ok(Object::Dictionary(d)) = doc.get_object(*id) { - return Some(d.clone()); - } - } - _ => {} - } - } - None - } - _ => None, - } -} - -/// Apply PNG predictor unfiltering to raw decompressed data. -/// Each row has a 1-byte filter type prefix followed by `row_bytes` of filtered data. -/// `bytes_per_pixel` is the number of bytes per pixel (channels * ceil(bpc/8)). -fn apply_png_predictor(data: &[u8], bytes_per_pixel: usize, row_bytes: usize) -> Option> { - let src_row_len = row_bytes + 1; // +1 for filter type byte - if !data.len().is_multiple_of(src_row_len) { - return None; - } - let num_rows = data.len() / src_row_len; - let mut output = Vec::with_capacity(num_rows * row_bytes); - let mut prev_row = vec![0u8; row_bytes]; - - for row_idx in 0..num_rows { - let row_start = row_idx * src_row_len; - let filter_byte = data[row_start]; - let mut current_row = data[row_start + 1..row_start + src_row_len].to_vec(); - - match filter_byte { - 0 => { /* None */ } - 1 => { - // Sub - for i in bytes_per_pixel..row_bytes { - current_row[i] = current_row[i].wrapping_add(current_row[i - bytes_per_pixel]); - } - } - 2 => { - // Up - for i in 0..row_bytes { - current_row[i] = current_row[i].wrapping_add(prev_row[i]); - } - } - 3 => { - // Average - for i in 0..bytes_per_pixel { - current_row[i] = current_row[i].wrapping_add(prev_row[i] / 2); - } - for i in bytes_per_pixel..row_bytes { - current_row[i] = current_row[i].wrapping_add( - ((current_row[i - bytes_per_pixel] as u16 + prev_row[i] as u16) / 2) as u8, - ); - } - } - 4 => { - // Paeth - for i in 0..bytes_per_pixel { - current_row[i] = current_row[i].wrapping_add(paeth_predictor(0, prev_row[i], 0)); - } - for i in bytes_per_pixel..row_bytes { - current_row[i] = current_row[i].wrapping_add(paeth_predictor( - current_row[i - bytes_per_pixel], - prev_row[i], - prev_row[i - bytes_per_pixel], - )); - } - } - _ => return None, // Unknown filter type - } - - output.extend_from_slice(¤t_row); - prev_row = current_row; - } - - Some(output) -} - -fn paeth_predictor(a: u8, b: u8, c: u8) -> u8 { - let pa = (b as i16 - c as i16).abs(); - let pb = (a as i16 - c as i16).abs(); - let pc = (a as i16 + b as i16 - 2 * c as i16).abs(); - if pa <= pb && pa <= pc { - a - } else if pb <= pc { - b - } else { - c - } -} - -/// Decompress a stream's content with correct predictor handling. -/// -/// lopdf's built-in `decompress()` attempts PNG predictor unfiltering internally -/// but produces corrupted output for some streams (e.g. xdvipdfmx/pandoc images). -/// We bypass it entirely: raw zlib inflate via `flate2`, then apply our own -/// predictor reversal. -fn decompress_stream_content( - doc: &Document, - stream: &lopdf::Stream, - width: u32, - height: u32, - channels: u32, - bpc: u32, -) -> Vec { - let bytes_per_sample = if bpc > 8 { 2u32 } else { 1u32 }; - let row_bytes = (width * channels * bpc / 8) as usize; - let expected = (width * height * channels * bytes_per_sample) as usize; - let predicted_len = height as usize * (row_bytes + 1); - - // Check if the stream uses FlateDecode - let uses_flate = match stream.dict.get(b"Filter") { - Ok(Object::Name(n)) => n == b"FlateDecode", - Ok(Object::Array(arr)) => arr - .iter() - .any(|o| matches!(o, Object::Name(n) if n == b"FlateDecode")), - _ => false, - }; - - // Step 1: Raw inflate — bypass lopdf's decompress to avoid its buggy predictor handling - let content = if uses_flate { - raw_inflate(&stream.content).unwrap_or_else(|| { - // Fallback: let lopdf try (handles edge cases like chained filters) - let mut s = stream.clone(); - if s.decompress().is_ok() { - s.content - } else { - stream.content.clone() - } - }) - } else { - stream.content.clone() - }; - - // Step 2: Apply predictor reversal if DecodeParms specifies one - if let Some(dp) = resolve_decode_parms(doc, &stream.dict) { - let predictor = get_dict_int(&dp, b"Predictor").unwrap_or(1); - - // TIFF Predictor 2: horizontal differencing (same size as raw pixels) - if predictor == 2 && content.len() == expected { - let bpp = (channels * bpc / 8).max(1) as usize; - let mut data = content; - apply_tiff_predictor2(&mut data, bpp, row_bytes); - return data; - } - - // PNG Predictors 10-15: each row has a leading filter type byte - if (10..=15).contains(&predictor) && content.len() == predicted_len { - let bpp = (channels * bpc / 8).max(1) as usize; - if let Some(unfiltered) = apply_png_predictor(&content, bpp, row_bytes) { - return unfiltered; - } - } - } - - content -} - -/// Raw zlib inflate without any predictor handling. -fn raw_inflate(data: &[u8]) -> Option> { - use std::io::Read; - // Try zlib wrapper first (most common in PDF) - let mut output = Vec::new(); - if flate2::read::ZlibDecoder::new(data) - .read_to_end(&mut output) - .is_ok() - { - return Some(output); - } - // Fallback to raw deflate (no zlib header) - output.clear(); - if flate2::read::DeflateDecoder::new(data) - .read_to_end(&mut output) - .is_ok() - { - return Some(output); - } - None -} - -/// Reverse TIFF Predictor 2 (horizontal differencing) in-place. -/// Each byte after the first `bpp` bytes in each row is a delta from the previous byte. -fn apply_tiff_predictor2(data: &mut [u8], bpp: usize, row_bytes: usize) { - if row_bytes == 0 { - return; - } - let num_rows = data.len() / row_bytes; - for row in 0..num_rows { - let start = row * row_bytes; - for i in (start + bpp)..(start + row_bytes) { - data[i] = data[i].wrapping_add(data[i - bpp]); - } - } -} - -/// Retrieve and decompress the SMask (soft mask / alpha channel) image data if present -fn get_smask_data(doc: &Document, dict: &lopdf::Dictionary) -> Option> { - let smask_ref = dict.get(b"SMask").ok()?; - let smask_id = match smask_ref { - Object::Reference(id) => *id, - _ => return None, - }; - - let smask_stream = match doc.get_object(smask_id) { - Ok(Object::Stream(s)) => s, - _ => return None, - }; - - // Verify it's an Image subtype - let subtype = smask_stream.dict.get(b"Subtype").ok().and_then(|v| { - if let Object::Name(n) = v { - Some(n.as_slice()) - } else { - None - } - }); - if subtype != Some(b"Image") { - return None; - } - - // SMask is always DeviceGray with 1 channel - let smask_width = get_dict_int(&smask_stream.dict, b"Width").unwrap_or(0) as u32; - let smask_height = get_dict_int(&smask_stream.dict, b"Height").unwrap_or(0) as u32; - let smask_bpc = get_dict_int(&smask_stream.dict, b"BitsPerComponent").unwrap_or(8) as u32; - Some(decompress_stream_content( - doc, - smask_stream, - smask_width, - smask_height, - 1, - smask_bpc, - )) -} - -fn resolve_to_dict(doc: &Document, obj: &Object) -> Option { - match obj { - Object::Dictionary(d) => Some(d.clone()), - Object::Reference(id) => match doc.get_object(*id).ok()? { - Object::Dictionary(d) => Some(d.clone()), - _ => None, - }, - _ => None, - } -} - -fn get_dict_int(dict: &lopdf::Dictionary, key: &[u8]) -> Option { - match dict.get(key).ok()? { - Object::Integer(i) => Some(*i), - _ => None, - } -} - -fn resolve_color_space(dict: &lopdf::Dictionary, doc: &Document) -> String { - let cs = match dict.get(b"ColorSpace") { - Ok(obj) => obj, - Err(_) => return "DeviceRGB".to_string(), - }; - - match cs { - Object::Name(name) => String::from_utf8_lossy(name).to_string(), - Object::Reference(id) => match doc.get_object(*id) { - Ok(Object::Name(name)) => String::from_utf8_lossy(name).to_string(), - // ICCBased is typically [/ICCBased ] - Ok(Object::Array(arr)) => parse_color_space_array(arr, doc), - _ => "DeviceRGB".to_string(), - }, - Object::Array(arr) => parse_color_space_array(arr, doc), - _ => "DeviceRGB".to_string(), - } -} - -fn parse_color_space_array(arr: &[Object], doc: &Document) -> String { - if arr.is_empty() { - return "DeviceRGB".to_string(); - } - - let cs_name = match &arr[0] { - Object::Name(n) => String::from_utf8_lossy(n).to_string(), - _ => return "DeviceRGB".to_string(), - }; - - if cs_name == "ICCBased" && arr.len() > 1 { - // Get /N from the ICCBased stream to determine channel count - let stream_id = match &arr[1] { - Object::Reference(id) => *id, - _ => return "ICCBased".to_string(), - }; - - if let Ok(Object::Stream(s)) = doc.get_object(stream_id) { - let n = get_dict_int(&s.dict, b"N").unwrap_or(3); - return format!("ICCBased{n}"); - } - } - - cs_name -} - -fn resolve_filter(dict: &lopdf::Dictionary) -> String { - match dict.get(b"Filter") { - Ok(Object::Name(name)) => String::from_utf8_lossy(name).to_string(), - Ok(Object::Array(arr)) => { - // Filter chain — return the last (innermost) filter for image type detection - if let Some(Object::Name(name)) = arr.last() { - String::from_utf8_lossy(name).to_string() - } else { - "None".to_string() - } - } - _ => "None".to_string(), - } -} - -/// Decode an XObject stream into a DynamicImage (shared by PNG export and OCR). -fn decode_xobject_to_dynamic_image( - content: &[u8], - width: u32, - height: u32, - bpc: u32, - color_space: &str, - filter: &str, - smask: Option<&[u8]>, -) -> Option { - let dynamic_img = if filter == "DCTDecode" { - image::load_from_memory_with_format(content, ImageFormat::Jpeg).ok()? - } else if filter == "JPXDecode" { - decode_jpx(content)? - } else { - decode_raw_pixels(content, width, height, bpc, color_space)? - }; - - Some(if let Some(mask_data) = smask { - apply_smask(dynamic_img, mask_data, width, height) - } else { - dynamic_img - }) -} - -fn encode_to_png( - content: &[u8], - width: u32, - height: u32, - bpc: u32, - color_space: &str, - filter: &str, - smask: Option<&[u8]>, -) -> Option> { - let final_img = - decode_xobject_to_dynamic_image(content, width, height, bpc, color_space, filter, smask)?; - let mut png_buf = Cursor::new(Vec::new()); - final_img.write_to(&mut png_buf, ImageFormat::Png).ok()?; - Some(png_buf.into_inner()) -} - -/// Decode a JPEG 2000 (JPXDecode) stream using hayro-jpeg2000 (pure Rust) -fn decode_jpx(content: &[u8]) -> Option { - let jp2_img = - hayro_jpeg2000::Image::new(content, &hayro_jpeg2000::DecodeSettings::default()).ok()?; - DynamicImage::from_decoder(jp2_img).ok() -} - -/// Decode raw pixel data (FlateDecode / uncompressed) into a DynamicImage -fn decode_raw_pixels( - content: &[u8], - width: u32, - height: u32, - bpc: u32, - color_space: &str, -) -> Option { - let channels: u32 = match color_space { - "DeviceRGB" | "ICCBased3" | "CalRGB" => 3, - "DeviceGray" | "ICCBased1" | "CalGray" => 1, - "DeviceCMYK" | "ICCBased4" => 4, - _ => 3, - }; - let bytes_per_sample = if bpc > 8 { 2u32 } else { 1u32 }; - let expected = (width * height * channels * bytes_per_sample) as usize; - - // Validate buffer size before constructing image - if content.len() < expected { - return None; - } - // Use exactly the expected number of bytes - let pixel_data = &content[..expected]; - - // Downscale 16-bit to 8-bit if needed - let pixel_data_8bit: Vec = if bytes_per_sample == 2 { - pixel_data - .chunks_exact(2) - .map(|pair| (u16::from_be_bytes([pair[0], pair[1]]) >> 8) as u8) - .collect() - } else { - pixel_data.to_vec() - }; - - match color_space { - "DeviceRGB" | "ICCBased3" | "CalRGB" => { - let img: ImageBuffer, Vec> = - ImageBuffer::from_raw(width, height, pixel_data_8bit)?; - Some(DynamicImage::ImageRgb8(img)) - } - "DeviceGray" | "ICCBased1" | "CalGray" => { - let img: ImageBuffer, Vec> = - ImageBuffer::from_raw(width, height, pixel_data_8bit)?; - Some(DynamicImage::ImageLuma8(img)) - } - "DeviceCMYK" | "ICCBased4" => { - let rgb_bytes = cmyk_to_rgb(&pixel_data_8bit); - let img: ImageBuffer, Vec> = - ImageBuffer::from_raw(width, height, rgb_bytes)?; - Some(DynamicImage::ImageRgb8(img)) - } - _ => { - let expected_rgb = (width * height * 3) as usize; - if pixel_data_8bit.len() == expected_rgb { - let img: ImageBuffer, Vec> = - ImageBuffer::from_raw(width, height, pixel_data_8bit)?; - Some(DynamicImage::ImageRgb8(img)) - } else { - None - } - } - } -} - -/// Combine a base RGB image with a grayscale SMask to produce an RGBA PNG -fn apply_smask(base: DynamicImage, mask_data: &[u8], width: u32, height: u32) -> DynamicImage { - let rgb = base.to_rgb8(); - let expected_mask_len = (width * height) as usize; - - if mask_data.len() < expected_mask_len { - return DynamicImage::ImageRgb8(rgb); - } - - let mut rgba_pixels = Vec::with_capacity((width * height * 4) as usize); - for (rgb_pixel, &alpha) in rgb.pixels().zip(mask_data.iter()) { - rgba_pixels.push(rgb_pixel[0]); - rgba_pixels.push(rgb_pixel[1]); - rgba_pixels.push(rgb_pixel[2]); - rgba_pixels.push(alpha); - } - - match ImageBuffer::from_raw(width, height, rgba_pixels) { - Some(img) => DynamicImage::ImageRgba8(img), - None => DynamicImage::ImageRgb8(rgb), - } -} - -fn cmyk_to_rgb(cmyk: &[u8]) -> Vec { - let pixel_count = cmyk.len() / 4; - let mut rgb = Vec::with_capacity(pixel_count * 3); - - for i in 0..pixel_count { - let c = cmyk[i * 4] as f32 / 255.0; - let m = cmyk[i * 4 + 1] as f32 / 255.0; - let y = cmyk[i * 4 + 2] as f32 / 255.0; - let k = cmyk[i * 4 + 3] as f32 / 255.0; - - let r = 255.0 * (1.0 - c) * (1.0 - k); - let g = 255.0 * (1.0 - m) * (1.0 - k); - let b = 255.0 * (1.0 - y) * (1.0 - k); - - rgb.push(r as u8); - rgb.push(g as u8); - rgb.push(b as u8); - } - - rgb -} - // ── Standalone async functions (libuv thread pool via AsyncTask) ─ pub struct ExtractTextTask(Vec); @@ -2407,165 +659,3 @@ impl PdfDown { }) } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn strip_basic_footer_artifact() { - let text = "Some content\n/\n38\nMore content"; - let result = strip_footer_artifacts(text, "38"); - assert_eq!(result, "Some content\nMore content"); - } - - #[test] - fn strip_footer_artifact_with_whitespace() { - let text = "Some content\n / \n 38 \nMore content"; - let result = strip_footer_artifacts(text, "38"); - assert_eq!(result, "Some content\nMore content"); - } - - #[test] - fn no_match_passthrough() { - let text = "Some content\nNo footer here\nMore content"; - let result = strip_footer_artifacts(text, "38"); - assert_eq!(result, text); - } - - #[test] - fn multiple_occurrences() { - let text = "Page one\n/\n38\nPage two\n/\n38\nPage three"; - let result = strip_footer_artifacts(text, "38"); - assert_eq!(result, "Page one\nPage two\nPage three"); - } - - #[test] - fn at_start_of_text() { - let text = "/\n38\nContent after"; - let result = strip_footer_artifacts(text, "38"); - assert_eq!(result, "Content after"); - } - - #[test] - fn at_end_of_text() { - let text = "Content before\n/\n38"; - let result = strip_footer_artifacts(text, "38"); - assert_eq!(result, "Content before"); - } - - #[test] - fn empty_input() { - let result = strip_footer_artifacts("", "38"); - assert_eq!(result, ""); - } - - #[test] - fn single_line_input() { - let result = strip_footer_artifacts("just one line", "38"); - assert_eq!(result, "just one line"); - } - - #[test] - fn consecutive_pairs() { - // Two pairs back to back: `/\n38\n/\n38` - let text = "start\n/\n38\n/\n38\nend"; - let result = strip_footer_artifacts(text, "38"); - assert_eq!(result, "start\nend"); - } - - #[test] - fn slash_not_followed_by_count() { - let text = "Some content\n/\n99\nMore content"; - let result = strip_footer_artifacts(text, "38"); - assert_eq!(result, text); - } - - #[test] - fn slash_with_extra_text_not_stripped() { - let text = "Some content\n/ extra\n38\nMore content"; - let result = strip_footer_artifacts(text, "38"); - assert_eq!(result, text); - } - - #[test] - fn count_with_extra_text_not_stripped() { - let text = "Some content\n/\n38 pages\nMore content"; - let result = strip_footer_artifacts(text, "38"); - assert_eq!(result, text); - } - - // ── parse_page_box tests ────────────────────────────────────── - - #[test] - fn parse_page_box_valid_integers() { - let obj = Object::Array(vec![ - Object::Integer(0), - Object::Integer(0), - Object::Integer(612), - Object::Integer(792), - ]); - let result = parse_page_box(&obj); - assert_eq!(result, Some([0.0, 0.0, 612.0, 792.0])); - } - - #[test] - fn parse_page_box_valid_reals() { - let obj = Object::Array(vec![ - Object::Real(0.0), - Object::Real(0.0), - Object::Real(595.0), - Object::Real(842.0), - ]); - let result = parse_page_box(&obj); - assert_eq!(result, Some([0.0, 0.0, 595.0, 842.0])); - } - - #[test] - fn parse_page_box_mixed_types() { - let obj = Object::Array(vec![ - Object::Integer(0), - Object::Real(0.5), - Object::Integer(612), - Object::Real(792.0), - ]); - let result = parse_page_box(&obj); - assert_eq!(result, Some([0.0, 0.5, 612.0, 792.0])); - } - - #[test] - fn parse_page_box_too_short() { - let obj = Object::Array(vec![Object::Integer(0), Object::Integer(0)]); - assert_eq!(parse_page_box(&obj), None); - } - - #[test] - fn parse_page_box_non_numeric() { - let obj = Object::Array(vec![ - Object::Integer(0), - Object::Integer(0), - Object::Name(b"bad".to_vec()), - Object::Integer(792), - ]); - assert_eq!(parse_page_box(&obj), None); - } - - #[test] - fn parse_page_box_not_array() { - let obj = Object::Integer(42); - assert_eq!(parse_page_box(&obj), None); - } - - #[test] - fn parse_page_box_extra_elements_ignored() { - let obj = Object::Array(vec![ - Object::Integer(0), - Object::Integer(0), - Object::Integer(612), - Object::Integer(792), - Object::Integer(999), - ]); - // Only first 4 used - assert_eq!(parse_page_box(&obj), Some([0.0, 0.0, 612.0, 792.0])); - } -} diff --git a/src/types.rs b/src/types.rs new file mode 100644 index 0000000..e84fe63 --- /dev/null +++ b/src/types.rs @@ -0,0 +1,256 @@ +use napi::bindgen_prelude::Buffer; +use napi_derive::napi; + +// ── Napi object types (JS boundary) ───────────────────────────── + +#[napi(object)] +pub struct PageText { + pub page: u32, + pub text: String, +} + +#[napi(object)] +pub struct StructuredPageText { + pub page: u32, + pub header: String, + pub body: String, + pub footer: String, +} + +#[cfg(feature = "ocr")] +#[napi(string_enum)] +pub enum TextSource { + Native, + Ocr, +} + +#[cfg(feature = "ocr")] +#[napi(object)] +pub struct OcrPageText { + pub page: u32, + pub text: String, + pub source: TextSource, +} + +#[cfg(feature = "ocr")] +#[napi(object)] +pub struct OcrOptions { + pub lang: Option, + pub min_text_length: Option, + pub max_threads: Option, +} + +#[napi(object)] +pub struct PdfMeta { + pub page_count: u32, + pub version: String, + pub is_linearized: bool, + pub creator: Option, + pub producer: Option, + pub creation_date: Option, + pub modification_date: Option, + pub page_boxes: Vec, +} + +#[napi(string_enum)] +pub enum BoxType { + CropBox, + MediaBox, + Unknown, +} + +#[napi(object)] +pub struct PageBox { + /// Number of pages that share these dimensions. + pub page_count: u32, + pub left: f64, + pub bottom: f64, + pub right: f64, + pub top: f64, + pub width: f64, + pub height: f64, + pub box_type: BoxType, + /// Present only on non-dominant boxes — lists the specific pages with these + /// dimensions. `None` on the first (most frequent) entry means "all pages + /// not listed in any other entry's `pages` array." + pub pages: Option>, +} + +#[napi(object)] +pub struct PageImage { + pub page: u32, + pub image_index: u32, + pub width: u32, + pub height: u32, + pub data: Buffer, + pub color_space: String, + pub bits_per_component: u32, + pub filter: String, + pub xobject_name: String, + pub object_id: String, +} + +#[napi(object)] +pub struct PageAnnotation { + pub page: u32, + pub subtype: String, + pub rect: Vec, + pub uri: Option, + pub dest: Option, + pub content: Option, +} + +#[napi(object)] +pub struct PdfDocument { + pub version: String, + pub is_linearized: bool, + pub page_count: u32, + pub creator: Option, + pub producer: Option, + pub creation_date: Option, + pub modification_date: Option, + pub page_boxes: Vec, + pub total_images: u32, + pub total_annotations: u32, + pub image_pages: Vec, + pub annotation_pages: Vec, + pub text: Vec, + pub structured_text: Vec, + pub images: Vec, + pub annotations: Vec, +} + +#[cfg(feature = "ocr")] +#[napi(object)] +pub struct OcrStructuredPageText { + pub page: u32, + pub header: String, + pub body: String, + pub footer: String, + pub source: TextSource, +} + +#[cfg(feature = "ocr")] +#[napi(object)] +pub struct PdfDocumentOcr { + pub version: String, + pub is_linearized: bool, + pub page_count: u32, + pub creator: Option, + pub producer: Option, + pub creation_date: Option, + pub modification_date: Option, + pub page_boxes: Vec, + pub total_images: u32, + pub total_annotations: u32, + pub image_pages: Vec, + pub annotation_pages: Vec, + pub text: Vec, + pub structured_text: Vec, + pub images: Vec, + pub annotations: Vec, +} + +// ── Internal type — no napi types, safe for any thread ── + +pub struct RawPageImage { + pub page: u32, + pub image_index: u32, + pub width: u32, + pub height: u32, + pub data: Vec, + pub color_space: String, + pub bits_per_component: u32, + pub filter: String, + pub xobject_name: String, + pub object_id: String, +} + +impl From for PageImage { + fn from(r: RawPageImage) -> Self { + PageImage { + page: r.page, + image_index: r.image_index, + width: r.width, + height: r.height, + data: r.data.into(), + color_space: r.color_space, + bits_per_component: r.bits_per_component, + filter: r.filter, + xobject_name: r.xobject_name, + object_id: r.object_id, + } + } +} + +pub struct RawPdfDocument { + pub meta: PdfMeta, + pub text: Vec, + pub structured_text: Vec, + pub images: Vec, + pub annotations: Vec, + pub image_pages: Vec, + pub annotation_pages: Vec, +} + +impl From for PdfDocument { + fn from(r: RawPdfDocument) -> Self { + let total_images = r.images.len() as u32; + let total_annotations = r.annotations.len() as u32; + PdfDocument { + version: r.meta.version, + is_linearized: r.meta.is_linearized, + page_count: r.meta.page_count, + creator: r.meta.creator, + producer: r.meta.producer, + creation_date: r.meta.creation_date, + modification_date: r.meta.modification_date, + page_boxes: r.meta.page_boxes, + total_images, + total_annotations, + image_pages: r.image_pages, + annotation_pages: r.annotation_pages, + text: r.text, + structured_text: r.structured_text, + images: r.images.into_iter().map(PageImage::from).collect(), + annotations: r.annotations, + } + } +} + +#[cfg(feature = "ocr")] +pub struct RawPdfDocumentOcr { + pub meta: PdfMeta, + pub text: Vec, + pub structured_text: Vec, + pub images: Vec, + pub annotations: Vec, + pub image_pages: Vec, + pub annotation_pages: Vec, +} + +#[cfg(feature = "ocr")] +impl From for PdfDocumentOcr { + fn from(r: RawPdfDocumentOcr) -> Self { + let total_images = r.images.len() as u32; + let total_annotations = r.annotations.len() as u32; + PdfDocumentOcr { + version: r.meta.version, + is_linearized: r.meta.is_linearized, + page_count: r.meta.page_count, + creator: r.meta.creator, + producer: r.meta.producer, + creation_date: r.meta.creation_date, + modification_date: r.meta.modification_date, + page_boxes: r.meta.page_boxes, + total_images, + total_annotations, + image_pages: r.image_pages, + annotation_pages: r.annotation_pages, + text: r.text, + structured_text: r.structured_text, + images: r.images.into_iter().map(PageImage::from).collect(), + annotations: r.annotations, + } + } +}