Skip to content
Open
1 change: 1 addition & 0 deletions lib/docsplit.rb
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def self.normalize_value(value)

end

require "#{Docsplit::ROOT}/lib/docsplit/external_process"
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
Expand Down
34 changes: 34 additions & 0 deletions lib/docsplit/external_process.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
module Docsplit
module ExternalProcess
COMMAND_TIMEOUT = 300 # seconds

# Run an external process and raise an exception if it fails.
def run(command, env = "")
# If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
# By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes
# sure we exit at some point.
#
# - See https://github.com/GetSilverfin/silverfin/issues/1998
# - Add timeout so a stuck process doesn't block our Ruby process forever
# - Remove blank lines
# - Remove duplicate lines
run_command = "#{env} #{timeout} #{command} | grep -v \"^$\" | uniq"

# - Run through bash so we can use PIPESTATUS
# - Use PIPESTATUS to return the exit status of #{command} instead of `uniq`
result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp

raise ExtractionFailed, result if $? != 0
result
end

def timeout
"#{timeout_bin} #{COMMAND_TIMEOUT}"
end

def timeout_bin
# gtimeout on Mac
`which timeout` != "" ? "timeout" : "gtimeout"
end
end
end
20 changes: 12 additions & 8 deletions lib/docsplit/image_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ module Docsplit
# Delegates to GraphicsMagick in order to convert PDF documents into
# nicely sized images.
class ImageExtractor
include ExternalProcess

MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
DEFAULT_FORMAT = :png
DEFAULT_DENSITY = '150'


# Extract a list of PDFs as rasterized page images, according to the
# configuration in options.
def extract(pdfs, options)
Expand All @@ -34,24 +36,26 @@ def convert(pdf, size, format, previous=nil)
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
escaped_pdf = ESCAPE[pdf]
FileUtils.mkdir_p(directory) unless File.exists?(directory)
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2"
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"

if previous
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
raise ExtractionFailed, result if $? != 0
# We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
# By filtering these we avoid memory bloat when the executing process tries to capture stdout.
# See https://github.com/GetSilverfin/silverfin/issues/1998

run("gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1", env)
else
page_list(pages).each do |page|
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
run("gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1", env)
end
end
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
end


private

# Extract the relevant GraphicsMagick options from the options hash.
Expand Down
19 changes: 8 additions & 11 deletions lib/docsplit/text_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ module Docsplit
# * Re-OCR each page in the `@pages_to_ocr` list at the end.
#
class TextExtractor
include ExternalProcess

NO_TEXT_DETECTED = /---------\n\Z/

Expand Down Expand Up @@ -61,22 +62,25 @@ def extract_from_ocr(pdf, pages)
base_path = File.join(@output, @pdf_name)
escaped_pdf = ESCAPE[pdf]
psm = @detect_orientation ? "-psm 1" : ""
timeout = 5.minutes.to_i
env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2"

if pages
pages.each do |page|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
escaped_tiff = ESCAPE[tiff]
file = "#{base_path}_#{page}"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
run("gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1", env)
run("tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1")
clean_text(file + '.txt') if @clean_ocr
FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
escaped_tiff = ESCAPE[tiff]
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
run("gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1", env)
#if the user says don't do orientation detection or the plugin is not installed, set psm to 0
run "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1"
run("tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1")
clean_text(base_path + '.txt') if @clean_ocr
end
ensure
Expand All @@ -95,13 +99,6 @@ def clean_text(file)
end
end

# Run an external process and raise an exception if it fails.
def run(command)
result = `#{command}`
raise ExtractionFailed, result if $? != 0
result
end

# Extract the full contents of a pdf as a single file, directly.
def extract_full(pdf)
text_path = File.join(@output, "#{@pdf_name}.txt")
Expand Down