tim-vandecasteele · dv · Jun 15, 2017 · Jun 15, 2017 · Jun 15, 2017 · Jun 15, 2017
diff --git a/lib/docsplit.rb b/lib/docsplit.rb
@@ -100,6 +100,7 @@ def self.normalize_value(value)
 
 end
 
+require "#{Docsplit::ROOT}/lib/docsplit/external_process"
 require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
 require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"

diff --git a/lib/docsplit/external_process.rb b/lib/docsplit/external_process.rb
@@ -0,0 +1,34 @@
+module Docsplit
+  module ExternalProcess
+    COMMAND_TIMEOUT = 300 # seconds
+
+    # Run an external process and raise an exception if it fails.
+    def run(command, env = "")
+      # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
+      # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes
+      # sure we exit at some point.
+      #
+      # - See https://github.com/GetSilverfin/silverfin/issues/1998
+      # - Add timeout so a stuck process doesn't block our Ruby process forever
+      # - Remove blank lines
+      # - Remove duplicate lines
+      run_command = "#{env} #{timeout} #{command} | grep -v \"^$\" | uniq"
+
+      # - Run through bash so we can use PIPESTATUS
+      # - Use PIPESTATUS to return the exit status of #{command} instead of `uniq`
+      result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp
+
+      raise ExtractionFailed, result if $? != 0
+      result
+    end
+
+    def timeout
+      "#{timeout_bin} #{COMMAND_TIMEOUT}"
+    end
+
+    def timeout_bin
+      # gtimeout on Mac
+      `which timeout` != "" ? "timeout" : "gtimeout"
+    end
+  end
+end
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
@@ -3,11 +3,13 @@ module Docsplit
   # Delegates to GraphicsMagick in order to convert PDF documents into
   # nicely sized images.
   class ImageExtractor
+    include ExternalProcess
 
     MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
     DEFAULT_FORMAT  = :png
     DEFAULT_DENSITY = '150'
 
+
     # Extract a list of PDFs as rasterized page images, according to the
     # configuration in options.
     def extract(pdfs, options)
@@ -34,24 +36,26 @@ def convert(pdf, size, format, previous=nil)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       escaped_pdf = ESCAPE[pdf]
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
-      common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
+      env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2"
+      common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
+
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
-        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
-        raise ExtractionFailed, result if $? != 0
+        # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
+        # By filtering these we avoid memory bloat when the executing process tries to capture stdout.
+        # See https://github.com/GetSilverfin/silverfin/issues/1998
+
+        run("gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1", env)
       else
         page_list(pages).each do |page|
-          out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
-          result = `#{cmd}`.chomp
-          raise ExtractionFailed, result if $? != 0
+          out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
+          run("gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1", env)
         end
       end
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end
 
-
     private
 
     # Extract the relevant GraphicsMagick options from the options hash.

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -13,6 +13,7 @@ module Docsplit
   #  * Re-OCR each page in the `@pages_to_ocr` list at the end.
   #
   class TextExtractor
+    include ExternalProcess
 
     NO_TEXT_DETECTED = /---------\n\Z/
 
@@ -61,22 +62,25 @@ def extract_from_ocr(pdf, pages)
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
       psm = @detect_orientation ? "-psm 1" : ""
+      timeout = 5.minutes.to_i
+      env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2"
+
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
-          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
+          run("gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1", env)
+          run("tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1")
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
+        run("gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1", env)
         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
-        run "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1"
+        run("tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1")
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
@@ -95,13 +99,6 @@ def clean_text(file)
       end
     end
 
-    # Run an external process and raise an exception if it fails.
-    def run(command)
-      result = `#{command}`
-      raise ExtractionFailed, result if $? != 0
-      result
-    end
-
     # Extract the full contents of a pdf as a single file, directly.
     def extract_full(pdf)
       text_path = File.join(@output, "#{@pdf_name}.txt")