Skip to content

PDF Text extraction trial #19

@WolfgangFahl

Description

@WolfgangFahl
#!/bin/bash
# WF 2020-06-10
# get text from pdf
which pdftotext > /dev/null
if [ $? -ne 0 ]
then
  echo "you might want to install pdf2text e.g. with sudo apt-get install poppler-utils" 1>&2
  echo "see https://en.wikipedia.org/wiki/Pdftotext" 1>&2
  exit 1
else
  log=/tmp/pdf2text$$.log
  limit=10000
  for f in $(find . -name '*.pdf' | head -$limit)
  do
    b=$(basename $f .pdf)
    d=$(dirname $f)
    txt="$d/$b-content.txt"
    echo "extracting text from $f to $txt ..."
    echo "extracting text from $f to $txt ..." >> $log
    pdftotext -layout $f $txt 2>>$log
  done
  echo "done. "
  echo "See log results below ..."
  grep -v "Bad annotation" $log | grep -v "extracting"
  echo "... end of log"
fi

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions