#!/bin/bash
# WF 2020-06-10
# get text from pdf
which pdftotext > /dev/null
if [ $? -ne 0 ]
then
echo "you might want to install pdf2text e.g. with sudo apt-get install poppler-utils" 1>&2
echo "see https://en.wikipedia.org/wiki/Pdftotext" 1>&2
exit 1
else
log=/tmp/pdf2text$$.log
limit=10000
for f in $(find . -name '*.pdf' | head -$limit)
do
b=$(basename $f .pdf)
d=$(dirname $f)
txt="$d/$b-content.txt"
echo "extracting text from $f to $txt ..."
echo "extracting text from $f to $txt ..." >> $log
pdftotext -layout $f $txt 2>>$log
done
echo "done. "
echo "See log results below ..."
grep -v "Bad annotation" $log | grep -v "extracting"
echo "... end of log"
fi