forked from gertvermeer/ingcontest
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDocumentProcessor.py
More file actions
41 lines (32 loc) · 1.37 KB
/
DocumentProcessor.py
File metadata and controls
41 lines (32 loc) · 1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
class DocumentProcessor:
contract_number_starts = ["contractnummer", "leningnummer", "contractnr", "ingnummer", "kenmerk", "contracmummer", "conracmummer"]
filter_list = ["Contractnummer", "Datum:", ":", "."]
def retrieveContractNumber(self, filename):
ocrText = pytesseract \
.image_to_string(Image.open(filename).convert("L")) \
.split('\n')
_lineWithContractNumber = self.__findLineContractNumber(ocrText)
_contractNumber = self.__filterContractNumberFromLine(_lineWithContractNumber)
_contractNumberWithoutDate = self.__filterDateFromLine(_contractNumber)
return _contractNumberWithoutDate
def __findLineContractNumber(self, stringList):
for line in stringList:
for start in self.contract_number_starts:
if start in line.lower():
return line
return "not found"
def __filterContractNumberFromLine(self, line):
for filter_word in self.filter_list:
line = line.replace(filter_word, "")
line = line.strip()
return line
def __filterDateFromLine(self, _contractNumber):
line = _contractNumber.split()
line = [x for x in line if len(x) > 5 and not x.isalpha()]
line = line[0]
return line