Python-Plagiarism-Detector/pdfExtractor.py at master · Arka5/Python-Plagiarism-Detector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from PyPDF2 import PdfFileReader

#
# Getting Metadata from a PDF using PyPDF2
#
"""
def get_info(path, mode):
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        info = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
        author = info.author
        creator = info.creator
        producer = info.producer
        subject = info.subject
        title = info.title
        if mode == 'numPages':
            return number_of_pages
        if mode == 'info':
            return info
"""

#
# Extracting text from a PDF
#

def textExtract(path):
    try:
        with open(path, 'rb') as f:
            try:
                pdf = PdfFileReader(f)
                number_of_pages = pdf.getNumPages()
                text = ''
                for i in range(0, number_of_pages):
                    page = pdf.getPage(i)
                    text += ' '+page.extractText()
                    if __name__ == '__main__':
                        print('Page No.- '+str(i))
                        print(page)
                        print('Page type: {}'.format(str(type(page))))
                try:
                    if __name__ == '__main__':
                        print("Extracted text is:" + text)
                    else:
                        return text
                except:
                    print(
                        "Error while printing or returning result! [Error issued from pdfExtractor module]")
            except:
                print(
                    "Error while extracting data from PDF File! [Error issued from pdfExtractor module]")
    except:
        print(
            "Error while opening the file at provided path! [Error issued from pdfExtractor module]")


# DEBUG-PURPOSE
if __name__ == '__main__':
    path = 'abc.pdf'
    textExtract(path)