Document_Search_Engine/search.py at main · Randrita/Document_Search_Engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import re
import pickle
from pprint import pprint
from typing import Dict

all_codecs = ['ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857',
'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869',
'cp874', 'cp875', 'cp932', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125',
'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
'cp1257', 'cp1258', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 'euc_kr',
'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'latin_1',
'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7',
'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13',
'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman',
'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be', 'utf_16_le', 'utf_7',
'utf_8', 'utf_8_sig']

def smart_decoded(blob):
    result = ""
    for i in range( len(blob) - 1 ):
        b = blob[i:i+1]
        for codec in all_codecs:
            try:
                d = b.decode(codec)
                result += d
                break
            except:
                continue
    pattern = re.compile(r'\\x[0-9a-f][0-9a-f]')
    s = str(result.encode('utf-8'))[2:-1]
    result = str(re.sub(pattern, '', s))
    result = str(re.sub('\\\\[a-z]', ' ', result))
    return result

class SearchEngine:
    ''' Create a search engine object '''

    def __init__(self, indexing='index.pkl'):
        self.file_index = [] # directory listing returned by os.walk()
        self.indexing = indexing


    def create_new_index(self, values: Dict[str, str]) -> None:
        ''' Create a new file index of the root; then save to self.file_index and to pickle file '''
        root_path = values['PATH']
        self.file_index: list = [(root, files) for root, dirs, files in os.walk(root_path) if files]

        # save index to file
        with open(self.indexing,'wb') as f:
            pickle.dump(self.file_index, f)


    def load_existing_index(self) -> None:
        ''' Load an existing file index into the program '''
        try:
            with open(self.indexing,'rb') as f:
                self.file_index = pickle.load(f)
        except:
            self.file_index = []


    def search(self, values: Dict[str, str], smartdecode=True, write_output=False) -> None:
        ''' Search for the term based on the type in the index; the types of search
            include: contains, startswith, endswith; save the results to file '''
        results = []
        matches = 0
        records = 0
        term = values['TERM']

        # search for matches and count results
        for path, files in self.file_index:
            for file in files:
                records +=1
                if (values.get('CONTAINS', False) and term.lower() in file.lower() or
                    values.get('STARTSWITH', False) and file.lower().startswith(term.lower()) or
                    values.get('ENDSWITH', False) and file.lower().endswith(term.lower())):

                    result = os.path.join(path.replace('\\','/'), file).replace('\\','/')
                    results.append(result)
                    matches += 1
                else:
                    continue

        if write_output:
            # save results to file
            with open('search_results.txt','w') as f:
                for row in results:
                    f.write(row + '\n')

        return [ self.build_result(path, smartdecode=smartdecode) for path in results ], matches, records

    def build_result(self, path, N=2048, smartdecode=True):
        fileresult = {}

        head, tail = os.path.split(path)
        blob=None
        try:
            with open(path, 'rb') as f:
                blob = f.read(N)
        except Exception as ex:
            return {
                'file': tail,
                'location': head,
                'content': {
                    'blob': None,
                    'data': None
                }
            }
        try:
            return {
                'file': tail,
                'location': head,
                'content': {
                    'blob': smart_decoded(blob) if smartdecode else blob,
                    'data': self.content_aware(blob)
                },
            }

        except Exception as ex:
            return {
                'file': tail,
                'location': head,
                'content': {
                    'blob': smart_decoded(blob) if smartdecode else blob,
                    'data': None
                }
            }


    def content_aware(self, blob):
        try:
            import magic
        except ImportError as ex:
            return 'Python Magic library `python-magic` not found. Follow the instructions from here: https://github.com/ahupp/python-magic to install `python-magic` and libpython.'

        encoding = magic.from_buffer(blob)
        magicraw = magic.Magic(raw=True)
        raw = magicraw.from_buffer(blob)
        return { 'encoding': encoding, 'type': raw }