mini-parser/mini_scanner.py at master · samar1tan/mini-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import os


class MiniScanner:
    ###### CONSTANT ######
    symbols = ['#', '+', '-', '*', '/', '%', '=',
               '(', ')', '{', '}', '>', '<', ';',
               '^', '!', '&', '|', '%', '==', '!=',
               '>=', '<=', '&&', '||']
    reserved = ['if', 'else', 'while', 'int',
                'float', 'double', 'bool']
    reserved_id = len(symbols)
    constant_id = len(symbols) + len(reserved)
    identifier_id = constant_id + 1
    ###### CONSTANT ######

    ###### PROPERTY ######
    source = None
    errors = None
    tokens = []
    constant_table = []
    identifier_table = []

    ###### PROPERTY ######

    def __init__(self, file):
        self.read(file)
        self.scan_fa()

    def read(self, file):
        """read code from file and preprocess"""
        source = []
        with open(file, encoding='utf-8') as fp:
            for line in fp.readlines():
                comment_index = line.find('//')  # remove comments
                if comment_index < 0:
                    source.append(line.strip())
                elif comment_index == 0:
                    source.append(' ')  # full-line comment, to preserve original line number
                elif comment_index > 0:
                    source.append(line[:comment_index].strip())
        self.source = '\n'.join(source)
        return '\n'.join(['%2d: %s' % (i + 1, line) for i, line in enumerate(source)])

    def scan_fa(self):
        """scan for tokens in FA's perspective"""
        source = self.source
        i = 0
        self.line = 1
        while i < len(source):
            if source[i] in self.symbols:  # is symbol
                ret, i = self.scan_symbol(i)
                if ret == 1:
                    return
            elif source[i].isalpha():  # is alphabet
                ret, i = self.scan_identifier(i)
                if ret != 0:
                    print(self.errors)
                    os._exit(1)
                    return
            elif source[i].isdecimal():  # is decimal
                ret, i = self.scan_decimal(i)
                if ret != 0:
                    print(self.errors)
                    os._exit(1)
                    return
            elif source[i] == ' ':  # just space
                i += 1
            elif source[i] == '\n':
                i += 1
                self.line += 1
            else:  # unexpected
                self.errors = 'ERROR: unexpected token \'%s\' in line %d: %s' \
                              % (source[i], self.line, self.source.split('\n')[self.line - 1])
                print(self.errors)
                os._exit(1)
                return

    def scan_symbol(self, index):
        """
        scan for symbols
        ret: 1 if it's end of the source code else 0
        """
        ret = 0
        source = self.source
        token = source[index]
        if token == '#':  # end of the source code
            self.tokens.append((self.symbols.index(token), token, self.line))
            ret = 1
            return ret, index
        # check for longer symbols
        if source[index + 1] in self.symbols:
            token += source[index + 1]
        if token not in self.symbols:
            token = token[0]
        self.tokens.append((self.symbols.index(token), token, self.line))
        return ret, index + len(token)

    def scan_decimal(self, index):
        """
        scan for decimals
        ret:  0 if nothing goes wrong
             -1 if multiple decimal points in one constant
             -2 if decimal ends with alphabet
        """
        ret = 0
        decimal_point = False
        source = self.source
        token = source[index]
        for i in range(index + 1, len(source)):
            if source[i].isdecimal():
                token += source[i]
            elif source[i] == '.':
                token += source[i]
                if decimal_point:  # multiple decimal points
                    ret = -1
                    self.errors = 'ERROR: multiple decimal points in one constant \'%s\'' % token
                    break
                else:  # the first decimal point
                    decimal_point = True
            else:
                break
        if source[i].isalpha():  # check if decimal ends with alphabet
            ret = -2
            self.errors = 'ERROR: decimal ends with alphabet \'%s\'' % (token + source[i])
            return ret, i
        constant_set = [x[1] for x in self.constant_table]
        if token in constant_set:  # check constant table
            constant_index = constant_set.index(token)
        else:  # it's a new constant
            constant_index = len(self.constant_table)
            self.constant_table.append((constant_index, token))
        self.tokens.append((self.constant_id, constant_index, self.line))
        return ret, i

    def scan_identifier(self, index):
        """
        scan for identifiers
        ret: -1 if met invalid identifier else 0
        """
        ret = 0
        source = self.source
        token = source[index]
        for i in range(index + 1, len(source)):
            if source[i].isalpha() or source[i].isdecimal():
                token += source[i]
            else:
                break
        if token in self.reserved:  # check in reserved keywords table
            self.tokens.append((self.reserved_id + self.reserved.index(token), token, self.line))
        else:
            symbol_set = [x[1] for x in self.identifier_table]
            if token in symbol_set:  # existing identifier
                token_index = symbol_set.index(token)
                self.tokens.append((self.identifier_id, token_index, self.line))
            elif self.is_valid(token):  # new identifier
                symbol_index = len(self.identifier_table)
                self.tokens.append((self.identifier_id, symbol_index, self.line))
                self.identifier_table.append((symbol_index, token))
            else:  # invalid token
                self.errors = 'ERROR: \'%s\' is not a valid identifier' % token
                ret = -1
        return ret, i

    @staticmethod
    def is_valid(token):
        """validate token"""
        return not token[0].isdecimal()


if __name__ == '__main__':
    scanner = MiniScanner('test.txt')
    print('tokens:', scanner.tokens)
    print('constant:%s' % (scanner.constant_table))
    print('identifier:%s' % (scanner.identifier_table))