regex-parser/regex_parser.py at master · johnshields/regex-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#!/usr/bin/python3
# RegEx Parser
# John Shields

# References
# Functions
# https://www.w3schools.com/python/python_functions.asp
# Shunting Yard Algorithm
# https://www.tldp.org/LDP/Bash-Beginners-Guide/html/sect_04_01.html
# https://web.microsoftstream.com/video/cfc9f4a2-d34f-4cde-afba-063797493a90
# http://www.martinbroadhurst.com/shunting-yard-algorithm-in-python.html
# Thompson's Construction
# https://web.microsoftstream.com/video/5e2a482a-b1c9-48a3-b183-19eb8362abc9
# https://xysun.github.io/posts/regex-parsing-thompsons-algorithm.html
# Matching
# https://web.microsoftstream.com/video/6b4ba6a4-01b7-4bde-8f85-b4b96abc902a
# Prompt
# https://stackoverflow.com/questions/70797/user-input-and-command-line-arguments
# https://stackabuse.com/getting-user-input-in-python/
# https://stackoverflow.com/questions/3754620/a-basic-question-about-while-true
import cmd


# Shunting Yard Algorithm
def shunt(infix):
    # specials dictionary
    specials = {'*': 50, '.': 40, '|': 30, '+': 40, '?': 35}

    pofix = ""
    stack = ""

    for c in infix:
        if c == '(':
            stack = stack + c
        elif c == ')':
            while stack[-1] != '(':
                # putting statements together
                pofix, stack = pofix + stack[-1], stack[:-1]
            stack = stack[:-1]
        elif c in specials:
            # looks for c in specials dictionary, if not found, give value 0
            # looks for top of stack in specials dictionary,  if not found, give value 0
            # n/ compares 1 against the other
            while stack and specials.get(c, 0) <= specials.get(stack[-1], 0):
                pofix, stack = pofix + stack[-1], stack[:-1]
            stack = stack + c

        else:
            pofix = pofix + c

    while stack:
        # putting statements together
        pofix, stack = pofix + stack[-1], stack[:-1]

    return pofix


# Thompson's Construction
# represents a state with two arrows, labelled by label
# use 'None' for a label representing 'e' arrows
class state:
    label = None
    edge1 = None
    edge2 = None


# an NFA is represented by its initial and accepts states
class nfa:
    initial = None
    accept = None

    # mandatory to have 'self' args in constructor
    def __init__(self, initial, accept):
        self.initial = initial
        self.accept = accept


# Algorithm
# function has a stack of NFAs
# pofix allows to loop one character at a time
# until the regular expression is complete
def compile_tom(pofix):
    nfa_stack = []

    for c in pofix:
        if c == '.':
            # stack works as last in first out
            # method to go to stack
            # pop 2 NFAs off the stack.
            nfa2 = nfa_stack.pop()
            nfa1 = nfa_stack.pop()
            # merges them together
            # connect 1st NFAs accept state to the 2nd's initial
            nfa1.accept.edge1 = nfa2.initial

            # push NFA to stack
            # one way to do it ¬ nfa_stack.append(nfa(initial, accept))
            # second way 'easier way'
            new_nfa = nfa(nfa1.initial, nfa2.accept)
            # push new_nfa
            nfa_stack.append(new_nfa)

        elif c == '|':
            # pop 2 NFAs off the stack.
            nfa2 = nfa_stack.pop()
            nfa1 = nfa_stack.pop()
            # create a new initial state, connect it to
            # initial states of the two NFAs popped from the stack
            initial = state()
            initial.edge1 = nfa1.initial
            initial.edge2 = nfa2.initial
            # create new accept state, connecting the accept states
            # of the 2 NFAs popped from the stack, to the new state
            accept = state()
            nfa1.accept.edge1 = accept
            nfa2.accept.edge1 = accept

            # push NFA to stack
            # one way to do it ¬ nfa_stack.append(nfa(initial, accept))
            # second way 'easier way'
            new_nfa = nfa(initial, accept)
            # push new_nfa
            nfa_stack.append(new_nfa)

        elif c == '*':
            # pop a single NFA from the stack
            nfa1 = nfa_stack.pop()
            # create a new initial and accept states
            initial = state()
            accept = state()
            # join the new initial state to nfa1's initial state and the new accept state.
            initial.edge1 = nfa1.initial
            initial.edge2 = accept
            # join the old accept state to the new accept state and nfa1's initial state.
            nfa1.accept.edge1 = nfa.initial
            nfa1.accept.edge2 = accept

            # push new NFA to the stack
            # one way to do it ¬ nfa_stack.append(nfa(initial, accept))
            # second way 'easier way'
            new_nfa = nfa(initial, accept)
            # push new_nfa
            nfa_stack.append(new_nfa)

        elif c == '+':
            # pop nfa from stack
            nfa1 = nfa_stack.pop()
            # create a new initial and accept states
            initial = state()
            accept = state()
            # join the new initial stse to nfa1's initial state and the new accept state.
            initial.edge1 = nfa1.initial
            # join the old accept state to the new accept state and nfa1's initial state.
            nfa1.accept.edge1 = nfa1.initial
            nfa1.accept.edge2 = accept
            # push new NFA to the stack
            new_nfa = nfa(initial, accept)
            # pushes new_nfa
            nfa_stack.append(new_nfa)

        elif c == '?':
            # pop nfa from stack
            nfa1 = nfa_stack.pop()
            # create a new initial and accept states
            initial = state()
            accept = state()
            # join the new initial stse to nfa1's initial state and the new accept state.
            initial.edge1 = nfa1.initial
            initial.edge2 = accept
            # join the old accept state to the new accept state and nfa1's initial state.
            nfa1.accept.edge1 = nfa1.initial
            nfa1.accept.edge2 = accept
            # push new NFA to the stack
            new_nfa = nfa(initial, accept)
            # pushes new_nfa
            nfa_stack.append(new_nfa)

        else:
            # create new initial and accept states
            # creates new nfa
            accept = state()
            # creates new nfa
            initial = state()
            # string character
            # joins the initial states to the accept state
            # using an arrow labelled c.
            initial.label = c
            initial.edge1 = accept
            # going to create a new instance of the NFA class.
            # set its initial state to the
            # initial state

            # push new NFA to the stack # ¬ returns an instance of the nfa class ¬
            # one way to do it ¬ nfa_stack.append(nfa(initial, accept))
            # second way 'easier way'
            new_nfa = nfa(initial, accept)
            # push new_nfa
            nfa_stack.append(new_nfa)

    # nfa_stack  should only have a single nfa on it at the point.
    return nfa_stack.pop()


#
# Return the set of states that can be reached from state following e arrows
def follow_es(following_state):
    # create a new set, with state as its only member
    states = set()
    states.add(following_state)

    # check if state has arrows labeled e from it
    if following_state.label is None:
        # Check if edge1 is a state
        if following_state.edge1 is not None:
            # if there's an edge1, follow it
            states |= follow_es(following_state.edge1)
        # check if edge2 is a state
        if following_state.edge2 is not None:
            # if there's an edge2, follow it
            states |= follow_es(following_state.edge2)

    # return the set of states
    return states


# matching
def match(infix, string_match):
    # Shunt and compile the regular expression
    # calls the shunt function
    postfix = shunt(infix)
    # calls the compile_tom function
    non_fin_auto = compile_tom(postfix)

    # current set of states and the next set of the states
    current = set()
    upcoming = set()

    # add the initial state to the current set
    current |= follow_es(non_fin_auto.initial)

    # Loop through each character in the string
    for s in string_match:
        # Loop through the current set of states
        for c in current:
            # check of that state is labelled s
            if c.label == s:
                # Add edge1 state to the upcoming set
                upcoming |= follow_es(c.edge1)
        # set current to upcoming, and clear out next
        current = upcoming
        upcoming = set()

        # check if the accept state is in the set of current states
    return non_fin_auto.accept in current


# testing code
def executioner():
    print('\nTest Shunting Yard Algorithm \n')
    # test Shunting Yard Algorithm
    print(shunt("(a.b)|(c*.d)"))

    print("\nTest Thompson's Construction \n")
    # test Thompson's Construction
    print(compile_tom("ab.cd.|"))
    print(compile_tom("aa.*"))
    print(compile_tom("(0|(1(01*(00)*0)*1)*)*"))
    print(compile_tom("(0|(1(01*(00)*0)*1)*)*"))

    # test Matching
    # creates list of infix regex
    infixes = ["a.b.c", "a.b.c*, a.(b|d).c*", "(a.(b|d))*", "a.(b.b)*.c"]
    # strings to match against the regex infixes
    strings = ["abc", "abbc", "abcc", "abad", "abbbc"]

    print('\n Test Matching \n')
    # double for loop - loops through the regex and strings
    for exp in infixes:
        for res in strings:
            # function prints out True/False if the infix regex matches the string
            print(match(exp, res), exp, res)

    # input prompt to match Regular Expressions
    print('\nInput a regular expression to be tested \n')
    # prints out True/False if the infix regex matches the string
    while True:
        try:
            # writes an infix regex to match with string
            infixes = (input("Input infix regular expression: "))
            # writes a string to match with infix regex
            string = (input("Input string to match: "))
            print('\nInfix and String match \n')
            # function prints out True/False if the infix regex matches the string
            print(match(infixes, string), infixes, string, '\n')

            a_file = open("bash_tests/output.txt", "w")
            text = match(infixes, string), infixes, string
            print(text, file=a_file)
            a_file.close()
        except (KeyboardInterrupt, EOFError, SystemExit):
            break


if __name__ == "__main__":
    executioner()