json-parser/json_parser.py at main · Massakera/json-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import argparse
import sys
import re

# Token types
TOKEN_TYPES = {
    'LBRACE': r'\{',
    'RBRACE': r'\}',
    'STRING': r'"([^"\\]*(\\.[^"\\]*)*)"',
    'NUMBER': r'-?\d+(\.\d+)?([eE][+-]?\d+)?',
    'BOOLEAN': r'true|false',
    'NULL': r'null',
    'COLON': r':',
    'COMMA': r',',
    'EOF': r'$',
    'LBRACKET': r'\[',
    'RBRACKET': r'\]',
    'TRUE': r'true',  # Recognize lowercase 'true'
    'FALSE': r'false',  # Recognize lowercase 'false'
}


class Token:
    def __init__(self, type, value):
        self.type = type
        self.value = value

    def __repr__(self):
        return f"Token({self.type}, {self.value})"

def lexer(input):
    tokens = []
    # Pattern to match any whitespace
    whitespace_pattern = re.compile(r'\s+')

    while input:
        whitespace_match = whitespace_pattern.match(input)
        if whitespace_match:
            input = input[whitespace_match.end():]

        match = None
        for type, pattern in TOKEN_TYPES.items():
            regex = re.compile(pattern)
            match = regex.match(input)
            if match:
                value = match.group(0)
                if type == 'STRING':
                    value = value[1:-1]
                tokens.append(Token(type, value))
                input = input[match.end():]
                break
        if not match:
            if input:  # Only raise an error if there's non-whitespace input left
                raise ValueError(f"Unexpected character: {input[0]}")
            break

    tokens.append(Token('EOF', None))
    return tokens

def parse(tokens):
    def parse_value(tokens):
        next_token_type = tokens[0].type

        if next_token_type == 'LBRACE':
            return parse_object(tokens)
        elif next_token_type == 'LBRACKET':
            return parse_array(tokens)
        else:
            token = tokens.pop(0)
            if token.type == 'STRING':
                return token.value
            elif token.type == 'NUMBER':
                try:
                    return int(token.value)
                except ValueError:
                    return float(token.value)
            elif token.type == 'BOOLEAN':
                return token.value == 'true'
            elif token.type == 'NULL':
                return None
            else:
                raise ValueError(f"Unsupported value type: {token.type}")


    def parse_array(tokens):
        array = []
        tokens.pop(0)  # Consume the opening '['
        while tokens[0].type != 'RBRACKET':
            element = parse_value(tokens)
            array.append(element)
            if tokens[0].type == 'COMMA':
                tokens.pop(0)  # Consume the comma, if present
        tokens.pop(0)  # Consume the closing ']'
        return array


    def parse_object(tokens):
        obj = {}
        tokens.pop(0)  # Consume the opening '{'
        while tokens[0].type != 'RBRACE':
            key_token = tokens.pop(0)
            if key_token.type != 'STRING':
                raise ValueError("Expected a string key")
            if tokens.pop(0).type != 'COLON':
                raise ValueError("Expected ':' after key")

            value = parse_value(tokens)

            obj[key_token.value] = value

            if tokens[0].type == 'COMMA':
                tokens.pop(0)  # Consume the comma, continue parsing the next key-value pair
            elif tokens[0].type != 'RBRACE':
                raise ValueError("Expected ',' or '}' after a key-value pair")
        tokens.pop(0)  # Consume the closing '}'
        return obj

    if not tokens:
        raise ValueError("Empty token list")

    ast = parse_object(tokens)
    if tokens[0].type != 'EOF':
        raise ValueError("Expected end of file after JSON object")
    return ast

def main(file_path):
    try:
        with open(file_path, 'r') as file:
            json_input = file.read()
        tokens = lexer(json_input)
        parsed = parse(tokens)
        print(parsed)
        sys.exit(0)
    except ValueError as e:
        print(f"Error parsing JSON: {e}", file=sys.stderr)
        sys.exit(1)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Simple JSON Parser')
    parser.add_argument('file', help='Path to the JSON file to parse')
    args = parser.parse_args()
    main(args.file)