Разбор JSON по одной строке в Python

Question 1

счастливого 2021 года всем!

Я начал писать код несколько месяцев назад ради развлечения, а недавно поставил перед собой задачу создать парсер JSON на Python (v3.8).

Основная идея заключалась в том, чтобы не загружать сразу весь файл, вместо этого анализируя файл построчно. Но я, конечно, еще новичок, так что там, наверное, много нелепостей. Конструктивная критика приветствуется!

Полный код здесь: https://pastebin.com/fEP4n9Gw
Образец JSON, использованный для его тестирования, находится здесь: https://pastebin.com/587jqziH

РЕДАКТИРОВАТЬ: переписал основную функцию синтаксического анализа, чтобы иметь возможность обрабатывать компактные jsons. Конечно, до оптимального кода еще далеко, но дело не в этом. 🙂

import re
import ast

class TranslateJSON(ast.NodeTransformer):
    '''
    NodeTransformer to replace null/true/false for None, True and False before evaluating the string.
    '''
    translate_map = {'null': None, 'true': True, 'false': False}
    def visit_Name(self, node):
        if node.id in self.translate_map.keys():
            return ast.Constant(value=self.translate_map[node.id], kind=None, lineno=node.lineno, col_offset=node.col_offset, end_lineno=node.end_lineno, end_col_offset=node.end_col_offset)

class JSON_parser():
    '''
    Class has two attributes other than its methods:
    'file_path': path of the json file to parse
    'map': created via the buildDict() method, which simply evaluates the json file into a dictionary.

    The methods that should be called directly are:
    read(): accepts one argument, an iterable containing the whole hierarchy of keys to to query (from the outermost to the innermost).
    Since this method reads the file one line at a time, it's faster when handling large files. Otherwise buildDict() should be faster.

    buildDict(): merely evaluates the whole JSON file into a dictionary, storing it in self.map.
    This method can also be used to parse JSON strings directly.
    '''

    full_value_regex = re.compile(r'^s*(".+"|null|true|false|d+.?d*)') #pattern to find a non-object, non-array value.
    first_char_regex = re.compile(r'^s*([{[]).*') #pattern to find out if a value is a JSON object or array
    def __init__(self, file_path):
        self.file_path = file_path
        self.map = None

    
    def cleanString(self, line):
        '''
        Prepares a string to be parsed (spaces are stripped)
        '''
        clean_line = line.strip()
        return clean_line

    def translate_and_eval(self, value):
        '''
        Replaces the values null, true and false in a captured value for None, True and False.
        Then evaluates the string litteraly into python data types.
        '''
        ast_obj = ast.parse(value, mode="eval")
        try:
            final_value = ast.literal_eval(ast_obj)
        except:
            try:
                TranslateJSON().visit(ast_obj)
                final_value = ast.literal_eval(ast_obj)
            except:
                raise ValueError(f"JSON malformed. Error evaluating {value}")
        return final_value
    
    def buildDict(self, string_to_eval=""):
        '''
        Reads the whole file and evaluates it into a dictionary, storing it in self.map.
        Alternatively, you can pass a JSON as a string argument.
        '''
        if not string_to_eval:
            with open(self.file_path) as source:
                for line in source:
                    string_to_eval += self.cleanString(line) 
        self.map = self.translate_and_eval(string_to_eval) 

    def read(self, keys):
        '''
        Master method to access a value of a JSON file without loading the whole file at once. To be used for large files.
        For smaller files, use buildDict() instead.
        'keys' has to be a list of all the keys being searched, from outer to innermost. Ex.: self.read(['outerkey','middlekey','finalkey'])
        The string value is evaluated literally before being returned.
        '''
        with open(self.file_path) as file:
            value = self._search(keys, file)
        value = self.translate_and_eval(value)
        return value

    def _search(self, keys, file):
        '''
        Iteratively finds all keys of the hierarchy that is being searched, the last of which will have its position passed to the function _getValue().
        Arguments:
        keys: list of keys to search, from outer to innermost.
        file: since the function is called with the file still open, the file object has to be passed as an argument.
        '''
        #The variables below help limit the search to a specific part of the file
        open_bracket_count = 0
        inside_quotes = False #Toggle to ignore curly brackets inside quotes
        start_is_set = False #When True, the desired hierarchy depth has been reached and the search can begin
        end_is_set = False #Toggles off the search (when a lower/higher hierarchy level is reached)
        last_endpos = [0,0] #Ultimately stores the position of the last found key, from which its value can be parsed.
        haystack = ''
        
        file.seek(0)

        for key_index, key in enumerate(keys):
            key_regex = re.compile('("' + key + '"' + r's*:)')
            match = None
            file.seek(0)
            for line_number, line in enumerate(file):
                if line_number < last_endpos[0]: #skips previous lines
                    continue
                clean_line = self.cleanString(line)
                if line_number == last_endpos[0]:
                    clean_line = clean_line[last_endpos[1]:]
                    char_index_offset = last_endpos[1] #offsets the character index with the position of the last found key. Allows for parsing the same line multiple times.
                else:
                    char_index_offset = 0

                for char_index, char in enumerate(clean_line):

                    if char == '"': 
                        inside_quotes = not inside_quotes 
                    elif char == '}' and not inside_quotes:
                        if open_bracket_count-1 == key_index+1 and not start_is_set:
                            start_is_set = True
                        elif open_bracket_count-1 == key_index and not end_is_set:
                            end_is_set = True
                        open_bracket_count -= 1
                            
                    elif char == '{' and not inside_quotes:
                        if open_bracket_count+1 == key_index+1 and not start_is_set:
                            start_is_set = True
                        elif open_bracket_count+1 == key_index+2 and not end_is_set:
                            end_is_set = True
                        open_bracket_count += 1

                    if start_is_set:
                        haystack += char
                        match = key_regex.search(haystack)
                        if match:
                            last_endpos = [line_number, char_index+char_index_offset]
                            start_is_set, end_is_set = False, False
                            haystack = ''
                            break
                        elif end_is_set:
                            start_is_set, end_is_set = False, False
                            haystack = ''
                if match:
                    break
            if not match:
                raise KeyError(f"{key} not found in file. Last valid key found at line {last_endpos[0]+1} and endchar index {last_endpos[1]}")
        if match:
            return self._getValue(last_endpos, file)

    def _getValue(self, match_end, file):
        '''
        Once the final key has been found, _getValue() is called to return the actual value of the key.
        The function tries to capture the value directly with a regex (when the value is null, a string or a number).
        If this fails, it assumes the value is either a JSON object or an array (starting with { or [ respectively)
        
        Arguments:
        match_end: a list containing the line where the key was found and the index of the last character of the key in that line. Parsing will start from there.
        file: since the function is called with the file still open, the file object has to be passed as an argument.
        '''

        file.seek(0)
        #The variables below help determine which type of data is being parsed (JSON object or array),
        #and whether the object/array has been fully captured.
        open_bracket="" 
        bracket_map = {'{': '}', '[': ']'} 
        open_bracket_count = 0 
        close_bracket_count = 0
        value=""

        for line_number, line in enumerate(file):
            if line_number < match_end[0]:
                continue
            
            elif line_number == match_end[0]:
                clean_line = self.cleanString(line)
                clean_line = clean_line[match_end[1]+1:] #starts parsing the line after the key name
            else:
                clean_line = self.cleanString(line)

            if not open_bracket: 
                full_value_match = self.full_value_regex.match(clean_line) #first try to match a simple value, instead of obj/array (string, null or number)
                if full_value_match:
                    return full_value_match.group(1)
            
                #If direct match fails, look at first non-whitespace character to determine whether value is a JSON object or array
                first_char_match = self.first_char_regex.match(clean_line) 
                try:
                    open_bracket = first_char_match.group(1)
                except:
                    raise ValueError(f"Could not retrieve value. JSON is probably malformed. Line: {line_number}")
            
            #the loop below adds characters to the variable 'value' until the whole object/array is captured.
            for char in clean_line:
                if char == open_bracket:
                    open_bracket_count += 1
                elif char == bracket_map[open_bracket]:
                    close_bracket_count += 1
                if open_bracket_count > 0:
                    if close_bracket_count == open_bracket_count:
                        value += char
                        return value
                    else:
                        value += char





if __name__ == '__main__':
    '''
    import timeit

    a="""
pop_map = JSON_parser('pop_map.json')
x = pop_map.read(['investor', 'jewellery', 'consumption'])
y = pop_map.read(['worker', 'fish'])
z = pop_map.read(['scholar'])
"""
    b="""
pop_map = JSON_parser('pop_map.json')
pop_map.buildDict()
x = pop_map.map['investor']['jewellery']['consumption']
y = pop_map.map['worker']['fish']
z = pop_map.map['scholar']
"""

    c="""
pop_map = JSON_parser('pop_map_compact.json')
x = pop_map.read(['investor', 'jewellery', 'consumption'])
y = pop_map.read(['worker', 'fish'])
z = pop_map.read(['scholar'])
"""

    d="""
import json
with open('pop_map.json') as js:
    data = json.load(js)
x = data['investor']['jewellery']['consumption']
y = data['worker']['fish']
z = data['scholar']
"""

    print(timeit.timeit(stmt=a, setup="from __main__ import JSON_parser", number=500))
    print(timeit.timeit(stmt=b, setup="from __main__ import JSON_parser", number=500))
    print(timeit.timeit(stmt=c, setup="from __main__ import JSON_parser", number=500))
    print(timeit.timeit(stmt=d, setup="from __main__ import JSON_parser", number=500))
    '''


    '''
    pop_map = JSON_parser('pop_map.json')
    x = pop_map.read(['investor', 'jewellery', 'consumption'])
    y = pop_map.read(['worker', 'market'])
    z = pop_map.read(['scholar'])
    print(x,y,z, sep='nn', end='nnn')
    '''
    '''
    pop_map = JSON_parser('pop_map.json')
    pop_map.buildDict()
    x = pop_map.map['investor']['jewellery']['consumption']
    y = pop_map.map['worker']['fish']
    z = pop_map.map['scholar']
    print(x,y,z, sep='nn')
    '''

    '''
    pop_map = JSON_parser('pop_map_compact.json')
    x = pop_map.read(['investor', 'jewellery', 'consumption'])
    y = pop_map.read(['worker', 'market'])
    z = pop_map.read(['scholar'])
    print(x,y,z, sep='nn', end='nnn')
    '''

С уважением,

Бернардо

Question 2

Ваше заявление об ускорении:

Избегайте одновременной загрузки всего файла, вместо этого анализируйте файл построчно, что, по данным тестирования, кажется в 2 раза быстрее

кажется маловероятным, и я хотел бы видеть доказательства этого. Скомпилированный и настроенный встроенный анализатор JSON, который работает с буфером в памяти, почти наверняка превзойдет некомпилированный, невстроенный построчный синтаксический анализатор. Единственное преимущество, которое, вероятно, будет иметь ваш код, — это уменьшение занимаемой памяти для огромных файлов.

Правильность важнее скорости, и, похоже, вы уже обнаружили случаи, когда ваш парсер просто ломается.

Также стоит отметить, что ваш парсер рекурсивен по вызовам. Будет тривиально легко вывести из строя ваш парсер, предоставив достаточно вложенный файл JSON, который взорвет его стек, и они действительно существуют в дикой природе в безопасных ситуациях.

В принципе, кроме как в учебных целях, этого делать вообще не следует. В 99% случаев, когда это работает, используйте встроенный синтаксический анализ JSON. В 1% случаев, когда проблемы с памятью действительно требуют итеративного анализа, ваша проблема уже был решается несколько раз.

Похожие записи:

Добавить комментарий Отменить ответ