GraphCoder/build_graph_database.py at main · miosomos/GraphCoder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import json
from tqdm import tqdm
from networkx.readwrite import json_graph
from utils.utils import CONSTANTS, CodexTokenizer
from utils.slicing import Slicing
from utils.ccg import create_graph
from utils.utils import iterate_repository_file, make_needed_dir, set_default, dump_jsonl, graph_to_json


class GraphDatabaseBuilder:

    def __init__(self, repo_base_dir=CONSTANTS.repo_base_dir,
                 graph_database_save_dir=CONSTANTS.graph_database_save_dir):
        self.repo_base_dir = repo_base_dir
        self.graph_database_save_dir = graph_database_save_dir
        return

    def build_full_graph_database(self, repo_name):
        code_files = iterate_repository_file(self.repo_base_dir, repo_name)
        file_num = 0
        make_needed_dir(os.path.join(self.graph_database_save_dir, repo_name))
        with tqdm(total=len(code_files)) as pbar:
            for file in code_files:
                with open(file, 'r') as f:
                    src_lines = f.readlines()
                ccg = create_graph(src_lines, repo_name)
                if ccg is None:
                    pbar.update(1)
                    continue
                save_path = os.path.join(self.graph_database_save_dir, repo_name, f"{file_num}.json")
                file_num += 1
                make_needed_dir(save_path)
                with open(save_path, 'w') as f:
                    f.write(json.dumps(json_graph.node_link_data(pdg), default=set_default))
                pbar.update(1)
        return

    def build_slicing_graph_database(self, repo_name):

        slicer = Slicing()
        repo_dict = []

        # Get all file
        code_files = iterate_repository_file(self.repo_base_dir, repo_name)
        repo_base_dir_len = len(self.repo_base_dir.split('/'))
        tokenizer = CodexTokenizer()
        with tqdm(total=len(code_files)) as pbar:
            for file in code_files:
                # read file
                pbar.set_description(file)
                with open(file, 'r') as f:
                    src_lines = f.readlines()
                # get graph
                ccg = create_graph(src_lines, repo_name)
                if ccg is None:
                    pbar.update(1)
                    continue

                # slicing for each statement
                for v in ccg.nodes:
                    curr_dict = dict()
                    forward_context, forward_line, forward_graph = slicer.forward_dependency_slicing(v, ccg,
                                                                                                     contain_node=False)
                    curr_dict['key_forward_graph'] = graph_to_json(forward_graph)
                    curr_dict['key_forward_context'] = forward_context
                    curr_dict['key_forward_encoding'] = tokenizer.tokenize(forward_context)
                    curr_dict['statement'] = "".join(ccg.nodes[v]['sourceLines'])
                    statement_line_row = ccg.nodes[v]['startRow']
                    start_line_row = max(0, statement_line_row-11)
                    end_line_row = min(statement_line_row+10, len(src_lines))
                    curr_dict['val'] = "".join(src_lines[start_line_row:end_line_row])
                    curr_dict['fpath_tuple'] = tuple(os.path.relpath(file, self.repo_base_dir).split(os.sep))
                    max_forward_line = 0
                    if len(forward_line) != 0:
                        max_forward_line = max(forward_line)
                    curr_dict['max_line_no'] = max(max_forward_line, end_line_row)
                    repo_dict.append(curr_dict.copy())
                pbar.update(1)

        save_name = os.path.join(self.graph_database_save_dir, f"{repo_name}.jsonl")
        make_needed_dir(os.path.dirname(save_name))
        dump_jsonl(repo_dict, save_name)
        return


if __name__ == '__main__':
    graph_db_builder = GraphDatabaseBuilder()
    for repo in CONSTANTS.repos:
        print(f'Processing repo {repo}')
        graph_db_builder.build_slicing_graph_database(repo)