camel_parser/handle_multiple_conll_files.py at main · CAMeL-Lab/camel_parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Script to handle multiple conll files.

Usage:
    text_to_conll_cli (-i <input> | --input=<input>)
        (-o <output> | --output=<output>)
        [-m <model> | --model=<model>]
    text_to_conll_cli (-h | --help)

Options:
    -i <input> --input=<input>
        A directory of conll files
    -o <output> --output=<output>
        The directory to save the parsed CoNLL-X files
    -m <model> --model=<model>
        The name BERT model used to parse (to be placed in the model directory) [default: catib]
    -h --help
        Show this screen.
"""

import os
from pathlib import Path
import re
from typing import List
from src.classes import ConllParams
from src.conll_output import save_to_file, text_tuples_to_string
from src.data_preparation import parse_text
from src.utils.model_downloader import get_model_name
from docopt import docopt
from pandas import read_csv
from transformers.utils import logging

logging.set_verbosity_error()

arguments = docopt(__doc__)

def get_list_of_comments(conll_lines) -> List[List[str]]:
    """Initializes the class variable comments as a list of lists of comments.
    Within the comments list:
    Each list represents the comments of the given tree.

    An empty list represents a tree with no comments.

    Returns:
        List[List[str]]: a list of lists of comments
    """
    # get lines starting with # and blank lines
    # the blank lines represent the end of the tree/tree comments.
    matcher = re.compile(r'^(# text.*)$', re.MULTILINE)

    # a flat list of all comments
    lines: List[str] = matcher.findall(conll_lines)

    lines = [line[9:] for line in lines]
    # # create a list of lists of comments
    # final_list: List[List[str]] = []
    # temp_list: List[str] = []
    # for line in lines:
    #     if line == '': # an empty string represents the end of comments of the given tree.
    #         final_list.append(temp_list)
    #         temp_list = []
    #     else:
    #         temp_list.append(line)
    return lines


def main():
    root_dir = Path(__file__).parent
    model_path = root_dir/"models"

    #
    ### Get clitic features
    #
    clitic_feats_df = read_csv(root_dir / 'data/clitic_feats.csv')
    clitic_feats_df = clitic_feats_df.astype(str).astype(object) # so ints read are treated as string objects


    #
    ### cli user input ###
    #
    input_path = arguments['--input']
    output_path = arguments['--output']
    parse_model = arguments['--model']

    #
    ### Set up parsing model
    # (download defaults models, and get correct model name from the models directory)
    #
    model_name = get_model_name(parse_model, model_path=model_path)

    #
    ### main code ###
    #
    for root, _, files in os.walk(input_path):
        for text_file in files:
            print(f'processing {text_file}')
            file_type_params = ConllParams(str(Path(input_path) / text_file), model_path/model_name)
            parsed_text_tuples = parse_text("conll", file_type_params)

            lines = []
            with open(f'{root}/{text_file}', 'r') as f:
                lines = [line for line in f.readlines() if line.strip()]

            lines = get_list_of_comments(''.join(lines))

            conll_name = f"{'.'.join(text_file.split('.')[:-1])}.conllx"
            save_to_file(
                text_tuples_to_string(parsed_text_tuples, file_type='conll', sentences=lines),
                Path(output_path) / conll_name
            )

if __name__ == '__main__':
    main()