parser/hyperparam_search.py at main · dayne-2stacks/parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import itertools
import json
import time
from typing import Dict, Any, List
import torch
import yaml
import nltk
import logging
from collections import defaultdict
from nltk import Nonterminal, induce_pcfg, ProbabilisticProduction, Production
from nltk.tree import Tree
from nltk.parse import ViterbiParser
from viterbi import TokenLevelViterbiParser
from provider import TokenLevelProbabilityProvider
from gpu_logging_utils import (
    log_gpu_memory_nvidia_smi,
    log_cuda_memory_pytorch,
    flush_cuda_cache,
)
from nltk.tokenize import TreebankWordTokenizer
from local_llm import LocalLLM
import matplotlib.pyplot as plt
import os

os.makedirs("logs", exist_ok=True)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("hyperparam_search")
handler = logging.FileHandler("logs/hyperparam_search.log")
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
logger.addHandler(handler)


def get_constituents(tree: Tree) -> set:

    constituents = set()
    leaves_pos = {}

    leaves = tree.leaves()
    for i, leaf in enumerate(leaves):
        for pos in tree.leaf_treeposition(i):
            leaves_pos[pos] = i

    for position in tree.treepositions():
        if isinstance(tree[position], Tree):
            if len(tree[position].leaves()) > 1:
                subtree = tree[position]
                start = leaves_pos[subtree.leaf_treeposition(0)]
                end = leaves_pos[subtree.leaf_treeposition(len(subtree.leaves()) - 1)] + 1
                label = subtree.label()
                constituents.add((label, start, end))
    return constituents


def load_config(path: str) -> Dict[str, List[Any]]:
    """Load a config file in either JSON or YAML"""
    with open(path, "r") as f:
        if path.endswith(".json"):
            data = json.load(f)
        else:
            data = yaml.safe_load(f)
    return data


def get_treebank_splits(dev_ratio: float = 0.1, test_ratio: float = 0.1) -> (List[Tree], List[Tree], List[Tree]):
    """Get train, dev and test splits of the NLTK Treebank corpus"""
    # Get all the trees from the NLTK Treebank corpus
    trees = list(nltk.corpus.treebank.parsed_sents())
    split_dev = int(len(trees) * (1 - dev_ratio - test_ratio))
    train_trees = trees[:split_dev]
    dev_trees = trees[split_dev : -int(len(trees) * test_ratio)] if test_ratio else trees[split_dev:]
    test_trees = trees[-int(len(trees) * test_ratio) :] if test_ratio else []
    return train_trees, dev_trees, test_trees


def build_pcfg(train_trees: List[Tree]):
    """Induce a PCFG from training trees with sanitized non-terminals."""

    with open(os.path.join("grammar", "changes.json"), "r") as f:
        special_nt_map = json.load(f)

    def _format_nonterminal(nt: Nonterminal) -> Nonterminal:
        sym = nt.symbol()

        if sym.startswith("-") and sym.endswith("-") and len(sym) > 1:
            sym = sym[1:-1]

        if sym in special_nt_map:
            sym = special_nt_map[sym]

        if not (sym[0].isalnum() or sym[0] == "_"):
            sym = "X" + sym

        for char, replacement in special_nt_map.items():
            if len(char) == 1:
                sym = sym.replace(char, replacement)

        return Nonterminal(sym)

    productions = []
    root_counts = defaultdict(int)

    for tree in train_trees:
        root_counts[tree.label()] += 1
        tree.chomsky_normal_form(horzMarkov=2)
        for prod in tree.productions():
            lhs = _format_nonterminal(prod.lhs())
            rhs = [_format_nonterminal(sym) if isinstance(sym, Nonterminal) else sym for sym in prod.rhs()]
            productions.append(Production(lhs, rhs))

    if len(root_counts) > 1:
        start = Nonterminal("ROOT")
        for lbl, cnt in root_counts.items():
            rhs_nt = _format_nonterminal(Nonterminal(lbl))
            for _ in range(cnt):
                productions.append(Production(start, [rhs_nt]))
    else:
        start = _format_nonterminal(Nonterminal(next(iter(root_counts))))

    return induce_pcfg(start, productions)


def evaluate(parser: ViterbiParser, dev_trees: List[Tree], *, theta: float, vocab: set):
    log_gpu_memory_nvidia_smi("evaluate_start")
    log_cuda_memory_pytorch("evaluate_start")
    correct = 0
    total_time = 0.0
    total_gold_constituents = 0
    total_pred_constituents = 0
    total_correct_constituents = 0

    for gold in dev_trees:
        sent = gold.leaves()
        tokens = TreebankWordTokenizer().tokenize(" ".join(sent))
        logger.info("Evaluating sentence: %s", ' '.join(sent))

        if theta == 1.0 and any(w not in vocab for w in tokens):
            total_time += 0.0
            continue

        t0 = time.perf_counter()
        try:
            parsed = list(parser.parse(tokens))
            logger.debug("Parsed tree: %s", parsed)
        except Exception:
            logger.error("An error occurred", exc_info=True)
            # logger.warning("Parsing failed")
            parsed = None
        total_time += time.perf_counter() - t0
        log_cuda_memory_pytorch("after_parse")

        # Exact match
        if parsed and parsed == list(gold):
            correct += 1

        # F1 calculation
        if parsed:
            gold_constituents = get_constituents(gold)
            pred_constituents = get_constituents(parsed)

            correct_constituents = gold_constituents.intersection(pred_constituents)

            total_gold_constituents += len(gold_constituents)
            total_pred_constituents += len(pred_constituents)
            total_correct_constituents += len(correct_constituents)
        log_gpu_memory_nvidia_smi("sentence_end")
        log_cuda_memory_pytorch("sentence_end")
        flush_cuda_cache()

    # Calculate metrics
    acc = correct / len(dev_trees)
    avg_time = total_time / len(dev_trees)

    precision = total_correct_constituents / total_pred_constituents if total_pred_constituents > 0 else 0
    recall = total_correct_constituents / total_gold_constituents if total_gold_constituents > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    log_gpu_memory_nvidia_smi("evaluate_end")
    log_cuda_memory_pytorch("evaluate_end")
    flush_cuda_cache()
    return acc, avg_time, precision, recall, f1


def main(cfg_path: str):
    log_gpu_memory_nvidia_smi("main_start")
    log_cuda_memory_pytorch("main_start")
    config = load_config(cfg_path)
    train_trees, dev_trees, test_trees = get_treebank_splits()
    grammar = build_pcfg(train_trees)

    # Ensure results directory exists
    os.makedirs("results", exist_ok=True)

    lexical_vocab = {
        prod.rhs()[0] for prod in grammar.productions() if len(prod.rhs()) == 1 and isinstance(prod.rhs()[0], str)
    }

    results = []
    current_model = None
    llm = None
    provider = None

    nts = {str(p.lhs()) for p in grammar.productions()}

    for theta, model_name in itertools.product(config["data_score"], config["model"]):
        logger.info("Running evaluation with theta=%s, model=%s", theta, model_name)
        log_gpu_memory_nvidia_smi(f"loop_theta_{theta}_model_{model_name}")
        log_cuda_memory_pytorch(f"loop_theta_{theta}_model_{model_name}")

        if theta == 1.0:
            parser = ViterbiParser(grammar)
        else:
            # Check if model has changed
            if model_name != current_model:
                if llm is not None:
                    del llm
                    torch.cuda.empty_cache()
                    flush_cuda_cache()
                # Create new LLM instance and provider
                llm = LocalLLM(model_name=model_name)
                provider = TokenLevelProbabilityProvider(llm, nts)
                current_model = model_name

            # When model hasnt changed
            parser = TokenLevelViterbiParser(grammar, provider, theta=theta)

        acc, avg_time, precision, recall, f1 = evaluate(parser, dev_trees, theta=theta, vocab=lexical_vocab)
        results.append(
            {
                "data_score": theta,
                "model": model_name,
                "accuracy": acc,
                "avg_inference_time": avg_time,
                "precision": precision,
                "recall": recall,
                "f1": f1,
            }
        )
        logger.info(
            "Results for theta=%s, model=%s: accuracy=%.4f, avg_time=%.4fs, precision=%.4f, recall=%.4f, f1=%.4f",
            theta,
            model_name,
            acc,
            avg_time,
            precision,
            recall,
            f1,
        )
        log_cuda_memory_pytorch("after_eval")
        flush_cuda_cache()

        # Add a second plot for F1 scores
        plt.figure()
        for model in set(r["model"] for r in results):
            vals = sorted([r for r in results if r["model"] == model], key=lambda x: x["data_score"])
            thetas = [r["data_score"] for r in vals]
            f1_scores = [r["f1"] for r in vals]
            plt.plot(thetas, f1_scores, marker="o", label=model)

        plt.xlabel("theta")
        plt.ylabel("F1 Score")
        plt.legend()
        plt.savefig("results/f1_scores.png")
    log_gpu_memory_nvidia_smi("main_end")
    log_cuda_memory_pytorch("main_end")


if __name__ == "__main__":
    import argparse

    nltk.download("treebank")
    parser = argparse.ArgumentParser(description="Hyperparameter search for parsing")
    parser.add_argument("--config", help="Path to YAML config file", default="configs/sample.yaml")
    args = parser.parse_args()
    main(args.config)