ask-a-local/optuna_optimization.py at main · ACMCMC/ask-a-local · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
import optuna
import logging
import dotenv
import os
import json
import pandas as pd
import argparse
from typing import List, Dict, Any, Tuple, Mapping
import numpy as np
import torch
from scipy.stats import spearmanr
from pipeline import run_hallucination_detection

# Load environment variables
dotenv.load_dotenv(override=True)

# Set up logging
logging.basicConfig(level=logging.INFO)

# If we're running in a notebook, we need to redirect the logs to stdout
if "get_ipython" in globals():
    logging.getLogger().handlers = [logging.StreamHandler()]
    logging.getLogger().setLevel(logging.INFO)


def score_iou(ref_dict, pred_dict):
    """computes intersection-over-union between reference and predicted hard labels, for a single datapoint.
    inputs:
    - ref_dict: a gold reference datapoint,
    - pred_dict: a model's prediction
    returns:
    the IoU, or 1.0 if neither the reference nor the prediction contain hallucinations
    """
    # ensure the prediction is correctly matched to its reference
    assert ref_dict["id"] == pred_dict["id"]
    # convert annotations to sets of indices
    ref_indices = {idx for span in ref_dict["hard_labels"] for idx in range(*span)}
    pred_indices = {idx for span in pred_dict["hard_labels"] for idx in range(*span)}
    # avoid division by zero
    if not pred_indices and not ref_indices:
        return 1.0
    # otherwise compute & return IoU
    return len(ref_indices & pred_indices) / len(ref_indices | pred_indices)


def score_cor(ref_dict, pred_dict):
    """computes Spearman correlation between predicted and reference soft labels, for a single datapoint.
    inputs:
    - ref_dict: a gold reference datapoint,
    - pred_dict: a model's prediction
    returns:
    the Spearman correlation, or a binarized exact match (0.0 or 1.0) if the reference or prediction contains no variation
    """
    # ensure the prediction is correctly matched to its reference
    assert ref_dict["id"] == pred_dict["id"]
    # convert annotations to vectors of observations
    ref_vec = [0.0] * ref_dict["text_len"]
    pred_vec = [0.0] * ref_dict["text_len"]
    for span in ref_dict["soft_labels"]:
        for idx in range(span["start"], span["end"]):
            ref_vec[idx] = span["prob"]
    for span in pred_dict["soft_labels"]:
        for idx in range(span["start"], span["end"]):
            pred_vec[idx] = span["prob"]
    # constant series (i.e., no hallucination) => cor is undef
    if (
        len({round(flt, 8) for flt in pred_vec}) == 1
        or len({round(flt, 8) for flt in ref_vec}) == 1
    ):
        return float(
            len({round(flt, 8) for flt in ref_vec})
            == len({round(flt, 8) for flt in pred_vec})
        )
    # otherwise compute Spearman's rho
    return spearmanr(ref_vec, pred_vec).correlation


# Define the objective function for Optuna
def objective(trial: optuna.Trial) -> float:
    HALLUCINATION_THRESHOLD = trial.suggest_float("HALLUCINATION_THRESHOLD", 0.0, 2.0)
    HALLUCINATION_BETA = trial.suggest_float("HALLUCINATION_BETA", 0.0, 1.0)
    WEIGHT_TEMPERATURE = trial.suggest_float("WEIGHT_TEMPERATURE", 0.0, 10.0)
    normalize_hallucination_probabilities_to_1 = trial.suggest_categorical(
        "normalize_hallucination_probabilities_to_1", [True, False]
    )
    # translate_to_english = trial.suggest_categorical(
    #     "translate_to_english", [True, False]
    # )

    translate_to_english = False  # Bypassing this for now

    ALL_POSSIBLE_LANGUAGE_MODELS_1 = {
        "Arabia": ["inceptionai/jais-family-1p3b-chat"],
        # "Asia": [
        # ],
        "Basque Country": ["HiTZ/latxa-7b-v1.2"],
        "Catalonia": ["projecte-aina/FLOR-1.3B"],
        "China": ["IDEA-CCNL/Wenzhong-GPT2-110M"],
        "Czechia": [
            "lchaloupsky/czech-gpt2-oscar",
            "spital/gpt2-small-czech-cs",
        ],
        "Netherlands": ["GroNLP/gpt2-small-dutch"],
        "English": [
            "EleutherAI/pythia-2.8b",
            # "meta-llama/Llama-3.1-8B-Instruct",
        ],
        "Persian": ["ai-forever/mGPT-1.3B-persian"],
        "Finland": ["TurkuNLP/gpt3-finnish-medium"],
        "France": [
            "antoinelouis/belgpt2",
            # "asi/gpt-fr-cased-small",
            "asi/gpt-fr-cased-base",
            "OpenLLM-France/Lucie-7B",
        ],
        "Germany": [
            "DiscoResearch/Llama3-German-8B",
            "benjamin/gerpt2-large",
        ],
        "India": [
            "NebulaByte/hindi_gpt2",
            "KathirKs/gemma-2b-hindi",
        ],
        "Italy": [
            "GroNLP/gpt2-small-italian",
            "swap-uniba/bloom-1b7-it",
        ],
        "Portugal": [
            "Nos-PT/Carvalho_pt-gl-1.3B",
            "NOVA-vision-language/GlorIA-1.3B",
            "egonrp/gpt2-wikiwriter-medium-portuguese",
            "pierreguillou/gpt2-small-portuguese",
        ],
        "Russia": ["ai-forever/rugpt3small_based_on_gpt2"],
        "Spain": [
            "DeepESP/gpt2-spanish",
            "BSC-LT/salamandra-2b",
            "BSC-LT/salamandra-7b",
        ],
        "Sweden": [
            "neph1/bellman-mistral-7b-instruct-v0.3",
            "dandanw/bloom-3b-sv",
        ],
        "Wikipedia": [
            "utter-project/EuroLLM-9B-Instruct",
            "allknowingroger/Qwenslerp4-14B",
            "microsoft/Phi-3.5-mini-instruct",
            "microsoft/Phi-3-medium-4k-instruct",
        ],
    }

    GOLDFISH_LANGUAGE_MODELS = {
        "Arabic": ["goldfish-models/arb_arab_1000mb"],
        "Basque": ["goldfish-models/eus_latn_1000mb"],
        "Bengali": ["goldfish-models/ben_beng_1000mb"],
        "Catalan": ["goldfish-models/cat_latn_1000mb"],
        "Chinese": ["goldfish-models/zho_hans_1000mb"],
        "Czech": ["goldfish-models/ces_latn_1000mb"],
        "Dutch": ["goldfish-models/nld_latn_1000mb"],
        "English": ["goldfish-models/eng_latn_1000mb"],
        "Finnish": ["goldfish-models/fin_latn_1000mb"],
        "French": ["goldfish-models/fra_latn_1000mb"],
        "German": ["goldfish-models/deu_latn_1000mb"],
        "Greek": ["goldfish-models/ell_grek_1000mb"],
        "Hebrew": ["goldfish-models/heb_hebr_1000mb"],
        "Hindi": ["goldfish-models/hin_deva_1000mb"],
        "Hungarian": ["goldfish-models/hun_latn_1000mb"],
        "Italian": ["goldfish-models/ita_latn_1000mb"],
        "Japanese": ["goldfish-models/jpn_jpan_1000mb"],
        "Korean": ["goldfish-models/kor_hang_1000mb"],
        "Math": [
            "deepseek-ai/DeepSeek-V2-Lite",
            "deepseek-ai/DeepSeek-Prover-V1.5-SFT",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
            "deepseek-ai/deepseek-math-7b-base",
            "deepseek-ai/deepseek-math-7b-instruct",
        ],
        "Norwegian": ["goldfish-models/nor_latn_1000mb"],
        "Persian": ["goldfish-models/fas_arab_1000mb"],
        "Polish": ["goldfish-models/pol_latn_1000mb"],
        "Portuguese": ["goldfish-models/por_latn_1000mb"],
        "Romanian": ["goldfish-models/ron_latn_1000mb"],
        "Russian": ["goldfish-models/rus_cyrl_1000mb"],
        "Spanish": ["goldfish-models/spa_latn_1000mb"],
        "Swedish": ["goldfish-models/swe_latn_1000mb"],
        "Thai": ["goldfish-models/tha_thai_1000mb"],
        "Turkish": ["goldfish-models/tur_latn_1000mb"],
        "Ukrainian": ["goldfish-models/ukr_cyrl_1000mb"],
        "Urdu": ["goldfish-models/urd_arab_1000mb"],
        "Vietnamese": ["goldfish-models/vie_latn_1000mb"],
        "Wikipedia": [
            "facebook/xglm-7.5B",
            "utter-project/EuroLLM-9B-Instruct",
            "allknowingroger/Qwenslerp4-14B",
            "microsoft/Phi-3.5-mini-instruct",
            "microsoft/Phi-3-medium-4k-instruct",
        ],
    }

    ALL_POSSIBLE_LANGUAGE_MODELS = GOLDFISH_LANGUAGE_MODELS

    languages_to_models = {}

    for language, models in ALL_POSSIBLE_LANGUAGE_MODELS.items():
        chosen_model = trial.suggest_categorical(language, [None] + models)
        if chosen_model is not None:
            languages_to_models[language] = chosen_model

    # Sort the languages to models dictionary by key
    languages_to_models = dict(sorted(languages_to_models.items()))

    # Placeholder for the actual hallucination detection logic
    # This should return a score indicating the performance of the model with the given parameters
    # For demonstration purposes, we'll use a dummy score
    score = perform_hallucination_detection(
        HALLUCINATION_THRESHOLD,
        HALLUCINATION_BETA,
        WEIGHT_TEMPERATURE,
        normalize_hallucination_probabilities_to_1,
        translate_to_english,
        languages_to_models,
        study_name=trial.study.study_name,
        trial_id=trial.number,
    )

    return score


validation_file = "mushroom/val/mushroom.en-val-mini.v2.jsonl"


# Dummy function to simulate hallucination detection
def perform_hallucination_detection(
    threshold,
    beta,
    weight_temperature,
    normalize_hallucination_probabilities_to_1,
    translate_to_english,
    languages_to_models: Mapping[str, str],
    study_name: str,
    trial_id: int,
):
    logging.info(f"Processing file: {validation_file}")

    references_df = pd.read_json(validation_file, lines=True)
    references_df["text_len"] = references_df.model_output_text.apply(len)
    references_df = references_df[["id", "soft_labels", "hard_labels", "text_len"]]

    input_df = pd.read_json(validation_file, lines=True)

    # Columns: Index(['id', 'lang', 'model_input', 'model_output_text', 'model_id', 'soft_labels', 'hard_labels', 'model_output_tokens', 'model_output_logits'], dtype='object')
    qa_pairs = [
        {
            "model_input": row["model_input"],
            "model_output_text": row["model_output_text"],
            "id": row["id"],
            "lang": row["lang"],
            "model_id": row["model_id"],
        }
        for _, row in input_df.iterrows()
    ]

    kwargs = {
        "HALLUCINATION_THRESHOLD": threshold,
        "HALLUCINATION_BETA": beta,
        "WEIGHT_TEMPERATURE": weight_temperature,
        "normalize_hallucination_probabilities_to_1": normalize_hallucination_probabilities_to_1,
        "translate_to_english": translate_to_english,
    }

    logging.info(f"Languages to models: {languages_to_models}")
    logging.info(f"Parameters: {kwargs}")

    results = run_hallucination_detection(
        qa_pairs, languages_to_models=languages_to_models, **kwargs
    )

    # Clear the cache
    torch.cuda.empty_cache()

    results_df = pd.DataFrame(results)
    results_df = results_df[["id", "soft_labels", "hard_labels"]]

    results_dict = results_df.sort_values("id").to_dict(orient="records")
    references_dict = references_df.sort_values("id").to_dict(orient="records")

    ious = np.array([score_iou(r, d) for r, d in zip(references_dict, results_dict)])
    cors = np.array([score_cor(r, d) for r, d in zip(references_dict, results_dict)])

    logging.info(f"IoU: {ious}")
    logging.info(f"Cor: {cors}")
    # The combined score is the mean of the IoU and Correlation scores
    combined_scores = (ious + cors) / 2

    # Store as a CSV with example id, question, answer, and cors
    scores_df = results_df.copy()
    scores_df["combined_scores"] = combined_scores
    scores_df["cors"] = cors
    scores_df["ious"] = ious
    scores_df["model_input"] = input_df["model_input"]
    scores_df["model_output_text"] = input_df["model_output_text"]
    scores_df["model_id"] = input_df["model_id"]
    import os
    from pathlib import Path

    scores_results_dir = Path(f"optuna") / f"{study_name}"
    if not os.path.exists(scores_results_dir):
        os.makedirs(scores_results_dir)
    scores_df.to_csv(scores_results_dir / f"{trial_id}.csv", index=False)

    # Return the mean of the combined scores
    mean_combined_score = combined_scores.mean()

    return mean_combined_score


# Create a study and optimize the objective function
study = optuna.create_study(
    direction="maximize",
    study_name="hallucinAItor_17",
    storage="sqlite:///optuna.db",
    load_if_exists=True,
)
study.optimize(objective, n_trials=1500, catch=(Exception,))

# logging.info the best parameters
logging.info("Best parameters: ", study.best_params)

# Store all the results in a CSV file
try:
    dataframe = study.trials_dataframe()
    dataframe.to_csv("optuna_results.csv")
except:
    pass

# Save the best parameters in a .json file
try:
    with open("optuna_best_params.json", "w") as f:
        json.dump(study.best_params, f)
except:
    pass