diff --git a/engine/fof_compare.py b/engine/fof_compare.py index ae47e99c..00064d29 100644 --- a/engine/fof_compare.py +++ b/engine/fof_compare.py @@ -6,84 +6,87 @@ Veri data are not considered, only reports and observations are compared. """ +import tempfile + import click +import pandas as pd import xarray as xr +from util.click_util import CommaSeparatedStrings, cli_help +from util.dataframe_ops import check_file_with_tolerances from util.fof_utils import ( - compare_var_and_attr_ds, - primary_check, - split_feedback_dataset, + get_log_file_name, ) +from util.log_handler import initialize_detailed_logger, logger +from util.utils import FileInfo @click.command() -@click.argument("file1", type=click.Path(exists=True)) -@click.argument("file2", type=click.Path(exists=True)) @click.option( - "--print-lines", - is_flag=True, - help="Prints the lines where there are differences. " - "If --lines is not specified, then the first 10 " - "differences per variables are shown.", + "--file1", + required=True, + help="Path to the file 1; it must contain the {fof_type} placeholder.", ) @click.option( - "--lines", - "-n", - default=10, - help="Option to specify how many lines to print " "with the --print-lines option", + "--file2", + required=True, + help="Path to the file 2; it must contain the {fof_type} placeholder.", ) @click.option( - "--output", - "-o", - is_flag=True, - help="Option to save differences in a CSV file. " - "If the location is not specified, the file " - "is saved in the same location as this code. ", + "--fof-types", + type=CommaSeparatedStrings(), + required=True, + help=cli_help["fof_types"], ) @click.option( - "--location", - "-l", - default=None, - help="If specified, location where to save the CSV file with the differences.", + "--tolerance", + default=1e-12, ) -def fof_compare( - file1, file2, print_lines, lines, output, location -): # pylint: disable=too-many-positional-arguments - - if not primary_check(file1, file2): - print("Different types of files") - return - - ds1 = xr.open_dataset(file1) - ds2 = xr.open_dataset(file2) - - ds_reports1_sorted, ds_obs1_sorted = split_feedback_dataset(ds1) - ds_reports2_sorted, ds_obs2_sorted = split_feedback_dataset(ds2) - - total_elements_all, equal_elements_all = 0, 0 - - if print_lines: - nl = lines - else: - nl = 0 - - for ds1, ds2 in [ - (ds_reports1_sorted, ds_reports2_sorted), - (ds_obs1_sorted, ds_obs2_sorted), - ]: - t, e = compare_var_and_attr_ds(ds1, ds2, nl, output, location) - total_elements_all += t - equal_elements_all += e - - if total_elements_all > 0: - percent_equal_all = (equal_elements_all / total_elements_all) * 100 - percent_diff_all = 100 - percent_equal_all - print(f"Total percentage of equality: {percent_equal_all:.2f}%") - print(f"Total percentage of difference: {percent_diff_all:.2f}%") - if equal_elements_all == total_elements_all: - print("Files are consistent!") - else: - print("Files are NOT consistent!") +@click.option("--rules", default="") +def fof_compare(file1, file2, fof_types, tolerance, rules): + + for fof_type in fof_types: + file1_path = file1.format(fof_type=fof_type) + file2_path = file2.format(fof_type=fof_type) + + n_rows_file1 = xr.open_dataset(file1_path).sizes["d_body"] + n_rows_file2 = xr.open_dataset(file2_path).sizes["d_body"] + + if n_rows_file1 != n_rows_file2: + raise ValueError("Files have different numbers of lines!") + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".csv", delete=True, dir="/dev/shm" + ) as tmp: + df = pd.DataFrame({"tolerance": [tolerance] * n_rows_file1}) + df.to_csv(tmp.name) + + out, err, tol = check_file_with_tolerances( + tmp.name, + FileInfo(file1_path), + FileInfo(file2_path), + factor=1, + rules=rules, + ) + + if out: + logger.info("Files are consistent!") + + else: + logger.info("Files are NOT consistent!") + + log_file_name = get_log_file_name(file1_path) + logger.info("Complete output available in %s", log_file_name) + if not err.empty: + detailed_logger = initialize_detailed_logger( + "DETAILS", log_level="DEBUG", log_file=log_file_name + ) + + detailed_logger.info( + "Differences, veri_data outside of tolerance range" + ) + detailed_logger.info(err) + detailed_logger.info(tol) if __name__ == "__main__": diff --git a/tests/engine/test_fof_compare.py b/tests/engine/test_fof_compare.py new file mode 100644 index 00000000..e51fc25d --- /dev/null +++ b/tests/engine/test_fof_compare.py @@ -0,0 +1,135 @@ +""" +This module contains test cases to validate the functionality +of fof-compare CLI commands. +""" + +import logging +import os +from pathlib import Path + +import pytest +from click.testing import CliRunner + +from engine.fof_compare import fof_compare + + +@pytest.fixture(name="fof_datasets", scope="function") +def fixture_fof_datasets(fof_datasets_base, tmp_dir): + """ + FOF datasets written to disk, returns file paths. + """ + ds1, ds2, _, _ = fof_datasets_base + ds3 = ds2.copy(deep=True) + ds3["flags"] = (("d_body",), ds3["flags"].values * 1.55) + + ds1_file = os.path.join(tmp_dir, "fof1_SYNOP.nc") + ds2_file = os.path.join(tmp_dir, "fof2_SYNOP.nc") + ds3_file = os.path.join(tmp_dir, "fof3_SYNOP.nc") + + ds1.to_netcdf(ds1_file) + ds2.to_netcdf(ds2_file) + ds3.to_netcdf(ds3_file) + + yield ds1_file, ds2_file, ds3_file + + +def test_fof_compare_works(fof_datasets, tmp_dir, monkeypatch): + """ + Test that fof-compare works and produces a log file. + """ + + df1, df2, _ = fof_datasets + + df1 = df1.replace("SYNOP", "{fof_type}") + df2 = df2.replace("SYNOP", "{fof_type}") + monkeypatch.chdir(tmp_dir) + rules = "" + runner = CliRunner() + + result = runner.invoke( + fof_compare, + [ + "--file1", + df1, + "--file2", + df2, + "--fof-types", + "SYNOP", + "--tolerance", + "1e-12", + "--rules", + rules, + ], + ) + + assert result.exit_code == 0 + + log_file = Path(tmp_dir + "/error_fof1_SYNOP.log") + + assert (log_file).exists() + + +def test_fof_compare_not_consistent(fof_datasets, tmp_dir, monkeypatch, caplog): + """ + Test that if there are differences in the files, then fof-compare writes + in the log file that the files are not consistent. + """ + + df1, _, df3 = fof_datasets + df1 = df1.replace("SYNOP", "{fof_type}") + df3 = df3.replace("SYNOP", "{fof_type}") + monkeypatch.chdir(tmp_dir) + + rules = "" + runner = CliRunner() + with caplog.at_level(logging.INFO): + runner.invoke( + fof_compare, + [ + "--file1", + df1, + "--file2", + df3, + "--fof-types", + "SYNOP", + "--tolerance", + "5", + "--rules", + rules, + ], + ) + + assert "Files are NOT consistent!" in caplog.text + + +def test_fof_compare_consistent(fof_datasets, tmp_dir, monkeypatch, caplog): + """ + Test that if there are no differences in the files and the tolerance is big + enough, then fof-compare writes in the log file that the files are consistent. + """ + + df1, df2, _ = fof_datasets + df1 = df1.replace("SYNOP", "{fof_type}") + df2 = df2.replace("SYNOP", "{fof_type}") + monkeypatch.chdir(tmp_dir) + + rules = "" + runner = CliRunner() + with caplog.at_level(logging.INFO): + runner.invoke( + fof_compare, + [ + "--file1", + df1, + "--file2", + df2, + "--fof-types", + "SYNOP", + "--tolerance", + "5", + "--rules", + rules, + ], + ) + + assert "Files are consistent!" in caplog.text diff --git a/tests/util/test_dataframe_ops.py b/tests/util/test_dataframe_ops.py index 2a8e2178..517020e1 100644 --- a/tests/util/test_dataframe_ops.py +++ b/tests/util/test_dataframe_ops.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Optional -from unittest.mock import patch +from unittest.mock import mock_open, patch import numpy as np import pandas as pd @@ -843,8 +843,12 @@ def test_multiple_solutions_from_dict_no_rules(dataframes_dict): dict_cur = {key: df.copy() for key, df in dict_ref.items()} rules = "" - errors = check_multiple_solutions_from_dict(dict_ref, dict_cur, rules) - assert errors == [] + with patch("builtins.open", mock_open()): + errors = check_multiple_solutions_from_dict( + dict_ref, dict_cur, rules, log_file_name="file_name.log" + ) + + assert errors is False def test_multiple_solutions_from_dict_with_rules(dataframes_dict): @@ -855,8 +859,11 @@ def test_multiple_solutions_from_dict_with_rules(dataframes_dict): rules = {"check": [9, 1], "state": [13, 14]} - errors = check_multiple_solutions_from_dict(dict_ref, dict_cur, rules) - assert errors == [] + with patch("builtins.open", mock_open()): + errors = check_multiple_solutions_from_dict( + dict_ref, dict_cur, rules, log_file_name="file_name.log" + ) + assert errors is False def test_multiple_solutions_from_dict_with_rules_wrong(dataframes_dict): @@ -867,15 +874,9 @@ def test_multiple_solutions_from_dict_with_rules_wrong(dataframes_dict): rules = {"check": [9, 1], "state": [13, 14]} - errors = check_multiple_solutions_from_dict(dict_ref, dict_cur, rules) + with patch("builtins.open", mock_open()): + errors = check_multiple_solutions_from_dict( + dict_ref, dict_cur, rules, log_file_name="file_name.log" + ) - expected = [ - { - "row": 1, - "column": "check", - "file1": np.int64(9), - "file2": np.int64(6), - "error": "values different and not admitted", - } - ] - assert errors == expected + assert errors is True diff --git a/tests/util/test_fof_utils.py b/tests/util/test_fof_utils.py index bc57bec4..3ecd8a7b 100644 --- a/tests/util/test_fof_utils.py +++ b/tests/util/test_fof_utils.py @@ -2,21 +2,21 @@ This module contains unit tests for the `util/fof_utils.py` module. """ +from unittest.mock import mock_open, patch + import numpy as np import pytest -from util.fof_utils import ( +from util.fof_utils import ( # write_lines, clean_value, compare_arrays, compare_var_and_attr_ds, fill_nans_for_float32, get_observation_variables, get_report_variables, - primary_check, - print_entire_line, split_feedback_dataset, - write_lines, ) +from util.log_handler import initialize_detailed_logger @pytest.fixture(name="ds1", scope="function") @@ -237,95 +237,22 @@ def fixture_sample_dataset_2(sample_dataset_fof): return data -def test_print_entire_line(ds1, ds2, capsys): - """ - Test that in case of differences, these are printed correctly. - """ - diff = np.array([5]) - print_entire_line(ds1, ds2, diff) - captured = capsys.readouterr() - output = captured.out.splitlines() - - assert output[0] == ( - "\x1b[1mid\x1b[0m : d_hdr |d_body |lat |lon " - "|varno |statid |time_nomi |codetype |level " - "|l_body |i_body |veri_data |obs |bcor " - "|level_typ |level_sig |state |flags |check " - "|e_o |qual |plevel " - ) - assert output[1] == ( - "\x1b[1mref\x1b[0m : 0 |5 |1 |5 " - "|4 |a |0 |5 |750 " - "|1 |1 |78 |0.155 |0.969 " - "|0.524 |0.366 |1 |9 |13 " - "|0.52 |0.138 |0.755 " - ) - assert output[2] == ( - "\x1b[1mcur\x1b[0m : 0 |5 |1 |5 " - "|4 |a |0 |5 |750 " - "|1 |1 |78 |0.155 |0.969 " - "|0.524 |0.366 |1 |9 |13 " - "|0.52 |0.138 |0.755 " - ) - assert output[3] == ( - "\x1b[1mdiff\x1b[0m: 0 |0 |0 |0 " - "|0 |nan |0 |0 |0 " - "|0 |0 |0 |0.0 |0.0 " - "|0.0 |0.0 |0 |0 |0 " - "|0.0 |0.0 |0.0 " - ) - - -def test_write_lines(ds1, ds2, tmp_path): - """ - Test that if there are any differences, they are saved in a separate csv file. - """ - file_path = tmp_path / "differences.csv" - diff = np.array([5]) - write_lines(ds1, ds2, diff, file_path) - - content = file_path.read_text(encoding="utf-8") - - expected = ( - "id : d_hdr |d_body |lat |lon |varno " - "|statid |time_nomi |codetype |level |l_body " - "|i_body |veri_data |obs |bcor |level_typ " - "|level_sig |state |flags |check |e_o " - "|qual |plevel \n" - "ref : 0 |5 |1 |5 |4 " - "|a |0 |5 |750 |1 " - "|1 |78 |0.155 |0.969 |0.524 " - "|0.366 |1 |9 |13 |0.52 " - "|0.138 |0.755 \n" - "cur : 0 |5 |1 |5 |4 " - "|a |0 |5 |750 |1 " - "|1 |78 |0.155 |0.969 |0.524 " - "|0.366 |1 |9 |13 |0.52 " - "|0.138 |0.755 \n" - "diff : 0 |0 |0 |0 |0 " - "|nan |0 |0 |0 |0 " - "|0 |0 |0.0 |0.0 |0.0 " - "|0.0 |0 |0 |0 |0.0 " - "|0.0 |0.0 \n" - ) - assert content == expected - - -def test_compare_var_and_attr_ds(ds1, ds2, tmp_path): +def test_compare_var_and_attr_ds(ds1, ds2): """ Test that, given two datasets, returns the number of elements in which the variables are the same and in which they differ. """ + with patch("builtins.open", mock_open()): - file_path = tmp_path / "differences.csv" + detailed_logger = initialize_detailed_logger( + "DETAILS", log_level="DEBUG", log_file="test_log.log" + ) - total1, equal1 = compare_var_and_attr_ds( - ds1, ds2, nl=0, output=True, location=file_path - ) - total2, equal2 = compare_var_and_attr_ds(ds1, ds2, nl=4, output=True, location=None) + total1, equal1 = compare_var_and_attr_ds(ds1, ds2, detailed_logger) + total2, equal2 = compare_var_and_attr_ds(ds1, ds2, detailed_logger) - assert (total1, equal1) == (104, 103) - assert (total2, equal2) == (104, 103) + assert (total1, equal1) == (103, 102) + assert (total2, equal2) == (103, 102) @pytest.fixture(name="ds3") @@ -337,17 +264,3 @@ def fixture_sample_dataset_3(sample_dataset_fof): ds.attrs["plevel"] = np.array([0.374, 0.950, 0.731, 0.598, 0.156]) return ds - - -def test_primary_check(tmp_path): - """ - Note that if two fof files are not of the same type, then the primary_check fails. - """ - test_fof1 = tmp_path / "fofAIREP.nc" - test_fof2 = tmp_path / "fofAIREP.nc" - test_fof3 = tmp_path / "fofPILOT.nc" - - assert primary_check(test_fof1, test_fof2) - - false_result = primary_check(test_fof1, test_fof3) - assert false_result is False diff --git a/util/dataframe_ops.py b/util/dataframe_ops.py index 69a33881..7720ab85 100644 --- a/util/dataframe_ops.py +++ b/util/dataframe_ops.py @@ -15,8 +15,13 @@ from util.constants import CHECK_THRESHOLD, compute_statistics from util.file_system import file_names_from_pattern -from util.fof_utils import compare_var_and_attr_ds, split_feedback_dataset -from util.log_handler import logger +from util.fof_utils import ( + clean_logger_file_if_only_details, + compare_var_and_attr_ds, + get_log_file_name, + split_feedback_dataset, +) +from util.log_handler import initialize_detailed_logger, logger from util.model_output_parser import model_output_parser from util.utils import FileInfo, FileType @@ -336,7 +341,7 @@ def check_file_with_tolerances( if input_file_ref.file_type != input_file_cur.file_type: logger.critical( "The current and the reference files are not of the same type; " - "it is impossible to calculate the tolerances. Abort." + "it is impossible to compare them. Abort." ) sys.exit(1) @@ -345,12 +350,15 @@ def check_file_with_tolerances( ) if input_file_ref.file_type == FileType.FOF: - errors = check_multiple_solutions_from_dict(df_ref, df_cur, rules) + log_file_name = get_log_file_name(input_file_ref.path) + errors = check_multiple_solutions_from_dict( + df_ref, df_cur, rules, log_file_name + ) if errors: logger.error("RESULT: check FAILED") - sys.exit(1) - + err = pd.DataFrame() + return False, err, 0 else: # check if variables are available in reference file skip_test, df_ref, df_cur = check_intersection(df_ref, df_cur) @@ -402,97 +410,83 @@ def has_enough_data(dfs): def parse_rules(rules): - if isinstance(rules, str): - rules = rules.strip() - return ast.literal_eval(rules) if rules else {} if isinstance(rules, dict): return rules + + if isinstance(rules, str) and rules.strip(): + return ast.literal_eval(rules) + return {} -def compare_cells(ref_df, cur_df, cols_present, rules_dict): +def compare_cells_rules(ref_df, cur_df, cols, rules_dict, detailed_logger): """ This function compares two DataFrames cell by cell for a selected set of columns. For each row and column, it ignores values that are equal or whose differences are allowed by predefined rules. - All other differences are collected and returned as a list of error descriptions. + All other differences not admitted are stored in a log file. """ - errors = [] - for i in range(len(ref_df)): - row1 = ref_df.iloc[i] - row2 = cur_df.iloc[i] - - for col in cols_present: - val1 = row1[col] - val2 = row2[col] + errors = False + for row_idx, (row1, row2) in enumerate( + zip(ref_df.itertuples(), cur_df.itertuples()) + ): + for col in cols: + val1 = getattr(row1, col) + val2 = getattr(row2, col) if val1 == val2: continue - if val1 in rules_dict[col] and val2 in rules_dict[col]: + + allowed = rules_dict.get(col, []) + if val1 in allowed and val2 in allowed: continue - errors.append( - { - "row": i, - "column": col, - "file1": val1, - "file2": val2, - "error": "values different and not admitted", - } + detailed_logger.info( + "Values different and not admitted | " + "row=%s, column=%s, file1=%s, file2=%s", + row_idx, + col, + val1, + val2, ) + errors = True return errors -def check_multiple_solutions_from_dict(dict_ref, dict_cur, rules): +def check_multiple_solutions_from_dict(dict_ref, dict_cur, rules, log_file_name): """ - This function compares two Python dictionaries—each containing DataFrames under - the keys "reports" and "observation"—row by row and column by column, according - to rules defined in a separate dictionary. If the corresponding cells are - different and the values are not allowed by the rules, it records an error. - It returns a list indicating the row, the column and which values are wrong. + This function compares two Python dictionaries, each containing DataFrames under + the keys "reports" and "observation", row by row and column by column, according + to rules defined in a separate dictionary. If the variable does not need to follow + specific rules, the values must be identical. + It records the row, column and invalid values in a log file. """ rules_dict = parse_rules(rules) - errors = [] + errors = False + detailed_logger = initialize_detailed_logger( + "DETAILS", log_level="DEBUG", log_file=log_file_name + ) - for key in dict_ref.keys(): - ref_df = dict_ref[key] + for key, ref_df in dict_ref.items(): cur_df = dict_cur[key] + common_cols = [col for col in ref_df.columns if col in cur_df.columns] - cols_present = [ - col - for col in rules_dict.keys() - if col in ref_df.columns and col in cur_df.columns - ] - - cols_other = [ - col - for col in ref_df.columns - if col not in cols_present and col in cur_df.columns - ] - - if cols_other: - ref_df_xr = ref_df[cols_other].to_xarray() - cur_df_xr = cur_df[cols_other].to_xarray() + cols_with_rules = [col for col in common_cols if col in rules_dict] + cols_without_rules = [col for col in common_cols if col not in rules_dict] + if cols_without_rules: t, e = compare_var_and_attr_ds( - ref_df_xr, cur_df_xr, nl=5, output=False, location=None + ref_df[list(cols_without_rules)].to_xarray(), + cur_df[list(cols_without_rules)].to_xarray(), + detailed_logger, ) if t != e: - return errors == 1 - - if cols_present: - errors.extend(compare_cells(ref_df, cur_df, cols_present, rules_dict)) + return True - if errors: - logger.error("Errors found while comparing the files:") - for e in errors: - logger.error( - "Row %s - Column '%s': file1=%s, file2=%s → %s", - e["row"], - e["column"], - e["file1"], - e["file2"], - e["error"], + if cols_with_rules: + errors = compare_cells_rules( + ref_df, cur_df, cols_with_rules, rules_dict, detailed_logger ) + clean_logger_file_if_only_details(log_file_name) return errors diff --git a/util/fof_utils.py b/util/fof_utils.py index 9400b00d..f9e544d0 100644 --- a/util/fof_utils.py +++ b/util/fof_utils.py @@ -3,12 +3,13 @@ """ import os -import shutil import numpy as np import pandas as pd import xarray as xr +from util.log_handler import logger + def get_report_variables(ds): """ @@ -92,12 +93,13 @@ def compare_arrays(arr1, arr2, var_name): mask_equal = arr1 == arr2 equal = mask_equal.sum() percent = (equal / total) * 100 - print( - f"Differences in '{var_name}': {percent:.2f}% equal. " - f"{total} total entries for this variable" + logger.info( + "Differences in '%s': %.2f%% equal. %s total entries for this variable", + var_name, + percent, + total, ) - diff_idx = np.where(~mask_equal.ravel())[0] - diff = diff_idx + diff = np.where(~mask_equal.ravel())[0] return total, equal, diff @@ -117,167 +119,128 @@ def clean_value(x): alignment when printing the value. """ if isinstance(x, bytes): - return x.decode().rstrip(" '") + return x.decode("utf-8", errors="replace").rstrip(" '") return str(x).rstrip(" '") -def print_entire_line(ds1, ds2, diff): +def write_lines_log(ds1, ds2, diff, detailed_logger): """ - If the specific option is called, this function print - the entire line in which differences are found. + This function writes the differences detected between + two files to a detailed log file. """ - if diff.size > 0: - da1 = ds1.to_dataframe().reset_index() - da2 = ds2.to_dataframe().reset_index() - - for i in diff: - col_width = 13 - row1 = "|".join(f"{clean_value(x):<{col_width}}" for x in da1.loc[i]) - - row2 = "|".join(f"{clean_value(x):<{col_width}}" for x in da2.loc[i]) - diff_row = [] - for x, y in zip(da1.loc[i], da2.loc[i]): - if pd.api.types.is_number(x) and pd.api.types.is_number(y): - row_diff = x - y - else: - row_diff = "nan" + da1 = ds1.to_dataframe().reset_index() + da2 = ds2.to_dataframe().reset_index() + col_width = 13 + index = "|".join(f"{str(x):<{col_width}}" for x in da1.columns) - diff_row.append(row_diff) + for i in diff: + row1 = "|".join(f"{clean_value(x):<{col_width}}" for x in da1.loc[i]) + row2 = "|".join(f"{clean_value(x):<{col_width}}" for x in da2.loc[i]) - row_diff = "|".join(f"{str(x):<{col_width}}" for x in diff_row) + diff_vals = [] + for x, y in zip(da1.loc[i], da2.loc[i]): + if pd.api.types.is_number(x) and pd.api.types.is_number(y): + diff_vals.append(x - y) + else: + diff_vals.append("nan") - index = "|".join(f"{str(x):<{col_width}}" for x in da1.columns) + row_diff = "|".join(f"{str(x):<{col_width}}" for x in diff_vals) - print(f"\033[1mid\033[0m : {index}") - print(f"\033[1mref\033[0m : {row1}") - print(f"\033[1mcur\033[0m : {row2}") - print(f"\033[1mdiff\033[0m: {row_diff}") - term_width = shutil.get_terminal_size().columns - print("-" * term_width) + detailed_logger.info("id : %s", index) + detailed_logger.info("ref : %s", row1) + detailed_logger.info("cur : %s", row2) + detailed_logger.info("diff : %s", row_diff) + detailed_logger.info("") -def write_lines(ds1, ds2, diff, path_name): +def write_different_size_log(var, size1, size2, detailed_logger): """ - If the specific option is called, this function save - the lines in which differences are found. + This function is triggered when the array sizes do not match and records + in the log file that a comparison is not possible. """ - if diff.size > 0: - da1 = ds1.to_dataframe().reset_index() - da2 = ds2.to_dataframe().reset_index() - col_width = 13 - index = "|".join(f"{str(x):<{col_width}}" for x in da1.columns) - for i in diff: - - row1 = "|".join(f"{clean_value(x):<{col_width}}" for x in da1.loc[i]) - - row2 = "|".join(f"{clean_value(x):<{col_width}}" for x in da2.loc[i]) - - diff_row = [] - for x, y in zip(da1.loc[i], da2.loc[i]): - if pd.api.types.is_number(x) and pd.api.types.is_number(y): - row_diff = x - y - else: - row_diff = "nan" - - diff_row.append(row_diff) - - row_diff = "|".join(f"{str(x):<{col_width}}" for x in diff_row) - - with open(path_name, "a", encoding="utf-8") as f: - f.write(f"id : {index}" + "\n") - f.write(f"ref : {row1}" + "\n") - f.write(f"cur : {row2}" + "\n") - f.write(f"diff : {row_diff}" + "\n") - - -def write_different_size(output, nl, path_name, var, sizes): - if output: - with open(path_name, "a", encoding="utf-8") as f: - f.write( - f"variable : {var} -> datasets have different lengths " - f"({sizes[0]} vs. {sizes[1]} ), comparison not possible" + "\n" - ) - if nl != 0: - print( - f"\033[1mvar\033[0m : {var} -> datasets have different lengths " - f"({sizes[0]} vs. {sizes[1]} ), comparison not possible" - ) - - -def compare_var_and_attr_ds(ds1, ds2, nl, output, location): + + detailed_logger.info( + "variable : %s -> datasets have different lengths " + "(%s vs. %s), comparison not possible\n", + var, + size1, + size2, + ) + + +def compare_var_and_attr_ds(ds1, ds2, detailed_logger): """ Variable by variable and attribute by attribute, - comparison of the two files. + comparison of the two datasets. """ total_all, equal_all = 0, 0 - list_to_skip = ["source", "i_body", "l_body"] - - if output: - if location: - path_name = location - else: - script_dir = os.path.dirname(os.path.abspath(__file__)) - path_name = os.path.join(script_dir, "differences.csv") - - with open(path_name, "w", encoding="utf-8") as f: - f.write("Differences\n") + list_to_skip = ["source", "i_body", "l_body", "veri_data"] - for var in set(ds1.data_vars).union(ds2.data_vars): + for var in sorted(set(ds1.data_vars).union(ds2.data_vars)): if var in ds1.data_vars and var in ds2.data_vars and var not in list_to_skip: - arr1 = fill_nans_for_float32(ds1[var].values) - arr2 = fill_nans_for_float32(ds2[var].values) + total, equal = process_var(ds1, ds2, var, detailed_logger) + total_all += total + equal_all += equal - if arr1.size == arr2.size: - t, e, diff = compare_arrays(arr1, arr2, var) + if var in ds1.attrs and var in ds2.attrs and var not in list_to_skip: - if output: - write_lines(ds1, ds2, diff, path_name) + total, equal = process_var(ds1, ds2, var, detailed_logger) + total_all += total + equal_all += equal - if nl != 0: - diff = diff[:nl] - print_entire_line(ds1, ds2, diff) + return total_all, equal_all - else: - t, e = max(arr1.size, arr2.size), 0 - write_different_size(output, nl, path_name, var, [arr1.size, arr2.size]) - total_all += t - equal_all += e +def process_var(ds1, ds2, var, detailed_logger): + """ + This function first checks whether two arrays have the same size. + If they do, their values are compared. + If they don't, the differences are written to a log file. + The function outputs the total number of elements and the + number of matching elements. + """ - if var in ds1.attrs and var in ds2.attrs and var not in list_to_skip: - arr1 = np.array(ds1.attrs[var], dtype=object) - arr2 = np.array(ds2.attrs[var], dtype=object) - if arr1.size == arr2.size: - t, e, diff = compare_arrays(arr1, arr2, var) + arr1 = fill_nans_for_float32(ds1[var].values) + arr2 = fill_nans_for_float32(ds2[var].values) + if arr1.size == arr2.size: + t, e, diff = compare_arrays(arr1, arr2, var) + if diff.size != 0: + write_lines_log(ds1, ds2, diff, detailed_logger) - if output: - write_lines(ds1, ds2, diff, path_name) + else: + t, e = max(arr1.size, arr2.size), 0 + write_different_size_log(var, arr1.size, arr2.size, detailed_logger) - if nl != 0: - diff = diff[:nl] - print_entire_line(ds1, ds2, diff) + return t, e - else: - t, e = max(arr1.size, arr2.size), 0 - write_different_size(output, nl, path_name, var, [arr1.size, arr2.size]) - total_all += t - equal_all += e +def get_log_file_name(file_path): + """ + This function gives the name of the detailed log file, + according to the file path. + """ - return total_all, equal_all + core_name = os.path.basename(file_path).replace(".nc", "") + log_file_name = f"error_{core_name}.log" + return log_file_name -def primary_check(file1, file2): +def clean_logger_file_if_only_details(file_path): """ - Test that the two files are of the same type. + This function deletes the detailed log file if it doesn't + contain anything. """ - name1 = os.path.basename(file1) - name2 = os.path.basename(file2) + target_line = "initialized named logger 'DETAILS'" - name1_core = name1.replace("fof", "").replace(".nc", "") - name2_core = name2.replace("fof", "").replace(".nc", "") + with open(file_path, "r", encoding="utf-8") as f: + lines = f.readlines() - return name1_core == name2_core + stripped_lines = [line.strip() for line in lines if line.strip()] + + if 0 < len(stripped_lines) <= 2 and all( + line == target_line for line in stripped_lines + ): + os.remove(file_path) diff --git a/util/log_handler.py b/util/log_handler.py index f3f25585..f66bb241 100644 --- a/util/log_handler.py +++ b/util/log_handler.py @@ -28,3 +28,32 @@ def initialize_logger(log_level="DEBUG", log_file="probtest.log"): logger.setLevel(log_level) logger.info("initialized logger with level %s", log_level) + + +def initialize_detailed_logger( + name, + log_level="DEBUG", + log_file=None, +): + detailed_logger = logging.getLogger(name) + detailed_logger.setLevel(log_level) + detailed_logger.propagate = False + + existing_handlers = [ + h + for h in detailed_logger.handlers + if getattr(h, "baseFilename", None) == log_file + ] + + if existing_handlers: + return detailed_logger + + formatter = logging.Formatter("%(message)s") + + if log_file: + file_handler = logging.FileHandler(log_file, mode="w") + file_handler.setFormatter(formatter) + detailed_logger.addHandler(file_handler) + + detailed_logger.info("initialized named logger '%s'", name) + return detailed_logger diff --git a/util/utils.py b/util/utils.py index af3123f2..d57905fe 100644 --- a/util/utils.py +++ b/util/utils.py @@ -326,7 +326,7 @@ def __post_init__(self): name = self.path.lower() - if "fof" in name: + if "fof" in name or "ekf" in name: self.file_type = FileType.FOF return if "csv" in name or "stats" in name: