From fd5ea7fbeccf56a082050fe98e0ccd4bb1bf7910 Mon Sep 17 00:00:00 2001 From: Daniel Hupp Date: Tue, 19 Nov 2024 15:08:02 +0100 Subject: [PATCH 1/4] add performance stats --- engine/performance_stats.py | 90 +++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 engine/performance_stats.py diff --git a/engine/performance_stats.py b/engine/performance_stats.py new file mode 100644 index 00000000..f95e3501 --- /dev/null +++ b/engine/performance_stats.py @@ -0,0 +1,90 @@ +""" +CLI for performance checks + +This module provides a command-line interface (CLI) tool for evaluating the +performance of a current experiment by comparing its timing data with a +reference. +It assesses whether the current experiment's runtime is within acceptable limits +based on specified parameters and provides feedback on whether the performance +check has passed or failed. +""" + +import sys + +import click + +from util.click_util import cli_help +from util.log_handler import logger +from util.tree import TimingTree + + +@click.command() +@click.option( + "--timing-current", + help=cli_help["timing_current"], +) +@click.option( + "--timing-reference", + help=cli_help["timing_reference"], +) +@click.option( + "--measurement-uncertainty", + help=cli_help["measurement_uncertainty"], + type=float, + default=2, +) +@click.option( + "--tolerance-factor", help=cli_help["tolerance_factor"], type=float, default=1.1 +) +@click.option( + "--new-reference-threshold", + help=cli_help["new_reference_threshold"], + type=float, + default=0.95, +) +@click.option("--i-table", type=int, help=cli_help["i_table"], default=-1) +def performance_check( + timing_current, + timing_reference, + i_table, + measurement_uncertainty, + tolerance_factor, + new_reference_threshold, +): # pylint: disable=too-many-positional-arguments + ttcur = TimingTree.from_json(timing_current) + ttref = TimingTree.from_json(timing_reference) + + total_time_cur = ( + ttcur.data[i_table].loc[("total", slice(None)), "total max (s)"].values[0] + ) + total_time_ref = ( + ttref.data[i_table].loc[("total", slice(None)), "total max (s)"].values[0] + ) + + if measurement_uncertainty < 0: + logger.error("measurement_uncertainty needs to be positive") + if tolerance_factor < 1: + logger.error("tolerance_factor needs to be greater than 1") + if new_reference_threshold < 0 or new_reference_threshold > 1: + logger.error("new_reference_threshold needs to be between 0 and 1") + + allowed_time = (total_time_ref + measurement_uncertainty) * tolerance_factor + + logger.info("Current runtime") + logger.info(total_time_cur) + logger.info("Allowed runtime") + logger.info(allowed_time) + logger.info("Reference runtime") + logger.info(total_time_ref) + + if total_time_cur <= (allowed_time): + logger.info("RESULT: performance_check PASSED!") + if total_time_cur < total_time_ref * new_reference_threshold: + logger.info( + "The current experiment ran a lot faster than the reference. " + + "Consider updating the reference." + ) + sys.exit(0) + else: + logger.info("RESULT: performance_check FAILED") + sys.exit(1) From df1a4d5006ef5869cb2a87628447a09fa560912e Mon Sep 17 00:00:00 2001 From: Daniel Hupp Date: Tue, 19 Nov 2024 16:56:25 +0100 Subject: [PATCH 2/4] first draft --- engine/performance_stats.py | 114 ++++++++++++++++++------------------ probtest.py | 2 + 2 files changed, 58 insertions(+), 58 deletions(-) diff --git a/engine/performance_stats.py b/engine/performance_stats.py index f95e3501..b45447b7 100644 --- a/engine/performance_stats.py +++ b/engine/performance_stats.py @@ -12,6 +12,8 @@ import sys import click +import numpy as np +import xarray as xr from util.click_util import cli_help from util.log_handler import logger @@ -23,68 +25,64 @@ "--timing-current", help=cli_help["timing_current"], ) -@click.option( - "--timing-reference", - help=cli_help["timing_reference"], -) -@click.option( - "--measurement-uncertainty", - help=cli_help["measurement_uncertainty"], - type=float, - default=2, -) -@click.option( - "--tolerance-factor", help=cli_help["tolerance_factor"], type=float, default=1.1 -) -@click.option( - "--new-reference-threshold", - help=cli_help["new_reference_threshold"], - type=float, - default=0.95, -) @click.option("--i-table", type=int, help=cli_help["i_table"], default=-1) -def performance_check( +def performance_stats( timing_current, - timing_reference, i_table, - measurement_uncertainty, - tolerance_factor, - new_reference_threshold, ): # pylint: disable=too-many-positional-arguments - ttcur = TimingTree.from_json(timing_current) - ttref = TimingTree.from_json(timing_reference) - total_time_cur = ( - ttcur.data[i_table].loc[("total", slice(None)), "total max (s)"].values[0] - ) - total_time_ref = ( - ttref.data[i_table].loc[("total", slice(None)), "total max (s)"].values[0] + + + timing_tree = TimingTree.from_json(timing_current) + + timer_names = ['model_init', 'total', 'integrate_nh', 'nh_solve', 'nh_hdiff', 'transport', 'physics'] + + aggregate_timer_names = {'dycore':('nh_solve', 'nh_hdiff', 'transport')} + + all_timer_names = timer_names + list(aggregate_timer_names.keys()) + + dims = ('name', 'metric') + + coords = {'name': all_timer_names, 'metric': ['mean', 'std']} + + print(all_timer_names) + + # Create an empty DataArray with np.nan + timer_stats = xr.DataArray( + data=np.full((len(coords['name']), len(coords['metric'])), np.nan), # Fill with NaN + coords=coords, + dims=dims ) - if measurement_uncertainty < 0: - logger.error("measurement_uncertainty needs to be positive") - if tolerance_factor < 1: - logger.error("tolerance_factor needs to be greater than 1") - if new_reference_threshold < 0 or new_reference_threshold > 1: - logger.error("new_reference_threshold needs to be between 0 and 1") - - allowed_time = (total_time_ref + measurement_uncertainty) * tolerance_factor - - logger.info("Current runtime") - logger.info(total_time_cur) - logger.info("Allowed runtime") - logger.info(allowed_time) - logger.info("Reference runtime") - logger.info(total_time_ref) - - if total_time_cur <= (allowed_time): - logger.info("RESULT: performance_check PASSED!") - if total_time_cur < total_time_ref * new_reference_threshold: - logger.info( - "The current experiment ran a lot faster than the reference. " - + "Consider updating the reference." - ) - sys.exit(0) - else: - logger.info("RESULT: performance_check FAILED") - sys.exit(1) + + for timer_name in timer_names: + times = np.asarray( + timing_tree.data[i_table].loc[(timer_name, slice(None)), "total max (s)"].values + ) + timer_stats.loc[timer_name, 'mean'] = np.mean(times) + timer_stats.loc[timer_name, 'std'] = np.std(times) + + + + for aggregated_timer_name in aggregate_timer_names.keys(): + + times = np.zeros_like(np.asarray( + timing_tree.data[i_table].loc[(aggregate_timer_names[aggregated_timer_name][0], slice(None)), "total max (s)"].values + )) + for timer_name in aggregate_timer_names[aggregated_timer_name]: + times += np.asarray( + timing_tree.data[i_table].loc[(timer_name, slice(None)), "total max (s)"].values + ) + timer_stats.loc[aggregated_timer_name, 'mean'] = np.mean(times) + timer_stats.loc[aggregated_timer_name, 'std'] = np.std(times) + + + # Save the dataset to a NetCDF file + timer_stats.to_netcdf('timer_stats.nc') + + # Load it back + loaded_dataarray = xr.open_dataarray('timer_stats.nc') + + print(timer_stats) + print(loaded_dataarray) + print(timer_stats-loaded_dataarray) \ No newline at end of file diff --git a/probtest.py b/probtest.py index 2dc0f7e2..f5312a00 100755 --- a/probtest.py +++ b/probtest.py @@ -14,6 +14,7 @@ from engine.init import init from engine.performance import performance from engine.performance_check import performance_check +from engine.performance_stats import performance_stats from engine.perturb import perturb from engine.run_ensemble import run_ensemble from engine.select_members import select_members @@ -57,6 +58,7 @@ def cli(ctx, log_level, log_file): cli.add_command(performance_plot) cli.add_command(performance_meta_data) cli.add_command(performance_check) +cli.add_command(performance_stats) cli.add_command(cdo_table) cli.add_command(cdo_table_reader) From b52d33414e4271d8cf2d3f5ed0f829c6c44b9483 Mon Sep 17 00:00:00 2001 From: Daniel Hupp Date: Tue, 19 Nov 2024 17:54:58 +0100 Subject: [PATCH 3/4] minor cleanup --- engine/performance_stats.py | 1 - 1 file changed, 1 deletion(-) diff --git a/engine/performance_stats.py b/engine/performance_stats.py index b45447b7..0fc3ad54 100644 --- a/engine/performance_stats.py +++ b/engine/performance_stats.py @@ -63,7 +63,6 @@ def performance_stats( timer_stats.loc[timer_name, 'std'] = np.std(times) - for aggregated_timer_name in aggregate_timer_names.keys(): times = np.zeros_like(np.asarray( From ee3b2939206ea692db70471a1ec4cb21ef09f063 Mon Sep 17 00:00:00 2001 From: Daniel Hupp Date: Tue, 4 Feb 2025 11:30:06 +0100 Subject: [PATCH 4/4] add option --- engine/performance_stats.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/engine/performance_stats.py b/engine/performance_stats.py index 0fc3ad54..536bf616 100644 --- a/engine/performance_stats.py +++ b/engine/performance_stats.py @@ -23,16 +23,21 @@ @click.command() @click.option( "--timing-current", + default="database", help=cli_help["timing_current"], ) +@click.option( + "--timing-stats", + default='timer_stats.nc', + help='name of the timing stats file', +) @click.option("--i-table", type=int, help=cli_help["i_table"], default=-1) def performance_stats( timing_current, + timing_stats, i_table, ): # pylint: disable=too-many-positional-arguments - - timing_tree = TimingTree.from_json(timing_current) timer_names = ['model_init', 'total', 'integrate_nh', 'nh_solve', 'nh_hdiff', 'transport', 'physics'] @@ -77,11 +82,4 @@ def performance_stats( # Save the dataset to a NetCDF file - timer_stats.to_netcdf('timer_stats.nc') - - # Load it back - loaded_dataarray = xr.open_dataarray('timer_stats.nc') - - print(timer_stats) - print(loaded_dataarray) - print(timer_stats-loaded_dataarray) \ No newline at end of file + timer_stats.to_netcdf(timing_stats) \ No newline at end of file