From 0d67640287380a62a8675b5c0dde80876005b604 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Mon, 8 Aug 2022 15:35:19 +0200 Subject: [PATCH 01/31] kurtosis check --- mlmc/estimator.py | 84 +++++++++++++++++++++++++++++- mlmc/plot/diagnostic_plots.py | 0 mlmc/quantity/quantity_estimate.py | 53 +++++++++++++++++-- mlmc/tool/hdf5.py | 2 +- 4 files changed, 133 insertions(+), 6 deletions(-) create mode 100644 mlmc/plot/diagnostic_plots.py diff --git a/mlmc/estimator.py b/mlmc/estimator.py index af4f6e03..9e9fad64 100644 --- a/mlmc/estimator.py +++ b/mlmc/estimator.py @@ -3,6 +3,7 @@ import scipy.integrate as integrate import mlmc.quantity.quantity_estimate as qe import mlmc.tool.simple_distribution +from mlmc.quantity.quantity_estimate import mask_nan_samples from mlmc.quantity.quantity_types import ScalarType from mlmc.plot import plots from mlmc.quantity.quantity_spec import ChunkSpec @@ -16,6 +17,7 @@ def __init__(self, quantity, sample_storage, moments_fn=None): self._quantity = quantity self._sample_storage = sample_storage self._moments_fn = moments_fn + self._moments_mean = None @property def quantity(self): @@ -29,6 +31,16 @@ def quantity(self, quantity): def n_moments(self): return self._moments_fn.size + @property + def moments_mean_obj(self): + return self._moments_mean + + @moments_mean_obj.setter + def moments_mean_obj(self, moments_mean): + if not isinstance(moments_mean, mlmc.quantity.quantity.QuantityMean): + raise TypeError + self._moments_mean = moments_mean + def estimate_moments(self, moments_fn=None): """ Use collected samples to estimate moments and variance of this estimate. @@ -39,6 +51,7 @@ def estimate_moments(self, moments_fn=None): moments_fn = self._moments_fn moments_mean = qe.estimate_mean(qe.moments(self._quantity, moments_fn)) + self.moments_mean_obj = moments_mean return moments_mean.mean, moments_mean.var def estimate_covariance(self, moments_fn=None): @@ -340,6 +353,51 @@ def get_level_samples(self, level_id, n_samples=None): chunk_spec = next(self._sample_storage.chunks(level_id=level_id, n_samples=n_samples)) return self._quantity.samples(chunk_spec=chunk_spec) + def kurtosis_check(self, quantity=None): + if quantity is None: + quantity = self._quantity + moments_mean_quantity = qe.estimate_mean(quantity) + kurtosis = qe.level_kurtosis(quantity, moments_mean_quantity) + return kurtosis + + +def consistency_check(quantity, sample_storage=None): + + fine_samples = {} + coarse_samples = {} + for chunk_spec in quantity.get_quantity_storage().chunks(): + samples = quantity.samples(chunk_spec) + chunk, n_mask_samples = mask_nan_samples(samples) + + # No samples in chunk + if chunk.shape[1] == 0: + continue + + fine_samples.setdefault(chunk_spec.level_id, []).extend(chunk[:, :, 0]) + if chunk_spec.level_id > 0: + coarse_samples.setdefault(chunk_spec.level_id, []).extend(chunk[:, :, 1]) + + cons_check_val = {} + for level_id in range(sample_storage.get_n_levels()): + if level_id > 0: + fine_mean = np.mean(fine_samples[level_id]) + coarse_mean = np.mean(coarse_samples[level_id]) + diff_mean = np.mean(np.array(fine_samples[level_id]) - np.array(coarse_samples[level_id])) + + fine_var = np.var(fine_samples[level_id]) + coarse_var = np.var(fine_samples[level_id]) + diff_var = np.var(np.array(fine_samples[level_id]) - np.array(coarse_samples[level_id])) + + val = np.abs(coarse_mean - fine_mean + diff_mean) / ( + 3 * (np.sqrt(coarse_var) + np.sqrt(fine_var) + np.sqrt(diff_var))) + + assert np.isclose(coarse_mean - fine_mean + diff_mean, 0) + assert val < 0.9 + + cons_check_val[level_id] = val + + return cons_check_val + def estimate_domain(quantity, sample_storage, quantile=None): """ @@ -363,7 +421,23 @@ def estimate_domain(quantity, sample_storage, quantile=None): return np.min(ranges[:, 0]), np.max(ranges[:, 1]) -def estimate_n_samples_for_target_variance(target_variance, prescribe_vars, n_ops, n_levels): +def coping_with_high_kurtosis(vars, costs, kurtosis, kurtosis_threshold=100): + """ + Coping with high kurtosis is recommended by prof. M. Giles in http://people.maths.ox.ac.uk/~gilesm/talks/MCQMC_22_b.pdf + :param vars: vars[L, M] for all levels L and moments_fn M safe the (zeroth) constant moment with zero variance. + :param costs: cost of level's sample + :param kurtosis: each level's sample kurtosis + :param kurtosis_threshold: Kurtosis is considered to be too high if it is above this threshold. + Original variances are underestimated and therefore modified in this metod + :return: vars + """ + for l_id in range(2, vars.shape[0]): + if kurtosis[l_id] > kurtosis_threshold: + vars[l_id] = np.maximum(vars[l_id], 0.5 * vars[l_id - 1] * costs[l_id - 1] / costs[l_id]) + return vars + + +def estimate_n_samples_for_target_variance(target_variance, prescribe_vars, n_ops, n_levels, theta=0, kurtosis=None): """ Estimate optimal number of samples for individual levels that should provide a target variance of resulting moment estimate. @@ -372,12 +446,20 @@ def estimate_n_samples_for_target_variance(target_variance, prescribe_vars, n_op :param prescribe_vars: vars[ L, M] for all levels L and moments_fn M safe the (zeroth) constant moment with zero variance. :param n_ops: number of operations at each level :param n_levels: number of levels + :param theta: number of samples N_l control parameter, suitable values: 0.25 ... 0.5 + :param kurtosis: levels' kurtosis :return: np.array with number of optimal samples for individual levels and moments_fn, array (LxR) """ vars = prescribe_vars + + if kurtosis is not None and len(vars) == len(kurtosis): + vars = coping_with_high_kurtosis(vars, n_ops, kurtosis) + sqrt_var_n = np.sqrt(vars.T * n_ops) # moments_fn in rows, levels in cols total = np.sum(sqrt_var_n, axis=1) # sum over levels n_samples_estimate = np.round((sqrt_var_n / n_ops).T * total / target_variance).astype(int) # moments_fn in cols + n_samples_estimate = 1/(1-theta) * n_samples_estimate + # Limit maximal number of samples per level n_samples_estimate_safe = np.maximum( np.minimum(n_samples_estimate, vars * n_levels / target_variance), 2) diff --git a/mlmc/plot/diagnostic_plots.py b/mlmc/plot/diagnostic_plots.py new file mode 100644 index 00000000..e69de29b diff --git a/mlmc/quantity/quantity_estimate.py b/mlmc/quantity/quantity_estimate.py index 2436d7b9..a1f63ddd 100644 --- a/mlmc/quantity/quantity_estimate.py +++ b/mlmc/quantity/quantity_estimate.py @@ -19,13 +19,17 @@ def cache_clear(): mlmc.quantity.quantity.QuantityConst.samples.cache_clear() -def estimate_mean(quantity): +def estimate_mean(quantity, form="diff", operation_func=None, **kwargs): """ MLMC mean estimator. The MLMC method is used to compute the mean estimate to the Quantity dependent on the collected samples. The squared error of the estimate (the estimator variance) is estimated using the central limit theorem. Data is processed by chunks, so that it also supports big data processing :param quantity: Quantity + :param form: if "diff" estimates based on difference between fine and coarse data = MLMC approach + "fine" estimates based on level's fine data + "coarse" estimates based on level's coarse data + :param operation_func: function to process level data, e.g. kurtosis estimation :return: QuantityMean which holds both mean and variance """ cache_clear() @@ -56,10 +60,26 @@ def estimate_mean(quantity): sums = [np.zeros(chunk.shape[0]) for _ in range(n_levels)] sums_of_squares = [np.zeros(chunk.shape[0]) for _ in range(n_levels)] - if chunk_spec.level_id == 0: - chunk_diff = chunk[:, :, 0] + # Estimates of level's fine data + if form == "fine": + if chunk_spec.level_id == 0: + chunk_diff = chunk[:, :, 0] + else: + chunk_diff = chunk[:, :, 0] + # Estimate of level's coarse data + elif form == "coarse": + if chunk_spec.level_id == 0: + chunk_diff = np.zeros(chunk[:, :, 0].shape) + else: + chunk_diff = chunk[:, :, 1] else: - chunk_diff = chunk[:, :, 0] - chunk[:, :, 1] + if chunk_spec.level_id == 0: + chunk_diff = chunk[:, :, 0] + else: + chunk_diff = chunk[:, :, 0] - chunk[:, :, 1] + + if operation_func is not None: + chunk_diff = operation_func(chunk_diff, chunk_spec, **kwargs) sums[chunk_spec.level_id] += np.sum(chunk_diff, axis=1) sums_of_squares[chunk_spec.level_id] += np.sum(chunk_diff**2, axis=1) @@ -154,3 +174,28 @@ def eval_cov(x): else: moments_qtype = qt.ArrayType(shape=(moments_fn.size, moments_fn.size, ), qtype=quantity.qtype) return mlmc.quantity.quantity.Quantity(quantity_type=moments_qtype, input_quantities=[quantity], operation=eval_cov) + + +def kurtosis_numerator(chunk_diff, chunk_spec, l_means): + """ + Estimate sample kurtosis nominator: + E[(Y_l - E[Y_l])^4] + :param chunk_diff: np.ndarray, [quantity shape, number of samples] + :param chunk_spec: quantity_spec.ChunkSpec + :return: np.ndarray, unchanged shape + """ + chunk_diff = (chunk_diff - l_means[chunk_spec.level_id]) ** 4 + return chunk_diff + + +def level_kurtosis(quantity, means_obj): + """ + Estimate sample kurtosis at each level as: + E[(Y_l - E[Y_l])^4] / (Var[Y_l])^2, where Y_l = fine_l - coarse_l + :param quantity: Quantity + :param means_obj: Quantity.QuantityMean + :return: np.ndarray, kurtosis per level + """ + numerator_means_obj = estimate_mean(quantity, operation_func=kurtosis_numerator, l_means=means_obj.l_means) + kurtosis = numerator_means_obj.l_means / (means_obj.l_vars)**2 + return kurtosis diff --git a/mlmc/tool/hdf5.py b/mlmc/tool/hdf5.py index f6f3219c..a83f6a8a 100644 --- a/mlmc/tool/hdf5.py +++ b/mlmc/tool/hdf5.py @@ -357,7 +357,7 @@ def chunks(self, n_samples=None): dataset = hdf_file["/".join([self.level_group_path, "collected_values"])] if n_samples is not None: - yield ChunkSpec(chunk_id=0, chunk_slice=slice(0, n_samples, 1), level_id=int(self.level_id)) + yield ChunkSpec(chunk_id=0, chunk_slice=slice(0, n_samples, ...), level_id=int(self.level_id)) else: for chunk_id, chunk in enumerate(dataset.iter_chunks()): yield ChunkSpec(chunk_id=chunk_id, chunk_slice=chunk[0], level_id=int(self.level_id)) # slice, level_id From 1d387443c7e6f2ec302f24a24381380c0251d977 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Tue, 23 Aug 2022 11:58:45 +0200 Subject: [PATCH 02/31] mlmc diagnostic plots --- examples/shooting/shooting_1D_mcqmc.py | 252 +++++++++++++++++++++++++ mlmc/plot/diagnostic_plots.py | 97 ++++++++++ test/test_estimates.py | 234 +++++++++++++++++++++++ test/test_sampler.py | 43 +++++ 4 files changed, 626 insertions(+) create mode 100644 examples/shooting/shooting_1D_mcqmc.py create mode 100644 test/test_estimates.py diff --git a/examples/shooting/shooting_1D_mcqmc.py b/examples/shooting/shooting_1D_mcqmc.py new file mode 100644 index 00000000..bcf5f5fe --- /dev/null +++ b/examples/shooting/shooting_1D_mcqmc.py @@ -0,0 +1,252 @@ +import numpy as np +import mlmc.estimator as est +from mlmc.estimator import Estimate, estimate_n_samples_for_target_variance +from mlmc.sampler import Sampler +from mlmc.sample_storage import Memory +from mlmc.sampling_pool import OneProcessPool +from examples.shooting.simulation_shooting_1D import ShootingSimulation1D +from mlmc.quantity.quantity import make_root_quantity +from mlmc.quantity.quantity_estimate import moments, estimate_mean +from mlmc.moments import Legendre +from mlmc.plot.plots import Distribution + + +# Tutorial class for 1D shooting simulation, includes +# - samples scheduling +# - process results: +# - create Quantity instance +# - approximate density +class ProcessShooting1D: + + def __init__(self): + n_levels = 3 + # Number of MLMC levels + step_range = [1, 1e-3] + # step_range [simulation step at the coarsest level, simulation step at the finest level] + level_parameters = ProcessShooting1D.determine_level_parameters(n_levels, step_range) + # Determine each level parameters (in this case, simulation step at each level), level_parameters should be + # simulation dependent + self._sample_sleep = 0#30 + # Time to do nothing just to make sure the simulations aren't constantly checked, useful mainly for PBS run + self._sample_timeout = 60 + # Maximum waiting time for running simulations + self._adding_samples_coef = 0.1 + self._n_moments = 20 + # number of generalized statistical moments used for MLMC number of samples estimation + self._quantile = 0.01 + # Setting parameters that are utilized when scheduling samples + ### + # MLMC run + ### + sampler = self.create_sampler(level_parameters=level_parameters) + # Create sampler (mlmc.Sampler instance) - crucial class that controls MLMC run + self.generate_samples(sampler, n_samples=None, target_var=1e-3) + # Generate MLMC samples, there are two ways: + # 1) set exact number of samples at each level, + # e.g. for 5 levels - self.generate_samples(sampler, n_samples=[1000, 500, 250, 100, 50]) + # 2) set target variance of MLMC estimates, + # e.g. self.generate_samples(sampler, n_samples=None, target_var=1e-6) + self.all_collect(sampler) + # Check if all samples are finished + ### + # Postprocessing + ### + self.process_results(sampler, n_levels) + # Postprocessing, MLMC is finished at this point + + def create_sampler(self, level_parameters): + """ + Create: + # sampling pool - the way sample simulations are executed + # sample storage - stores sample results + # sampler - controls MLMC execution + :param level_parameters: list of lists + :return: mlmc.sampler.Sampler instance + """ + # Create OneProcessPool - all run in the same process + sampling_pool = OneProcessPool() + # There is another option mlmc.sampling_pool.ProcessPool() - supports local parallel sample simulation run + # sampling_pool = ProcessPool(n), n - number of parallel simulations, depends on computer architecture + + # Simulation configuration which is passed to simulation constructor + simulation_config = { + "start_position": np.array([0, 0]), + "start_velocity": np.array([10, 0]), + "area_borders": np.array([-100, 200, -300, 400]), + "max_time": 10, + "complexity": 2, # used for initial estimate of number of operations per sample + 'fields_params': dict(model='gauss', dim=1, sigma=1, corr_length=0.1), + } + + # Create simulation factory, instance of class that inherits from mlmc.sim.simulation + simulation_factory = ShootingSimulation1D(config=simulation_config) + + # Create simple sample storage + # Memory() storage keeps samples in computer main memory + sample_storage = Memory() + # We support also HDF file storage mlmc.sample_storage_hdf.SampleStorageHDF() + # sample_storage = SampleStorageHDF(file_path=path_to_HDF5_file) + + # Create sampler + # Controls the execution of MLMC + sampler = Sampler(sample_storage=sample_storage, sampling_pool=sampling_pool, sim_factory=simulation_factory, + level_parameters=level_parameters) + return sampler + + def generate_samples(self, sampler, n_samples=None, target_var=None): + """ + Generate MLMC samples + :param sampler: mlmc.sampler.Sampler instance + :param n_samples: None or list, number of samples at each level + :param target_var: target variance of MLMC estimates + :return: None + """ + # The number of samples is set by user + if n_samples is not None: + sampler.set_initial_n_samples(n_samples) + # The number of initial samples is determined automatically + else: + sampler.set_initial_n_samples() + # Samples are scheduled and the program is waiting for all of them to be completed. + sampler.schedule_samples() + sampler.ask_sampling_pool_for_samples(sleep=self._sample_sleep, timeout=self._sample_timeout) + self.all_collect(sampler) + + # MLMC estimates target variance is set + if target_var is not None: + # The mlmc.quantity.quantity.Quantity instance is created + # parameters 'storage' and 'q_specs' are obtained from sample_storage, + # originally 'q_specs' is set in the simulation class + root_quantity = make_root_quantity(storage=sampler.sample_storage, + q_specs=sampler.sample_storage.load_result_format()) + + # Moment functions object is created + # The MLMC algorithm determines number of samples according to the moments variance, + # Type of moment functions (Legendre by default) might affect the total number of MLMC samples + moments_fn = self.set_moments(root_quantity, sampler.sample_storage, n_moments=self._n_moments) + estimate_obj = Estimate(root_quantity, sample_storage=sampler.sample_storage, + moments_fn=moments_fn) + + # Initial estimation of the number of samples at each level + variances, n_ops = estimate_obj.estimate_diff_vars_regression(sampler.n_finished_samples) + # Firstly, the variance of moments and execution time of samples at each level are calculated from already finished samples + n_estimated = estimate_n_samples_for_target_variance(target_var, variances, n_ops, + n_levels=sampler.n_levels) + + ##### + # MLMC sampling algorithm - gradually schedules samples and refines the total number of samples + ##### + # Loop until number of estimated samples is greater than the number of scheduled samples + while not sampler.process_adding_samples(n_estimated, self._sample_sleep, self._adding_samples_coef, + timeout=self._sample_timeout): + # New estimation according to already finished samples + variances, n_ops = estimate_obj.estimate_diff_vars_regression(sampler._n_scheduled_samples) + n_estimated = estimate_n_samples_for_target_variance(target_var, variances, n_ops, + n_levels=sampler.n_levels) + + def set_moments(self, quantity, sample_storage, n_moments=25): + true_domain = Estimate.estimate_domain(quantity, sample_storage, quantile=self._quantile) + return Legendre(n_moments, true_domain) + + def all_collect(self, sampler): + """ + Collect samples, wait until all samples are finished + :param sampler: mlmc.sampler.Sampler object + :return: None + """ + running = 1 + while running > 0: + running = 0 + running += sampler.ask_sampling_pool_for_samples() + print("N running: ", running) + + def process_results(self, sampler, n_levels): + """ + Process MLMC results + :param sampler: mlmc.sampler.Sampler instance + :param n_levels: int, number of MLMC levels + :return: None + """ + sample_storage = sampler.sample_storage + # Load result format from the sample storage + result_format = sample_storage.load_result_format() + # Create Quantity instance representing our real quantity of interest + root_quantity = make_root_quantity(sample_storage, result_format) + + print("N collected ", sample_storage.get_n_collected()) + + # It is possible to access items of the quantity according to the result format + target = root_quantity['target'] + time = target[10] + position = time['0'] + q_value = position[0] + + # Compute moments, first estimate domain of moment functions + estimated_domain = Estimate.estimate_domain(q_value, sample_storage, quantile=self._quantile) + moments_fn = Legendre(self._n_moments, estimated_domain) + + # Create estimator for the quantity + estimator = Estimate(quantity=q_value, sample_storage=sample_storage, moments_fn=moments_fn) + # Estimate moment means and variances + means, vars = estimator.estimate_moments(moments_fn) + + est.plot_checks(quantity=q_value, sample_storage=sample_storage, moments_fn=moments_fn) + est.consistency_check(quantity=q_value, sample_storage=sample_storage) + estimator.kurtosis_check(q_value) + + # Generally, root quantity has different domain than its items + root_quantity_estimated_domain = Estimate.estimate_domain(root_quantity, sample_storage, + quantile=self._quantile) + root_quantity_moments_fn = Legendre(self._n_moments, root_quantity_estimated_domain) + + # There is another possible approach to calculating all moments at once and then select desired quantity + moments_quantity = moments(root_quantity, moments_fn=root_quantity_moments_fn, mom_at_bottom=True) + moments_mean = estimate_mean(moments_quantity) + target_mean = moments_mean['target'] + time_mean = target_mean[10] # times: [1] + location_mean = time_mean['0'] # locations: ['0'] + value_mean = location_mean[0] # result shape: (1,) + + assert value_mean.mean[0] == 1 + self.approx_distribution(estimator, n_levels, tol=1e-8) + + def approx_distribution(self, estimator, n_levels, tol=1.95): + """ + Probability density function approximation + :param estimator: mlmc.estimator.Estimate instance, it contains quantity for which the density is approximated + :param n_levels: int, number of MLMC levels + :param tol: Tolerance of the fitting problem, with account for variances in moments. + :return: None + """ + distr_obj, result, _, _ = estimator.construct_density(tol=tol) + distr_plot = Distribution(title="distributions", error_plot=None) + distr_plot.add_distribution(distr_obj) + + if n_levels == 1: + samples = estimator.get_level_samples(level_id=0)[..., 0] + distr_plot.add_raw_samples(np.squeeze(samples)) + distr_plot.show(None) + distr_plot.reset() + + @staticmethod + def determine_level_parameters(n_levels, step_range): + """ + Determine level parameters, + In this case, a step of fine simulation at each level + :param n_levels: number of MLMC levels + :param step_range: simulation step range + :return: list of lists + """ + assert step_range[0] > step_range[1] + level_parameters = [] + for i_level in range(n_levels): + if n_levels == 1: + level_param = 1 + else: + level_param = i_level / (n_levels - 1) + level_parameters.append([step_range[0] ** (1 - level_param) * step_range[1] ** level_param]) + return level_parameters + + +if __name__ == "__main__": + ProcessShooting1D() diff --git a/mlmc/plot/diagnostic_plots.py b/mlmc/plot/diagnostic_plots.py index e69de29b..5d6a844e 100644 --- a/mlmc/plot/diagnostic_plots.py +++ b/mlmc/plot/diagnostic_plots.py @@ -0,0 +1,97 @@ +import numpy as np +import scipy.stats as st +from scipy import interpolate +import matplotlib + +matplotlib.rcParams.update({'font.size': 22}) +from matplotlib.patches import Patch +import matplotlib.pyplot as plt + + +# def log_var_level(variances, l_vars, err_variances=0, err_l_vars=0, moments=[1,2,3,4]): +# fig, ax1 = plt.subplots(figsize=(8, 5)) +# for m in moments: +# # line1, = ax1.errorbar(np.log2(variances[m]), yerr=err_variances, label="m={}".format(m), marker="o") +# # line2, = ax1.errorbar(np.log2(l_vars[:, m]), yerr=err_l_vars, label="m={}".format(m), marker="s") +# #line1, = ax1.plot(np.log2(variances[m]), label="m={}".format(m), marker="o") +# line2, = ax1.plot(np.log2(l_vars[:, m]), label="m={}".format(m), marker="s") +# +# ax1.set_ylabel('log' + r'$_2$' + 'variance') +# ax1.set_xlabel('level' + r'$l$') +# plt.legend() +# #plt.savefig("MLMC_cost_saves.pdf") +# plt.show() + + +def log_var_per_level(l_vars, err_variances=0, err_l_vars=0, moments=[1, 2, 3, 4]): + fig, ax1 = plt.subplots(figsize=(8, 5)) + for m in moments: + # line1, = ax1.errorbar(np.log2(variances[m]), yerr=err_variances, label="m={}".format(m), marker="o") + # line2, = ax1.errorbar(np.log2(l_vars[:, m]), yerr=err_l_vars, label="m={}".format(m), marker="s") + #line1, = ax1.plot(np.log2(variances[m]), label="m={}".format(m), marker="o") + line2, = ax1.plot(np.log2(l_vars[:, m]), label="m={}".format(m), marker="s") + + ax1.set_ylabel('log' + r'$_2$' + 'variance') + ax1.set_xlabel('level' + r'$l$') + plt.legend() + #plt.savefig("MLMC_cost_saves.pdf") + plt.show() + + +# def log_mean_level(means, l_means, err_means=0, err_l_means=0, moments=[1,2,3,4]): +# fig, ax1 = plt.subplots(figsize=(8, 5)) +# for m in moments: +# # line1, = ax1.errorbar(np.log2(variances[m]), yerr=err_variances, label="m={}".format(m), marker="o") +# # line2, = ax1.errorbar(np.log2(l_vars[:, m]), yerr=err_l_vars, label="m={}".format(m), marker="s") +# #line1, = ax1.plot(np.log2(variances[m]), label="m={}".format(m), marker="o") +# line2, = ax1.plot(np.log2(np.abs(l_means[:, m])), label="m={}".format(m), marker="s") +# +# ax1.set_ylabel('log' + r'$_2$' + 'mean') +# ax1.set_xlabel('level' + r'$l$') +# plt.legend() +# #plt.savefig("MLMC_cost_saves.pdf") +# plt.show() + + +def log_mean_per_level(l_means, err_means=0, err_l_means=0, moments=[1, 2, 3, 4]): + fig, ax1 = plt.subplots(figsize=(8, 5)) + for m in moments: + # line1, = ax1.errorbar(np.log2(variances[m]), yerr=err_variances, label="m={}".format(m), marker="o") + # line2, = ax1.errorbar(np.log2(l_vars[:, m]), yerr=err_l_vars, label="m={}".format(m), marker="s") + #line1, = ax1.plot(np.log2(variances[m]), label="m={}".format(m), marker="o") + line2, = ax1.plot(np.log2(np.abs(l_means[:, m])), label="m={}".format(m), marker="s") + + ax1.set_ylabel('log' + r'$_2$' + 'mean') + ax1.set_xlabel('level' + r'$l$') + plt.legend() + #plt.savefig("MLMC_cost_saves.pdf") + plt.show() + + +def sample_cost_per_level(costs): + fig, ax1 = plt.subplots(figsize=(8, 5)) + line2, = ax1.plot(np.log2(costs), marker="s") + + ax1.set_ylabel('log' + r'$_2$' + 'cost per sample') + ax1.set_xlabel('level' + r'$l$') + plt.legend() + #plt.savefig("MLMC_cost_saves.pdf") + plt.show() + + +def kurtosis_per_level(means, l_means, err_means=0, err_l_means=0, moments=[1, 2, 3, 4]): + fig, ax1 = plt.subplots(figsize=(8, 5)) + for m in moments: + # line1, = ax1.errorbar(np.log2(variances[m]), yerr=err_variances, label="m={}".format(m), marker="o") + # line2, = ax1.errorbar(np.log2(l_vars[:, m]), yerr=err_l_vars, label="m={}".format(m), marker="s") + #line1, = ax1.plot(np.log2(variances[m]), label="m={}".format(m), marker="o") + line2, = ax1.plot(np.log2(np.abs(l_means[:, m])), label="m={}".format(m), marker="s") + + ax1.set_ylabel('log ' + r'$_2$ ' + 'mean') + ax1.set_xlabel('level ' + r'$l$') + plt.legend() + #plt.savefig("MLMC_cost_saves.pdf") + plt.show() + + + diff --git a/test/test_estimates.py b/test/test_estimates.py new file mode 100644 index 00000000..b8eb1ac5 --- /dev/null +++ b/test/test_estimates.py @@ -0,0 +1,234 @@ +import os +import shutil +import numpy as np +from scipy import stats +import pytest +from mlmc.sim.synth_simulation import SynthSimulationWorkspace +from test.synth_sim_for_tests import SynthSimulationForTests +from mlmc.sampler import Sampler +from mlmc.sample_storage import Memory +from mlmc.sample_storage_hdf import SampleStorageHDF +from mlmc.sampling_pool import OneProcessPool, ProcessPool +from mlmc.moments import Legendre +from mlmc.quantity.quantity import make_root_quantity +import mlmc.estimator + +# Set work dir +os.chdir(os.path.dirname(os.path.realpath(__file__))) +work_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '_test_tmp') +if os.path.exists(work_dir): + shutil.rmtree(work_dir) +os.makedirs(work_dir) + +# Create simulations +failed_fraction = 0.0 +distr = stats.norm() +simulation_config = dict(distr=distr, complexity=2, nan_fraction=failed_fraction, sim_method='_sample_fn') + +shutil.copyfile('synth_sim_config.yaml', os.path.join(work_dir, 'synth_sim_config.yaml')) +simulation_config_workspace = {"config_yaml": os.path.join(work_dir, 'synth_sim_config.yaml')} + + +def hdf_storage_factory(file_name="mlmc_test.hdf5"): + os.chdir(os.path.dirname(os.path.realpath(__file__))) + work_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '_test_tmp') + if os.path.exists(work_dir): + shutil.rmtree(work_dir) + os.makedirs(work_dir) + + # Create sample storages + return SampleStorageHDF(file_path=os.path.join(work_dir, file_name)) + + +def mlmc_test(test_case): + #np.random.seed(1234) + n_moments = 20 + step_range = [[0.1], [0.005], [0.00025]] + + simulation_factory, sample_storage, sampling_pool = test_case + + if simulation_factory.need_workspace: + os.chdir(os.path.dirname(os.path.realpath(__file__))) + shutil.copyfile('synth_sim_config.yaml', os.path.join(work_dir, 'synth_sim_config.yaml')) + + sampler = Sampler(sample_storage=sample_storage, sampling_pool=sampling_pool, sim_factory=simulation_factory, + level_parameters=step_range) + + true_domain = distr.ppf([0.0001, 0.9999]) + moments_fn = Legendre(n_moments, true_domain) + # moments_fn = Monomial(n_moments, true_domain) + + sampler.set_initial_n_samples([100, 50, 25]) + # sampler.set_initial_n_samples([10000]) + sampler.schedule_samples() + sampler.ask_sampling_pool_for_samples() + + target_var = 1e-3 + sleep = 0 + add_coef = 0.1 + + quantity = make_root_quantity(sample_storage, q_specs=simulation_factory.result_format()) + + length = quantity['length'] + time = length[1] + location = time['10'] + value_quantity = location[0] + + estimator = mlmc.estimator.Estimate(value_quantity, sample_storage, moments_fn) + + # New estimation according to already finished samples + variances, n_ops = estimator.estimate_diff_vars_regression(sampler._n_scheduled_samples) + n_estimated = mlmc.estimator.estimate_n_samples_for_target_variance(target_var, variances, n_ops, + n_levels=sampler.n_levels) + + + # Loop until number of estimated samples is greater than the number of scheduled samples + while not sampler.process_adding_samples(n_estimated, sleep, add_coef): + print("n estimated ", n_estimated) + # New estimation according to already finished samples + variances, n_ops = estimator.estimate_diff_vars_regression(sampler._n_scheduled_samples) + n_estimated = mlmc.estimator.estimate_n_samples_for_target_variance(target_var, variances, n_ops, + n_levels=sampler.n_levels) + + means, vars = estimator.estimate_moments(moments_fn) + assert means[0] == 1 + assert vars[0] == 0 + + return estimator.moments_mean_obj, sample_storage.get_n_collected(), sample_storage.get_n_ops() + + +def mlmc_test_mcqmc(test_case): + # np.random.seed(1234) + n_moments = 10 + step_range = [[0.1], [0.005], [0.00025]] + + simulation_factory, sample_storage, sampling_pool = test_case + + if simulation_factory.need_workspace: + os.chdir(os.path.dirname(os.path.realpath(__file__))) + shutil.copyfile('synth_sim_config.yaml', os.path.join(work_dir, 'synth_sim_config.yaml')) + + sampler = Sampler(sample_storage=sample_storage, sampling_pool=sampling_pool, sim_factory=simulation_factory, + level_parameters=step_range) + + true_domain = distr.ppf([0.0001, 0.9999]) + moments_fn = Legendre(n_moments, true_domain) + # moments_fn = Monomial(n_moments, true_domain) + + sampler.set_initial_n_samples([100, 50, 25]) + # sampler.set_initial_n_samples([10000]) + sampler.schedule_samples() + sampler.ask_sampling_pool_for_samples() + + target_var = 1e-4 + sleep = 0 + add_coef = 0.1 + + quantity = make_root_quantity(sample_storage, q_specs=simulation_factory.result_format()) + + length = quantity['length'] + time = length[1] + location = time['10'] + value_quantity = location[0] + + estimator = mlmc.estimator.Estimate(value_quantity, sample_storage, moments_fn) + + # New estimation according to already finished samples + variances, n_ops = estimator.estimate_diff_vars_regression(sampler._n_scheduled_samples) + n_estimated = mlmc.estimator.estimate_n_samples_for_target_variance_giles(target_var, variances, n_ops, + n_levels=sampler.n_levels, theta=0.25) + + + # Loop until number of estimated samples is greater than the number of scheduled samples + while not sampler.process_adding_samples(n_estimated, sleep, add_coef): + # New estimation according to already finished samples + kurtosis = estimator.kurtosis_check() + variances, n_ops = estimator.estimate_diff_vars_regression(sampler._n_scheduled_samples) + n_estimated = mlmc.estimator.estimate_n_samples_for_target_variance_giles(target_var, variances, n_ops, + n_levels=sampler.n_levels, theta=0.25, + kurtosis=kurtosis) + + means, vars = estimator.estimate_moments(moments_fn) + + estimator.kurtosis_check(value_quantity) + + assert means[0] == 1 + assert vars[0] == 0 + + return estimator.moments_mean_obj, sample_storage.get_n_collected(), sample_storage.get_n_ops() + + +def mlmc_test_data(method): + from mlmc.plot.diagnostic_plots import log_var_per_level, log_mean_per_level, sample_cost_per_level, kurtosis_per_level + means = [] + l_means = [] + vars = [] + l_vars = [] + n_collected = [] + n_ops = [] + for _ in range(2): + moments_mean_obj, n_col, n_op = method((SynthSimulationForTests(simulation_config), hdf_storage_factory(file_name="mlmc_test.hdf5"), OneProcessPool())) + means.append(moments_mean_obj.mean) + vars.append(moments_mean_obj.var) + l_means.append(moments_mean_obj.l_means) + l_vars.append(moments_mean_obj.l_vars) + n_collected.append(n_col) + n_ops.append(n_op) + + print("means ", np.mean(means, axis=0)) + print("vars ", np.mean(vars, axis=0)) + + log_var_per_level(l_vars=np.mean(l_vars, axis=0), moments=[1, 2, 5]) + log_mean_per_level(l_means=np.mean(l_means, axis=0), moments=[1, 2, 5]) + #sample_cost_level(costs=np.mean(n_ops, axis=0)) + + print("n collected ", np.mean(n_collected, axis=0)) + print("cost on level ", np.mean(n_ops, axis=0)) + print("total cost ", np.sum(np.mean(n_ops * np.array(n_collected), axis=0))) + + +if __name__ == "__main__": + # means_mlmc = [] + # means_mlmc_giles = [] + # n_collected_mlmc = [] + # n_collected_mlmc_giles = [] + # vars_mlmc = [] + # vars_mlmc_giles = [] + for i in range(3): + # print("############### Original estimator ###################") + # mlmc_test_data(mlmc_test) + print("######################################################") + print("############### Improved estimator ###################") + mlmc_test_data(mlmc_test_mcqmc) + # mean, var, n_estimated, n_ops = mlmc_test((SynthSimulationForTests(simulation_config), + # hdf_storage_factory(file_name="mlmc_test.hdf5"), + # OneProcessPool())) + # means_mlmc.append(mean) + # vars_mlmc.append(var) + # n_collected_mlmc.append(n_estimated) + # mean, var, n_estimated, n_ops = mlmc_test_giles((SynthSimulationForTests(simulation_config), + # hdf_storage_factory(file_name="mlmc_giles_test.hdf5"), + # OneProcessPool())) + # means_mlmc_giles.append(mean) + # vars_mlmc_giles.append(var) + # n_collected_mlmc_giles.append(n_estimated) + + # + # print("mlmc means ", np.mean(means_mlmc, axis=0)) + # print("mlmc vars ", np.mean(vars_mlmc, axis=0)) + # print("mlmc n collected ", np.mean(n_collected_mlmc, axis=0)) + # + # print("mlmc giles means ", np.mean(means_mlmc_giles, axis=0)) + # print("mlmc giles vars ", np.mean(vars_mlmc_giles, axis=0)) + # print("mlmc giles n collected ", np.mean(n_collected_mlmc_giles, axis=0)) + + + + + + + + + + + diff --git a/test/test_sampler.py b/test/test_sampler.py index dc846f3a..d559b6ea 100644 --- a/test/test_sampler.py +++ b/test/test_sampler.py @@ -1,5 +1,8 @@ +import os +import shutil import numpy as np from scipy import stats +import mlmc from mlmc.sample_storage import Memory from mlmc.sim.synth_simulation import SynthSimulation from mlmc.sampling_pool import OneProcessPool @@ -34,3 +37,43 @@ def test_sampler(): n_estimated = np.array([100, 50, 20]) sampler.process_adding_samples(n_estimated, 0, 0.1) assert np.allclose(sampler._n_target_samples, init_samples + (n_estimated * 0.1), atol=1) + + +def test_sampler_hdf(): + # Create simulations + failed_fraction = 0.1 + distr = stats.norm() + simulation_config = dict(distr=distr, complexity=2, nan_fraction=failed_fraction, sim_method='_sample_fn') + simulation = SynthSimulation(simulation_config) + + work_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '_test_tmp') + if os.path.exists(work_dir): + shutil.rmtree(work_dir) + os.makedirs(work_dir) + file_path = os.path.join(work_dir, "mlmc_test.hdf5") + storage = mlmc.SampleStorageHDF(file_path=file_path) + sampling_pool = OneProcessPool() + + step_range = [[0.1], [0.01], [0.001]] + + sampler = Sampler(sample_storage=storage, sampling_pool=sampling_pool, sim_factory=simulation, + level_parameters=step_range) + + assert len(sampler._level_sim_objects) == len(step_range) + for step, level_sim in zip(step_range, sampler._level_sim_objects): + assert step[0] == level_sim.config_dict['fine']['step'] + + init_samples = list(np.ones(len(step_range)) * 10) + + sampler.set_initial_n_samples(init_samples) + assert np.allclose(sampler._n_target_samples, init_samples) + assert 0 == sampler.ask_sampling_pool_for_samples() + sampler.schedule_samples() + assert np.allclose(sampler._n_scheduled_samples, init_samples) + + n_estimated = np.array([100, 50, 20]) + sampler.process_adding_samples(n_estimated, 0, 0.1) + assert np.allclose(sampler._n_target_samples, init_samples + (n_estimated * 0.1), atol=1) + + +test_sampler_hdf() \ No newline at end of file From 067bbce3a984919ec118814250e6b0b52fa31570 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Tue, 9 May 2023 14:31:56 +0200 Subject: [PATCH 03/31] numpy 1.24 fix --- mlmc/sample_storage.py | 9 ++++++--- mlmc/sample_storage_hdf.py | 8 ++++---- mlmc/tool/hdf5.py | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/mlmc/sample_storage.py b/mlmc/sample_storage.py index 01be9082..ec05080f 100644 --- a/mlmc/sample_storage.py +++ b/mlmc/sample_storage.py @@ -168,12 +168,15 @@ def _save_successful(self, samples): :return: None """ for level_id, res in samples.items(): - res = np.array(res) + res = np.array(res, dtype=object) fine_coarse_res = res[:, 1] - result_type = np.dtype((np.float, np.array(fine_coarse_res[0]).shape)) + result_type = np.dtype((float, np.array(fine_coarse_res[0], dtype=object).shape)) results = np.empty(shape=(len(res),), dtype=result_type) - results[:] = [val for val in fine_coarse_res] + + for idx, val in enumerate(fine_coarse_res): + results[idx, 0] = val[0] + results[idx, 1] = val[1] # Save sample ids self._successful_sample_ids.setdefault(level_id, []).extend(res[:, 0]) diff --git a/mlmc/sample_storage_hdf.py b/mlmc/sample_storage_hdf.py index 7e5fbef5..68b1af9d 100644 --- a/mlmc/sample_storage_hdf.py +++ b/mlmc/sample_storage_hdf.py @@ -39,7 +39,7 @@ def _hdf_result_format(self, locations, times): :return: """ if len(locations[0]) == 3: - tuple_dtype = np.dtype((np.float, (3,))) + tuple_dtype = np.dtype((float, (3,))) loc_dtype = np.dtype((tuple_dtype, (len(locations),))) else: loc_dtype = np.dtype(('S50', (len(locations),))) @@ -48,14 +48,14 @@ def _hdf_result_format(self, locations, times): 'formats': ('S50', 'S50', np.dtype((np.int32, (2,))), - np.dtype((np.float, (len(times),))), + np.dtype((float, (len(times),))), loc_dtype ) } return result_dtype - def save_global_data(self, level_parameters: List[np.float], result_format: List[QuantitySpec]): + def save_global_data(self, level_parameters: List[float], result_format: List[QuantitySpec]): """ Save hdf5 file global attributes :param level_parameters: list of simulation steps @@ -125,7 +125,7 @@ def save_samples(self, successful, failed): def _save_succesful(self, successful_samples): for level, samples in successful_samples.items(): if len(samples) > 0: - self._level_groups[level].append_successful(np.array(samples)) + self._level_groups[level].append_successful(np.array(samples, dtype=object)) def _save_failed(self, failed_samples): for level, samples in failed_samples.items(): diff --git a/mlmc/tool/hdf5.py b/mlmc/tool/hdf5.py index f6f3219c..32e24149 100644 --- a/mlmc/tool/hdf5.py +++ b/mlmc/tool/hdf5.py @@ -309,7 +309,7 @@ def append_successful(self, samples: np.array): self._append_dataset(self.collected_ids_dset, samples[:, 0]) values = samples[:, 1] - result_type = np.dtype((np.float, np.array(values[0]).shape)) + result_type = np.dtype((float, np.array(values[0]).shape)) # Create dataset for failed samples self._make_dataset(name='collected_values', shape=(0,), From f8b7f3fa40812cd5beb5dd2a6693713aa0501eb8 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Thu, 3 Aug 2023 13:09:06 +0200 Subject: [PATCH 04/31] pbs run fix --- mlmc/random/correlated_field.py | 6 +- mlmc/sampler.py | 25 ++- mlmc/sampling_pool_pbs.py | 266 ++++++++++++++++++++++++-------- mlmc/tool/pbs_job.py | 13 +- 4 files changed, 240 insertions(+), 70 deletions(-) diff --git a/mlmc/random/correlated_field.py b/mlmc/random/correlated_field.py index ba83e15b..b5c809a7 100644 --- a/mlmc/random/correlated_field.py +++ b/mlmc/random/correlated_field.py @@ -226,7 +226,11 @@ def sample(self): for field in self.fields: sample = field.sample() if field.is_outer: - result[field.name] = np.zeros(self.n_elements) + if field.name == "cond_tn": + result[field.name] = np.zeros((self.n_elements, 3)) + else: + result[field.name] = np.zeros(self.n_elements) + #result[field.name] = np.zeros(self.n_elements) result[field.name][field.full_sample_ids] = sample return result diff --git a/mlmc/sampler.py b/mlmc/sampler.py index 6a550726..17b7dce0 100644 --- a/mlmc/sampler.py +++ b/mlmc/sampler.py @@ -119,7 +119,7 @@ def _get_sample_tag(self, level_id): """ return "L{:02d}_S{:07d}".format(level_id, int(self._n_scheduled_samples[level_id])) - def schedule_samples(self, timeout=None): + def schedule_samples(self, timeout=None, level_id=None, n_samples=None): """ Create simulation samples, loop through "levels" and its samples (given the number of target samples): 1) generate sample tag (same for fine and coarse simulation) @@ -132,7 +132,9 @@ def schedule_samples(self, timeout=None): self.ask_sampling_pool_for_samples(timeout=timeout) plan_samples = self._n_target_samples - self._n_scheduled_samples - for level_id, n_samples in enumerate(plan_samples): + if level_id is None: + level_id = len(plan_samples) - 1 + if n_samples is not None: samples = [] for _ in range(int(n_samples)): # Unique sample id @@ -143,11 +145,28 @@ def schedule_samples(self, timeout=None): self._sampling_pool.schedule_sample(sample_id, level_sim) # Increment number of created samples at current level self._n_scheduled_samples[level_id] += 1 - samples.append(sample_id) # Store scheduled samples self.sample_storage.save_scheduled_samples(level_id, samples) + else: + for n_samples in np.flip(plan_samples): + samples = [] + for _ in range(int(n_samples)): + # Unique sample id + sample_id = self._get_sample_tag(level_id) + level_sim = self._level_sim_objects[level_id] + + # Schedule current sample + self._sampling_pool.schedule_sample(sample_id, level_sim) + # Increment number of created samples at current level + self._n_scheduled_samples[level_id] += 1 + + samples.append(sample_id) + + # Store scheduled samples + self.sample_storage.save_scheduled_samples(level_id, samples) + level_id -= 1 def _check_failed_samples(self): """ diff --git a/mlmc/sampling_pool_pbs.py b/mlmc/sampling_pool_pbs.py index 01c4128b..2142fdcd 100644 --- a/mlmc/sampling_pool_pbs.py +++ b/mlmc/sampling_pool_pbs.py @@ -5,6 +5,8 @@ import pickle import json import glob +import time +import numpy as np from mlmc.level_simulation import LevelSimulation from mlmc.sampling_pool import SamplingPool from mlmc.tool.pbs_job import PbsJob @@ -133,9 +135,17 @@ def pbs_common_setting(self, **kwargs): :return: None """ # Script header - select_flags_list = kwargs.get('select_flags', []) - if select_flags_list: - kwargs['select_flags'] = ":" + ":".join(select_flags_list) + select_flags_dict = kwargs.get('select_flags', {}) + + # Set scratch dir + if any(re.compile('scratch.*').match(flag) for flag in list(select_flags_dict.keys())): + if kwargs['scratch_dir'] is None: + kwargs['scratch_dir'] = "$SCRATCHDIR" + else: + kwargs['scratch_dir'] = '' + + if select_flags_dict: + kwargs['select_flags'] = ":" + ':'.join('{}={}'.format(*item) for item in select_flags_dict.items()) else: kwargs['select_flags'] = "" @@ -162,7 +172,7 @@ def pbs_common_setting(self, **kwargs): kwargs['optional_pbs_requests']) # e.g. ['#PBS -m ae'] means mail is sent when the job aborts or terminates self._pbs_header_template.extend(('MLMC_WORKDIR=\"{}\"'.format(self._work_dir),)) self._pbs_header_template.extend(kwargs['env_setting']) - self._pbs_header_template.extend(('{python} -m mlmc.tool.pbs_job {output_dir} {job_name} >' + self._pbs_header_template.extend(('{python} -m mlmc.tool.pbs_job {output_dir} {job_name} {scratch_dir} >' '{pbs_output_dir}/{job_name}_STDOUT 2>&1',)) self._pbs_config = kwargs @@ -221,28 +231,31 @@ def execute(self): script_content = "\n".join(self.pbs_script) self.write_script(script_content, job_file) - process = subprocess.run(['qsub', job_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) - try: - if process.returncode != 0: - raise Exception(process.stderr.decode('ascii')) - # Find all finished jobs - self._qsub_failed_n = 0 - # Write current job count - self._job_count += 1 - - # Get pbs_id from qsub output - pbs_id = process.stdout.decode("ascii").split(".")[0] - # Store pbs id for future qstat calls - self._pbs_ids.append(pbs_id) - pbs_process.write_pbs_id(pbs_id) - - self._current_job_weight = 0 - self._n_samples_in_job = 0 - self._scheduled = [] - except: - self._qsub_failed_n += 1 - if self._qsub_failed_n > SamplingPoolPBS.QSUB_FAILED_MAX_N: - raise Exception(process.stderr.decode("ascii")) + while self._qsub_failed_n <= SamplingPoolPBS.QSUB_FAILED_MAX_N: + process = subprocess.run(['qsub', job_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + try: + if process.returncode != 0: + raise Exception(process.stderr.decode('ascii')) + # Find all finished jobs + self._qsub_failed_n = 0 + # Write current job count + self._job_count += 1 + + # Get pbs_id from qsub output + pbs_id = process.stdout.decode("ascii").split(".")[0] + # Store pbs id for future qstat calls + self._pbs_ids.append(pbs_id) + pbs_process.write_pbs_id(pbs_id) + + self._current_job_weight = 0 + self._n_samples_in_job = 0 + self._scheduled = [] + break + except: + self._qsub_failed_n += 1 + time.sleep(30) + if self._qsub_failed_n > SamplingPoolPBS.QSUB_FAILED_MAX_N: + raise Exception(process.stderr.decode("ascii")) def _create_script(self): """ @@ -277,6 +290,64 @@ def get_finished(self): finished_pbs_jobs, unfinished_pbs_jobs = self._qstat_pbs_job() return self._get_result_files(finished_pbs_jobs, unfinished_pbs_jobs) + def collect_data(self): + successful_results = {} + failed_results = {} + times = {} + sim_data_results = {} + # running_times = {} + # extract_mesh_times = {} + # make_field_times = {} + # generate_rnd_times = {} + # fine_flow_times = {} + # coarse_flow_times = {} + n_running = 0 + + os.chdir(self._jobs_dir) + for file in glob.glob("*_STDOUT"): + job_id = re.findall(r'(\d+)_STDOUT', file)[0] + + successful, failed, time = PbsJob.read_results(job_id, self._jobs_dir) + + # Split results to levels + for level_id, results in successful.items(): + successful_results.setdefault(level_id, []).extend(results) + for level_id, results in failed.items(): + failed_results.setdefault(level_id, []).extend(results) + for level_id, results in time.items(): + if level_id in times: + times[level_id][0] += results[-1][0] + times[level_id][1] += results[-1][1] + else: + times[level_id] = list(results[-1]) + + # # Optional simulation data + # for level_id, results in sim_data.items(): + # sim_data_results.setdefault(level_id, []).extend(results) + + # for level_id, results in running_time.items(): + # running_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in extract_mesh.items(): + # extract_mesh_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in make_field.items(): + # make_field_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in generate_rnd.items(): + # generate_rnd_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in fine_flow.items(): + # fine_flow_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in coarse_flow.items(): + # coarse_flow_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + + return successful_results, failed_results, n_running #, sim_data_results #list(times.items()), list(running_times.items()), \ + # list(extract_mesh_times.items()), list(make_field_times.items()), list(generate_rnd_times.items()), \ + # list(fine_flow_times.items()), \ + # list(coarse_flow_times.items()) + def _qstat_pbs_job(self): """ Parse qstat output and get all unfinished job ids @@ -286,23 +357,36 @@ def _qstat_pbs_job(self): if len(self._pbs_ids) > 0: # Get PBS id's status, # '-x' - displays status information for finished and moved jobs in addition to queued and running jobs. - qstat_call = ["qstat", "-x"] + qstat_call = ["qstat", "-xs"] qstat_call.extend(self._pbs_ids) - # qstat call - process = subprocess.run(qstat_call, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - try: - if process.returncode != 0: - raise Exception(process.stderr.decode("ascii")) - output = process.stdout.decode("ascii") - # Find all finished jobs - finished_pbs_jobs = re.findall(r"(\d+)\..*\d+ F", output) - self._qstat_failed_n = 0 - except: - self._qstat_failed_n += 1 - if self._qstat_failed_n > SamplingPoolPBS.QSTAT_FAILED_MAX_N: - raise Exception(process.stderr.decode("ascii")) - finished_pbs_jobs = [] + while self._qstat_failed_n <= SamplingPoolPBS.QSTAT_FAILED_MAX_N: + # qstat call + unknown_job_ids = [] + process = subprocess.run(qstat_call, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + try: + if process.returncode != 0: + err_output = process.stderr.decode("ascii") + # Presumably, Job Ids are 'unknown' for PBS after some time of their inactivity + unknown_job_ids = re.findall(r"Unknown Job Id (\d+)\.", err_output) + + if len(unknown_job_ids) == 0: + raise Exception(process.stderr.decode("ascii")) + + output = process.stdout.decode("ascii") + # Find all finished jobs + finished_pbs_jobs = re.findall(r"(\d+)\..*\d+ F", output) + finished_moved_pbs_jobs = re.findall(r"(\d+)\..*\d+ M.*\n.*Job finished", output) + finished_pbs_jobs.extend(finished_moved_pbs_jobs) + finished_pbs_jobs.extend(unknown_job_ids) + self._qstat_failed_n = 0 + break + except: + self._qstat_failed_n += 1 + time.sleep(30) + if self._qstat_failed_n > SamplingPoolPBS.QSTAT_FAILED_MAX_N: + raise Exception(process.stderr.decode("ascii")) + finished_pbs_jobs = [] # Get unfinished as diff between planned and finished unfinished_pbs_jobs = [] @@ -327,7 +411,7 @@ def _get_result_files(self, finished_pbs_jobs, unfinished_pbs_jobs): :return: successful_results: Dict[level_id, List[Tuple[sample_id: str, Tuple[fine_result: np.ndarray, coarse_result: n.ndarray]]]] failed_results: Dict[level_id, List[Tuple[sample_id: str, err_msg: str]]] n_running: int, number of running samples - times: + times: """ os.chdir(self._jobs_dir) @@ -343,6 +427,14 @@ def _get_result_files(self, finished_pbs_jobs, unfinished_pbs_jobs): successful_results = {} failed_results = {} times = {} + #sim_data_results = {} + # running_times = {} + # extract_mesh_times = {} + # make_field_times = {} + # generate_rnd_times = {} + # fine_flow_times = {} + # coarse_flow_times = {} + for pbs_id in finished_pbs_jobs: reg = "*_{}".format(pbs_id) # JobID_PbsId file file = glob.glob(reg) @@ -359,6 +451,8 @@ def _get_result_files(self, finished_pbs_jobs, unfinished_pbs_jobs): successful_results.setdefault(level_id, []).extend(results) for level_id, results in failed.items(): failed_results.setdefault(level_id, []).extend(results) + # for level_id, results in sim_data.items(): + # sim_data_results.setdefault(level_id, []).extend(results) for level_id, results in time.items(): if level_id in times: times[level_id][0] += results[-1][0] @@ -366,14 +460,38 @@ def _get_result_files(self, finished_pbs_jobs, unfinished_pbs_jobs): else: times[level_id] = list(results[-1]) + # for level_id, results in running_time.items(): + # running_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in extract_mesh.items(): + # extract_mesh_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in make_field.items(): + # make_field_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in generate_rnd.items(): + # generate_rnd_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in fine_flow.items(): + # fine_flow_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in coarse_flow.items(): + # coarse_flow_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] # Delete pbsID file - it means job is finished SamplingPoolPBS.delete_pbs_id_file(file) if self._unfinished_sample_ids: successful_results, failed_results, times = self._collect_unfinished(successful_results, - failed_results, times) + failed_results, times, + ) + # running_times + # extract_mesh_times, + # make_field_times, + # generate_rnd_times, + # fine_flow_times, + # coarse_flow_times) - return successful_results, failed_results, n_running, list(times.items()) + return successful_results, failed_results, n_running, list(times.items())#, sim_data_results def _collect_unfinished(self, successful_results, failed_results, times): """ @@ -384,42 +502,64 @@ def _collect_unfinished(self, successful_results, failed_results, times): :return: all input dictionaries """ already_collected = set() + for sample_id in self._unfinished_sample_ids: if sample_id in already_collected: continue - job_id = PbsJob.job_id_from_sample_id(sample_id, self._jobs_dir) + try: + job_id = PbsJob.job_id_from_sample_id(sample_id, self._jobs_dir) + except (FileNotFoundError, KeyError) as e: + level_id = int(re.findall(r'L0?(\d*)', sample_id)[0]) + failed_results.setdefault(level_id, []).append((sample_id, "".format(e))) + continue + successful, failed, time = PbsJob.read_results(job_id, self._jobs_dir) # Split results to levels for level_id, results in successful.items(): - for res in results: - if res[0] in self._unfinished_sample_ids: - already_collected.add(res[0]) - successful_results.setdefault(level_id, []).append(res) - - for level_id, results in failed_results.items(): - for res in results: - if res[0] in self._unfinished_sample_ids: - already_collected.add(res[0]) - failed_results.setdefault(level_id, []).append(res) - - for level_id, results in times.items(): - for res in results: - if res[0] in self._unfinished_sample_ids: - times.setdefault(level_id, []).append(res) - times[level_id] = results + successful_results.setdefault(level_id, []).extend(results) + for level_id, results in failed.items(): + failed_results.setdefault(level_id, []).extend(results) + for level_id, results in time.items(): + times[level_id] = results[-1] + + # for level_id, results in sim_data.items(): + # sim_data_results.setdefault(level_id, []).extend(results) + + # for level_id, results in running_time.items(): + # running_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in extract_mesh.items(): + # extract_mesh_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in make_field.items(): + # make_field_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in generate_rnd.items(): + # generate_rnd_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in fine_flow.items(): + # fine_flow_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + # + # for level_id, results in coarse_flow.items(): + # coarse_flow_times[level_id] = [np.sum(results, axis=0)[0], results[-1][1]] + + level_id_sample_id_seed = PbsJob.get_scheduled_sample_ids(job_id, self._jobs_dir) + + for level_id, sample_id, _ in level_id_sample_id_seed: + already_collected.add(sample_id) # Delete pbsID file - it means job is finished # SamplingPoolPBS.delete_pbs_id_file(file) self._unfinished_sample_ids = set() - return successful_results, failed_results, times + return successful_results, failed_results, times#, sim_data_results def have_permanent_samples(self, sample_ids): """ - List of unfinished sample ids, the corresponding samples are collecting in next get_finished() call . + List of unfinished sample ids, the corresponding samples are collecting in next get_finished() call """ self._unfinished_sample_ids = set(sample_ids) diff --git a/mlmc/tool/pbs_job.py b/mlmc/tool/pbs_job.py index a39b8646..6b8a6535 100644 --- a/mlmc/tool/pbs_job.py +++ b/mlmc/tool/pbs_job.py @@ -149,11 +149,13 @@ def calculate_samples(self): current_samples = [] # Currently saved samples start_time = time.time() + successful_samples_time = 0 times = [] # Sample calculation time - Tuple(level_id, [n samples, cumul time for n sample]) n_times = 0 successful_dest_dir = os.path.join(self._output_dir, SamplingPool.SEVERAL_SUCCESSFUL_DIR) for level_id, sample_id, seed in level_id_sample_id_seed: + start_time = time.time() # Deserialize level simulation config if level_id not in self._level_simulations: self._get_level_sim(level_id) @@ -161,9 +163,10 @@ def calculate_samples(self): # Start measuring time if current_level != level_id: # Save previous level times - times.append((current_level, time.time() - start_time, n_times)) + times.append((current_level, successful_samples_time, n_times)) n_times = 0 start_time = time.time() + successful_samples_time = 0 current_level = level_id level_sim = self._level_simulations[current_level] @@ -178,6 +181,10 @@ def calculate_samples(self): SamplingPool.move_successful_rm(sample_id, level_sim, output_dir=self._output_dir, dest_dir=SamplingPool.SEVERAL_SUCCESSFUL_DIR) + n_times += 1 + successful_samples_time += (time.time() - start_time) + print("sample time ", time.time() - start_time) + # times.append((current_level, time.time() - start_time, n_times)) else: failed.append((current_level, sample_id, err_msg)) SamplingPool.move_failed_rm(sample_id, level_sim, @@ -185,8 +192,8 @@ def calculate_samples(self): dest_dir=SamplingPool.FAILED_DIR) current_samples.append(sample_id) - n_times += 1 - times.append((current_level, time.time() - start_time, n_times)) + #n_times += 1 + times.append((current_level, successful_samples_time, n_times)) self._save_to_file(success, failed, times, current_samples) success = [] From 95633ded1e80f0640218d70694fa77063841eb53 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Mon, 12 Aug 2024 12:08:54 +0200 Subject: [PATCH 05/31] scikit-learn --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 765d3159..f9301bd2 100644 --- a/setup.py +++ b/setup.py @@ -61,5 +61,5 @@ def read(*names, **kwargs): # include automatically all files in the template MANIFEST.in include_package_data=True, zip_safe=False, - install_requires=['numpy', 'scipy', 'sklearn', 'h5py>=3.1.0', 'ruamel.yaml', 'attrs', 'gstools', 'memoization'], + install_requires=['numpy', 'scipy', 'scikit-learn', 'h5py>=3.1.0', 'ruamel.yaml', 'attrs', 'gstools', 'memoization'], ) From 9d94434dda74ad50eb06eec227a1d01998c2b74d Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Mon, 12 Aug 2024 12:47:41 +0200 Subject: [PATCH 06/31] hdf estimator fix --- mlmc/estimator.py | 66 ++++++++++++++++++++++++++++------------------- mlmc/tool/hdf5.py | 2 +- 2 files changed, 41 insertions(+), 27 deletions(-) diff --git a/mlmc/estimator.py b/mlmc/estimator.py index 9e9fad64..b5957f80 100644 --- a/mlmc/estimator.py +++ b/mlmc/estimator.py @@ -82,7 +82,9 @@ def estimate_diff_vars_regression(self, n_created_samples, moments_fn=None, raw_ raw_vars, n_samples = self.estimate_diff_vars(moments_fn) sim_steps = np.squeeze(self._sample_storage.get_level_parameters()) + # print("sim steps ", sim_steps) vars = self._all_moments_variance_regression(raw_vars, sim_steps) + # We need to get n_ops_estimate from storage return vars, self._sample_storage.get_n_ops() @@ -95,6 +97,8 @@ def estimate_diff_vars(self, moments_fn=None): n_samples - shape L, num samples for individual levels. """ moments_mean = qe.estimate_mean(qe.moments(self._quantity, moments_fn)) + # print("moments_mean.l_vars ", moments_mean.l_vars) + # print("moments_mean.n_samples ", moments_mean.n_samples) return moments_mean.l_vars, moments_mean.n_samples def _all_moments_variance_regression(self, raw_vars, sim_steps): @@ -196,7 +200,7 @@ def est_bootstrap(self, n_subsamples=100, sample_vector=None, moments_fn=None): bs_l_means = [] bs_l_vars = [] for i in range(n_subsamples): - quantity_subsample = self.quantity.select(self.quantity.subsample(sample_vec=sample_vector)) + quantity_subsample = self.quantity.subsample(sample_vec=sample_vector) moments_quantity = qe.moments(quantity_subsample, moments_fn=moments_fn, mom_at_bottom=False) q_mean = qe.estimate_mean(moments_quantity) @@ -205,6 +209,9 @@ def est_bootstrap(self, n_subsamples=100, sample_vector=None, moments_fn=None): bs_l_means.append(q_mean.l_means) bs_l_vars.append(q_mean.l_vars) + self.bs_mean = bs_mean + self.bs_var = bs_var + self.mean_bs_mean = np.mean(bs_mean, axis=0) self.mean_bs_var = np.mean(bs_var, axis=0) self.mean_bs_l_means = np.mean(bs_l_means, axis=0) @@ -217,14 +224,15 @@ def est_bootstrap(self, n_subsamples=100, sample_vector=None, moments_fn=None): self._bs_level_mean_variance = self.var_bs_l_means * np.array(self._sample_storage.get_n_collected())[:, None] - def bs_target_var_n_estimated(self, target_var, sample_vec=None): + def bs_target_var_n_estimated(self, target_var, sample_vec=None, n_subsamples=100): sample_vec = determine_sample_vec(n_collected_samples=self._sample_storage.get_n_collected(), n_levels=self._sample_storage.get_n_levels(), sample_vector=sample_vec) - self.est_bootstrap(n_subsamples=300, sample_vector=sample_vec) + self.est_bootstrap(n_subsamples=n_subsamples, sample_vector=sample_vec) variances, n_ops = self.estimate_diff_vars_regression(sample_vec, raw_vars=self.mean_bs_l_vars) + n_estimated = estimate_n_samples_for_target_variance(target_var, variances, n_ops, n_levels=self._sample_storage.get_n_levels()) @@ -304,10 +312,17 @@ def estimate_domain(quantity, sample_storage, quantile=None): except AttributeError: print("No collected values for level {}".format(level_id)) break - chunk_spec = next(sample_storage.chunks(n_samples=sample_storage.get_n_collected()[level_id])) - fine_samples = quantity.samples(chunk_spec)[..., 0] # Fine samples at level 0 + #print("sample_storage.get_n_collected()[level_id] ", type(sample_storage.get_n_collected()[level_id])) + print("sample_storage.get_n_collected() ", type(sample_storage.get_n_collected()[0])) + + if isinstance(sample_storage.get_n_collected()[level_id], AttributeError): + print("continue") + continue + chunk_spec = next(sample_storage.chunks(level_id=level_id, n_samples=sample_storage.get_n_collected()[level_id])) + fine_samples = quantity.samples(chunk_spec)[..., 0] # Fine samples at level 0 fine_samples = np.squeeze(fine_samples) + print("fine samples ", fine_samples) fine_samples = fine_samples[~np.isnan(fine_samples)] # remove NaN ranges.append(np.percentile(fine_samples, [100 * quantile, 100 * (1 - quantile)])) @@ -399,26 +414,26 @@ def consistency_check(quantity, sample_storage=None): return cons_check_val -def estimate_domain(quantity, sample_storage, quantile=None): - """ - Estimate moments domain from MLMC samples. - :param quantity: mlmc.quantity.Quantity instance, represents the real quantity - :param sample_storage: mlmc.sample_storage.SampleStorage instance, provides all the samples - :param quantile: float in interval (0, 1), None means whole sample range - :return: lower_bound, upper_bound - """ - ranges = [] - if quantile is None: - quantile = 0.01 - - for level_id in range(sample_storage.get_n_levels()): - fine_samples = quantity.samples(ChunkSpec(level_id=level_id, n_samples=sample_storage.get_n_collected()[0]))[..., 0] - - fine_samples = np.squeeze(fine_samples) - ranges.append(np.percentile(fine_samples, [100 * quantile, 100 * (1 - quantile)])) - - ranges = np.array(ranges) - return np.min(ranges[:, 0]), np.max(ranges[:, 1]) +# def estimate_domain(quantity, sample_storage, quantile=None): +# """ +# Estimate moments domain from MLMC samples. +# :param quantity: mlmc.quantity.Quantity instance, represents the real quantity +# :param sample_storage: mlmc.sample_storage.SampleStorage instance, provides all the samples +# :param quantile: float in interval (0, 1), None means whole sample range +# :return: lower_bound, upper_bound +# """ +# ranges = [] +# if quantile is None: +# quantile = 0.01 +# +# for level_id in range(sample_storage.get_n_levels()): +# fine_samples = quantity.samples(ChunkSpec(level_id=level_id, n_samples=sample_storage.get_n_collected()[0]))[..., 0] +# +# fine_samples = np.squeeze(fine_samples) +# ranges.append(np.percentile(fine_samples, [100 * quantile, 100 * (1 - quantile)])) +# +# ranges = np.array(ranges) +# return np.min(ranges[:, 0]), np.max(ranges[:, 1]) def coping_with_high_kurtosis(vars, costs, kurtosis, kurtosis_threshold=100): @@ -531,4 +546,3 @@ def determine_n_samples(n_levels, n_samples=None): return n_samples - diff --git a/mlmc/tool/hdf5.py b/mlmc/tool/hdf5.py index 62a9ac99..32e24149 100644 --- a/mlmc/tool/hdf5.py +++ b/mlmc/tool/hdf5.py @@ -357,7 +357,7 @@ def chunks(self, n_samples=None): dataset = hdf_file["/".join([self.level_group_path, "collected_values"])] if n_samples is not None: - yield ChunkSpec(chunk_id=0, chunk_slice=slice(0, n_samples, ...), level_id=int(self.level_id)) + yield ChunkSpec(chunk_id=0, chunk_slice=slice(0, n_samples, 1), level_id=int(self.level_id)) else: for chunk_id, chunk in enumerate(dataset.iter_chunks()): yield ChunkSpec(chunk_id=chunk_id, chunk_slice=chunk[0], level_id=int(self.level_id)) # slice, level_id From edf1f5a4160eb68e34879024acf4904fdc0ced8e Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Mon, 12 Aug 2024 13:13:05 +0200 Subject: [PATCH 07/31] srf --- mlmc/random/correlated_field.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mlmc/random/correlated_field.py b/mlmc/random/correlated_field.py index b5c809a7..cebd1572 100644 --- a/mlmc/random/correlated_field.py +++ b/mlmc/random/correlated_field.py @@ -198,7 +198,9 @@ def set_points(self, points, region_ids=[], region_map={}): :return: """ self.n_elements = len(points) - assert len(points) == len(region_ids) + print("n elements: {}, len(points): {}".format(self.n_elements, len(points))) + + #assert len(points) == len(region_ids) reg_points = {} for i, reg_id in enumerate(region_ids): reg_list = reg_points.get(reg_id, []) @@ -504,14 +506,14 @@ def _sample(self): class GSToolsSpatialCorrelatedField(RandomFieldBase): - def __init__(self, model, mode_no=1000, log=False, sigma=1): + def __init__(self, model, mode_no=1000, log=False, sigma=1, seed=None): """ :param model: instance of covariance model class, which parent is gstools.covmodel.CovModel :param mode_no: number of Fourier modes, default: 1000 as in gstools package """ self.model = model self.mode_no = mode_no - self.srf = gstools.SRF(model, mode_no=mode_no) + self.srf = gstools.SRF(model, mode_no=mode_no, seed=seed) self.mu = self.srf.mean self.sigma = sigma self.dim = model.dim @@ -753,7 +755,3 @@ def _sample(self): :return: Random field evaluated in points given by 'set_points'. """ return self.random_field() - - if not self.log: - return field - return np.exp(field) From 6e06ecc6e8a3640a05b3dc8bfd68136b23f89051 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Wed, 21 Aug 2024 14:34:09 +0200 Subject: [PATCH 08/31] ruamel.yaml lower version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ad19c689..83fe5a4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ numpy scipy sklearn h5py>=3.1.0 -ruamel.yaml +ruamel.yaml<0.18.0 attrs gstools memoization From bc22fa969fb4f56ceec5ffe181f182aa42ab415c Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Wed, 21 Aug 2024 14:48:10 +0200 Subject: [PATCH 09/31] remove py36 from tox --- tox.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 7a4d4ef6..704732ff 100644 --- a/tox.ini +++ b/tox.ini @@ -3,12 +3,11 @@ # content of: tox.ini , put in same dir as setup.py [tox] -envlist = py36, py37, py38 +envlist = py37, py38 #envlist = py36 [gh-actions] python = - 3.6: py36 3.7: py37 3.8: py38 From b6f5df1e43806935bf7c2a0264f92e0842f24a30 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Wed, 21 Aug 2024 15:10:21 +0200 Subject: [PATCH 10/31] ruaml.yaml 0.17 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 83fe5a4a..8931ac1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ numpy scipy sklearn h5py>=3.1.0 -ruamel.yaml<0.18.0 +ruamel.yaml==0.17.26 attrs gstools memoization From 9bc3168df84b64d69bf662a2247d537d3ecc2675 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Mon, 14 Jul 2025 15:37:39 +0200 Subject: [PATCH 11/31] error msg print --- mlmc/sampling_pool.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mlmc/sampling_pool.py b/mlmc/sampling_pool.py index 0d402325..cbbbb360 100644 --- a/mlmc/sampling_pool.py +++ b/mlmc/sampling_pool.py @@ -122,6 +122,7 @@ def calculate_sample(sample_id, level_sim, work_dir=None, seed=None): except Exception: str_list = traceback.format_exception(*sys.exc_info()) err_msg = "".join(str_list) + print("Error msg: ", err_msg) return sample_id, res, err_msg, running_time From b2b0a9038458fa4cfe2e6b8c69e067f8f8b05b40 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Wed, 16 Jul 2025 10:51:07 +0200 Subject: [PATCH 12/31] tox update --- mlmc/sample_storage_hdf.py | 2 -- mlmc/sim/synth_simulation.py | 3 ++- test/test_hdf.py | 4 ++-- test/test_sampling_pools.py | 4 +++- tox.ini | 10 +++------- 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/mlmc/sample_storage_hdf.py b/mlmc/sample_storage_hdf.py index 68b1af9d..a53a8c64 100644 --- a/mlmc/sample_storage_hdf.py +++ b/mlmc/sample_storage_hdf.py @@ -4,8 +4,6 @@ from mlmc.sample_storage import SampleStorage from mlmc.quantity.quantity_spec import QuantitySpec, ChunkSpec import mlmc.tool.hdf5 as hdf -import warnings -warnings.simplefilter("ignore", np.VisibleDeprecationWarning) class SampleStorageHDF(SampleStorage): diff --git a/mlmc/sim/synth_simulation.py b/mlmc/sim/synth_simulation.py index 53417219..0a5176c6 100644 --- a/mlmc/sim/synth_simulation.py +++ b/mlmc/sim/synth_simulation.py @@ -1,5 +1,5 @@ import os -import ruamel.yaml as yaml +import ruamel.yaml as ruyaml import numpy as np from typing import List import scipy.stats as stats @@ -291,6 +291,7 @@ def n_ops_estimate(self, step): @staticmethod def _read_config(): with open(os.path.join(os.getcwd(), SynthSimulationWorkspace.CONFIG_FILE)) as file: + yaml = ruyaml.YAML(typ='rt') config = yaml.load(file) return config diff --git a/test/test_hdf.py b/test/test_hdf.py index 8778a60b..f06ec98e 100644 --- a/test/test_hdf.py +++ b/test/test_hdf.py @@ -87,10 +87,10 @@ def load_from_file(hdf_obj, obligatory_attributes): SCHEDULED_SAMPLES = ['L00_S0000000', 'L00_S0000001', 'L00_S0000002', 'L00_S0000003', 'L00_S0000004'] -RESULT_DATA_DTYPE = [("value", np.float), ("time", np.float)] +RESULT_DATA_DTYPE = [("value", np.float64), ("time", np.float64)] COLLECTED_SAMPLES = np.array([['L00S0000000', (np.array([10, 20]), np.array([5, 6]))], - ['L00S0000001', (np.array([1, 2]), np.array([50, 60]))]]) + ['L00S0000001', (np.array([1, 2]), np.array([50, 60]))]], dtype=object) diff --git a/test/test_sampling_pools.py b/test/test_sampling_pools.py index 7d87251f..f3b72361 100644 --- a/test/test_sampling_pools.py +++ b/test/test_sampling_pools.py @@ -33,7 +33,9 @@ simulation_config = dict(distr='norm', complexity=2, nan_fraction=failed_fraction, sim_method='_sample_fn') with open('synth_sim_config_test.yaml', "w") as file: - yaml.dump(simulation_config, file, default_flow_style=False) + yaml = yaml.YAML(typ='full') + yaml.dump(simulation_config, file) + shutil.copyfile('synth_sim_config_test.yaml', os.path.join(work_dir, 'synth_sim_config.yaml')) sim_config_workspace = {"config_yaml": os.path.join(work_dir, 'synth_sim_config.yaml')} diff --git a/tox.ini b/tox.ini index 704732ff..71e6f5f3 100644 --- a/tox.ini +++ b/tox.ini @@ -1,15 +1,11 @@ - - - # content of: tox.ini , put in same dir as setup.py [tox] -envlist = py37, py38 -#envlist = py36 +envlist = py310, py312 [gh-actions] python = - 3.7: py37 - 3.8: py38 + 3.10: py310 + 3.12: py312 [testenv] From a12dfc4bffbdd47e930b34e963aba044ff7131a9 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Wed, 16 Jul 2025 10:55:29 +0200 Subject: [PATCH 13/31] gh actions python version update --- .github/workflows/pythonpackage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index e3bddcb2..9b13faa5 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.10, 3.12] steps: - uses: actions/checkout@v2 From 7f16981868f0a27d64039603a1050dd7d8e956d4 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Wed, 16 Jul 2025 10:58:25 +0200 Subject: [PATCH 14/31] gh actions python version update fix --- .github/workflows/pythonpackage.yml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 9b13faa5..6da8b668 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.10, 3.12] + python-version: [3.10.18, 3.12] steps: - uses: actions/checkout@v2 diff --git a/requirements.txt b/requirements.txt index 8931ac1d..f57915c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy scipy -sklearn +scikit-learn h5py>=3.1.0 ruamel.yaml==0.17.26 attrs From d15e24e92f1498f6a646b7295a60b7f8eceaef68 Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Wed, 16 Jul 2025 11:26:43 +0200 Subject: [PATCH 15/31] sampler test ignore rm err --- test/test_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_sampler.py b/test/test_sampler.py index d559b6ea..e03dab97 100644 --- a/test/test_sampler.py +++ b/test/test_sampler.py @@ -48,7 +48,7 @@ def test_sampler_hdf(): work_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '_test_tmp') if os.path.exists(work_dir): - shutil.rmtree(work_dir) + shutil.rmtree(work_dir, ignore_errors=True) os.makedirs(work_dir) file_path = os.path.join(work_dir, "mlmc_test.hdf5") storage = mlmc.SampleStorageHDF(file_path=file_path) From b3d6367dbcf174bbbf579e1d4348704111c5d0fc Mon Sep 17 00:00:00 2001 From: martinspetlik Date: Wed, 15 Oct 2025 14:28:35 +0200 Subject: [PATCH 16/31] improved comments generated --- mlmc/level_simulation.py | 39 +- mlmc/moments.py | 249 +++++++-- mlmc/plot/diagnostic_plots.py | 187 +++++-- mlmc/plot/violinplot.py | 145 ++++- mlmc/quantity/quantity.py | 459 ++++++++++------ mlmc/quantity/quantity_estimate.py | 192 ++++--- mlmc/quantity/quantity_spec.py | 46 +- mlmc/quantity/quantity_types.py | 276 +++++++--- mlmc/random/correlated_field.py | 571 ++++++++------------ mlmc/random/frac_geom.py | 140 ----- mlmc/sample_storage.py | 88 +-- mlmc/sample_storage_hdf.py | 204 ++++--- mlmc/sampler.py | 203 ++++--- mlmc/sampling_pool.py | 419 +++++++++----- mlmc/sim/simulation.py | 42 +- mlmc/sim/synth_simulation.py | 225 ++++++++ mlmc/tool/context_statprof.py | 13 - mlmc/tool/distribution.py | 39 -- mlmc/tool/flow_mc.py | 287 +++++----- mlmc/tool/gmsh_io.py | 180 ++++--- mlmc/tool/hdf5.py | 293 ++++++---- mlmc/tool/pbs_job.py | 227 ++++---- mlmc/tool/process_base.py | 251 ++++----- mlmc/tool/simple_distribution.py | 840 +++++++++++------------------ mlmc/tool/stats_tests.py | 78 +-- 25 files changed, 3176 insertions(+), 2517 deletions(-) delete mode 100644 mlmc/random/frac_geom.py delete mode 100644 mlmc/tool/context_statprof.py diff --git a/mlmc/level_simulation.py b/mlmc/level_simulation.py index 67c5c23d..593f28c8 100644 --- a/mlmc/level_simulation.py +++ b/mlmc/level_simulation.py @@ -1,34 +1,37 @@ import attr -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional, Callable from mlmc.quantity.quantity_spec import QuantitySpec @attr.s(auto_attribs=True) class LevelSimulation: """ - This class is used to pass simulation data at a given level between a Sampler and a SamplingPool - User shouldn't change this class + Class for passing simulation configuration and metadata for a given level between + a Sampler and a SamplingPool. + + User shouldn't modify this class manually. """ + config_dict: Dict[Any, Any] - # Calculate configuration. + # Level-specific simulation configuration dictionary. - common_files: List[str] = None - # List of files in the level workspace to copy/symlink to the sample workspace. + common_files: Optional[List[str]] = None + # List of files in the level workspace to copy or symlink to the sample workspace. need_sample_workspace: bool = False - # If the simulation needs sample workspace at all. + # Whether the simulation requires an individual workspace for each sample. - task_size: int = 0 - # Relative size of the simulation at this level. - # When using PBS, keep in mind that the pbs job size is the sum of task_sizes, and if this sum is above 1, - # the job is scheduled and PBS scheduler manages it + task_size: float = 0.0 + # Relative size (or computational cost) of the simulation task at this level. + # When using PBS or SLURM, note that the job size is the sum of task_sizes. + # If this sum exceeds 1.0, the job is queued and scheduled by the system. - ### User shouldn't modify the following attributes ### - _calculate: Any = None - # Calculate method + ### Internal attributes — users should not modify these ### + _calculate: Optional[Callable] = None + # Calculation method used internally by the sampler. - _level_id: int = None - # Level id is set by mlmc.sampler.Sampler. It is internal variable and user shouldn't change it. + _level_id: Optional[int] = None + # Level identifier, set automatically by mlmc.sampler.Sampler. - _result_format: List[QuantitySpec] = None - # Simulation result format + _result_format: Optional[List[QuantitySpec]] = None + # Format specification for simulation results (defined by QuantitySpec instances). diff --git a/mlmc/moments.py b/mlmc/moments.py index 2329566e..ba1f932a 100644 --- a/mlmc/moments.py +++ b/mlmc/moments.py @@ -5,9 +5,26 @@ class Moments: """ - Class for calculating moments of a random variable + Base class for computing moment functions of a random variable. + + Provides transformation, scaling, and evaluation utilities common + to various types of generalized moment bases (monomial, Fourier, Legendre, etc.). """ + def __init__(self, size, domain, log=False, safe_eval=True): + """ + Initialize the moment function set. + + :param size: int + Number of moment functions. + :param domain: tuple(float, float) + Domain of the input variable (min, max). + :param log: bool + If True, use logarithmic transformation of the domain. + :param safe_eval: bool + If True, clip transformed values outside the reference domain + and replace them with NaN. + """ assert size > 0 self.size = size self.domain = domain @@ -25,6 +42,7 @@ def __init__(self, size, domain, log=False, safe_eval=True): self._linear_scale = (self.ref_domain[1] - self.ref_domain[0]) / diff self._linear_shift = lin_domain[0] + # Define transformation and inverse transformation functions if safe_eval and log: self.transform = lambda val: self.clip(self.linear(np.log(val))) self.inv_transform = lambda ref: np.exp(self.inv_linear(ref)) @@ -40,78 +58,153 @@ def __init__(self, size, domain, log=False, safe_eval=True): def __eq__(self, other): """ - Compare two moment functions. Equal if they returns same values. + Compare two Moments objects for equality. + + :param other: Moments + Another Moments instance. + :return: bool + True if both instances have the same parameters and configuration. """ - return type(self) is type(other) \ - and self.size == other.size \ - and np.all(self.domain == other.domain) \ - and self._is_log == other._is_log \ - and self._is_clip == other._is_clip + return ( + type(self) is type(other) + and self.size == other.size + and np.all(self.domain == other.domain) + and self._is_log == other._is_log + and self._is_clip == other._is_clip + ) def change_size(self, size): """ - Return moment object with different size. - :param size: int, new number of _moments_fn + Return a new moment object with a different number of basis functions. + + :param size: int + New number of moment functions. + :return: Moments + New instance of the same class with updated size. """ return self.__class__(size, self.domain, self._is_log, self._is_clip) def clip(self, value): """ - Remove outliers and replace them with NaN - :param value: array of numbers - :return: masked_array, out + Clip values to the reference domain, replacing outliers with NaN. + + :param value: array-like + Input data to be clipped. + :return: ndarray + Array with out-of-bound values replaced by NaN. """ - # Masked array out = ma.masked_outside(value, self.ref_domain[0], self.ref_domain[1]) - # Replace outliers with NaN return ma.filled(out, np.nan) def linear(self, value): + """Apply linear transformation to reference domain.""" return (value - self._linear_shift) * self._linear_scale + self.ref_domain[0] def inv_linear(self, value): + """Inverse linear transformation back to the original domain.""" return (value - self.ref_domain[0]) / self._linear_scale + self._linear_shift def __call__(self, value): + """Evaluate all moment functions for the given value(s).""" return self._eval_all(value, self.size) def eval(self, i, value): - return self._eval_all(value, i+1)[:, -1] + """ + Evaluate the i-th moment function. + + :param i: int + Index of the moment function to evaluate (0-based). + :param value: float or array-like + Input value(s). + :return: ndarray + Values of the i-th moment function. + """ + return self._eval_all(value, i + 1)[:, -1] def eval_single_moment(self, i, value): """ - Be aware this implementation is inefficient for large i - :param i: int, order of moment - :param value: float - :return: np.ndarray + Evaluate a single moment function (less efficient for large i). + + :param i: int + Order of the moment. + :param value: float or array-like + Input value(s). + :return: ndarray + Evaluated moment values. """ - return self._eval_all(value, i+1)[..., i] + return self._eval_all(value, i + 1)[..., i] def eval_all(self, value, size=None): + """ + Evaluate all moments up to the specified size. + + :param value: float or array-like + Input value(s). + :param size: int or None + Number of moments to evaluate. If None, use self.size. + :return: ndarray + Matrix of evaluated moments. + """ if size is None: size = self.size return self._eval_all(value, size) def eval_all_der(self, value, size=None, degree=1): + """ + Evaluate derivatives of all moment functions. + + :param value: float or array-like + Input value(s). + :param size: int or None + Number of moments to evaluate. + :param degree: int + Derivative degree (1 for first derivative, etc.). + :return: ndarray + Matrix of evaluated derivatives. + """ if size is None: size = self.size return self._eval_all_der(value, size, degree) def eval_diff(self, value, size=None): + """ + Evaluate first derivatives of all moment functions. + + :param value: float or array-like + Input value(s). + :param size: int or None + Number of moments to evaluate. + :return: ndarray + Matrix of first derivatives. + """ if size is None: size = self.size return self._eval_diff(value, size) def eval_diff2(self, value, size=None): + """ + Evaluate second derivatives of all moment functions. + + :param value: float or array-like + Input value(s). + :param size: int or None + Number of moments to evaluate. + :return: ndarray + Matrix of second derivatives. + """ if size is None: size = self.size return self._eval_diff2(value, size) +# ------------------------------------------------------------------------- +# Specific moment types +# ------------------------------------------------------------------------- class Monomial(Moments): """ - Monomials generalized moments + Monomial basis functions for generalized moment evaluation. """ + def __init__(self, size, domain=(0, 1), ref_domain=None, log=False, safe_eval=True): if ref_domain is not None: self.ref_domain = ref_domain @@ -120,33 +213,49 @@ def __init__(self, size, domain=(0, 1), ref_domain=None, log=False, safe_eval=Tr super().__init__(size, domain, log=log, safe_eval=safe_eval) def _eval_all(self, value, size): - # Create array from values and transform values outside the ref domain + """ + Evaluate monomial basis (Vandermonde matrix). + + :param value: array-like + Input values. + :param size: int + Number of moments to compute. + :return: ndarray + Vandermonde matrix of monomials. + """ t = self.transform(np.atleast_1d(value)) - # Vandermonde matrix return np.polynomial.polynomial.polyvander(t, deg=size - 1) def eval(self, i, value): + """Evaluate the i-th monomial t^i.""" t = self.transform(np.atleast_1d(value)) - return t**i + return t ** i class Fourier(Moments): """ - Fourier functions generalized moments + Fourier basis functions for generalized moment evaluation. """ - def __init__(self, size, domain=(0, 2*np.pi), ref_domain=None, log=False, safe_eval=True): + + def __init__(self, size, domain=(0, 2 * np.pi), ref_domain=None, log=False, safe_eval=True): if ref_domain is not None: self.ref_domain = ref_domain else: - self.ref_domain = (0, 2*np.pi) - + self.ref_domain = (0, 2 * np.pi) super().__init__(size, domain, log=log, safe_eval=safe_eval) def _eval_all(self, value, size): - # Transform values + """ + Evaluate Fourier moment basis (cosine/sine terms). + + :param value: array-like + Input values. + :param size: int + Number of moments to compute. + :return: ndarray + Matrix of evaluated Fourier functions. + """ t = self.transform(np.atleast_1d(value)) - - # Half the number of moments R = int(size / 2) shorter_sin = 1 - int(size % 2) k = np.arange(1, R + 1) @@ -154,26 +263,33 @@ def _eval_all(self, value, size): res = np.empty((len(t), size)) res[:, 0] = 1 - - # Odd column index res[:, 1::2] = np.cos(kx[:, :]) - # Even column index res[:, 2::2] = np.sin(kx[:, : R - shorter_sin]) return res def eval(self, i, value): + """ + Evaluate a single Fourier basis function. + + :param i: int + Index of the moment function. + :param value: float or array-like + Input values. + :return: ndarray + Evaluated function values. + """ t = self.transform(np.atleast_1d(value)) if i == 0: return 1 elif i % 2 == 1: - return np.sin( (i - 1) / 2 * t) + return np.sin((i - 1) / 2 * t) else: return np.cos(i / 2 * t) class Legendre(Moments): """ - Legendre polynomials generalized moments + Legendre polynomial basis functions for generalized moments. """ def __init__(self, size, domain, ref_domain=None, log=False, safe_eval=True): @@ -182,6 +298,7 @@ def __init__(self, size, domain, ref_domain=None, log=False, safe_eval=True): else: self.ref_domain = (-1, 1) + # Precompute derivative matrices self.diff_mat = np.zeros((size, size)) for n in range(size - 1): self.diff_mat[n, n + 1::2] = 2 * n + 1 @@ -190,19 +307,26 @@ def __init__(self, size, domain, ref_domain=None, log=False, safe_eval=True): super().__init__(size, domain, log, safe_eval) def _eval_value(self, x, size): - return np.polynomial.legendre.legvander(x, deg=size-1) + """Evaluate Legendre polynomials up to the given order.""" + return np.polynomial.legendre.legvander(x, deg=size - 1) def _eval_all(self, value, size): + """Evaluate all Legendre polynomials.""" value = self.transform(np.atleast_1d(value)) return np.polynomial.legendre.legvander(value, deg=size - 1) def _eval_all_der(self, value, size, degree=1): """ - Derivative of Legendre polynomials - :param value: values to evaluate - :param size: number of moments - :param degree: degree of derivative - :return: + Evaluate derivatives of Legendre polynomials. + + :param value: array-like + Points at which to evaluate. + :param size: int + Number of moment functions. + :param degree: int + Derivative order. + :return: ndarray + Matrix of derivative values. """ value = self.transform(np.atleast_1d(value)) eval_values = np.empty((value.shape + (size,))) @@ -211,7 +335,7 @@ def _eval_all_der(self, value, size, degree=1): if s == 0: coef = [1] else: - coef = np.zeros(s+1) + coef = np.zeros(s + 1) coef[-1] = 1 coef = np.polynomial.legendre.legder(coef, degree) @@ -219,26 +343,38 @@ def _eval_all_der(self, value, size, degree=1): return eval_values def _eval_diff(self, value, size): + """Evaluate first derivatives using precomputed differentiation matrix.""" t = self.transform(np.atleast_1d(value)) P_n = np.polynomial.legendre.legvander(t, deg=size - 1) return P_n @ self.diff_mat def _eval_diff2(self, value, size): + """Evaluate second derivatives using precomputed differentiation matrix.""" t = self.transform(np.atleast_1d(value)) P_n = np.polynomial.legendre.legvander(t, deg=size - 1) return P_n @ self.diff2_mat class TransformedMoments(Moments): + """ + Linearly transformed moment basis. + + Creates a new set of moment functions as linear combinations + of another existing set of basis functions. + """ + def __init__(self, other_moments, matrix): """ - Set a new moment functions as linear combination of the previous. - new_moments = matrix . old_moments + Initialize transformed moment functions. + + :param other_moments: Moments + Original set of moment functions. + :param matrix: ndarray + Linear transformation matrix where: + new_moments = matrix @ old_moments - We assume that new_moments[0] is still == 1. That means - first row of the matrix must be (1, 0 , ...). - :param other_moments: Original _moments_fn. - :param matrix: Linear combinations of the original _moments_fn. + The first row must correspond to (1, 0, 0, ...), + ensuring that new_moments[0] = 1. """ n, m = matrix.shape assert m == other_moments.size @@ -248,27 +384,34 @@ def __init__(self, other_moments, matrix): self._transform = matrix def __eq__(self, other): - return type(self) is type(other) \ - and self.size == other.size \ - and self._origin == other._origin \ - and np.all(self._transform == other._transform) + """Check equality with another TransformedMoments object.""" + return ( + type(self) is type(other) + and self.size == other.size + and self._origin == other._origin + and np.all(self._transform == other._transform) + ) def _eval_all(self, value, size): + """Evaluate all transformed moment functions.""" orig_moments = self._origin._eval_all(value, self._origin.size) x1 = np.matmul(orig_moments, self._transform.T) return x1[..., :size] def _eval_all_der(self, value, size, degree=1): + """Evaluate derivatives of transformed moment functions.""" orig_moments = self._origin._eval_all_der(value, self._origin.size, degree=degree) x1 = np.matmul(orig_moments, self._transform.T) return x1[..., :size] def _eval_diff(self, value, size): + """Evaluate first derivatives of transformed moment functions.""" orig_moments = self._origin.eval_diff(value, self._origin.size) x1 = np.matmul(orig_moments, self._transform.T) return x1[..., :size] def _eval_diff2(self, value, size): + """Evaluate second derivatives of transformed moment functions.""" orig_moments = self._origin.eval_diff2(value, self._origin.size) x1 = np.matmul(orig_moments, self._transform.T) return x1[..., :size] diff --git a/mlmc/plot/diagnostic_plots.py b/mlmc/plot/diagnostic_plots.py index 5d6a844e..cefd0520 100644 --- a/mlmc/plot/diagnostic_plots.py +++ b/mlmc/plot/diagnostic_plots.py @@ -1,97 +1,166 @@ import numpy as np -import scipy.stats as st -from scipy import interpolate import matplotlib matplotlib.rcParams.update({'font.size': 22}) -from matplotlib.patches import Patch import matplotlib.pyplot as plt -# def log_var_level(variances, l_vars, err_variances=0, err_l_vars=0, moments=[1,2,3,4]): -# fig, ax1 = plt.subplots(figsize=(8, 5)) -# for m in moments: -# # line1, = ax1.errorbar(np.log2(variances[m]), yerr=err_variances, label="m={}".format(m), marker="o") -# # line2, = ax1.errorbar(np.log2(l_vars[:, m]), yerr=err_l_vars, label="m={}".format(m), marker="s") -# #line1, = ax1.plot(np.log2(variances[m]), label="m={}".format(m), marker="o") -# line2, = ax1.plot(np.log2(l_vars[:, m]), label="m={}".format(m), marker="s") -# -# ax1.set_ylabel('log' + r'$_2$' + 'variance') -# ax1.set_xlabel('level' + r'$l$') -# plt.legend() -# #plt.savefig("MLMC_cost_saves.pdf") -# plt.show() +def log_var_per_level(l_vars, levels=None, moments=[0], err_l_vars=None): + """ + Plot log₂ of variance per level and fit a slope to estimate the decay rate β. + The function plots the base-2 logarithm of the variance for each level + and fits a linear model to estimate the convergence rate β, based on + the slope of log₂(variance) vs. level. -def log_var_per_level(l_vars, err_variances=0, err_l_vars=0, moments=[1, 2, 3, 4]): - fig, ax1 = plt.subplots(figsize=(8, 5)) - for m in moments: - # line1, = ax1.errorbar(np.log2(variances[m]), yerr=err_variances, label="m={}".format(m), marker="o") - # line2, = ax1.errorbar(np.log2(l_vars[:, m]), yerr=err_l_vars, label="m={}".format(m), marker="s") - #line1, = ax1.plot(np.log2(variances[m]), label="m={}".format(m), marker="o") - line2, = ax1.plot(np.log2(l_vars[:, m]), label="m={}".format(m), marker="s") - - ax1.set_ylabel('log' + r'$_2$' + 'variance') - ax1.set_xlabel('level' + r'$l$') - plt.legend() - #plt.savefig("MLMC_cost_saves.pdf") - plt.show() + :param l_vars: Array of shape (n_levels, n_moments) representing + the variance of each moment at each level. + :param levels: Optional array of level indices (default: np.arange(n_levels)). + :param moments: List of moment indices to include in the plot. + :param err_l_vars: Optional array of errors corresponding to l_vars. + :return: None + """ + n_levels = l_vars.shape[0] + if levels is None: + levels = np.arange(n_levels) + fig, ax = plt.subplots(figsize=(8, 5)) -# def log_mean_level(means, l_means, err_means=0, err_l_means=0, moments=[1,2,3,4]): -# fig, ax1 = plt.subplots(figsize=(8, 5)) -# for m in moments: -# # line1, = ax1.errorbar(np.log2(variances[m]), yerr=err_variances, label="m={}".format(m), marker="o") -# # line2, = ax1.errorbar(np.log2(l_vars[:, m]), yerr=err_l_vars, label="m={}".format(m), marker="s") -# #line1, = ax1.plot(np.log2(variances[m]), label="m={}".format(m), marker="o") -# line2, = ax1.plot(np.log2(np.abs(l_means[:, m])), label="m={}".format(m), marker="s") -# -# ax1.set_ylabel('log' + r'$_2$' + 'mean') -# ax1.set_xlabel('level' + r'$l$') -# plt.legend() -# #plt.savefig("MLMC_cost_saves.pdf") -# plt.show() + for m in moments: + y = np.log2(l_vars[:, m]) + ax.plot(levels, y, 'o-', label=f'm={m}') + + slope, intercept = np.polyfit(levels, y, 1) + beta = -slope + ax.plot( + levels, + slope * levels + intercept, + '--', + label=f'fit m={m}: slope={slope:.2f}, beta≈{beta:.2f}' + ) + + ax.set_ylabel(r'$\log_2 \, V_\ell$') + ax.set_xlabel('level $\ell$') + ax.legend() + ax.grid(True, which="both") + plt.tight_layout() + plt.show() def log_mean_per_level(l_means, err_means=0, err_l_means=0, moments=[1, 2, 3, 4]): + """ + Plot log₂ of absolute mean per level for specified statistical moments. + + :param l_means: Array of mean values per level and moment. + :param err_means: Optional array of mean estimation errors (unused). + :param err_l_means: Optional array of level-mean estimation errors (unused). + :param moments: List of moment indices to include in the plot. + :return: None + """ fig, ax1 = plt.subplots(figsize=(8, 5)) + print("l means ", l_means) for m in moments: - # line1, = ax1.errorbar(np.log2(variances[m]), yerr=err_variances, label="m={}".format(m), marker="o") - # line2, = ax1.errorbar(np.log2(l_vars[:, m]), yerr=err_l_vars, label="m={}".format(m), marker="s") - #line1, = ax1.plot(np.log2(variances[m]), label="m={}".format(m), marker="o") - line2, = ax1.plot(np.log2(np.abs(l_means[:, m])), label="m={}".format(m), marker="s") + line2, = ax1.plot(np.log2(np.abs(l_means[:, m])), label=f"m={m}", marker="s") ax1.set_ylabel('log' + r'$_2$' + 'mean') ax1.set_xlabel('level' + r'$l$') plt.legend() - #plt.savefig("MLMC_cost_saves.pdf") + plt.tight_layout() + plt.show() + + +def sample_cost_per_level(costs, levels=None): + """ + Plot log₂ of sample cost per level and fit a slope to estimate γ. + + The slope of the linear regression line provides an estimate of the + cost scaling parameter γ. + + :param costs: Array of computational costs per sample for each level. + :param levels: Optional array of level indices (default: 0, 1, ...). + :return: Estimated γ (float), the slope of the fitted line. + """ + n_levels = len(costs) + if levels is None: + levels = np.arange(n_levels) + + y = np.log2(costs) + slope, intercept = np.polyfit(levels, y, 1) + gamma = slope + + fig, ax = plt.subplots(figsize=(8, 5)) + ax.plot(levels, y, 'o-', label='log2(cost)') + ax.plot( + levels, + slope * levels + intercept, + '--', + label=f'fit: slope={slope:.2f}, gamma≈{gamma:.2f}' + ) + + ax.set_ylabel(r'$\log_2 \, C_\ell$') + ax.set_xlabel('level $\ell$') + ax.legend() + ax.grid(True, which="both") + plt.tight_layout() plt.show() + return gamma + + +def variance_to_cost_ratio(l_vars, costs, moments=[1, 2, 3, 4]): + """ + Plot the log₂ of variance-to-cost ratio per level for given statistical moments. -def sample_cost_per_level(costs): + The ratio Vₗ/Cₗ is computed for each level, and the slope of its + log₂-linear fit indicates the decay behavior relative to computational cost. + + :param l_vars: Array of variances per level and moment (shape: n_levels × n_moments). + :param costs: Array of costs per sample for each level. + :param moments: List of moment indices to include in the plot. + :return: None + """ + print("l_vars ", l_vars) + print(costs) + n_levels = l_vars.shape[0] + levels = np.arange(n_levels) fig, ax1 = plt.subplots(figsize=(8, 5)) - line2, = ax1.plot(np.log2(costs), marker="s") + print('costs ', costs) + print("levels ", levels) + for m in moments: + line2, = ax1.plot(np.log2(l_vars[:, m] / costs), label=f"m={m}", marker="s") - ax1.set_ylabel('log' + r'$_2$' + 'cost per sample') + print("l vars ", l_vars[:, m]) + print("np.log2(l_vars[:, m]/costs) ", np.log2(l_vars[:, m] / costs)) + + # Fit a straight line: log2(V/C) ≈ a + b * level + coeffs = np.polyfit(levels, np.log2(l_vars[:, m] / costs), 1) + slope, intercept = coeffs[0], coeffs[1] + ax1.plot(levels, slope * levels + intercept, '--', label=f'fit: slope={slope:.2f}') + + ax1.set_ylabel('log' + r'$_2$' + 'variance to cost ratio') ax1.set_xlabel('level' + r'$l$') plt.legend() - #plt.savefig("MLMC_cost_saves.pdf") + plt.tight_layout() plt.show() def kurtosis_per_level(means, l_means, err_means=0, err_l_means=0, moments=[1, 2, 3, 4]): + """ + Plot log₂ of mean values per level (often used for analyzing kurtosis trends). + + :param means: Array of global mean values per moment (unused in plotting). + :param l_means: Array of level-wise mean values per moment. + :param err_means: Optional array of mean estimation errors (unused). + :param err_l_means: Optional array of level-mean estimation errors (unused). + :param moments: List of moment indices to include in the plot. + :return: None + """ fig, ax1 = plt.subplots(figsize=(8, 5)) for m in moments: - # line1, = ax1.errorbar(np.log2(variances[m]), yerr=err_variances, label="m={}".format(m), marker="o") - # line2, = ax1.errorbar(np.log2(l_vars[:, m]), yerr=err_l_vars, label="m={}".format(m), marker="s") - #line1, = ax1.plot(np.log2(variances[m]), label="m={}".format(m), marker="o") - line2, = ax1.plot(np.log2(np.abs(l_means[:, m])), label="m={}".format(m), marker="s") + line2, = ax1.plot(np.log2(np.abs(l_means[:, m])), label=f"m={m}", marker="s") ax1.set_ylabel('log ' + r'$_2$ ' + 'mean') ax1.set_xlabel('level ' + r'$l$') plt.legend() - #plt.savefig("MLMC_cost_saves.pdf") + plt.tight_layout() plt.show() - - - diff --git a/mlmc/plot/violinplot.py b/mlmc/plot/violinplot.py index 85e9a1b1..74dfe836 100644 --- a/mlmc/plot/violinplot.py +++ b/mlmc/plot/violinplot.py @@ -2,18 +2,56 @@ import seaborn from mlmc.plot.plots import _show_and_save import matplotlib + +# Set default font size for all plots matplotlib.rcParams.update({'font.size': 22}) + import matplotlib.pyplot as plt class ViolinPlotter(seaborn.categorical._ViolinPlotter): + """ + Custom subclass of seaborn's internal _ViolinPlotter to modify how quartiles + and mean lines are drawn inside a violin plot. + + This class extends the default behavior by drawing the 25th, 50th, and 75th + percentiles as dashed lines, and the mean as a solid line across the violin body. + """ + def draw_quartiles(self, ax, data, support, density, center, split=False): + """ + Draw quartile and mean lines on the violin plot. + + Parameters + ---------- + ax : matplotlib.axes.Axes + The axes object to draw on. + data : array-like + Input data for a single violin. + support : array-like + Grid over which the kernel density was evaluated. + density : array-like + Corresponding kernel density values. + center : float + Position of the violin on the categorical axis. + split : bool, default=False + Whether the violin is split by hue (two sides). + + Notes + ----- + - The mean is drawn as a solid line. + - Quartiles (25%, 50%, 75%) are drawn as dashed lines. + - The density scaling follows seaborn’s internal behavior. + """ + # Compute quartiles and mean of the data q25, q50, q75 = np.percentile(data, [25, 50, 75]) mean = np.mean(data) + # Draw mean line (solid) self.draw_to_density(ax, center, mean, support, density, split, linewidth=self.linewidth) + # Draw quartile lines (dashed) self.draw_to_density(ax, center, q25, support, density, split, linewidth=self.linewidth, dashes=[self.linewidth * 1.5] * 2) @@ -33,39 +71,110 @@ def violinplot( bw="scott", cut=2, scale="area", scale_hue=True, gridsize=100, width=.8, inner="box", split=False, dodge=True, orient=None, linewidth=None, color=None, palette=None, saturation=.75, - ax=None, **kwargs,): - - plotter = ViolinPlotter(x, y, hue, data, order, hue_order, - bw, cut, scale, scale_hue, gridsize, - width, inner, split, dodge, orient, linewidth, - color, palette, saturation) - + ax=None, **kwargs, +): + """ + Wrapper around the custom ViolinPlotter class to generate a violin plot. + + Parameters + ---------- + x, y, hue : str, optional + Variable names for the categorical axis, numeric axis, and hue grouping. + data : DataFrame, optional + Dataset containing the variables. + order, hue_order : list, optional + Order of categories for x and hue variables. + bw : str or float, default="scott" + Bandwidth method or scalar for kernel density estimation. + cut : float, default=2 + How far the violin extends beyond extreme data points. + scale : {"area", "count", "width"}, default="area" + Method for scaling the width of each violin. + scale_hue : bool, default=True + Whether to scale by hue levels within each category. + gridsize : int, default=100 + Number of points in the KDE grid. + width : float, default=0.8 + Width of each violin. + inner : {"box", "quartile", "point", "stick", None}, default="box" + Representation inside each violin. + split : bool, default=False + Draw half-violins when hue is used. + dodge : bool, default=True + Separate violins for each hue level. + orient : {"v", "h"}, optional + Plot orientation; inferred if not specified. + linewidth : float, optional + Width of the line used for drawing violins and quartiles. + color : matplotlib color, optional + Color for all violins. + palette : str or sequence, optional + Color palette for hue levels. + saturation : float, default=0.75 + Saturation for colors. + ax : matplotlib.axes.Axes, optional + Axes object to draw on; created if None. + **kwargs : + Additional arguments passed to seaborn’s internal methods. + + Returns + ------- + ax : matplotlib.axes.Axes + The axes containing the drawn violin plot. + """ + # Initialize a custom violin plotter instance + plotter = ViolinPlotter( + x, y, hue, data, order, hue_order, + bw, cut, scale, scale_hue, gridsize, + width, inner, split, dodge, orient, linewidth, + color, palette, saturation + ) + + # Create a new axes if none provided if ax is None: ax = plt.gca() + # Draw the plot using the seaborn-based custom plotter plotter.plot(ax) return ax def fine_coarse_violinplot(data_frame): + """ + Generate a split violin plot comparing fine and coarse simulation samples per level. + + Parameters + ---------- + data_frame : pandas.DataFrame + Must contain the columns: + - 'level' : int, simulation level + - 'samples' : float, sample values + - 'type' : str, either 'fine' or 'coarse' + + Notes + ----- + - Uses log scale on the y-axis. + - Calls `_show_and_save` to display and save the resulting plot. + - Produces a split violin plot (fine vs coarse) for each level. + """ + # Create a single subplot for the violin plot fig, axes = plt.subplots(1, 1, figsize=(22, 10)) - # mean with confidence interval - # sns.pointplot(x='level', y='samples', hue='type', data=data_frame, estimator=np.mean, - # palette="Set2", join=False, ax=axes) - - # line is not suitable for our purpose - # sns.lineplot(x="level", y="samples", hue="type",# err_style="band", ci='sd' - # estimator=np.median, data=data_frame, ax=axes) - - violinplot(x="level", y="samples", hue='type', data=data_frame, palette="Set2", - split=True, scale="area", inner="quartile", ax=axes) + # Draw split violin plot for 'fine' and 'coarse' samples per level + violinplot( + x="level", y="samples", hue='type', data=data_frame, + palette="Set2", split=True, scale="area", + inner="quartile", ax=axes + ) + # Use logarithmic y-scale (typical for MLMC variance/cost visualizations) axes.set_yscale('log') axes.set_ylabel('') axes.set_xlabel('') + + # Remove legend frame and content axes.legend([], [], frameon=False) + # Display and save plot using utility function _show_and_save(fig, "violinplot", "violinplot") _show_and_save(fig, None, "violinplot") - diff --git a/mlmc/quantity/quantity.py b/mlmc/quantity/quantity.py index bdeddb96..6a1696f4 100644 --- a/mlmc/quantity/quantity.py +++ b/mlmc/quantity/quantity.py @@ -13,12 +13,13 @@ def make_root_quantity(storage: SampleStorage, q_specs: List[QuantitySpec]): """ - Create a root quantity that has QuantityStorage as the input quantity, + Create a root quantity that has QuantityStorage as the input quantity. QuantityStorage is the only class that directly accesses the stored data. - Quantity type is created based on the q_spec parameter - :param storage: SampleStorage - :param q_specs: same as result format in simulation class - :return: QuantityStorage + The returned QuantityStorage uses a QType built from provided QuantitySpec objects. + + :param storage: SampleStorage instance that provides stored samples + :param q_specs: list of QuantitySpec describing the simulation result format + :return: QuantityStorage that wraps the provided SampleStorage with a matching QType """ dict_types = [] for q_spec in q_specs: @@ -33,29 +34,37 @@ def make_root_quantity(storage: SampleStorage, q_specs: List[QuantitySpec]): class Quantity: + """ + Represents a quantity (a measurable value or expression) constructed from a QType, + an operation (callable) and zero-or-more input quantities. Quantities are lazy: + their actual data are returned by calling `samples(chunk_spec)`. + + - qtype: structure description (QType) + - _operation: callable that takes sample-chunks from input_quantities and returns result chunks + - _input_quantities: dependencies (other Quantity instances) + """ + def __init__(self, quantity_type, operation, input_quantities=[]): """ - Quantity class represents real quantity and also provides operation that can be performed with stored values. - Each Quantity has Qtype which describes its structure. - :param quantity_type: QType instance - :param operation: function - :param input_quantities: List[Quantity] + :param quantity_type: QType instance describing the shape/structure + :param operation: callable implementing the transform on input chunks + :param input_quantities: List[Quantity] dependencies (may be empty for constants) """ self.qtype = quantity_type self._operation = operation self._input_quantities = input_quantities - # List of quantities on which the 'self' depends, their number have to match number of arguments - # to the operation. + # Underlying QuantityStorage (inherited from one of the inputs, if present) self._storage = self.get_quantity_storage() - # QuantityStorage instance + # Selection identifier - used to tie selections together (set by select) self._selection_id = self.set_selection_id() - # Identifier of selection, should be set in select() method + # Validate that input quantities use consistent selection/storage self._check_selection_ids() def get_quantity_storage(self): """ - Get QuantityStorage instance - :return: None, QuantityStorage + Find the first QuantityStorage among inputs (if any) and return it. + + :return: QuantityStorage instance or None if not found """ if len(self._input_quantities) == 0: return None @@ -68,9 +77,11 @@ def get_quantity_storage(self): def set_selection_id(self): """ - Set selection id - selection id is None by default, - but if we create new quantity from quantities that are result of selection we need to pass selection id + Determine the selection id for this Quantity. If inputs have a selection id + (created by select), inherit it; if multiple different selection ids are + present among inputs, raise an exception. + + :return: selection id or None """ selection_id = None for input_quantity in self._input_quantities: @@ -82,12 +93,11 @@ def set_selection_id(self): def _check_selection_ids(self): """ - Make sure the all input quantities come from the same QuantityStorage + Ensure that all input quantities that have selection ids share the same one. + If no QuantityStorage is present, nothing to check. """ - # All input quantities are QuantityConst instances if self._storage is None: return - # Check selection ids otherwise for input_quantity in self._input_quantities: sel_id = input_quantity.selection_id() if sel_id is None: @@ -97,8 +107,10 @@ def _check_selection_ids(self): def selection_id(self): """ - Get storage ids of all input quantities - :return: List[int] + Return this Quantity's selection id. If not set, use id(self._storage) to + identify the underlying storage instance. + + :return: selection identifier (int or None) """ if self._selection_id is not None: return self._selection_id @@ -109,71 +121,93 @@ def selection_id(self): def size(self) -> int: """ - Quantity size from qtype + Return the number of scalar components described by the QType. + :return: int """ return self.qtype.size() def get_cache_key(self, chunk_spec): """ - Create cache key + Create a cache key used by memoization for samples. We include: + - level id + - chunk id + - chunk size (derived from slice) + - id(self) to distinguish different quantity instances + + :param chunk_spec: ChunkSpec + :return: tuple key """ chunk_size = None if chunk_spec.chunk_slice is not None: chunk_size = chunk_spec.chunk_slice.stop - chunk_spec.chunk_slice.start - return (chunk_spec.level_id, chunk_spec.chunk_id, chunk_size, id(self)) # redundant parentheses needed due to py36, py37 + return (chunk_spec.level_id, chunk_spec.chunk_id, chunk_size, id(self)) # py36/37 compatibility @cached(custom_key_maker=get_cache_key) def samples(self, chunk_spec): """ - Return list of sample chunks for individual levels. - Possibly calls underlying quantities. - :param chunk_spec: object containing chunk identifier level identifier and chunk_slice - slice() object - :return: np.ndarray or None + Evaluate and return the data chunk for this quantity at the specified chunk_spec. + Calls samples(chunk_spec) recursively on inputs and passes the results to _operation. + + :param chunk_spec: ChunkSpec object with level_id, chunk_id, and optional slice + :return: np.ndarray (M, chunk_size, 2) or None """ chunks_quantity_level = [q.samples(chunk_spec) for q in self._input_quantities] return self._operation(*chunks_quantity_level) def _reduction_op(self, quantities, operation): """ + Helper for building a reduction Quantity from many inputs. + + If any input is a non-constant Quantity, return a Quantity with the operation and inputs. + If all inputs are QuantityConst, evaluate the operation immediately and return QuantityConst. + :param quantities: List[Quantity] - :param operation: function which is run with given quantities + :param operation: Callable to apply :return: Quantity or QuantityConst """ for quantity in quantities: if not isinstance(quantity, QuantityConst): return Quantity(quantity.qtype, operation=operation, input_quantities=quantities) - # Quantity from QuantityConst instances + # All constant -> precompute value return QuantityConst(quantities[0].qtype, value=operation(*[q._value for q in quantities])) def select(self, *args): """ - Performs sample selection based on conditions - :param args: Quantity - :return: Quantity + Apply boolean selection masks to this Quantity's samples. + + :param args: One or more Quantity instances with BoolType that act as masks. + :return: Quantity representing the selected samples (mask applied on sample axis) """ - # args always has len() at least 1 + # First mask masks = args[0] + # Validate masks are BoolType for quantity in args: if not isinstance(quantity.qtype.base_qtype(), qt.BoolType): raise Exception("Quantity: {} doesn't have BoolType, instead it has QType: {}" .format(quantity, quantity.qtype.base_qtype())) - # More conditions leads to default AND + # Combine multiple masks with logical AND if len(args) > 1: for m in args[1:]: - masks = np.logical_and(masks, m) # method from this module + masks = np.logical_and(masks, m) def op(x, mask): - return x[..., mask, :] # [...sample size, cut number of samples, 2] + # Mask samples (reduce number of sample columns) + return x[..., mask, :] # [..., selected_samples, 2] + q = Quantity(quantity_type=self.qtype, input_quantities=[self, masks], operation=op) - q._selection_id = id(q) + q._selection_id = id(q) # mark selection id to ensure consistency return q def __array_ufunc__(self, ufunc, method, *args, **kwargs): + """ + Support numpy ufuncs by routing them through _method which constructs a new Quantity. + """ return Quantity._method(ufunc, method, *args, **kwargs) + # Arithmetic operator wrappers - build new Quantities or constants as needed. def __add__(self, other): return Quantity.create_quantity([self, Quantity.wrap(other)], Quantity.add_op) @@ -207,19 +241,17 @@ def __rmod__(self, other): @staticmethod def create_quantity(quantities, operation): """ - Create new quantity (Quantity or QuantityConst) based on given quantities and operation. - There are two scenarios: - 1. At least one of quantities is Quantity instance then all quantities are considered to be input_quantities - of new Quantity - 2. All of quantities are QuantityConst instances then new QuantityConst is created - :param quantities: List[Quantity] - :param operation: function which is run with given quantities - :return: Quantity + Create a new Quantity or QuantityConst. If any input is non-constant, return + a Quantity that will evaluate lazily. If all are constant, return QuantityConst. + + :param quantities: list-like of Quantity / QuantityConst + :param operation: callable to combine inputs + :return: Quantity or QuantityConst """ for quantity in quantities: if not isinstance(quantity, QuantityConst): return Quantity(quantity.qtype, operation=operation, input_quantities=quantities) - # Quantity from QuantityConst instances + # all constant -> precompute return QuantityConst(quantities[0].qtype, value=operation(*[q._value for q in quantities])) @staticmethod @@ -245,35 +277,40 @@ def mod_op(x, y): @staticmethod def _process_mask(x, y, operator): """ - Create samples mask - All values for sample must meet given condition, if any value doesn't meet the condition, - whole sample is eliminated - :param x: Quantity chunk - :param y: Quantity chunk or int, float - :param operator: operator module function - :return: np.ndarray of bools + Create a boolean mask that marks full samples passing the given per-element condition. + + The operator is applied elementwise; then we require that *every* element within the sample + passes to keep that sample. This collapses non-sample axes and returns a 1-D boolean array. + + :param x: Quantity chunk (ndarray) + :param y: Quantity chunk or scalar + :param operator: operator module function like operator.lt + :return: 1-D boolean numpy array indexing samples """ mask = operator(x, y) + # collapse over spatial/time axes and per-sample axis, keep sample index axis return mask.all(axis=tuple(range(mask.ndim - 2))).all(axis=1) def _mask_quantity(self, other, op): """ - Create quantity that represent bool mask - :param other: number or Quantity - :param op: operation - :return: Quantity + Helper to build a BoolType Quantity representing comparisons (>, <, ==, etc.) + + :param other: Quantity or scalar to compare with + :param op: operation callable that builds the boolean mask from chunked arrays + :return: Quantity producing a boolean mask per sample """ bool_type = qt.BoolType() - new_qtype = self.qtype - new_qtype = new_qtype.replace_scalar(bool_type) + new_qtype = self.qtype.replace_scalar(bool_type) other = Quantity.wrap(other) + # Only scalar base types support comparison if not isinstance(self.qtype.base_qtype(), qt.ScalarType) or not isinstance(other.qtype.base_qtype(), qt.ScalarType): raise TypeError("Quantity has base qtype {}. " "Quantities with base qtype ScalarType are the only ones that support comparison". format(self.qtype.base_qtype())) return Quantity(quantity_type=new_qtype, input_quantities=[self, other], operation=op) + # Comparison operators returning boolean mask Quantities def __lt__(self, other): def lt_op(x, y): return Quantity._process_mask(x, y, operator.lt) @@ -281,59 +318,66 @@ def lt_op(x, y): def __le__(self, other): def le_op(x, y): - return self._process_mask(x, y, operator.le) + return Quantity._process_mask(x, y, operator.le) return self._mask_quantity(other, le_op) def __gt__(self, other): def gt_op(x, y): - return self._process_mask(x, y, operator.gt) + return Quantity._process_mask(x, y, operator.gt) return self._mask_quantity(other, gt_op) def __ge__(self, other): def ge_op(x, y): - return self._process_mask(x, y, operator.ge) + return Quantity._process_mask(x, y, operator.ge) return self._mask_quantity(other, ge_op) def __eq__(self, other): def eq_op(x, y): - return self._process_mask(x, y, operator.eq) + return Quantity._process_mask(x, y, operator.eq) return self._mask_quantity(other, eq_op) def __ne__(self, other): def ne_op(x, y): - return self._process_mask(x, y, operator.ne) + return Quantity._process_mask(x, y, operator.ne) return self._mask_quantity(other, ne_op) @staticmethod def pick_samples(chunk, subsample_params): """ - Pick samples some samples from chunk in order to have 'k' samples from 'n' after all chunks are processed - Inspired by https://dl.acm.org/doi/10.1145/23002.23003 method S + Subsample a chunk using Method S (hypergeometric sampling) so that across chunks + we end up with k samples from n total. - :param chunk: np.ndarray, shape M, N, 2, where N denotes number of samples in chunk - :param subsample_params: instance of SubsampleParams class, it has two parameters: - k: number of samples which we want to get from all chunks - n: number of all samples among all chunks - :return: np.ndarray + :param chunk: ndarray of shape (M, N, 2) where N is number of samples in this chunk + :param subsample_params: object with attributes k (remaining desired) and n (remaining available) + :return: selected sub-chunk array with shape (M, m, 2), where m is chosen by hypergeometric draw """ + # Draw how many to pick from this chunk using hypergeometric distribution size = scipy.stats.hypergeom(subsample_params.n, subsample_params.k, chunk.shape[1]).rvs(size=1) out = RNG.choice(chunk, size=size, axis=1) - subsample_params.k -= out.shape[1] - subsample_params.n -= chunk.shape[1] + subsample_params.k -= out.shape[1] # reduce remaining desired + subsample_params.n -= chunk.shape[1] # reduce remaining available return out def subsample(self, sample_vec): """ - Subsampling - :param sample_vec: list of number of samples at each level - :return: Quantity + Build a Quantity that implements subsampling across levels to obtain a specified + number of samples per level (sample_vec). + + Returns a Quantity whose operation will pick samples according to subsample params + stored per-level. Uses QuantityConst with a level-aware _adjust_value to pass + different parameters to each level chunk. + + :param sample_vec: list-like of desired numbers of samples per level + :return: Quantity producing subsampled chunks """ class SubsampleParams: + """ + Small helper to carry per-level parameters while subsampling across chunks. + """ def __init__(self, num_subsample, num_collected): """ - Auxiliary object for subsampling - :param num_subsample: the number of samples we want to obtain from all samples - :param num_collected: total number of samples + :param num_subsample: desired number of samples to pick from this level + :param num_collected: total available samples on this level """ self._orig_k = num_subsample self._orig_n = num_collected @@ -342,36 +386,41 @@ def __init__(self, num_subsample, num_collected): self.n = num_collected self.total_n = num_collected - # SubsampleParams for each level + # Build params per level using level collected counts from the storage subsample_level_params = {key: SubsampleParams(sample_vec[key], value) for key, value in enumerate(self.get_quantity_storage().n_collected())} - # Create a QuantityConst of dictionary in the sense of hashing dictionary items + + # Wrap a hashed version of this parameters dict in a QuantityConst to feed into operation quantity_subsample_params = Quantity.wrap(hash(frozenset(subsample_level_params.items()))) def adjust_value(values, level_id): """ - Custom implementation of QuantityConst.adjust_value() - It allows us to get different parameters for different levels + Method assigned to QuantityConst._adjust_value so each level receives its own SubsampleParams. + Re-initializes k/n for repeated calls. """ subsample_l_params_obj = subsample_level_params[level_id] subsample_l_params_obj.k = subsample_l_params_obj._orig_k subsample_l_params_obj.n = subsample_l_params_obj._orig_n subsample_l_params_obj.total_n = subsample_l_params_obj._orig_total_n return subsample_l_params_obj + quantity_subsample_params._adjust_value = adjust_value + # Build resulting Quantity that uses pick_samples as its operation return Quantity(quantity_type=self.qtype.replace_scalar(qt.BoolType()), input_quantities=[self, quantity_subsample_params], operation=Quantity.pick_samples) def __getitem__(self, key): """ - Get items from Quantity, quantity type must support brackets access - :param key: str, int, tuple - :return: Quantity + Create a Quantity representing indexed/ sliced access into this quantity (similar to numpy slicing). + + :param key: index or slice or tuple interpreted by qtype.get_key + :return: Quantity restricted to the requested key """ - new_qtype, start = self.qtype.get_key(key) # New quantity type + new_qtype, start = self.qtype.get_key(key) # New quantity type for selection if not isinstance(self.qtype, qt.ArrayType): + # Convert key to a slice covering the sub-array if base is not ArrayType key = slice(start, start + new_qtype.size()) def _make_getitem_op(y): @@ -380,7 +429,11 @@ def _make_getitem_op(y): return Quantity(quantity_type=new_qtype, input_quantities=[self], operation=_make_getitem_op) def __getattr__(self, name): - static_fun = getattr(self.qtype, name) # We support only static function call forwarding + """ + Forward static QType methods as Quantity methods so that QType-level helpers are available + as operations on quantities (e.g., aggregation helpers). + """ + static_fun = getattr(self.qtype, name) def apply_on_quantity(*attr, **d_attr): return static_fun(self, *attr, **d_attr) @@ -389,11 +442,12 @@ def apply_on_quantity(*attr, **d_attr): @staticmethod def _concatenate(quantities, qtype, axis=0): """ - Concatenate level_chunks - :param quantities: list of quantities - :param qtype: QType - :param axis: int - :return: Quantity + Construct a Quantity that concatenates multiple quantities along a given axis. + + :param quantities: sequence of Quantity instances + :param qtype: QType describing result shape + :param axis: axis along which concatenation happens + :return: Quantity that when evaluated concatenates input chunks """ def op_concatenate(*chunks): y = np.concatenate(tuple(chunks), axis=axis) @@ -403,12 +457,12 @@ def op_concatenate(*chunks): @staticmethod def _get_base_qtype(args_quantities): """ - Get quantities base Qtype - :param args_quantities: list of quantities and other passed arguments, - we expect at least one of the arguments is Quantity - :return: base QType, ScalarType if any quantity has that base type, otherwise BoolType + Determine base QType for arithmetic/ufunc results: if any argument has a ScalarType base + return ScalarType(), otherwise BoolType(). + + :param args_quantities: iterable containing Quantity instances and possibly other values + :return: base QType instance """ - # Either all quantities are BoolType or it is considered to be ScalarType for quantity in args_quantities: if isinstance(quantity, Quantity): if type(quantity.qtype.base_qtype()) == qt.ScalarType: @@ -418,14 +472,17 @@ def _get_base_qtype(args_quantities): @staticmethod def _method(ufunc, method, *args, **kwargs): """ - Process input parameters to perform numpy ufunc. - Get base QType of passed quantities, QuantityStorage instance, ... - Determine the resulting QType from the first few samples - :param ufunc: ufunc object that was called - :param method: string, indicating which Ufunc method was called - :param args: tuple of the input arguments to the ufunc - :param kwargs: dictionary containing the optional input arguments of the ufunc - :return: Quantity + Generic handler for numpy ufunc operations mapped to Quantities. + + 1) Wrap inputs as Quantities. + 2) Determine the result QType by calling the ufunc on a small sample. + 3) Return a new Quantity that performs the ufunc at evaluation time. + + :param ufunc: numpy ufunc object + :param method: method name to call on ufunc (e.g., '__call__' or 'reduce') + :param args: positional arguments passed to ufunc (may include Quantities) + :param kwargs: optional ufunc kwargs + :return: Quantity representing ufunc applied to inputs """ def _ufunc_call(*input_quantities_chunks): return getattr(ufunc, method)(*input_quantities_chunks, **kwargs) @@ -438,9 +495,10 @@ def _ufunc_call(*input_quantities_chunks): @staticmethod def wrap(value): """ - Convert flat, bool or array (list) to Quantity - :param value: flat, bool, array (list) or Quantity - :return: Quantity + Convert a primitive (int, float, bool), a numpy/list array, or an existing Quantity into a Quantity. + + :param value: scalar, bool, list/ndarray, or Quantity + :return: Quantity or QuantityConst wrapping the value """ if isinstance(value, Quantity): return value @@ -459,27 +517,32 @@ def wrap(value): @staticmethod def _result_qtype(method, quantities): """ - Determine QType from evaluation with given method and first few samples from storage - :param quantities: list of Quantities - :param method: ufunc function - :return: QType + Infer the resulting QType for an operation by evaluating the operation on the first + available chunk from each input quantity. + + :param method: callable that takes input chunks and returns sample chunk result + :param quantities: list of Quantity instances + :return: inferred QType (ArrayType) """ chunks_quantity_level = [] for q in quantities: quantity_storage = q.get_quantity_storage() - # QuantityConst doesn't have QuantityStorage + # If QuantityConst (no storage), use an empty default ChunkSpec if quantity_storage is None: chunk_spec = ChunkSpec() else: chunk_spec = next(quantity_storage.chunks()) chunks_quantity_level.append(q.samples(chunk_spec)) - result = method(*chunks_quantity_level) # numpy array of [M, <=10, 2] + result = method(*chunks_quantity_level) # expect shape [M, <=10, 2] qtype = qt.ArrayType(shape=result.shape[0], qtype=Quantity._get_base_qtype(quantities)) return qtype @staticmethod def QArray(quantities): + """ + Build a Quantity representing an array-of-quantities aggregated into a single QType. + """ flat_quantities = np.array(quantities).flatten() qtype = Quantity._check_same_qtype(flat_quantities) array_type = qt.ArrayType(np.array(quantities).shape, qtype) @@ -487,24 +550,40 @@ def QArray(quantities): @staticmethod def QDict(key_quantity): + """ + Build a Quantity representing a dictionary of quantities. + :param key_quantity: iterable of (key, Quantity) + """ dict_type = qt.DictType([(key, quantity.qtype) for key, quantity in key_quantity]) return Quantity._concatenate(np.array(key_quantity)[:, 1], qtype=dict_type) @staticmethod def QTimeSeries(time_quantity): + """ + Build a Quantity representing a time series constructed from (time, Quantity) pairs. + """ qtype = Quantity._check_same_qtype(np.array(time_quantity)[:, 1]) times = np.array(time_quantity)[:, 0] return Quantity._concatenate(np.array(time_quantity)[:, 1], qtype=qt.TimeSeriesType(times=times, qtype=qtype)) - @staticmethod def QField(key_quantity): + """ + Build a Quantity representing a field (mapping of locations to quantities). + """ Quantity._check_same_qtype(np.array(key_quantity)[:, 1]) field_type = qt.FieldType([(key, quantity.qtype) for key, quantity in key_quantity]) return Quantity._concatenate(np.array(key_quantity)[:, 1], qtype=field_type) @staticmethod def _check_same_qtype(quantities): + """ + Validate that all provided quantities share the same QType. + + :param quantities: sequence of Quantity instances + :return: the shared QType + :raise ValueError: if a mismatch is found + """ qtype = quantities[0].qtype for quantity in quantities[1:]: if qtype != quantity.qtype: @@ -513,26 +592,28 @@ def _check_same_qtype(quantities): class QuantityConst(Quantity): + """ + Represents a constant quantity whose value is stored directly in the instance. + The samples() method returns the constant value broadcasted to the requested chunk shape. + """ + def __init__(self, quantity_type, value): """ - QuantityConst class represents constant quantity and also provides operation - that can be performed with quantity values. - The quantity is constant, meaning that this class stores the data itself - :param quantity_type: QType instance - :param value: quantity value + :param quantity_type: QType describing the const + :param value: scalar or array-like value """ self.qtype = quantity_type self._value = self._process_value(value) + # No input dependencies for a constant self._input_quantities = [] - # List of input quantities should be empty, - # but we still need this attribute due to storage_id() and level_ids() method self._selection_id = None def _process_value(self, value): """ - Reshape value if array, otherwise create array first - :param value: quantity value - :return: value with shape [M, 1, 1] which suitable for further broadcasting + Ensure the constant is stored as an array with axes [M, 1, 1] suitable for broadcasting. + + :param value: scalar or array-like + :return: ndarray shaped for broadcasting into (M, chunk_size, 2) """ if isinstance(value, (int, float, bool)): value = np.array([value]) @@ -540,42 +621,50 @@ def _process_value(self, value): def selection_id(self): """ - Get storage ids of all input quantities - :return: List[int] + Constants have no selection id (they are independent of storage). """ return self._selection_id def _adjust_value(self, value, level_id=None): """ - Allows process value based on chunk_epc params (such as level_id, ...). - The custom implementation is used in Qunatity.subsample method - :param value: np.ndarray - :param level_id: int - :return: np.ndarray, particular type depends on implementation + Hook to adjust constant value per-level. By default returns the stored value unchanged. + This method gets overridden by consumers (e.g., subsample) to provide level-specific params. + + :param value: constant value array + :param level_id: int, level index (optional) + :return: possibly adjusted value """ return value @cached(custom_key_maker=Quantity.get_cache_key) def samples(self, chunk_spec): """ - Get constant values with an enlarged number of axes - :param chunk_spec: object containing chunk identifier level identifier and chunk_slice - slice() object - :return: np.ndarray + Return the constant value, optionally adjusted for the given level via _adjust_value. + + :param chunk_spec: ChunkSpec with level_id + :return: ndarray representing the constant for this chunk """ return self._adjust_value(self._value, chunk_spec.level_id) class QuantityMean: + """ + Container for aggregated mean/variance results computed by mlmc.quantity.quantity_estimate.estimate_mean. + + - qtype: QType of the quantity + - _l_means: per-level mean contributions (L x M flattened) + - _l_vars: per-level variance contributions (L x M flattened) + - _n_samples: number of samples used per level + - _n_rm_samples: number of removed samples per level + """ def __init__(self, quantity_type, l_means, l_vars, n_samples, n_rm_samples): """ - QuantityMean represents results of mlmc.quantity_estimate.estimate_mean method :param quantity_type: QType - :param l_means: np.ndarray, shape: L, M - :param l_vars: np.ndarray, shape: L, M - :param n_samples: List, number of samples that were used for means at each level - :param n_rm_samples: List, number of removed samples at each level, - n_samples + n_rm_samples = all successfully collected samples + :param l_means: ndarray shape (L, M_flat) of level-wise mean contributions + :param l_vars: ndarray shape (L, M_flat) of level-wise variance contributions + :param n_samples: list/ndarray length L with number of samples used per level + :param n_rm_samples: list/ndarray length L with removed samples count per level """ self.qtype = quantity_type self._mean = None @@ -587,29 +676,43 @@ def __init__(self, quantity_type, l_means, l_vars, n_samples, n_rm_samples): def _calculate_mean_var(self): """ - Calculates the overall estimates of the mean and the variance from the means and variances at each level + Compute overall mean and variance from per-level contributions: + mean = sum_l l_means[l] + var = sum_l (l_vars[l] / n_samples[l]) """ self._mean = np.sum(self._l_means, axis=0) self._var = np.sum(self._l_vars / self._n_samples[:, None], axis=0) @property def mean(self): + """ + Reshaped overall mean according to QType. + """ if self._mean is None: self._calculate_mean_var() return self._reshape(self._mean) @property def var(self): + """ + Reshaped overall variance according to QType. + """ if self._var is None: self._calculate_mean_var() return self._reshape(self._var) @property def l_means(self): + """ + Level means reshaped according to QType for each level. + """ return np.array([self._reshape(means) for means in self._l_means]) @property def l_vars(self): + """ + Level variances reshaped according to QType for each level. + """ return np.array([self._reshape(vars) for vars in self._l_vars]) @property @@ -622,74 +725,96 @@ def n_rm_samples(self): def _reshape(self, data): """ - Reshape passed data, expected means or vars - :param data: flatten np.ndarray - :return: np.ndarray, reshaped data, the final data shape depends on the particular QType - there is currently a reshape for ArrayType only + Reshape a flat data vector (flattened M) into the structure determined by qtype. + + :param data: flattened ndarray + :return: reshaped ndarray according to qtype """ return self.qtype.reshape(data) def __getitem__(self, key): """ - Get item from current QuantityMean, quantity type must support brackets access - All levels means and vars are reshaped to their QType shape and then the item is gotten, - ath the end, new QuantityMean instance is created with flatten selected means and vars - :param key: str, int, tuple - :return: QuantityMean + Index into QuantityMean similarly to Quantity.__getitem__: + reshape level-wise means/vars and select the requested key, then return a new QuantityMean. + + :param key: indexing key (int, slice, str, etc.) + :return: QuantityMean restricted to the requested key """ new_qtype, start = self.qtype.get_key(key) # New quantity type if not isinstance(self.qtype, qt.ArrayType): key = slice(start, start + new_qtype.size()) - # Getting items, it performs reshape inside + # Selecting and reshaping level arrays l_means = self.l_means[:, key] l_vars = self.l_vars[:, key] - return QuantityMean(quantity_type=new_qtype, l_means=l_means.reshape((l_means.shape[0], -1)), - l_vars=l_vars.reshape((l_vars.shape[0], -1)), n_samples=self._n_samples, + return QuantityMean(quantity_type=new_qtype, + l_means=l_means.reshape((l_means.shape[0], -1)), + l_vars=l_vars.reshape((l_vars.shape[0], -1)), + n_samples=self._n_samples, n_rm_samples=self._n_rm_samples) class QuantityStorage(Quantity): + """ + Special Quantity that provides direct access to SampleStorage. + It implements the bridge between storage and the Quantity abstraction. + """ + def __init__(self, storage, qtype): """ - Special Quantity for direct access to SampleStorage - :param storage: mlmc._sample_storage.SampleStorage child - :param qtype: QType + :param storage: SampleStorage instance (in-memory or HDF5, etc.) + :param qtype: QType describing stored data structure """ + # Store underlying storage reference and QType self._storage = storage self.qtype = qtype + # No operation or inputs required for storage root self._input_quantities = [] self._operation = None def level_ids(self): """ - Number of levels + Return list of available level ids from the SampleStorage. :return: List[int] """ return self._storage.get_level_ids() def selection_id(self): """ - Identity of QuantityStorage instance + Identity of this QuantityStorage (unique by object id). :return: int """ return id(self) def get_quantity_storage(self): + """ + For QuantityStorage the storage is itself. + :return: self + """ return self def chunks(self, level_id=None): + """ + Proxy to SampleStorage.chunks which yields ChunkSpec instances describing available chunks. + :param level_id: optional level id to restrict chunks + :return: generator of ChunkSpec + """ return self._storage.chunks(level_id) def samples(self, chunk_spec): """ - Get results for given level id and chunk id - :param chunk_spec: object containing chunk identifier level identifier and chunk_slice - slice() object - :return: Array[M, chunk size, 2] + Retrieve stored sample pairs for the requested level/chunk. + + :param chunk_spec: ChunkSpec describing (level, chunk slice) + :return: ndarray shaped [M, chunk_size, 2] where M is number of result quantities """ return self._storage.sample_pairs_level(chunk_spec) # Array[M, chunk size, 2] def n_collected(self): + """ + Return number of collected results per level from the underlying SampleStorage. + :return: list of ints + """ return self._storage.get_n_collected() diff --git a/mlmc/quantity/quantity_estimate.py b/mlmc/quantity/quantity_estimate.py index a1f63ddd..e2553138 100644 --- a/mlmc/quantity/quantity_estimate.py +++ b/mlmc/quantity/quantity_estimate.py @@ -5,122 +5,146 @@ def mask_nan_samples(chunk): """ - Mask out samples that contain NaN in either fine or coarse part of the result - :param chunk: np.ndarray [M, chunk_size, 2] - :return: chunk: np.ndarray, number of masked samples: int + Remove (mask out) samples containing NaN values in either the fine or coarse part of the result. + + :param chunk: np.ndarray of shape [M, chunk_size, 2] + M - quantity size (number of scalar components), + chunk_size - number of samples in the chunk, + 2 - fine and coarse parts of the result. + :return: (filtered_chunk, n_masked) + filtered_chunk: np.ndarray with invalid samples removed, + n_masked: int, number of masked (removed) samples. """ - # Fine and coarse moments_fn mask + # Identify any sample with NaNs in its fine or coarse component mask = np.any(np.isnan(chunk), axis=0).any(axis=1) return chunk[..., ~mask, :], np.count_nonzero(mask) def cache_clear(): + """ + Clear cached Quantity sample evaluations. + + Used before running MLMC estimations to ensure fresh data is fetched from storage. + """ mlmc.quantity.quantity.Quantity.samples.cache_clear() mlmc.quantity.quantity.QuantityConst.samples.cache_clear() def estimate_mean(quantity, form="diff", operation_func=None, **kwargs): """ - MLMC mean estimator. - The MLMC method is used to compute the mean estimate to the Quantity dependent on the collected samples. - The squared error of the estimate (the estimator variance) is estimated using the central limit theorem. - Data is processed by chunks, so that it also supports big data processing - :param quantity: Quantity - :param form: if "diff" estimates based on difference between fine and coarse data = MLMC approach - "fine" estimates based on level's fine data - "coarse" estimates based on level's coarse data - :param operation_func: function to process level data, e.g. kurtosis estimation - :return: QuantityMean which holds both mean and variance + Estimate the MLMC mean (and variance) of a Quantity using multilevel sampling. + + The function computes per-level means and variances from simulation results. + Supports large datasets via chunked processing and handles NaN-masked samples. + + :param quantity: Quantity instance to estimate. + :param form: str, type of estimation: + - "diff": estimate based on differences (fine - coarse) → standard MLMC approach. + - "fine": estimate using fine-level data only. + - "coarse": estimate using coarse-level data only. + :param operation_func: Optional transformation applied to chunk data before accumulation + (e.g., for moment or kurtosis computation). + :param kwargs: Additional keyword arguments passed to operation_func. + :return: QuantityMean object containing mean, variance, and sample statistics per level. """ + # Reset cached quantity evaluations cache_clear() + quantity_vec_size = quantity.size() sums = None sums_of_squares = None - # initialization + # Initialize level-specific storage quantity_storage = quantity.get_quantity_storage() level_ids = quantity_storage.level_ids() n_levels = np.max(level_ids) + 1 n_samples = [0] * n_levels n_rm_samples = [0] * n_levels + # Iterate through data chunks for chunk_spec in quantity_storage.chunks(): samples = quantity.samples(chunk_spec) chunk, n_mask_samples = mask_nan_samples(samples) n_samples[chunk_spec.level_id] += chunk.shape[1] n_rm_samples[chunk_spec.level_id] += n_mask_samples - # No samples in chunk + # Skip empty chunks if chunk.shape[1] == 0: continue - assert (chunk.shape[0] == quantity_vec_size) + assert chunk.shape[0] == quantity_vec_size - # Set variables for level sums and sums of squares + # Allocate accumulators at first valid chunk if sums is None: sums = [np.zeros(chunk.shape[0]) for _ in range(n_levels)] sums_of_squares = [np.zeros(chunk.shape[0]) for _ in range(n_levels)] - # Estimates of level's fine data + # Select appropriate data form for the estimator if form == "fine": - if chunk_spec.level_id == 0: - chunk_diff = chunk[:, :, 0] - else: - chunk_diff = chunk[:, :, 0] - # Estimate of level's coarse data + chunk_diff = chunk[:, :, 0] elif form == "coarse": - if chunk_spec.level_id == 0: - chunk_diff = np.zeros(chunk[:, :, 0].shape) - else: - chunk_diff = chunk[:, :, 1] + chunk_diff = np.zeros_like(chunk[:, :, 0]) if chunk_spec.level_id == 0 else chunk[:, :, 1] else: - if chunk_spec.level_id == 0: - chunk_diff = chunk[:, :, 0] - else: - chunk_diff = chunk[:, :, 0] - chunk[:, :, 1] + # Default MLMC difference (fine - coarse) + chunk_diff = chunk[:, :, 0] if chunk_spec.level_id == 0 else chunk[:, :, 0] - chunk[:, :, 1] + # Optional user-defined transformation of data if operation_func is not None: chunk_diff = operation_func(chunk_diff, chunk_spec, **kwargs) + # Accumulate sums and squared sums for this level sums[chunk_spec.level_id] += np.sum(chunk_diff, axis=1) - sums_of_squares[chunk_spec.level_id] += np.sum(chunk_diff**2, axis=1) + sums_of_squares[chunk_spec.level_id] += np.sum(chunk_diff ** 2, axis=1) if sums is None: - raise Exception("All samples were masked") + raise Exception("All samples were masked (no valid data found).") + # Compute means and variances for each level l_means = [] l_vars = [] for s, sp, n in zip(sums, sums_of_squares, n_samples): l_means.append(s / n) if n > 1: - l_vars.append((sp - (s ** 2 / n)) / (n-1)) + l_vars.append((sp - (s ** 2 / n)) / (n - 1)) else: l_vars.append(np.full(len(s), np.inf)) - return mlmc.quantity.quantity.QuantityMean(quantity.qtype, l_means=l_means, l_vars=l_vars, n_samples=n_samples, - n_rm_samples=n_rm_samples) + # Construct QuantityMean object with level statistics + return mlmc.quantity.quantity.QuantityMean( + quantity.qtype, + l_means=l_means, + l_vars=l_vars, + n_samples=n_samples, + n_rm_samples=n_rm_samples + ) def moment(quantity, moments_fn, i=0): """ - Create quantity with operation that evaluates particular moment - :param quantity: Quantity instance - :param moments_fn: mlmc.moments.Moments child - :param i: index of moment - :return: Quantity + Construct a Quantity that represents a single statistical moment. + + :param quantity: Base Quantity instance. + :param moments_fn: Instance of mlmc.moments.Moments defining the moment computation. + :param i: Index of the moment to compute. + :return: New Quantity that computes the i-th moment. """ def eval_moment(x): return moments_fn.eval_single_moment(i, value=x) - return mlmc.quantity.quantity.Quantity(quantity_type=quantity.qtype, input_quantities=[quantity], operation=eval_moment) + + return mlmc.quantity.quantity.Quantity( + quantity_type=quantity.qtype, + input_quantities=[quantity], + operation=eval_moment + ) def moments(quantity, moments_fn, mom_at_bottom=True): """ - Create quantity with operation that evaluates moments_fn - :param quantity: Quantity - :param moments_fn: mlmc.moments.Moments child - :param mom_at_bottom: bool, if True moments are underneath, - a scalar is substituted with an array of moments of that scalar - :return: Quantity + Construct a Quantity representing all moments defined by a given Moments object. + + :param quantity: Base Quantity. + :param moments_fn: mlmc.moments.Moments child defining moment evaluations. + :param mom_at_bottom: bool, if True, moments are added at the lowest (scalar) level of the Quantity type. + :return: Quantity that computes all defined moments. """ def eval_moments(x): if mom_at_bottom: @@ -129,73 +153,85 @@ def eval_moments(x): mom = moments_fn.eval_all(x).transpose((3, 0, 1, 2)) # [R, M, N, 2] return mom.reshape((np.prod(mom.shape[:-2]), mom.shape[-2], mom.shape[-1])) # [M, N, 2] - # Create quantity type which has moments_fn at the bottom + # Define new Quantity type according to desired hierarchy if mom_at_bottom: moments_array_type = qt.ArrayType(shape=(moments_fn.size,), qtype=qt.ScalarType()) moments_qtype = quantity.qtype.replace_scalar(moments_array_type) - # Create quantity type that has moments_fn on the surface else: moments_qtype = qt.ArrayType(shape=(moments_fn.size,), qtype=quantity.qtype) - return mlmc.quantity.quantity.Quantity(quantity_type=moments_qtype, input_quantities=[quantity], operation=eval_moments) + + return mlmc.quantity.quantity.Quantity( + quantity_type=moments_qtype, + input_quantities=[quantity], + operation=eval_moments + ) def covariance(quantity, moments_fn, cov_at_bottom=True): """ - Create quantity with operation that evaluates covariance matrix - :param quantity: Quantity - :param moments_fn: mlmc.moments.Moments child - :param cov_at_bottom: bool, if True cov matrices are underneath, - a scalar is substituted with a matrix of moments of that scalar - :return: Quantity + Construct a Quantity representing covariance matrices of the given moments. + + :param quantity: Base Quantity. + :param moments_fn: mlmc.moments.Moments child defining moment evaluations. + :param cov_at_bottom: bool, if True covariance matrices are attached at the scalar level of the Quantity type. + :return: Quantity that computes covariance matrices. """ def eval_cov(x): + # Compute all moments (fine and coarse) moments = moments_fn.eval_all(x) mom_fine = moments[..., 0, :] cov_fine = np.einsum('...i,...j', mom_fine, mom_fine) if moments.shape[-2] == 1: + # Single level (no coarse) cov = np.array([cov_fine]) else: mom_coarse = moments[..., 1, :] cov_coarse = np.einsum('...i,...j', mom_coarse, mom_coarse) cov = np.array([cov_fine, cov_coarse]) + # Reshape covariance according to desired data layout if cov_at_bottom: - cov = cov.transpose((1, 3, 4, 2, 0)) # [M, R, R, N, 2] + cov = cov.transpose((1, 3, 4, 2, 0)) # [M, R, R, N, 2] else: - cov = cov.transpose((3, 4, 1, 2, 0)) # [R, R, M, N, 2] + cov = cov.transpose((3, 4, 1, 2, 0)) # [R, R, M, N, 2] return cov.reshape((np.prod(cov.shape[:-2]), cov.shape[-2], cov.shape[-1])) - # Create quantity type which has covariance matrices at the bottom + # Adjust Quantity type for covariance structure if cov_at_bottom: - moments_array_type = qt.ArrayType(shape=(moments_fn.size, moments_fn.size, ), qtype=qt.ScalarType()) + moments_array_type = qt.ArrayType(shape=(moments_fn.size, moments_fn.size), qtype=qt.ScalarType()) moments_qtype = quantity.qtype.replace_scalar(moments_array_type) - # Create quantity type that has covariance matrices on the surface else: - moments_qtype = qt.ArrayType(shape=(moments_fn.size, moments_fn.size, ), qtype=quantity.qtype) - return mlmc.quantity.quantity.Quantity(quantity_type=moments_qtype, input_quantities=[quantity], operation=eval_cov) + moments_qtype = qt.ArrayType(shape=(moments_fn.size, moments_fn.size), qtype=quantity.qtype) + + return mlmc.quantity.quantity.Quantity( + quantity_type=moments_qtype, + input_quantities=[quantity], + operation=eval_cov + ) def kurtosis_numerator(chunk_diff, chunk_spec, l_means): """ - Estimate sample kurtosis nominator: - E[(Y_l - E[Y_l])^4] - :param chunk_diff: np.ndarray, [quantity shape, number of samples] - :param chunk_spec: quantity_spec.ChunkSpec - :return: np.ndarray, unchanged shape + Compute the numerator for the sample kurtosis: + E[(Y_l - E[Y_l])^4] + :param chunk_diff: np.ndarray [quantity shape, number of samples] + :param chunk_spec: quantity_spec.ChunkSpec describing current level and chunk. + :param l_means: List of per-level means used for centering. + :return: np.ndarray of the same shape as input. """ - chunk_diff = (chunk_diff - l_means[chunk_spec.level_id]) ** 4 - return chunk_diff + return (chunk_diff - l_means[chunk_spec.level_id]) ** 4 def level_kurtosis(quantity, means_obj): """ - Estimate sample kurtosis at each level as: - E[(Y_l - E[Y_l])^4] / (Var[Y_l])^2, where Y_l = fine_l - coarse_l - :param quantity: Quantity - :param means_obj: Quantity.QuantityMean - :return: np.ndarray, kurtosis per level + Estimate the sample kurtosis for each level: + E[(Y_l - E[Y_l])^4] / (Var[Y_l])^2, where Y_l = fine_l - coarse_l + + :param quantity: Quantity instance. + :param means_obj: QuantityMean object containing level means and variances. + :return: np.ndarray of kurtosis values per level. """ numerator_means_obj = estimate_mean(quantity, operation_func=kurtosis_numerator, l_means=means_obj.l_means) - kurtosis = numerator_means_obj.l_means / (means_obj.l_vars)**2 + kurtosis = numerator_means_obj.l_means / (means_obj.l_vars) ** 2 return kurtosis diff --git a/mlmc/quantity/quantity_spec.py b/mlmc/quantity/quantity_spec.py index ea25dc66..377adad8 100644 --- a/mlmc/quantity/quantity_spec.py +++ b/mlmc/quantity/quantity_spec.py @@ -5,24 +5,56 @@ @attr.s(auto_attribs=True, eq=False) class QuantitySpec: + """ + Specification of a physical quantity for simulation or data storage. + + :param name: Name of the quantity (e.g. 'pressure', 'velocity'). + :param unit: Unit of the quantity (e.g. 'm/s', 'Pa'). + :param shape: Tuple describing the shape of the data (e.g. (64, 64)). + :param times: List of time points associated with this quantity. + :param locations: List of either string-based identifiers or 3D coordinates + (x, y, z) where the quantity is defined. + """ + name: str unit: str shape: Tuple[int, int] times: List[float] locations: Union[List[str], List[Tuple[float, float, float]]] - # Note: auto generated eq raises ValueError def __eq__(self, other): - if (self.name, self.unit) == (other.name, other.unit) \ - and np.array_equal(self.shape, other.shape)\ - and np.array_equal(self.times, other.times)\ - and not (set(self.locations) - set(other.locations)): - return True - return False + """ + Compare two QuantitySpec instances for equality. + + :param other: Another QuantitySpec instance to compare with. + :return: True if both instances describe the same quantity, False otherwise. + """ + if not isinstance(other, QuantitySpec): + return False + + # Compare name, unit, shape, and times + same_basic_attrs = ( + (self.name, self.unit) == (other.name, other.unit) + and np.array_equal(self.shape, other.shape) + and np.array_equal(self.times, other.times) + ) + + # Compare locations (set difference = ∅ → same) + same_locations = not (set(self.locations) - set(other.locations)) + + return same_basic_attrs and same_locations @attr.s(auto_attribs=True) class ChunkSpec: + """ + Specification of a simulation or dataset chunk. + + :param chunk_id: Integer identifier of the chunk. + :param chunk_slice: Slice object defining the range of data indices in the chunk. + :param level_id: Identifier of the refinement or simulation level. + """ + chunk_id: int = None chunk_slice: slice = None level_id: int = None diff --git a/mlmc/quantity/quantity_types.py b/mlmc/quantity/quantity_types.py index 41d6ea7f..56b42b69 100644 --- a/mlmc/quantity/quantity_types.py +++ b/mlmc/quantity/quantity_types.py @@ -7,23 +7,37 @@ class QType(metaclass=abc.ABCMeta): + """ + Base class for quantity types. + + :param qtype: inner/contained QType or Python type + """ + def __init__(self, qtype): self._qtype = qtype def size(self) -> int: """ - Size of type + Size of the type in flattened units. + :return: int """ + raise NotImplementedError def base_qtype(self): + """ + Return the base scalar/bool type for nested types. + + :return: QType + """ return self._qtype.base_qtype() def replace_scalar(self, substitute_qtype): """ - Find ScalarType and replace it with substitute_qtype - :param substitute_qtype: QType, replaces ScalarType - :return: QType + Find ScalarType and replace it with substitute_qtype. + + :param substitute_qtype: QType that replaces ScalarType + :return: QType (new instance with scalar replaced) """ inner_qtype = self._qtype.replace_scalar(substitute_qtype) new_qtype = copy.deepcopy(self) @@ -31,85 +45,121 @@ def replace_scalar(self, substitute_qtype): return new_qtype @staticmethod - def keep_dims(chunk): + def keep_dims(chunk: np.ndarray) -> np.ndarray: """ - Always keep chunk shape to be [M, chunk size, 2]! - For scalar quantities, the input block can have the shape (chunk size, 2) - Sometimes we need to 'flatten' first few shape to have desired chunk shape - :param chunk: list - :return: list + Ensure chunk has shape [M, chunk size, 2]. + + For scalar quantities the input block can have shape (chunk size, 2). + Sometimes we need to 'flatten' first few dimensions to achieve desired chunk shape. + + :param chunk: numpy array + :return: numpy array with shape [M, chunk size, 2] + :raises ValueError: if chunk.ndim < 2 """ # Keep dims [M, chunk size, 2] if len(chunk.shape) == 2: chunk = chunk[np.newaxis, :] elif len(chunk.shape) > 2: - chunk = chunk.reshape((np.prod(chunk.shape[:-2]), chunk.shape[-2], chunk.shape[-1])) + chunk = chunk.reshape((int(np.prod(chunk.shape[:-2])), chunk.shape[-2], chunk.shape[-1])) else: - raise ValueError("Chunk shape not supported") + raise ValueError("Chunk shape not supported: need ndim >= 2") return chunk - def _make_getitem_op(self, chunk, key): + def _make_getitem_op(self, chunk: np.ndarray, key): """ - Operation - :param chunk: level chunk, list with shape [M, chunk size, 2] - :param key: parent QType's key, needed for ArrayType - :return: list + Extract a slice from chunk while preserving chunk dims. + + :param chunk: level chunk, numpy array with shape [M, chunk size, 2] + :param key: index/slice used by parent QType + :return: numpy array with shape [M', chunk size', 2] """ return QType.keep_dims(chunk[key]) - def reshape(self, data): + def reshape(self, data: np.ndarray) -> np.ndarray: + """ + Default reshape (identity). + + :param data: numpy array + :return: numpy array + """ return data class ScalarType(QType): + """ + Scalar quantity type (leaf type). + """ + def __init__(self, qtype=float): + """ + :param qtype: Python type or nested type used as underlying scalar type + """ self._qtype = qtype def base_qtype(self): + """ + :return: base scalar QType (self or underlying BoolType base) + """ if isinstance(self._qtype, BoolType): return self._qtype.base_qtype() return self def size(self) -> int: - if hasattr(self._qtype, 'size'): + """ + :return: int size of the scalar (defaults to 1 or uses `_qtype.size()` if present) + """ + if hasattr(self._qtype, "size"): return self._qtype.size() return 1 def replace_scalar(self, substitute_qtype): """ - Find ScalarType and replace it with substitute_qtype - :param substitute_qtype: QType, replaces ScalarType - :return: QType + Replace ScalarType with substitute type. + + :param substitute_qtype: QType that replaces ScalarType + :return: substitute_qtype """ return substitute_qtype class BoolType(ScalarType): + """ + Boolean scalar type (inherits ScalarType). + """ pass class ArrayType(QType): - def __init__(self, shape, qtype: QType): + """ + Array quantity type. + :param shape: int or tuple describing array shape + :param qtype: contained QType for array elements + """ + + def __init__(self, shape, qtype: QType): if isinstance(shape, int): shape = (shape,) - self._shape = shape self._qtype = qtype def size(self) -> int: - return np.prod(self._shape) * self._qtype.size() + """ + :return: total flattened size (product of shape * inner qtype size) + """ + return int(np.prod(self._shape)) * int(self._qtype.size()) def get_key(self, key): """ - ArrayType indexing + ArrayType indexing. + :param key: int, tuple of ints or slice objects - :return: QuantityType - ArrayType or self._qtype + :return: Tuple (QuantityType, offset) where offset is 0 for this implementation """ - # Get new shape + # Get new shape by applying indexing on an empty array of the target shape new_shape = np.empty(self._shape)[key].shape - # One selected item is considered to be a scalar QType + # If one selected item is considered to be a scalar QType if len(new_shape) == 1 and new_shape[0] == 1: new_shape = () @@ -121,26 +171,42 @@ def get_key(self, key): q_type = self._qtype return q_type, 0 - def _make_getitem_op(self, chunk, key): + def _make_getitem_op(self, chunk: np.ndarray, key): """ - Operation - :param chunk: list [M, chunk size, 2] - :param key: slice - :return: + Slice operation for ArrayType while restoring original shape. + + :param chunk: numpy array [M, chunk size, 2] + :param key: slice or index to apply on the array-shaped leading dims + :return: numpy array with preserved dims via QType.keep_dims """ - # Reshape M to original shape to allow access assert self._shape is not None chunk = chunk.reshape((*self._shape, chunk.shape[-2], chunk.shape[-1])) return QType.keep_dims(chunk[key]) - def reshape(self, data): + def reshape(self, data: np.ndarray) -> np.ndarray: + """ + Reshape flattened data to array shape. + + :param data: numpy array + :return: reshaped numpy array + """ if isinstance(self._qtype, ScalarType): return data.reshape(self._shape) else: - return data.reshape((*self._shape, np.prod(data.shape) // np.prod(self._shape))) + # assume trailing dimension belongs to inner types + total = np.prod(data.shape) + leading = int(np.prod(self._shape)) + return data.reshape((*self._shape, int(total // leading))) class TimeSeriesType(QType): + """ + Time-series quantity type. + + :param times: iterable of time points + :param qtype: QType for each time slice + """ + def __init__(self, times, qtype): if isinstance(times, np.ndarray): times = times.tolist() @@ -148,96 +214,176 @@ def __init__(self, times, qtype): self._qtype = qtype def size(self) -> int: - return len(self._times) * self._qtype.size() + """ + :return: total size = number of time points * inner qtype.size() + """ + return len(self._times) * int(self._qtype.size()) def get_key(self, key): + """ + Get a qtype and offset corresponding to a given time key. + + :param key: time value to locate + :return: Tuple (q_type, offset) + """ q_type = self._qtype try: position = self._times.index(key) - except KeyError: - print("Item " + str(key) + " was not found in TimeSeries" + ". Available items: " + str(list(self._times))) + except ValueError: + # keep behavior similar to original: print available items + print( + "Item " + + str(key) + + " was not found in TimeSeries" + + ". Available items: " + + str(list(self._times)) + ) + # raise to make the error explicit + raise return q_type, position * q_type.size() @staticmethod def time_interpolation(quantity, value): """ - Interpolation in time - :param quantity: Quantity instance - :param value: point where to interpolate - :return: Quantity + Interpolate a time-series quantity to a single time value. + + :param quantity: Quantity instance with qtype being a TimeSeriesType + :param value: float time value where to interpolate + :return: Quantity object representing interpolated value """ def interp(y): - split_indeces = np.arange(1, len(quantity.qtype._times)) * quantity.qtype._qtype.size() - y = np.split(y, split_indeces, axis=-3) + split_indices = np.arange(1, len(quantity.qtype._times)) * quantity.qtype._qtype.size() + y = np.split(y, split_indices, axis=-3) f = interpolate.interp1d(quantity.qtype._times, y, axis=0) return f(value) - return mlmc.quantity.quantity.Quantity(quantity_type=quantity.qtype._qtype, input_quantities=[quantity], operation=interp) + + return mlmc.quantity.quantity.Quantity( + quantity_type=quantity.qtype._qtype, + input_quantities=[quantity], + operation=interp + ) class FieldType(QType): + """ + Field type composed of named entries each having the same base qtype. + + :param args: List of (name, QType) pairs + """ + def __init__(self, args: List[Tuple[str, QType]]): - """ - QType must have same structure - :param args: - """ self._dict = dict(args) self._qtype = args[0][1] assert all(q_type.size() == self._qtype.size() for _, q_type in args) def size(self) -> int: - return len(self._dict.keys()) * self._qtype.size() + """ + :return: total size = number of fields * inner qtype size + """ + return len(self._dict.keys()) * int(self._qtype.size()) def get_key(self, key): + """ + Access sub-field by name. + + :param key: field name + :return: Tuple (q_type, offset) + """ q_type = self._qtype try: position = list(self._dict.keys()).index(key) - except KeyError: - print("Key " + str(key) + " was not found in FieldType" + - ". Available keys: " + str(list(self._dict.keys())[:5]) + "...") + except ValueError: + print( + "Key " + + str(key) + + " was not found in FieldType" + + ". Available keys: " + + str(list(self._dict.keys())[:5]) + + "..." + ) + raise return q_type, position * q_type.size() class DictType(QType): + """ + Dictionary-like type of named QTypes which may differ in size. + + :param args: List of (name, QType) pairs + """ + def __init__(self, args: List[Tuple[str, QType]]): - self._dict = dict(args) # Be aware we it is ordered dictionary + self._dict = dict(args) # keep ordered mapping semantics self._check_base_type() def _check_base_type(self): + """ + Ensure all contained qtypes share the same base_qtype. + + :raises TypeError: if base_qtypes differ + """ qtypes = list(self._dict.values()) qtype_0_base_type = qtypes[0].base_qtype() for qtype in qtypes[1:]: if not isinstance(qtype.base_qtype(), type(qtype_0_base_type)): - raise TypeError("qtype {} has base QType {}, expecting {}. " - "All QTypes must have same base QType, either SacalarType or BoolType". - format(qtype, qtype.base_qtype(), qtype_0_base_type)) + raise TypeError( + "qtype {} has base QType {}, expecting {}. " + "All QTypes must have same base QType, either ScalarType or BoolType".format( + qtype, qtype.base_qtype(), qtype_0_base_type + ) + ) def base_qtype(self): + """ + :return: base_qtype of the first element + """ return next(iter(self._dict.values())).base_qtype() def size(self) -> int: + """ + :return: total flattened size (sum of sizes of contained qtypes) + """ return int(sum(q_type.size() for _, q_type in self._dict.items())) def get_qtypes(self): + """ + :return: iterable of contained qtypes + """ return self._dict.values() def replace_scalar(self, substitute_qtype): """ - Find ScalarType and replace it with substitute_qtype - :param substitute_qtype: QType, replaces ScalarType - :return: DictType + Replace scalar types recursively inside dict entries. + + :param substitute_qtype: QType that replaces ScalarType + :return: new DictType instance """ dict_items = [] for key, qtype in self._dict.items(): new_qtype = qtype.replace_scalar(substitute_qtype) - dict_items.append((key, new_qtype)) + dict_items.append((key, new_qtype)) return DictType(dict_items) def get_key(self, key): + """ + Return the QType and starting offset for a named key. + + :param key: name of entry + :return: Tuple (q_type, start_offset) + """ try: q_type = self._dict[key] except KeyError: - print("Key " + str(key) + " was not found in DictType" + - ". Available keys: " + str(list(self._dict.keys())[:5]) + "...") + print( + "Key " + + str(key) + + " was not found in DictType" + + ". Available keys: " + + str(list(self._dict.keys())[:5]) + + "..." + ) + raise + start = 0 for k, qt in self._dict.items(): if k == key: diff --git a/mlmc/random/correlated_field.py b/mlmc/random/correlated_field.py index cebd1572..7867c7b4 100644 --- a/mlmc/random/correlated_field.py +++ b/mlmc/random/correlated_field.py @@ -12,16 +12,17 @@ def kozeny_carman(porosity, m, factor, viscosity): """ Kozeny-Carman law. Empirical relationship between porosity and conductivity. + :param porosity: Porosity value. :param m: Power. Suitable values are 1 < m < 4 - :param factor: [m^2] - E.g. 1e-7 , m = 3.48; juta fibers - 2.2e-8 , 1.46; glass fibers - 1.8e-13, 2.89; erruptive material - 1e-12 2.76; erruptive material - 1.8e-12 1.99; basalt - :param viscosity: [Pa . s], water: 8.90e-4 - :return: + :param factor: Factor [m^2]. Examples: + 1e-7 , m = 3.48; juta fibers + 2.2e-8 , m = 1.46; glass fibers + 1.8e-13, m = 2.89; erruptive material + 1e-12 , m = 2.76; erruptive material + 1.8e-12, m = 1.99; basalt + :param viscosity: Fluid viscosity [Pa.s], e.g., water: 8.90e-4 + :return: Conductivity """ assert np.all(viscosity > 1e-10) porosity = np.minimum(porosity, 1-1e-10) @@ -33,10 +34,12 @@ def kozeny_carman(porosity, m, factor, viscosity): def positive_to_range(exp, a, b): """ - Mapping a positive parameter 'exp' from the interval <0, \infty) to the interval . + + :param exp: Positive parameter (e.g., lognormal variable) + :param a: Lower bound of target interval + :param b: Upper bound of target interval + :return: Mapped value in [a, b) """ return b * (1 - (b - a) / (b + (b - a) * exp)) @@ -44,12 +47,12 @@ def positive_to_range(exp, a, b): class Field: def __init__(self, name, field=None, param_fields=[], regions=[]): """ - :param name: Name of the field. - :param field: scalar (const field), or instance of SpatialCorrelatedField, or a callable - for evaluation of the field from its param_fields. - :param regions: Domain where field is sampled. - :param param_fields: List of names of parameter fields, dependees. - TODO: consider three different derived classes for: const, random and func fields. + Initialize a Field object. + + :param name: Name of the field + :param field: Scalar (const), RandomFieldBase, or callable function + :param param_fields: List of dependent parameter fields + :param regions: List of region names where the field is defined """ self.correlated_field = None self.const = None @@ -67,8 +70,6 @@ def __init__(self, name, field=None, param_fields=[], regions=[]): assert len(param_fields) == 0 else: assert len(param_fields) > 0, field - - # check callable try: params = [np.ones(2) for i in range(len(param_fields))] field(*params) @@ -81,74 +82,61 @@ def __init__(self, name, field=None, param_fields=[], regions=[]): def set_points(self, points): """ - Internal method to set evaluation points. See Fields.set_points. + Set points for field evaluation. + + :param points: Array of points where the field will be evaluated """ if self.const is not None: self._sample = self.const * np.ones(len(points)) elif self.correlated_field is not None: self.correlated_field.set_points(points) - if type(self.correlated_field) is SpatialCorrelatedField: - # TODO: make n_terms_range an optianal parmater for SpatialCorrelatedField + if type(self.correlated_field) is SpatialCorrelatedField: self.correlated_field.svd_dcmp(n_terms_range=(10, 100)) else: pass def sample(self): """ - Internal method to generate/compute new sample. - :return: + Generate or compute a new sample of the field. + + :return: Sample values of the field """ if self.const is not None: return self._sample elif self.correlated_field is not None: self._sample = self.correlated_field.sample() else: - params = [ pf._sample for pf in self.param_fields] + params = [pf._sample for pf in self.param_fields] self._sample = self._func(*params) return self._sample class Fields: - def __init__(self, fields): """ - Creates a new set of cross dependent random fields. - Currently no support for cross-correlated random fields. - A set of independent basic random fields must exist - other fields can be dependent in deterministic way. - - :param fields: A list of dependent fields. + Create a set of cross-dependent random fields. - Example: - rf = SpatialCorrelatedField(log=True) - Fields([ - Field('por_top', rf, regions='ground_0'), - Field('porosity_top', positive_to_range, ['por_top', 0.02, 0.1], regions='ground_0'), - Field('por_bot', rf, regions='ground_1'), - Field('porosity_bot', positive_to_range, ['por_bot', 0.01, 0.05], regions='ground_1'), - Field('conductivity_top', cf.kozeny_carman, ['porosity_top', 1, 1e-8, water_viscosity], regions='ground_0'), - Field('conductivity_bot', cf.kozeny_carman, ['porosity_bot', 1, 1e-10, water_viscosity],regions='ground_1') - ]) - - TODO: use topological sort to fix order of 'fields' - TODO: syntactic sugar for calculating with fields (like with np.arrays). + :param fields: List of Field objects """ self.fields_orig = fields self.fields_dict = {} self.fields = [] - # Have to make a copy of the fields since we want to generate the samples in them - # and the given instances of Field can be used by an independent FieldSet instance. for field in self.fields_orig: new_field = copy.copy(field) if new_field.param_fields: - new_field.param_fields = [self._get_field_obj(field, new_field.regions) for field in new_field.param_fields] + new_field.param_fields = [self._get_field_obj(f, new_field.regions) + for f in new_field.param_fields] self.fields_dict[new_field.name] = new_field self.fields.append(new_field) def _get_field_obj(self, field_name, regions): """ - Get fields by name, replace constants by constant fields for unification. + Get Field object by name or create constant field. + + :param field_name: Field name or constant + :param regions: Regions of the field + :return: Field object """ if type(field_name) in [float, int]: const_field = Field("const_{}".format(field_name), field_name, regions=regions) @@ -156,56 +144,31 @@ def _get_field_obj(self, field_name, regions): self.fields_dict[const_field.name] = const_field return const_field else: - assert field_name in self.fields_dict, "name: {} dict: {}".format(field_name, self.fields_dict) + assert field_name in self.fields_dict return self.fields_dict[field_name] - @property - def names(self): - return self.fields_dict.keys() - - # def iterative_dfs(self, graph, start, path=[]): - # q = [start] - # while q: - # v = q.pop(0) - # if v not in path: - # path = path + [v] - # q = graph[v] + q - # - # return path - def set_outer_fields(self, outer): """ - Set fields that will be in a dictionary produced by FieldSet.sample() call. - :param outer: A list of names of fields that are sampled. - :return: + Set fields to be included in the sampled dictionary. + + :param outer: List of outer field names """ outer_set = set(outer) for f in self.fields: - if f.name in outer_set: - f.is_outer = True - else: - f.is_outer = False + f.is_outer = f.name in outer_set def set_points(self, points, region_ids=[], region_map={}): """ - Set mesh related data to fields. - - set points for sample evaluation - - translate region names to region ids in fields - - create maps from region constraned point sets of fields to full point set - :param points: np array of points for field evaluation - :param regions: regions of the points; - empty means no points for fields restricted to regions and all points for unrestricted fields - :return: + Assign evaluation points to each field. + + :param points: Array of points for field evaluation + :param region_ids: Optional array of region ids for each point + :param region_map: Mapping from region name to region id """ self.n_elements = len(points) - print("n elements: {}, len(points): {}".format(self.n_elements, len(points))) - - #assert len(points) == len(region_ids) reg_points = {} for i, reg_id in enumerate(region_ids): - reg_list = reg_points.get(reg_id, []) - reg_list.append(i) - reg_points[reg_id] = reg_list + reg_points.setdefault(reg_id, []).append(i) for field in self.fields: point_ids = [] @@ -221,73 +184,42 @@ def set_points(self, points, region_ids=[], region_map={}): def sample(self): """ - Return dictionary of sampled fields. - :return: { 'field_name': sample, ...} + Sample all outer fields. + + :return: Dictionary with field names as keys and sampled arrays as values """ result = {} for field in self.fields: sample = field.sample() if field.is_outer: - if field.name == "cond_tn": - result[field.name] = np.zeros((self.n_elements, 3)) - else: - result[field.name] = np.zeros(self.n_elements) - #result[field.name] = np.zeros(self.n_elements) + shape = (self.n_elements, 3) if field.name == "cond_tn" else self.n_elements + result[field.name] = np.zeros(shape) result[field.name][field.full_sample_ids] = sample return result class RandomFieldBase: """ - Base class for various methods for generating random fields. - - Generating realizations of a spatially correlated random field F for a fixed set of points at X. - E[F(x)] = mu(x) - Cov_ij = Cov[x_i,x_j] = E[(F(x_i) - mu(x))(F(x_j) - mu(x))] - - We assume stationary random field with covariance matrix Cov_ij: - Cov_i,j = c(x_i - x_j) - where c(X) is the "stationary covariance" function. We assume: - c(X) = sigma^2 exp( -|X^t K X|^(alpha/2) ) - for spatially heterogeneous sigma(X) we consider particular non-stationary generalization:\ - Cov_i,i = sigma(x_i)*sigma(x_j) exp( -|X^t K X|^(alpha/2) ); X = x_i - x_j - - where: - - sigma(X) is the standard deviance of the single uncorrelated value - - K is a positive definite tensor with eigen vectors corresponding to - main directions and eigen values equal to (1/l_i)^2, where l_i is correlation - length in singel main direction. - - alpha is =1 for "exponential" and =2 for "Gauss" correlation - - SVD decomposition: - Considering first m vectors, such that lam(m)/lam(0) <0.1 - - Example: - ``` - field = SpatialCorrelatedField(corr_exp='exp', corr_length=1.5) - X, Y = np.mgrid[0:1:10j, 0:1:10j] - points = np.vstack([X.ravel(), Y.ravel()]) - field.set_points(points) - sample = field.sample() - - ``` + Base class for generating spatially correlated random fields. + + Random field F(x) with mean E[F(x)] = mu(x) and covariance Cov[x_i,x_j]. + Stationary covariance: Cov_ij = sigma^2 * exp(-|X^T K X|^(alpha/2)), + X = x_i - x_j. + Supports optional non-stationary variance sigma(X). """ def __init__(self, corr_exp='gauss', dim=2, corr_length=1.0, aniso_correlation=None, mu=0.0, sigma=1.0, log=False, **kwargs): """ - :param corr_exp: 'gauss', 'exp' or a float (should be >= 1) - :param dim: dimension of the domain (size of point coords) - :param corr_length: scalar, correlation length L > machine epsilon; tensor K = (1/L)^2 - :param aniso_correlation: 3x3 array; K tensor, overrides correlation length - :param mu - mu field (currently just a constant) - :param sigma - sigma field (currently just a constant) + Initialize a random field. - TODO: - - implement anisotropy in the base class using transformation matrix for the points - - use transformation matrix also for the corr_length - - replace corr_exp by aux classes for various correlation functions and pass them here - - more general set of correlation functions + :param corr_exp: 'gauss', 'exp', or float >=1 (correlation exponent) + :param dim: Dimension of the domain + :param corr_length: Scalar correlation length + :param aniso_correlation: Optional anisotropic 3x3 correlation tensor + :param mu: Mean (scalar or array) + :param sigma: Standard deviation (scalar or array) + :param log: If True, output field is exponentiated """ self.dim = dim self.log = log @@ -299,8 +231,6 @@ def __init__(self, corr_exp='gauss', dim=2, corr_length=1.0, else: self.correlation_exponent = float(corr_exp) - # TODO: User should prescribe scaling for main axis and their rotation. - # From this we should construct the transformation matrix for the points self._corr_length = corr_length if aniso_correlation is None: assert corr_length > np.finfo(float).eps @@ -308,31 +238,26 @@ def __init__(self, corr_exp='gauss', dim=2, corr_length=1.0, self._max_corr_length = corr_length else: self.correlation_tensor = aniso_correlation - self._max_corr_length = la.norm(aniso_correlation, ord=2) # largest eigen value + self._max_corr_length = la.norm(aniso_correlation, ord=2) - #### Attributes set through `set_points`. self.points = None - # Evaluation points of the field. self.mu = mu - # Mean in points. Or scalar. self.sigma = sigma - # Standard deviance in points. Or scalar. - - self._initialize(**kwargs) # Implementation dependent initialization. + self._initialize(**kwargs) def _initialize(self, **kwargs): + """Implementation-specific initialization. To be overridden in subclasses.""" raise NotImplementedError() def set_points(self, points, mu=None, sigma=None): """ - :param points: N x d array. Points X_i where the field will be evaluated. d is the dimension. - :param mu: Scalar or N array. Mean value of uncorrelated field: E( F(X_i)). - :param sigma: Scalar or N array. Standard deviance of uncorrelated field: sqrt( E ( F(X_i) - mu_i )^2 ) - :return: None + Set points for field evaluation. + + :param points: Array of points (N x dim) + :param mu: Optional mean at points + :param sigma: Optional standard deviation at points """ points = np.array(points, dtype=float) - - assert len(points.shape) >= 1 assert points.shape[1] == self.dim self.n_points, self.dimension = points.shape self.points = points @@ -345,49 +270,43 @@ def set_points(self, points, mu=None, sigma=None): if sigma is not None: self.sigma = sigma self.sigma = np.array(self.sigma, dtype=float) - assert self.sigma.shape == () or sigma.shape == (len(points),) + assert self.sigma.shape == () or self.sigma.shape == (len(points),) def _set_points(self): + """Optional internal method to update points. Can be overridden.""" pass def sample(self): """ - :param uncorelated: Random samples from standard normal distribution. - Removed as the spectral method do not support it. - :return: Random field evaluated in points given by 'set_points'. - """ - # if uncorelated is None: - # uncorelated = np.random.normal(0, 1, self.n_approx_terms) - # else: - # assert uncorelated.shape == (self.n_approx_terms,) + Generate a realization of the random field. + :return: Array of field values at set points + """ field = self._sample() field = self.sigma * field + self.mu - - if not self.log: - return field - return np.exp(field) + return np.exp(field) if self.log else field def _sample(self, uncorrelated): + """ + Implementation-specific sample generation. To be overridden. + + :param uncorrelated: Array of uncorrelated standard normal samples + :return: Field sample + """ raise NotImplementedError() class SpatialCorrelatedField(RandomFieldBase): + """ + Generate spatially correlated fields using covariance matrix and KL decomposition. + """ def _initialize(self, **kwargs): - """ - Called after initialization in common constructor. - """ - - ### Attributes computed in precalculation. + """Initialization specific to SVD/KL-based spatial correlation.""" self.cov_mat = None - # Covariance matrix (dense). self._n_approx_terms = None - # Length of the sample vector, number of KL (Karhunen-Loe?ve) expansion terms. self._cov_l_factor = None - # (Reduced) L factor of the SVD decomposition of the covariance matrix. self._sqrt_ev = None - # (Reduced) square roots of singular values. def _set_points(self): self.cov_mat = None @@ -395,17 +314,16 @@ def _set_points(self): def cov_matrix(self): """ - Setup dense covariance matrix for given set of points. - :return: None. + Compute dense covariance matrix for current points. + + :return: Covariance matrix """ assert self.points is not None, "Points not set, call set_points." - - self._points_bbox = box = (np.min(self.points, axis=0), np.max(self.points, axis=0)) - diameter = np.max(np.abs(box[1] - box[0])) + self._points_bbox = (np.min(self.points, axis=0), np.max(self.points, axis=0)) + diameter = np.max(np.abs(self._points_bbox[1] - self._points_bbox[0])) self._relative_corr_length = self._max_corr_length / diameter - - # sigma_sqr_mat = np.outer(self.sigma, self.sigma.T) self._sigma_sqr_max = np.max(self.sigma) ** 2 + n_pt = len(self.points) self.cov_mat = np.empty((n_pt, n_pt)) corr_exp = self.correlation_exponent / 2.0 @@ -415,19 +333,16 @@ def cov_matrix(self): diff_row = self.points - pt len_sqr_row = np.sum(diff_row.dot(self.correlation_tensor) * diff_row, axis=-1) self.cov_mat[i_row, :] = np.exp(-len_sqr_row ** corr_exp) + return self.cov_mat def _eigen_value_estimate(self, m): """ - Estimate of the m-th eigen value of the covariance matrix. - According to paper: Schwab, Thodor: KL Approximation of Random Fields by ... - However for small gamma the asimtotics holds just for to big values of 'm'. - We rather need to find a semiempricial formula. - greater - :param m: - :return: + Semi-empirical estimate of the m-th eigenvalue of covariance matrix. + + :param m: Eigenvalue index + :return: Estimated eigenvalue """ - assert self.cov_mat is not None d = self.dimension alpha = self.correlation_exponent gamma = self._relative_corr_length @@ -435,24 +350,11 @@ def _eigen_value_estimate(self, m): def svd_dcmp(self, precision=0.01, n_terms_range=(1, np.inf)): """ - Does decomposition of covariance matrix defined by set of points - :param precision: Desired accuracy of the KL approximation, smaller eigen values are dropped. - :param n_terms_range: (min, max) number of terms in KL expansion to use. The number of terms estimated from - given precision is snapped to the given interval. - - truncated SVD: - cov_mat = U*diag(ev) * V, - _cov_l_factor = U[:,0:m]*sqrt(ev[0:m]) + Perform truncated SVD for Karhunen-Loeve decomposition. - Note on number of terms: - According to: C. Schwab and R. A. Todor: KL Approximation of Random Fields by Generalized Fast Multiploe Method - the eigen values should decay as (Proposition 2.18): - lambda_m ~ sigma^2 * ( 1/gamma ) **( m**(1/d) + alpha ) / Gamma(0.5 * m**(1/d) ) - where gamma = correlation length / domain diameter - ans alpha is the correlation exponent. Gamma is the gamma function. - ... should be checked experimantaly and generalized for sigma(X) - - :return: + :param precision: Desired accuracy + :param n_terms_range: Min/max number of KL terms + :return: (_cov_l_factor, singular values) """ if self.cov_mat is None: self.cov_matrix() @@ -461,55 +363,58 @@ def svd_dcmp(self, precision=0.01, n_terms_range=(1, np.inf)): U, ev, VT = np.linalg.svd(self.cov_mat) m = self.n_points else: - range = list(n_terms_range) - range[0] = max(1, range[0]) - range[1] = min(self.n_points, range[1]) - - prec_range = (self._eigen_value_estimate(range[0]), self._eigen_value_estimate(range[1])) + range_vals = [max(1, n_terms_range[0]), min(self.n_points, n_terms_range[1])] + prec_range = (self._eigen_value_estimate(range_vals[0]), self._eigen_value_estimate(range_vals[1])) if precision < prec_range[0]: - m = range[0] + m = range_vals[0] elif precision > prec_range[1]: - m = range[1] + m = range_vals[1] else: f = lambda m: self._eigen_value_estimate(m) - precision - m = sp.optmize.bisect(f, range[0], range[1], xtol=0.5, ) - - m = max(m, range[0]) + m = sp.optimize.bisect(f, range_vals[0], range_vals[1], xtol=0.5) + m = max(m, range_vals[0]) threshold = 2 * precision - # TODO: Test if we should cut eigen values by relative (like now) or absolute value - while threshold >= precision and m <= range[1]: - #print("treshold: {} m: {} precision: {} max_m: {}".format(threshold, m, precision, range[1])) + while threshold >= precision and m <= range_vals[1]: U, ev, VT = randomized_svd(self.cov_mat, n_components=m, n_iter=3, random_state=None) threshold = ev[-1] / ev[0] m = int(np.ceil(1.5 * m)) - m = len(ev) - m = min(m, range[1]) + m = min(len(ev), range_vals[1]) - #print("KL approximation: {} for {} points.".format(m, self.n_points)) self.n_approx_terms = m - self._sqrt_ev = np.sqrt(ev[0:m]) - self._cov_l_factor = U[:, 0:m].dot(np.diag(self._sqrt_ev)) + self._sqrt_ev = np.sqrt(ev[:m]) + self._cov_l_factor = U[:, :m].dot(np.diag(self._sqrt_ev)) self.cov_mat = None - return self._cov_l_factor, ev[0:m] + return self._cov_l_factor, ev[:m] def _sample(self): """ - :param uncorelated: Random samples from standard normal distribution. - :return: Random field evaluated in points given by 'set_points'. + Generate a field realization using KL decomposition. + + :return: Field sample array """ if self._cov_l_factor is None: self.svd_dcmp() - uncorelated = np.random.normal(0, 1, self.n_approx_terms) - return self._cov_l_factor.dot(uncorelated) + uncorrelated = np.random.normal(0, 1, self.n_approx_terms) + return self._cov_l_factor.dot(uncorrelated) class GSToolsSpatialCorrelatedField(RandomFieldBase): + """ + Spatially correlated random field using the GSTools library. + + Uses Fourier modes to generate spatial random fields efficiently. + """ def __init__(self, model, mode_no=1000, log=False, sigma=1, seed=None): """ - :param model: instance of covariance model class, which parent is gstools.covmodel.CovModel - :param mode_no: number of Fourier modes, default: 1000 as in gstools package + Initialize GSTools-based spatial random field. + + :param model: gstools covariance model (subclass of gstools.CovModel) + :param mode_no: Number of Fourier modes (default 1000) + :param log: If True, output field is exponentiated + :param sigma: Standard deviation + :param seed: Optional random seed for reproducibility """ self.model = model self.mode_no = mode_no @@ -521,172 +426,164 @@ def __init__(self, model, mode_no=1000, log=False, sigma=1, seed=None): def change_srf(self, seed): """ - Spatial random field with new seed - :param seed: int, random number generator seed - :return: None + Generate a new spatial random field with a different seed. + + :param seed: Random seed """ self.srf = gstools.SRF(self.model, seed=seed, mode_no=self.mode_no) def random_field(self): """ - Generate the spatial random field - :return: field, np.ndarray + Evaluate the spatial random field at the current points. + + :return: Field values (np.ndarray) """ if self.dim == 1: x = self.points - x.reshape(len(x)) - field = self.srf((x,)) + x = x.reshape(len(x),) + return self.srf((x,)) elif self.dim == 2: x, y = self.points.T x = x.reshape(len(x), 1) y = y.reshape(len(y), 1) - field = self.srf((x, y)) - else: + return self.srf((x, y)) + else: # dim == 3 x, y, z = self.points.T x = x.reshape(len(x), 1) y = y.reshape(len(y), 1) z = z.reshape(len(z), 1) - field = self.srf((x, y, z)) - - return field + return self.srf((x, y, z)) def sample(self): """ - :return: Random field evaluated in points given by 'set_points' + Generate a realization of the GSTools spatial random field. + + :return: Field values (np.ndarray) """ - if not self.log: - return self.sigma * self.random_field() + self.mu - return np.exp(self.sigma * self.random_field() + self.mu) + field = self.random_field() + field = self.sigma * field + self.mu + return np.exp(field) if self.log else field class FourierSpatialCorrelatedField(RandomFieldBase): """ - Generate spatial random fields + Deprecated: Fourier-based spatial random field generator. + + Generates spatial random fields using a truncated Fourier series. + Use GSToolsSpatialCorrelatedField instead. """ def _initialize(self, **kwargs): """ - Own intialization. - :param mode_no: Number of Fourier modes + Initialization specific to Fourier-based spatial fields. + + :param mode_no: Number of Fourier modes (default 1000) """ - warnings.warn("FourierSpatialCorrelatedField class is deprecated, try to use GSToolsSpatialCorrelatedField class instead", - DeprecationWarning) - self.len_scale = self._corr_length * 2*np.pi + warnings.warn( + "FourierSpatialCorrelatedField class is deprecated, use GSToolsSpatialCorrelatedField instead", + DeprecationWarning + ) + self.len_scale = self._corr_length * 2 * np.pi self.mode_no = kwargs.get("mode_no", 1000) def get_normal_distr(self): """ - Normal distributed arrays - :return: np.ndarray + Generate normal distributed random coefficients for Fourier modes. + + :return: Array of shape (2, mode_no) """ Z = np.empty((2, self.mode_no)) rng = self._get_random_stream() for i in range(2): Z[i] = rng.normal(size=self.mode_no) - return Z def _sample_sphere(self, mode_no): - """Uniform sampling on a d-dimensional sphere - Parameters - ---------- - mode_no : :class:`int`, optional - number of the Fourier modes - Returns - ------- - coord : :class:`numpy.ndarray` - x[, y[, z]] coordinates on the sphere with shape (dim, mode_no) + """ + Uniformly sample directions on the unit sphere (dim=1,2,3). + + :param mode_no: Number of modes + :return: Array of unit vectors (dim, mode_no) """ coord = self._create_empty_k(mode_no) + rng = self._get_random_stream() if self.dim == 1: - rng = self._get_random_stream() ang1 = rng.random_sample(mode_no) coord[0] = 2 * np.around(ang1) - 1 elif self.dim == 2: - rng = self._get_random_stream() ang1 = rng.uniform(0.0, 2 * np.pi, mode_no) coord[0] = np.cos(ang1) coord[1] = np.sin(ang1) elif self.dim == 3: - raise NotImplementedError("For implementation see " - "https://github.com/LSchueler/GSTools/blob/randomization_revisited/gstools/field/rng.py") + raise NotImplementedError("3D implementation see GSTools repo") return coord def gau(self, mode_no=1000): """ - Compute a gaussian spectrum - :param mode_no: int, Number of Fourier modes - :return: numpy.ndarray + Gaussian Fourier spectrum. + + :param mode_no: Number of modes + :return: Array of wave vectors (dim, mode_no) """ len_scale = self.len_scale * np.sqrt(np.pi / 4) if self.dim == 1: k = self._create_empty_k(mode_no) - rng = self._get_random_stream() - k[0] = rng.normal(0., np.pi / 2.0 / len_scale ** 2, mode_no) + k[0] = self._get_random_stream().normal(0., np.pi / 2.0 / len_scale ** 2, mode_no) elif self.dim == 2: coord = self._sample_sphere(mode_no) - rng = self._get_random_stream() - rad_u = rng.random_sample(mode_no) - # weibull distribution sampling + rad_u = self._get_random_stream().random_sample(mode_no) rad = np.sqrt(np.pi) / len_scale * np.sqrt(-np.log(rad_u)) k = rad * coord elif self.dim == 3: - raise NotImplementedError("For implementation see " - "https://github.com/LSchueler/GSTools/blob/randomization_revisited/gstools/field/rng.py") + raise NotImplementedError("3D implementation see GSTools repo") return k def exp(self, mode_no=1000): """ - Compute an exponential spectrum - :param mode_no: int, Number of Fourier modes - :return: numpy.ndarray + Exponential Fourier spectrum. + + :param mode_no: Number of modes + :return: Array of wave vectors (dim, mode_no) """ if self.dim == 1: k = self._create_empty_k(mode_no) - rng = self._get_random_stream() - k_u = rng.rng.uniform(-np.pi / 2.0, np.pi / 2.0, mode_no) + k_u = self._get_random_stream().uniform(-np.pi / 2.0, np.pi / 2.0, mode_no) k[0] = np.tan(k_u) / self.len_scale elif self.dim == 2: coord = self._sample_sphere(mode_no) - rng = self._get_random_stream() - rad_u = rng.random_sample(mode_no) - # sampling with ppf + rad_u = self._get_random_stream().random_sample(mode_no) rad = np.sqrt(1.0 / rad_u ** 2 - 1.0) / self.len_scale k = rad * coord elif self.dim == 3: - raise NotImplementedError("For implementation see " - "https://github.com/LSchueler/GSTools/blob/randomization_revisited/gstools/field/rng.py") + raise NotImplementedError("3D implementation see GSTools repo") return k def _create_empty_k(self, mode_no=None): - """ Create empty mode array with the correct shape. - Parameters - ---------- - mode_no : :class:`int` - number of the fourier modes - Returns - ------- - :class:`numpy.ndarray` - the empty mode array - """ - if mode_no is None: - k = np.empty(self.dim) - else: - k = np.empty((self.dim, mode_no)) + """ + Helper to create empty Fourier mode array. - return k + :param mode_no: Number of modes + :return: Empty array of shape (dim, mode_no) + """ + return np.empty((self.dim, mode_no)) if mode_no is not None else np.empty(self.dim) def _get_random_stream(self, seed=None): + """ + Return a random number generator. + + :param seed: Optional seed + """ return rand.RandomState(rand.RandomState(seed).randint(2 ** 16 - 1)) def random_field(self): """ - Calculates the random modes for the randomization method. + Generate a random field using Fourier series. + + :return: Field values at points """ - y, z = None, None + # Prepare coordinates if self.dim == 1: - x = self.points - x.reshape(len(x), 1) + x = self.points.reshape(len(self.points), 1) elif self.dim == 2: x, y = self.points.T x = x.reshape(len(x), 1) @@ -698,60 +595,20 @@ def random_field(self): z = z.reshape(len(z), 1) normal_distr_values = self.get_normal_distr() + k = self.gau(self.mode_no) if self.correlation_exponent == 2 else self.exp(self.mode_no) - if self.correlation_exponent == 2: - k = self.gau(self.mode_no) - else: - k = self.exp(self.mode_no) + summed_modes = np.zeros(len(self.points)) + # Fourier summation (memory safe chunks could be implemented here) + for i in range(self.mode_no): + phase = np.sum(k[:, i] * self.points.T, axis=0) + summed_modes += normal_distr_values[0, i] * np.cos(2*np.pi*phase) + normal_distr_values[1, i] * np.sin(2*np.pi*phase) - # reshape for unstructured grid - for dim_i in range(self.dim): - k[dim_i] = np.squeeze(k[dim_i]) - k[dim_i] = np.reshape(k[dim_i], (1, len(k[dim_i]))) - - summed_modes = np.broadcast(x, y, z) - summed_modes = np.squeeze(np.zeros(summed_modes.shape)) - # Test to see if enough memory is available. - # In case there isn't, divide Fourier modes into smaller chunks - chunk_no = 1 - chunk_no_exp = 0 - - while True: - try: - chunk_len = int(np.ceil(self.mode_no / chunk_no)) - - for chunk in range(chunk_no): - a = chunk * chunk_len - # In case k[d,a:e] with e >= len(k[d,:]) causes errors in - # numpy, use the commented min-function below - # e = min((chunk + 1) * chunk_len, self.mode_no-1) - e = (chunk + 1) * chunk_len - - if self.dim == 1: - phase = k[0, a:e]*x - elif self.dim == 2: - phase = k[0, a:e]*x + k[1, a:e]*y - else: - phase = (k[0, a:e]*x + k[1, a:e]*y + - k[2, a:e]*z) - - summed_modes += np.squeeze( - np.sum(normal_distr_values[0, a:e] * np.cos(2.*np.pi*phase) + - normal_distr_values[1, a:e] * np.sin(2.*np.pi*phase), - axis=-1)) - except MemoryError: - chunk_no += 2**chunk_no_exp - chunk_no_exp += 1 - print('Not enough memory. Dividing Fourier modes into {} ' - 'chunks.'.format(chunk_no)) - else: - break - - field = np.sqrt(1.0 / self.mode_no) * summed_modes - return field + return np.sqrt(1.0 / self.mode_no) * summed_modes def _sample(self): """ - :return: Random field evaluated in points given by 'set_points'. + Generate a Fourier-based random field realization. + + :return: Field values """ return self.random_field() diff --git a/mlmc/random/frac_geom.py b/mlmc/random/frac_geom.py deleted file mode 100644 index 0d872646..00000000 --- a/mlmc/random/frac_geom.py +++ /dev/null @@ -1,140 +0,0 @@ -import numpy as np -import geomop.polygons as poly -import geomop.merge as merge -import geomop.polygons_io as poly_io -import geomop.format_last as lg -import geomop.layers_io -import geomop.geometry -#from geomop.plot_polygons import plot_polygon_decomposition - - - - - - - - - -def make_frac_mesh(box, mesh_step, fractures, frac_step): - """ - Make geometry and mesh for given 2d box and set of fractures. - :param box: [min_point, max_point]; points are np.arrays - :param fractures: Array Nx2x2, one row for every fracture given by endpoints: [p0, p1] - :return: GmshIO object with physical groups: - box: 1, - fractures: 1000 + i, i = 0, ... , N-1 - """ - regions = make_regions(mesh_step, fractures, frac_step) - decomp, reg_map = make_decomposition(box, fractures, regions) - geom = fill_lg(decomp, reg_map, regions) - return make_mesh(geom) - - -def add_reg(regions, name, dim, step=0.0, bc=False, not_used =False): - reg = lg.Region(dict(name=name, dim=dim, mesh_step=step, boundary=bc, not_used=not_used)) - reg._id = len(regions) - regions.append(reg) - -def make_regions(mesh_step, fractures, frac_step): - regions = [] - add_reg(regions, "NONE", -1, not_used=True) - add_reg(regions, "bulk_0", 2, mesh_step) - add_reg(regions, ".bc_inflow", 1, bc=True) - add_reg(regions, ".bc_outflow", 1, bc=True) - for f_id in range(len(fractures)): - add_reg(regions, "frac_{}".format(f_id), 1, frac_step) - return regions - - -def make_decomposition(box, fractures, regions): - box_pd = poly.PolygonDecomposition() - p00, p11 = box - p01 = np.array([p00[0], p11[1]]) - p10 = np.array([p11[0], p00[1]]) - box_pd.add_line(p00, p01) - seg_outflow, = box_pd.add_line(p01, p11) - box_pd.add_line(p11, p10) - seg_inflow, = box_pd.add_line(p10, p00) - - decompositions = [box_pd] - for p0, p1 in fractures: - pd = poly.PolygonDecomposition() - pd.add_line(p0, p1) - decompositions.append(pd) - - common_decomp, maps = merge.intersect_decompositions(decompositions) - #plot_polygon_decomposition(common_decomp) - #print(maps) - - # Map common_decomp objects to regions. - none_region_id = 0 - box_reg_id = 1 - bc_inflow_id = 2 - bc_outflow_id = 3 - frac_id_shift = 4 - decomp_shapes = [common_decomp.points, common_decomp.segments, common_decomp.polygons] - reg_map = [{key: regions[none_region_id] for key in decomp_shapes[d].keys()} for d in range(3)] - for i_frac, f_map in enumerate(maps[1:]): - for id, orig_seg_id in f_map[1].items(): - reg_map[1][id] = regions[frac_id_shift + i_frac] - - for id, orig_poly_id in maps[0][2].items(): - if orig_poly_id == 0: - continue - reg_map[2][id] = regions[box_reg_id] - - for id, orig_seg_id in maps[0][1].items(): - if orig_seg_id == seg_inflow.id: - reg_map[1][id] = regions[bc_inflow_id] - if orig_seg_id == seg_outflow.id: - reg_map[1][id] = regions[bc_outflow_id] - - - return common_decomp, reg_map - - -def fill_lg(decomp, reg_map, regions): - """ - Create LayerGeometry object. - """ - nodes, topology = poly_io.serialize(decomp) - - geom = lg.LayerGeometry() - geom.version - geom.regions = regions - - - - iface_ns = lg.InterfaceNodeSet(dict( - nodeset_id = 0, - interface_id = 0 - )) - layer = lg.FractureLayer(dict( - name = "layer", - top = iface_ns, - polygon_region_ids = [ reg_map[2][poly.id]._id for poly in decomp.polygons.values() ], - segment_region_ids = [ reg_map[1][seg.id]._id for seg in decomp.segments.values() ], - node_region_ids = [ reg_map[0][node.id]._id for node in decomp.points.values() ] - )) - geom.layers = [ layer ] - #geom.surfaces = [ClassFactory(Surface)] - - iface = lg.Interface(dict( - surface_id = None, - elevation = 0.0 - )) - geom.interfaces = [ iface ] - #geom.curves = [ClassFactory(Curve)] - geom.topologies = [ topology ] - - nodeset = lg.NodeSet(dict( - topology_id = 0, - nodes = nodes - )) - geom.node_sets = [ nodeset ] - geomop.layers_io.write_geometry("fractured_2d.json", geom) - return geom - - -def make_mesh(geometry): - return geomop.geometry.make_geometry(geometry=geometry, layers_file="fractured_2d.json", mesh_step=1.0) \ No newline at end of file diff --git a/mlmc/sample_storage.py b/mlmc/sample_storage.py index ec05080f..623bd9b3 100644 --- a/mlmc/sample_storage.py +++ b/mlmc/sample_storage.py @@ -1,134 +1,142 @@ import itertools import numpy as np -from abc import ABCMeta -from abc import abstractmethod -from typing import List, Dict +from abc import ABCMeta, abstractmethod +from typing import List, Dict, Any, Generator, Optional, Tuple from mlmc.quantity.quantity_spec import QuantitySpec, ChunkSpec class SampleStorage(metaclass=ABCMeta): """ - Provides methods to store and retrieve sample's data + Provides methods to store and retrieve sample data. + Abstract base class for all storage backends. """ @abstractmethod def save_samples(self, successful_samples, failed_samples): """ - Write results to storage + Write simulation results to storage. + :param successful_samples: Dict[level_id, List[Tuple[sample_id, (fine, coarse)]]] + :param failed_samples: Dict[level_id, List[Tuple[sample_id, error_message]]] """ @abstractmethod def save_result_format(self, res_spec: List[QuantitySpec]): """ - Save result format + Save result format. + :param res_spec: List of quantity specifications describing result structure. """ @abstractmethod def load_result_format(self) -> List[QuantitySpec]: """ - Load result format + Load stored result format. + :return: List[QuantitySpec] """ @abstractmethod def save_global_data(self, result_format: List[QuantitySpec], level_parameters=None): """ - Save global data, at the moment: _result_format, level_parameters + Save global metadata such as result format and level parameters. + :param result_format: List[QuantitySpec] + :param level_parameters: Optional metadata per level """ @abstractmethod def save_scheduled_samples(self, level_id, samples): """ - Save scheduled samples ids + Save scheduled sample identifiers. + :param level_id: int + :param samples: List[str] """ @abstractmethod - def load_scheduled_samples(self): + def load_scheduled_samples(self) -> Dict[int, List[str]]: """ - Load scheduled samples - :return: Dict[_level_id, List[sample_id: str]] + Load scheduled sample IDs. + :return: Dict[level_id, List[sample_id]] """ @abstractmethod def sample_pairs(self): """ - Get results from storage - :return: List[Array[M, N, 2]] + Retrieve all stored fine–coarse result pairs. + :return: List[np.ndarray[M, N, 2]] """ - def chunks(self, level_id=None, n_samples=None): + def chunks(self, level_id: Optional[int] = None, n_samples: Optional[int] = None) -> Generator[ChunkSpec, None, None]: """ - Create chunks generator - :param level_id: int, if not None return chunks for a given level - :param n_samples: int, number of samples to retrieve - :return: generator + Create a generator yielding chunk specifications for collected data. + :param level_id: int, if provided, return chunks only for the given level. + :param n_samples: int, maximum number of samples to retrieve. + :return: generator of ChunkSpec objects. """ - assert isinstance(n_samples, (type(None), int)), "n_samples param must be int" - level_ids = self.get_level_ids() - if level_id is not None: - level_ids = [level_id] - return itertools.chain(*[self._level_chunks(level_id, n_samples) for level_id in level_ids]) # concatenate generators + assert isinstance(n_samples, (type(None), int)), "n_samples must be int or None" + level_ids = [level_id] if level_id is not None else self.get_level_ids() + return itertools.chain(*[self._level_chunks(lid, n_samples) for lid in level_ids]) @abstractmethod def _level_chunks(self, level_id, n_samples=None): """ - Info about chunks of level's collected data + Get chunk information for data collected at a given level. + :param level_id: int + :param n_samples: int :return: generator of ChunkSpec objects """ @abstractmethod def n_finished(self): """ - Number of finished samples - :return: List + Get number of finished samples on each level. + :return: List[int] """ @abstractmethod - def save_n_ops(self, n_ops: Dict[int, List[float]]): + def save_n_ops(self, n_ops: Dict[int, Tuple[float, int]]): """ - Save number of operations (time) - :param n_ops: Dict[_level_id, List[overall time, number of valid samples]] + Save number of operations (time). + :param n_ops: Dict[level_id, Tuple[total_time, n_valid_samples]] """ @abstractmethod def get_n_ops(self): """ - Number of operations (time) per sample for each level + Get number of operations per sample for each level. :return: List[float] """ @abstractmethod def unfinished_ids(self): """ - Get unfinished sample's ids - :return: list + Get IDs of unfinished samples. + :return: List[str] """ @abstractmethod def get_level_ids(self): """ - Get number of levels - :return: int + Get list of available level IDs. + :return: List[int] """ @abstractmethod def get_n_levels(self): """ - Get number of levels + Get total number of levels. :return: int """ @abstractmethod def get_level_parameters(self): """ - Get level parameters - :return: list + Get stored level parameters. + :return: List[Any] """ @abstractmethod def get_n_collected(self): """ - Get number of collected results at each evel - :return: list + Get number of collected results at each level. + :return: List[int] """ diff --git a/mlmc/sample_storage_hdf.py b/mlmc/sample_storage_hdf.py index a53a8c64..5b7e4dbe 100644 --- a/mlmc/sample_storage_hdf.py +++ b/mlmc/sample_storage_hdf.py @@ -8,33 +8,39 @@ class SampleStorageHDF(SampleStorage): """ - Sample's data are stored in a HDF5 file + Store and manage sample data in an HDF5 file. + + This implementation of the SampleStorage interface provides efficient + persistent storage for MLMC simulation results using HDF5. """ def __init__(self, file_path): """ - HDF5 storage, provide method to interact with storage - :param file_path: absolute path to hdf file (which not exists at the moment) + Initialize the HDF5 storage and create or load the file structure. + + :param file_path: Absolute path to the HDF5 file. + If the file exists, it will be loaded instead of created. """ super().__init__() - # If file exists load not create new file - load_from_file = True if os.path.exists(file_path) else False + load_from_file = os.path.exists(file_path) # HDF5 interface self._hdf_object = hdf.HDF5(file_path=file_path, load_from_file=load_from_file) self._level_groups = [] - # 'Load' level groups + # Load existing level groups if file already contains data if load_from_file: - # Create level group for each level if len(self._level_groups) != len(self._hdf_object.level_parameters): for i_level in range(len(self._hdf_object.level_parameters)): self._level_groups.append(self._hdf_object.add_level_group(str(i_level))) def _hdf_result_format(self, locations, times): """ - QuantitySpec data type, necessary for hdf storage - :return: + Construct an appropriate dtype for QuantitySpec data representation in HDF5. + + :param locations: List of spatial locations (as coordinates or identifiers). + :param times: List of time steps. + :return: Numpy dtype describing the QuantitySpec data structure. """ if len(locations[0]) == 3: tuple_dtype = np.dtype((float, (3,))) @@ -42,41 +48,42 @@ def _hdf_result_format(self, locations, times): else: loc_dtype = np.dtype(('S50', (len(locations),))) - result_dtype = {'names': ('name', 'unit', 'shape', 'times', 'locations'), - 'formats': ('S50', - 'S50', - np.dtype((np.int32, (2,))), - np.dtype((float, (len(times),))), - loc_dtype - ) - } + result_dtype = { + 'names': ('name', 'unit', 'shape', 'times', 'locations'), + 'formats': ( + 'S50', + 'S50', + np.dtype((np.int32, (2,))), + np.dtype((float, (len(times),))), + loc_dtype + ) + } return result_dtype def save_global_data(self, level_parameters: List[float], result_format: List[QuantitySpec]): """ - Save hdf5 file global attributes - :param level_parameters: list of simulation steps - :param result_format: simulation result format + Save HDF5 global attributes including simulation parameters and result format. + + :param level_parameters: List of simulation level parameters (e.g., mesh sizes). + :param result_format: List of QuantitySpec objects describing result quantities. :return: None """ res_dtype = self._hdf_result_format(result_format[0].locations, result_format[0].times) - - # Create file structure self._hdf_object.create_file_structure(level_parameters) - # Create group for each level + # Create HDF5 groups for each simulation level if len(self._level_groups) != len(level_parameters): for i_level in range(len(level_parameters)): self._level_groups.append(self._hdf_object.add_level_group(str(i_level))) - # Save result format (QuantitySpec) self.save_result_format(result_format, res_dtype) def load_scheduled_samples(self): """ - Get scheduled samples for each level - :return: Dict[level_id, List[sample_id: str]] + Load scheduled samples from storage. + + :return: Dict[level_id, List[sample_id: str]] """ scheduled = {} for level in self._level_groups: @@ -85,79 +92,116 @@ def load_scheduled_samples(self): def save_result_format(self, result_format: List[QuantitySpec], res_dtype): """ - Save result format to hdf - :param result_format: List[QuantitySpec] + Save result format metadata to HDF5. + + :param result_format: List of QuantitySpec objects defining stored quantities. + :param res_dtype: Numpy dtype for structured storage. :return: None """ try: if self.load_result_format() != result_format: - raise ValueError('You are setting a new different result format for an existing sample storage') + raise ValueError( + "Attempting to overwrite an existing result format with a new incompatible one." + ) except AttributeError: pass + self._hdf_object.save_result_format(result_format, res_dtype) def load_result_format(self) -> List[QuantitySpec]: """ - Load result format + Load and reconstruct the result format from HDF5. + + :return: List of QuantitySpec objects. """ results_format = self._hdf_object.load_result_format() quantities = [] for res_format in results_format: - spec = QuantitySpec(res_format[0].decode(), res_format[1].decode(), res_format[2], res_format[3], - [loc.decode() for loc in res_format[4]]) - + spec = QuantitySpec( + res_format[0].decode(), + res_format[1].decode(), + res_format[2], + res_format[3], + [loc.decode() for loc in res_format[4]] + ) quantities.append(spec) - return quantities def save_samples(self, successful, failed): """ - Save successful and failed samples - :param successful: List[Tuple[sample_id: str, Tuple[ndarray, ndarray]]] - :param failed: List[Tuple[sample_id: str, error_message: str]] + Save successful and failed samples to the HDF5 storage. + + :param successful: Dict[level_id, List[Tuple[sample_id: str, (fine, coarse)]]] + :param failed: Dict[level_id, List[Tuple[sample_id: str, error_message: str]]] :return: None """ - self._save_succesful(successful) + self._save_successful(successful) self._save_failed(failed) - def _save_succesful(self, successful_samples): + def _save_successful(self, successful_samples): + """ + Append successful sample results to the appropriate level group. + + :param successful_samples: Dict[level_id, List[Tuple[sample_id, (fine, coarse)]]] + :return: None + """ for level, samples in successful_samples.items(): if len(samples) > 0: self._level_groups[level].append_successful(np.array(samples, dtype=object)) def _save_failed(self, failed_samples): + """ + Append failed sample identifiers and messages. + + :param failed_samples: Dict[level_id, List[Tuple[sample_id, error_message]]] + :return: None + """ for level, samples in failed_samples.items(): if len(samples) > 0: self._level_groups[level].append_failed(samples) def save_scheduled_samples(self, level_id, samples: List[str]): """ - Append scheduled samples - :param level_id: int - :param samples: list of sample identifiers + Append scheduled sample identifiers for a specific level. + + :param level_id: Integer level identifier. + :param samples: List of sample identifiers. :return: None """ self._level_groups[level_id].append_scheduled(samples) def _level_chunks(self, level_id, n_samples=None): + """ + Generate chunk specifications for a given level. + + :param level_id: Level identifier. + :param n_samples: Optional number of samples to include per chunk. + :return: Generator of ChunkSpec objects. + """ return self._level_groups[level_id].chunks(n_samples) def sample_pairs(self): """ - Load results from hdf file - :return: List[Array[M, N, 2]] + Retrieve all sample pairs from storage. + + :return: List[np.ndarray[M, N, 2]] where M = number of results, N = number of samples. """ if len(self._level_groups) == 0: - raise Exception("self._level_groups shouldn't be empty, save_global_data() method should have set it, " - "that method is always called from mlmc.sampler.Sampler constructor." - " In other cases, call save_global_data() directly") + raise Exception( + "Level groups are not initialized. " + "Ensure save_global_data() is called before using SampleStorageHDF." + ) levels_results = list(np.empty(len(self._level_groups))) for level in self._level_groups: - chunk_spec = next(self.chunks(level_id=int(level.level_id), - n_samples=self.get_n_collected()[int(level.level_id)])) - results = self.sample_pairs_level(chunk_spec) # return all samples no chunks + chunk_spec = next( + self.chunks( + level_id=int(level.level_id), + n_samples=self.get_n_collected()[int(level.level_id)] + ) + ) + results = self.sample_pairs_level(chunk_spec) if results is None or len(results) == 0: levels_results[int(level.level_id)] = [] continue @@ -166,13 +210,12 @@ def sample_pairs(self): def sample_pairs_level(self, chunk_spec): """ - Get result for particular level and chunk - :param chunk_spec: object containing chunk identifier level identifier and chunk_slice - slice() object - :return: np.ndarray + Retrieve samples for a specific level and chunk. + + :param chunk_spec: ChunkSpec containing level ID and slice information. + :return: np.ndarray of shape [M, chunk size, 2]. """ - level_id = chunk_spec.level_id - if chunk_spec.level_id is None: - level_id = 0 + level_id = chunk_spec.level_id or 0 chunk = self._level_groups[int(level_id)].collected(chunk_spec.chunk_slice) # Remove auxiliary zeros from level zero sample pairs @@ -183,31 +226,31 @@ def sample_pairs_level(self, chunk_spec): def n_finished(self): """ - Number of finished samples on each level - :return: List[int] + Count the number of finished samples for each level. + + :return: np.ndarray[int] containing finished sample counts per level. """ n_finished = np.zeros(len(self._level_groups)) for level in self._level_groups: n_finished[int(level.level_id)] += len(level.get_finished_ids()) - return n_finished def unfinished_ids(self): """ - List of unfinished ids - :return: list + Return identifiers of all unfinished samples. + + :return: List[str] """ unfinished = [] - for level in self._level_groups: unfinished.extend(level.get_unfinished_ids()) - return unfinished def failed_samples(self): """ - Dictionary of failed samples - :return: dict + Return dictionary of failed samples for each level. + + :return: Dict[str, List[str]] """ failed_samples = {} for level in self._level_groups: @@ -215,13 +258,17 @@ def failed_samples(self): return failed_samples def clear_failed(self): + """ + Clear all failed sample records from storage. + """ for level in self._level_groups: level.clear_failed_dataset() def save_n_ops(self, n_ops): """ - Save number of operations (time) of samples - :param n_ops: Dict[level_id, List[overall time, number of successful samples]] + Save the estimated number of operations (e.g., runtime) for each level. + + :param n_ops: Dict[level_id, List[total_time, num_successful_samples]] :return: None """ for level_id, (time, n_samples) in n_ops: @@ -236,8 +283,9 @@ def save_n_ops(self, n_ops): def get_n_ops(self): """ - Get number of estimated operations on each level - :return: List + Get the average number of operations per sample for each level. + + :return: List[float] """ n_ops = list(np.zeros(len(self._level_groups))) for level in self._level_groups: @@ -248,15 +296,26 @@ def get_n_ops(self): return n_ops def get_level_ids(self): + """ + Get identifiers of all levels stored in HDF5. + + :return: List[int] + """ return [int(level.level_id) for level in self._level_groups] def get_level_parameters(self): + """ + Load stored level parameters (e.g., step sizes or resolutions). + + :return: List[float] + """ return self._hdf_object.load_level_parameters() def get_n_collected(self): """ - Get number of collected samples at each level - :return: List + Get the number of collected (stored) samples for each level. + + :return: List[int] """ n_collected = list(np.zeros(len(self._level_groups))) for level in self._level_groups: @@ -265,7 +324,8 @@ def get_n_collected(self): def get_n_levels(self): """ - Get number of levels + Get total number of levels present in storage. + :return: int """ return len(self._level_groups) diff --git a/mlmc/sampler.py b/mlmc/sampler.py index 17b7dce0..d1a39f29 100644 --- a/mlmc/sampler.py +++ b/mlmc/sampler.py @@ -8,7 +8,12 @@ class Sampler: """ - Manages samples scheduling, results collection, and result storage. + Manages sample scheduling, result collection, and persistent storage. + + Coordinates the sampling pool, simulation factory, and sample storage: + - schedules new samples according to target counts, + - collects finished samples and writes them to storage, + - handles failed samples and runtime (n_ops) bookkeeping. """ ADDING_SAMPLES_TIMEOUT = 1e-15 @@ -16,64 +21,74 @@ class Sampler: def __init__(self, sample_storage: SampleStorage, sampling_pool: SamplingPool, sim_factory: Simulation, level_parameters: List[List[float]], seed=1234): """ + Initialize sampler and prepare per-level simulation objects. + :param sample_storage: store scheduled samples, results and result structure - :param sampling_pool: calculate samples - :param sim_factory: generate samples - :param level_parameters: List of e.g. simulation steps, ... - :param seed: global random seed + :param sampling_pool: sampling pool responsible for executing simulations + :param sim_factory: factory that creates level Simulation instances and provides result_format() + :param level_parameters: List of per-level parameters (e.g. simulation steps) + :param seed: global RNG seed used to seed NumPy's RNG """ np.random.seed(seed) self.sample_storage = sample_storage self._sampling_pool = sampling_pool + # Target number of samples per level (may be updated later) self._n_target_samples = np.zeros(len(level_parameters)) - # Number of target samples + + # Create LevelSimulation objects for each level using the provided factory self._level_sim_objects = [] self._create_level_sim_objects(level_parameters, sim_factory) + # Persist global data (level parameters and result format) into storage sample_storage.save_global_data(level_parameters=level_parameters, result_format=sim_factory.result_format()) + # Load already scheduled samples (if any) from storage self._n_scheduled_samples = [len(level_scheduled) for level_id, level_scheduled in sample_storage.load_scheduled_samples().items()] - # Number of created samples + # If there are no scheduled samples yet, initialize to zeros if not self._n_scheduled_samples: self._n_scheduled_samples = np.zeros(len(level_parameters)) - # Are there any unfinished samples which have already finished? + # Check for unfinished samples and inform the sampling pool self._check_failed_samples() - # @TODO: get unfinished samples from sampler and call have permanent samples -> add results to pool's queues, - # before scheduled samples call, call get_finished - we need to know how many samples is finished + # @TODO: If sampler is restarted, collect any samples finished while offline: + # - add permanent samples into pool queues, + # - before scheduling new samples, call get_finished to know how many are already done. @property def n_levels(self): + """Return number of MLMC levels managed by this sampler.""" return len(self._level_sim_objects) @property def n_finished_samples(self): """ - Retrieve number of all finished samples - :return: + Retrieve numbers of finished samples for all levels. + + :return: array-like containing finished counts per level """ return self.sample_storage.n_finished() def _create_level_sim_objects(self, level_parameters, sim_factory): """ - Create LevelSimulation object for each level, use simulation factory - :param: level_parameters: List, simulation steps, ... - :param: sim_factory: Simulation instance + Create LevelSimulation object for each level via the simulation factory. + + :param level_parameters: List of per-level parameters + :param sim_factory: Simulation factory providing level_instance and calculate methods :return: None """ n_levels = len(level_parameters) for level_id in range(n_levels): if level_id == 0: level_sim = sim_factory.level_instance(level_parameters[level_id], [0]) - else: level_sim = sim_factory.level_instance(level_parameters[level_id], level_parameters[level_id - 1]) + # Attach factory methods and metadata to the LevelSimulation level_sim._calculate = sim_factory.calculate level_sim._result_format = sim_factory.result_format level_sim._level_id = level_id @@ -81,30 +96,37 @@ def _create_level_sim_objects(self, level_parameters, sim_factory): def sample_range(self, n0, nL): """ - Geometric sequence of L elements decreasing from n0 to nL. - Useful to set number of samples explicitly. - :param n0: int - :param nL: int - :return: np.array of length L = n_levels. + Generate a geometric sequence of length L decreasing from n0 to nL. + + Useful to generate a set of target sample counts across levels. + + :param n0: int, number of samples at finest level + :param nL: int, number of samples at coarsest level + :return: np.ndarray of length self.n_levels with integer sample counts """ return np.round(np.exp2(np.linspace(np.log2(n0), np.log2(nL), self.n_levels))).astype(int) def set_initial_n_samples(self, n_samples=None): """ - Set target number of samples for each level - :param n_samples: array of number of samples + Set initial target number of samples for each level. + + Accepts: + - None (defaults to [100, 10]), + - single integer (interpreted as n0, with default nL=10), + - two-element list [n0, nL] (geometric interpolation across levels). + + :param n_samples: scalar, length-2 list, or array specifying target counts :return: None """ if n_samples is None: n_samples = [100, 10] - # Num of samples to ndarray n_samples = np.atleast_1d(n_samples) - # Just maximal number of samples is set + # Single value -> treat as n0 with default nL if len(n_samples) == 1: n_samples = np.array([n_samples[0], 10]) - # Create number of samples for all levels + # Two values -> create geometric progression across levels if len(n_samples) == 2: n0, nL = n_samples n_samples = self.sample_range(n0, nL) @@ -113,75 +135,81 @@ def set_initial_n_samples(self, n_samples=None): def _get_sample_tag(self, level_id): """ - Create sample tag + Create a unique sample tag for a given level. + :param level_id: identifier of current level - :return: str + :return: str unique sample tag (e.g. 'L00_S0000123') """ return "L{:02d}_S{:07d}".format(level_id, int(self._n_scheduled_samples[level_id])) def schedule_samples(self, timeout=None, level_id=None, n_samples=None): """ - Create simulation samples, loop through "levels" and its samples (given the number of target samples): - 1) generate sample tag (same for fine and coarse simulation) - 2) get LevelSimulation instance by simulation factory - 3) schedule sample via sampling pool - 4) store scheduled samples in sample storage, separately for each level - :param timeout: int, get_finished - while break timeout in seconds + Schedule new simulation samples in the sampling pool and record them in storage. + + For each scheduled sample: + 1) generate a unique sample id shared by fine and coarse tasks, + 2) obtain the LevelSimulation instance for the level, + 3) schedule the sample with SamplingPool, + 4) store scheduled sample ids in SampleStorage. + + :param timeout: float or None, passed to ask_sampling_pool_for_samples() before scheduling + :param level_id: int or None, if provided schedule only for this level (default: highest level) + :param n_samples: int or None, if provided schedule exactly this many samples for the specified level :return: None """ + # First, collect any finished samples self.ask_sampling_pool_for_samples(timeout=timeout) plan_samples = self._n_target_samples - self._n_scheduled_samples + # Default to the coarsest level if not specified if level_id is None: level_id = len(plan_samples) - 1 + + # If a specific number of samples for one level is requested if n_samples is not None: samples = [] for _ in range(int(n_samples)): - # Unique sample id sample_id = self._get_sample_tag(level_id) level_sim = self._level_sim_objects[level_id] - # Schedule current sample self._sampling_pool.schedule_sample(sample_id, level_sim) - # Increment number of created samples at current level self._n_scheduled_samples[level_id] += 1 samples.append(sample_id) - # Store scheduled samples self.sample_storage.save_scheduled_samples(level_id, samples) else: + # Iterate levels from coarsest to finest and schedule required samples for n_samples in np.flip(plan_samples): samples = [] for _ in range(int(n_samples)): - # Unique sample id sample_id = self._get_sample_tag(level_id) level_sim = self._level_sim_objects[level_id] - # Schedule current sample self._sampling_pool.schedule_sample(sample_id, level_sim) - # Increment number of created samples at current level self._n_scheduled_samples[level_id] += 1 - samples.append(sample_id) - # Store scheduled samples self.sample_storage.save_scheduled_samples(level_id, samples) level_id -= 1 def _check_failed_samples(self): """ - Get unfinished samples and check if failed samples have saved results then collect them - :return: + Query storage for unfinished sample IDs and inform the sampling pool. + + This allows the sampling pool to reattach or handle 'permanent' samples + that may have been started previously. + :return: None """ unfinished_sample_ids = self.sample_storage.unfinished_ids() self._sampling_pool.have_permanent_samples(unfinished_sample_ids) def ask_sampling_pool_for_samples(self, sleep=0, timeout=None): """ - Waiting for running simulations - :param sleep: time for doing nothing - :param timeout: maximum time for waiting on running simulations - :return: int, number of running simulations + Poll the sampling pool for finished simulations and store their results. + + :param sleep: float, time to sleep between polls (seconds) + :param timeout: float or None, maximum time to wait; if <= 0 returns immediately + :return: int, number of running simulations remaining after the call """ if timeout is None: timeout = 0 @@ -192,7 +220,7 @@ def ask_sampling_pool_for_samples(self, sleep=0, timeout=None): t0 = time.perf_counter() while n_running > 0: successful_samples, failed_samples, n_running, n_ops = self._sampling_pool.get_finished() - # Store finished samples + # Persist finished samples and operation counts self._store_samples(successful_samples, failed_samples, n_ops) time.sleep(sleep) if 0 < timeout < (time.perf_counter() - t0): @@ -202,10 +230,11 @@ def ask_sampling_pool_for_samples(self, sleep=0, timeout=None): def _store_samples(self, successful_samples, failed_samples, n_ops): """ - Store finished samples - :param successful_samples: Dict[level_id, List[Tuple[sample_id:str, Tuple[ndarray, ndarray]]]] - :param failed_samples: Dict[level_id, List[Tuple[sample_id: str, error message: str]]] - :param n_ops: Dict[level_id: int, List[total time: float, number of success samples: int]] + Persist finished samples and operation time estimates to storage. + + :param successful_samples: Dict[level_id, List[Tuple[sample_id:str, (fine, coarse)]]] + :param failed_samples: Dict[level_id, List[Tuple[sample_id:str, error_message:str]]] + :param n_ops: Dict[level_id, Tuple[total_time:float, n_success_samples:int]] :return: None """ self.sample_storage.save_samples(successful_samples, failed_samples) @@ -213,24 +242,24 @@ def _store_samples(self, successful_samples, failed_samples, n_ops): def process_adding_samples(self, n_estimated, sleep=0, add_coeff=0.1, timeout=ADDING_SAMPLES_TIMEOUT): """ - Process adding samples - Note: n_estimated are wrong if n_ops is similar through all levels - :param n_estimated: Number of estimated samples on each level, list - :param sleep: Sample waiting time - :param add_coeff: default value 0.1, The number of scheduled samples would be 'add_coef' fraction of difference - between current number of target samples and new estimated number of target samples - :param timeout: ask sampling pool for finished samples timeout - :return: bool, if True adding samples is complete + Add newly estimated samples in batches, scheduling a fraction of the difference + between current scheduled and newly estimated targets. + + Note: n_estimated may be unreliable if per-level n_ops are similar across levels. + + :param n_estimated: array-like, estimated target samples per level + :param sleep: float, time to sleep while waiting for results + :param add_coeff: float in (0,1], fraction of the difference to schedule each iteration (default 0.1) + :param timeout: float, timeout passed to ask_sampling_pool_for_samples() + :return: bool, True if scheduled counts reached the estimates for all levels """ + # Ensure storage reflects any finished work self.ask_sampling_pool_for_samples(timeout=timeout) - # Get default scheduled samples + # Currently scheduled samples per level n_scheduled = self.l_scheduled_samples() - # New scheduled sample will be 10 percent of difference - # between current number of target samples and new estimated one - # If 10 percent of estimated samples is greater than difference between estimated and scheduled samples, - # set scheduled samples to estimated samples + # Compute new scheduled values (add_coeff fraction of the remaining difference) new_scheduled = np.where((n_estimated * add_coeff) > (n_estimated - n_scheduled), n_estimated, n_scheduled + (n_estimated - n_scheduled) * add_coeff) @@ -239,41 +268,43 @@ def process_adding_samples(self, n_estimated, sleep=0, add_coeff=0.1, timeout=AD n_scheduled, new_scheduled)) - # Levels where estimated are greater than scheduled + # Levels where estimated > scheduled greater_items = np.where(np.greater(n_estimated, n_scheduled))[0] - # Scheduled samples and wait until at least half of the samples are done + # Schedule and wait until at least a fraction of newly scheduled samples finish self.set_scheduled_and_wait(n_scheduled, greater_items, sleep, timeout=timeout) return np.all(n_estimated[greater_items] == n_scheduled[greater_items]) def set_scheduled_and_wait(self, n_scheduled, greater_items, sleep, fin_sample_coef=0.5, timeout=1e-7): """ - Scheduled samples on each level and wait until at least half of the samples is done - :param n_scheduled: ndarray, number of scheduled samples on each level - :param greater_items: Items where n_estimated is greater than n_scheduled - :param sleep: Time waiting for samples - :param fin_sample_coef: The proportion of samples to finished for further estimate + Set scheduled sample targets and wait until a proportion of those samples finish. + + :param n_scheduled: ndarray, target number of scheduled samples per level + :param greater_items: iterable of indices where targets were increased + :param sleep: float, time to sleep between polls + :param fin_sample_coef: float in (0,1], fraction of scheduled samples that should finish before continuing + :param timeout: float, timeout passed to ask_sampling_pool_for_samples() :return: None """ - # Set scheduled samples and run simulations + # Update internal targets and schedule required samples self.set_level_target_n_samples(n_scheduled) self.schedule_samples(timeout=timeout) - # Finished level samples + # Current finished counts n_finished = self.n_finished_samples - # Wait until at least half of the scheduled samples are done on each level + # Wait until at least fin_sample_coef fraction of scheduled samples are finished for affected levels while np.any(n_finished[greater_items] < fin_sample_coef * n_scheduled[greater_items]): - # Wait a while time.sleep(sleep) self.ask_sampling_pool_for_samples(timeout=timeout) n_finished = self.n_finished_samples def set_level_target_n_samples(self, n_samples): """ - Set level number of target samples - :param n_samples: list, each level target samples + Update the per-level target sample counts to at least the provided values. + + :param n_samples: iterable of new target samples per level :return: None """ for level, n in enumerate(n_samples): @@ -281,14 +312,18 @@ def set_level_target_n_samples(self, n_samples): def l_scheduled_samples(self): """ - Get all levels number of scheduled samples - :return: list + Return the currently scheduled sample counts per level. + + :return: list or array-like of scheduled sample counts """ return self._n_scheduled_samples def renew_failed_samples(self): """ - Resurrect failed samples + Reschedule previously failed samples. + + Retrieves failed sample IDs from storage, re-schedules them in the sampling pool, + and clears failed records from storage. :return: None """ failed_samples = self.sample_storage.failed_samples() @@ -298,8 +333,8 @@ def renew_failed_samples(self): level_id = int(level_id) for sample_id in sample_ids: level_sim = self._level_sim_objects[level_id] - # Schedule current sample self._sampling_pool.schedule_sample(sample_id, level_sim) samples.append(sample_id) + # Clear failed sample records after rescheduling self.sample_storage.clear_failed() diff --git a/mlmc/sampling_pool.py b/mlmc/sampling_pool.py index cbbbb360..9044246d 100644 --- a/mlmc/sampling_pool.py +++ b/mlmc/sampling_pool.py @@ -5,7 +5,7 @@ import time import hashlib import numpy as np -from typing import List +from typing import List, Tuple, Dict, Optional, Any import traceback from abc import ABC, abstractmethod from multiprocessing import Pool as ProcPool @@ -15,18 +15,22 @@ class SamplingPool(ABC): """ - Determining the runtime environment of samples, eg single process, multiple processes, running PBS, ... + Abstract base class defining the runtime environment for sample simulations. + It manages sample execution across different backends (single process, + multiprocessing, PBS, etc.). """ FAILED_DIR = 'failed' SEVERAL_SUCCESSFUL_DIR = 'several_successful' N_SUCCESSFUL = 5 - # Number of successful samples to store + # Number of successful samples to store. - def __init__(self, work_dir=None, debug=False): + def __init__(self, work_dir: Optional[str] = None, debug: bool = False): """ - :param work_dir: Path to working directory - :param debug: bool, if True keep sample directories + Initialize the sampling pool environment. + + :param work_dir: Path to the working directory where outputs are stored. + :param debug: If True, keep sample directories for debugging. """ self._output_dir = None if work_dir is not None: @@ -34,14 +38,16 @@ def __init__(self, work_dir=None, debug=False): self._output_dir = os.path.join(work_dir, "output") self._debug = debug - self._create_dir() # prepare output dir - self._create_dir(SamplingPool.FAILED_DIR) # prepare failed dir - self._successful_dir = self._create_dir(SamplingPool.SEVERAL_SUCCESSFUL_DIR) # prepare several successful dir + # Prepare main output, failed, and successful directories. + self._create_dir() + self._create_dir(SamplingPool.FAILED_DIR) + self._successful_dir = self._create_dir(SamplingPool.SEVERAL_SUCCESSFUL_DIR) - def _create_dir(self, directory=""): + def _create_dir(self, directory: str = "") -> Optional[str]: """ - Create output directory, in 'debug' mode not remove existing output_dir - :return: None + Create the output directory if it does not exist. + + In debug mode, existing directories are preserved. """ if self._output_dir is not None: directory = os.path.join(self._output_dir, directory) @@ -49,289 +55,446 @@ def _create_dir(self, directory=""): shutil.rmtree(directory) os.makedirs(directory, mode=0o775, exist_ok=True) return directory + return None + + # --- Abstract methods to be implemented by subclasses --- @abstractmethod - def schedule_sample(self, sample_id, level_sim: LevelSimulation): + def schedule_sample(self, sample_id: str, level_sim: LevelSimulation): """ - Method for calculating simulation samples - :param sample_id: str - :param level_sim: level_simulation.LevelSimulation instance + Schedule a simulation sample for execution. + + :param sample_id: Unique sample identifier. + :param level_sim: LevelSimulation instance. :return: Tuple[str, List] """ @abstractmethod - def have_permanent_samples(self, sample_ids): + def have_permanent_samples(self, sample_ids: List[str]) -> bool: """ - Informs the Pool about sample_ids that have been scheduled but not yet finished + Inform the pool about samples that have been scheduled but not yet finished. """ @abstractmethod def get_finished(self): """ - Return finished samples - :return: list of results, number of running samples + Retrieve finished sample results. + + :return: Tuple containing (successful samples, failed samples, number of running samples) """ + # --- Utility methods shared across subclasses --- + @staticmethod - def compute_seed(sample_id): + def compute_seed(sample_id: str) -> int: """ - Calculate seed for given sample id - :param sample_id: str - :return: int + Compute a deterministic seed for a given sample ID. + + :param sample_id: Unique sample identifier. + :return: Integer seed value. """ - hash = hashlib.md5(sample_id.encode('ascii')) - seed = np.frombuffer(hash.digest(), dtype='uint32')[0] - return seed + hash_val = hashlib.md5(sample_id.encode('ascii')) + seed = np.frombuffer(hash_val.digest(), dtype='uint32')[0] + return int(seed) @staticmethod - def calculate_sample(sample_id, level_sim, work_dir=None, seed=None): + def calculate_sample(sample_id: str, level_sim: LevelSimulation, + work_dir: Optional[str] = None, + seed: Optional[int] = None) -> Tuple[str, Any, str, float]: """ - Method for calculating results - :param sample_id: str - :param level_sim: LevelSimulation - :param work_dir: working directory - :param seed: random seed - :return: sample id, sample result, error message with traceback, running time + Execute a single simulation sample. + + :param sample_id: Sample identifier. + :param level_sim: LevelSimulation instance. + :param work_dir: Working directory for the sample. + :param seed: Optional random seed (generated if not provided). + :return: Tuple(sample_id, result, error_message, running_time) """ if seed is None: seed = SamplingPool.compute_seed(sample_id) + res = (None, None) err_msg = "" - running_time = 0 + running_time = 0.0 if level_sim.need_sample_workspace: SamplingPool.handle_sim_files(work_dir, sample_id, level_sim) + try: start = time.time() res = level_sim._calculate(level_sim.config_dict, seed) running_time = time.time() - start - # Check result format - if type(res[0]) is np.ndarray and type(res[1]) is np.ndarray: + # Validate result format. + if isinstance(res[0], np.ndarray) and isinstance(res[1], np.ndarray): flatten_fine_res = res[0].flatten() flatten_coarse_res = res[1].flatten() - res_expected_len = np.sum( - [np.prod(quantity_spec.shape) * len(quantity_spec.times) * len(quantity_spec.locations) - for quantity_spec in level_sim._result_format()]) + expected_len = np.sum([ + np.prod(q.shape) * len(q.times) * len(q.locations) + for q in level_sim._result_format() + ]) - assert len(flatten_fine_res) == len(flatten_coarse_res) == res_expected_len, \ - "Unexpected result format, expected length: {}, resultf length: {}".format(res_expected_len, - len(flatten_fine_res)) + assert len(flatten_fine_res) == len(flatten_coarse_res) == expected_len, \ + f"Unexpected result format. Expected length: {expected_len}, got: {len(flatten_fine_res)}" except Exception: - str_list = traceback.format_exception(*sys.exc_info()) - err_msg = "".join(str_list) - print("Error msg: ", err_msg) + err_msg = "".join(traceback.format_exception(*sys.exc_info())) + print("Error msg:", err_msg) return sample_id, res, err_msg, running_time + # --- File handling helpers --- + @staticmethod - def change_to_sample_directory(work_dir, path: str): + def change_to_sample_directory(work_dir: str, path: str) -> str: """ - Create sample directory and change working directory - :param path: str - :return: None + Create and switch to the sample-specific directory. + + :param work_dir: Base working directory. + :param path: Sample subdirectory name. + :return: Absolute path to the created sample directory. """ sample_dir = os.path.join(work_dir, path) - if not os.path.isdir(sample_dir): - os.makedirs(sample_dir, mode=0o775, exist_ok=True) + os.makedirs(sample_dir, mode=0o775, exist_ok=True) return sample_dir @staticmethod - def copy_sim_files(files: List[str], sample_dir): + def copy_sim_files(files: List[str], sample_dir: str): """ - Copy simulation common files to current simulation sample directory - :param files: List of files - :return: None + Copy shared simulation files to the sample directory. + + :param files: List of file paths to copy. + :param sample_dir: Destination sample directory. """ for file in files: shutil.copy(file, sample_dir) @staticmethod - def handle_sim_files(work_dir, sample_id, level_sim): + def handle_sim_files(work_dir: str, sample_id: str, level_sim: LevelSimulation): """ - Change working directory to sample dir and copy common files - :param sample_id: str - :param level_sim: LevelSimulation - :return: None + Prepare the sample workspace (create directory, copy common files, set cwd). + + :param work_dir: Base working directory. + :param sample_id: Sample identifier. + :param level_sim: LevelSimulation instance. """ if level_sim.need_sample_workspace: sample_dir = SamplingPool.change_to_sample_directory(work_dir, sample_id) - if level_sim.common_files is not None: SamplingPool.copy_sim_files(level_sim.common_files, sample_dir) os.chdir(sample_dir) @staticmethod - def move_successful_rm(sample_id, level_sim, output_dir, dest_dir): + def move_successful_rm(sample_id: str, level_sim: LevelSimulation, + output_dir: str, dest_dir: str): + """ + Move successful sample directories and remove originals. + """ if int(sample_id[-7:]) < SamplingPool.N_SUCCESSFUL: - SamplingPool.move_dir(sample_id, level_sim.need_sample_workspace, output_dir, dest_dir=dest_dir) + SamplingPool.move_dir(sample_id, level_sim.need_sample_workspace, output_dir, dest_dir) SamplingPool.remove_sample_dir(sample_id, level_sim.need_sample_workspace, output_dir) @staticmethod - def move_failed_rm(sample_id, level_sim, output_dir, dest_dir): - SamplingPool.move_dir(sample_id, level_sim.need_sample_workspace, output_dir, dest_dir=dest_dir) + def move_failed_rm(sample_id: str, level_sim: LevelSimulation, + output_dir: str, dest_dir: str): + """ + Move failed sample directories and remove originals. + """ + SamplingPool.move_dir(sample_id, level_sim.need_sample_workspace, output_dir, dest_dir) SamplingPool.remove_sample_dir(sample_id, level_sim.need_sample_workspace, output_dir) @staticmethod - def move_dir(sample_id, sample_workspace, work_dir, dest_dir): + def move_dir(sample_id: str, sample_workspace: bool, + work_dir: str, dest_dir: str): """ - Move failed sample dir to failed directory - :param sample_id: str - :param sample_workspace: bool, simulation needs workspace - :param work_dir: str - :param dest_dir: destination - :return: None + Move a sample directory to another location (e.g., failed or successful). + + :param sample_id: Sample identifier. + :param sample_workspace: Whether the sample uses its own workspace. + :param work_dir: Base working directory. + :param dest_dir: Destination subdirectory name. """ - if sample_workspace and work_dir is not None and dest_dir is not None: + if sample_workspace and work_dir and dest_dir: destination_dir = os.path.join(work_dir, dest_dir) sample_dir = SamplingPool.change_to_sample_directory(work_dir, sample_id) - if os.path.exists(os.path.join(destination_dir, sample_id)): - shutil.rmtree(os.path.join(destination_dir, sample_id), ignore_errors=True) - shutil.copytree(sample_dir, os.path.join(destination_dir, sample_id)) + target_dir = os.path.join(destination_dir, sample_id) + if os.path.exists(target_dir): + shutil.rmtree(target_dir, ignore_errors=True) + shutil.copytree(sample_dir, target_dir) @staticmethod - def remove_sample_dir(sample_id, sample_workspace, work_dir): + def remove_sample_dir(sample_id: str, sample_workspace: bool, work_dir: str): """ - Remove sample directory - :param sample_id: str - :param sample_workspace: bool, simulation needs workspace - :param work_dir: str - :return: None + Remove the directory for a completed or failed sample. + + :param sample_id: Sample identifier. + :param sample_workspace: Whether the sample uses its own workspace. + :param work_dir: Base working directory. """ - if sample_workspace and work_dir is not None: + if sample_workspace and work_dir: sample_dir = SamplingPool.change_to_sample_directory(work_dir, sample_id) shutil.rmtree(sample_dir, ignore_errors=True) class OneProcessPool(SamplingPool): + """ + Sampling pool implementation that executes all samples sequentially in a single process. + Used primarily for debugging or lightweight simulations. + """ def __init__(self, work_dir=None, debug=False): """ - Everything is running in one process + Initialize the one-process pool. + + Parameters + ---------- + work_dir : str, optional + Working directory for storing sample outputs. + debug : bool, default=False + If True, disables moving/removing files after successful execution. """ super().__init__(work_dir=work_dir, debug=debug) - self._failed_queues = {} - self._queues = {} - self._n_running = 0 - self.times = {} + self._failed_queues = {} # Stores failed sample queues per level + self._queues = {} # Stores successful sample queues per level + self._n_running = 0 # Tracks number of currently running samples + self.times = {} # Stores total runtime and count per level def schedule_sample(self, sample_id, level_sim): - self._n_running += 1 + """ + Execute a single sample synchronously (in the current process). + + Parameters + ---------- + sample_id : int + Identifier of the sample. + level_sim : LevelSimulation + Simulation instance containing configuration for the sample. + """ + self._n_running += 1 # Increment running sample counter + # Set output directory if required by simulation if self._output_dir is None and level_sim.need_sample_workspace: self._output_dir = os.getcwd() - sample_id, result, err_msg, running_time = SamplingPool.calculate_sample(sample_id, level_sim, - work_dir=self._output_dir) + # Run the sample and collect result, error message, and runtime + sample_id, result, err_msg, running_time = SamplingPool.calculate_sample( + sample_id, level_sim, work_dir=self._output_dir + ) + # Process result (successful or failed) self._process_result(sample_id, result, err_msg, running_time, level_sim) def _process_result(self, sample_id, result, err_msg, running_time, level_sim): """ - Save sample result - :param sample_id: sample identifier from calculate_sample() - :param result: sample result from calculate_sample() - :param err_msg: sample error message from calculate_sample() - :param running_time: running time for sample from calculate_sample() - :param level_sim: level_simulation instance - :return: None - """ - # Save running time for n_ops + Process result from a sample execution and store it in the appropriate queue. + + Parameters + ---------- + sample_id : int + Identifier of the executed sample. + result : tuple + Pair of fine and coarse results (numpy arrays). + err_msg : str + Error message if the sample failed, empty string otherwise. + running_time : float + Runtime of the sample execution in seconds. + level_sim : LevelSimulation + Simulation instance used to produce the sample. + """ + # Record runtime for this level self._save_running_time(level_sim._level_id, running_time) + # If no error occurred, store successful result if not err_msg: - self._queues.setdefault(level_sim._level_id, queue.Queue()).put((sample_id, (result[0], result[1]))) + self._queues.setdefault(level_sim._level_id, queue.Queue()).put( + (sample_id, (result[0], result[1])) + ) + # Move successful sample to its permanent directory unless debugging if not self._debug: - SamplingPool.move_successful_rm(sample_id, level_sim, output_dir=self._output_dir, dest_dir=self._successful_dir) + SamplingPool.move_successful_rm( + sample_id, level_sim, output_dir=self._output_dir, dest_dir=self._successful_dir + ) else: + # If the simulation failed if not level_sim.need_sample_workspace: - print("Sample {} error: {}".format(sample_id, err_msg)) + print(f"Sample {sample_id} error: {err_msg}") else: - SamplingPool.move_failed_rm(sample_id, level_sim, output_dir=self._output_dir, dest_dir=SamplingPool.FAILED_DIR) + SamplingPool.move_failed_rm( + sample_id, level_sim, output_dir=self._output_dir, dest_dir=SamplingPool.FAILED_DIR + ) self._failed_queues.setdefault(level_sim._level_id, queue.Queue()).put((sample_id, err_msg)) def _save_running_time(self, level_id, running_time): """ - Save running time to dictionary, store total time and number of samples - :param level_id: int - :param running_time: float - :return: None + Save sample execution time in the tracking dictionary. + + Parameters + ---------- + level_id : int + Identifier of the simulation level. + running_time : float + Execution time of the sample. """ - # Save sample times [total time, number of samples] + # Initialize level entry if missing if level_id not in self.times: self.times[level_id] = [0, 0] - # Failed samples have running time equal 0 by default + # Only count successful samples with nonzero runtime if running_time != 0: - self.times[level_id][0] += running_time - self.times[level_id][1] += 1 + self.times[level_id][0] += running_time # Accumulate total runtime + self.times[level_id][1] += 1 # Increment sample count def have_permanent_samples(self, sample_ids): + """ + Return False, indicating that no samples are stored permanently. + + Parameters + ---------- + sample_ids : list + List of sample identifiers (ignored). + + Returns + ------- + bool + Always False. + """ return False def get_finished(self): """ - return results from queue - list of (sample_id, pair_of_result_vectors, error_message) + Retrieve all completed (successful and failed) samples. + + Returns + ------- + successful : dict + Dictionary of successful samples by level. + failed : dict + Dictionary of failed samples by level. + n_running : int + Number of currently running samples. + times : list + List of (level_id, [total_time, n_samples]) pairs. """ successful = self._queues_to_list(list(self._queues.items())) failed = self._queues_to_list(list(self._failed_queues.items())) - return successful, failed, self._n_running, list(self.times.items()) def _queues_to_list(self, queue_dict_list): + """ + Convert queues to lists and clear them safely. + + Parameters + ---------- + queue_dict_list : list + List of (level_id, queue.Queue) pairs. + + Returns + ------- + results : dict + Dictionary mapping level_id to list of queue entries. + """ results = {} for level_id, q in queue_dict_list: queue_list = list(q.queue) if not queue_list: continue results[level_id] = queue_list - # Thread safe clear + + # Thread-safe queue clearing with q.mutex: q.queue.clear() + # Update running sample counter self._n_running -= len(results[level_id]) - return results +# ============================================================================== + class ProcessPool(OneProcessPool): """ - Suitable for local parallel sampling for simulations WITHOUT external program call + Sampling pool using multiprocessing for parallel sample execution. + Suitable for simulations without external program calls. """ def __init__(self, n_processes, work_dir=None, debug=False): - self._pool = ProcPool(n_processes) + """ + Initialize process-based parallel sampling pool. + + Parameters + ---------- + n_processes : int + Number of worker processes to use. + work_dir : str, optional + Working directory for samples. + debug : bool, default=False + If True, disables moving/removing sample outputs. + """ + self._pool = ProcPool(n_processes) # Multiprocessing pool super().__init__(work_dir=work_dir, debug=debug) def res_callback(self, result, level_sim): """ - Process simulation results - :param result: tuple - :param level_sim: LevelSimulation instance - :return: None + Callback for handling results from asynchronous execution. + + Parameters + ---------- + result : tuple + Returned result from SamplingPool.calculate_sample(). + level_sim : LevelSimulation + Simulation level instance. """ self._process_result(*result, level_sim) def schedule_sample(self, sample_id, level_sim): + """ + Schedule a sample for parallel execution in a separate process. + + Parameters + ---------- + sample_id : int + Sample identifier. + level_sim : LevelSimulation + Simulation configuration instance. + """ self._n_running += 1 + # Set working directory for output files if self._output_dir is None and level_sim.need_sample_workspace: self._output_dir = os.getcwd() - self._pool.apply_async(SamplingPool.calculate_sample, args=(sample_id, level_sim, self._output_dir), - callback=lambda res: self.res_callback(res, level_sim), - error_callback=lambda res: self.res_callback(res, level_sim)) + # Submit task asynchronously to process pool + self._pool.apply_async( + SamplingPool.calculate_sample, + args=(sample_id, level_sim, self._output_dir), + callback=lambda res: self.res_callback(res, level_sim), + error_callback=lambda res: self.res_callback(res, level_sim) + ) + +# ============================================================================== class ThreadPool(ProcessPool): """ - Suitable local parallel sampling for simulations WITH external program call + Sampling pool using threading for local parallel sampling. + Suitable for simulations with external program calls (I/O-bound). """ def __init__(self, n_thread, work_dir=None, debug=False): + """ + Initialize thread-based parallel sampling pool. + + Parameters + ---------- + n_thread : int + Number of threads to use. + work_dir : str, optional + Working directory for samples. + debug : bool, default=False + If True, disables moving/removing sample outputs. + """ super().__init__(n_thread, work_dir=work_dir, debug=debug) - self._pool = pool.ThreadPool(n_thread) + self._pool = pool.ThreadPool(n_thread) # Thread-based pool instead of process-based self._failed_queues = {} self._queues = {} self._n_running = 0 diff --git a/mlmc/sim/simulation.py b/mlmc/sim/simulation.py index bd9a04d0..e5d5dd33 100644 --- a/mlmc/sim/simulation.py +++ b/mlmc/sim/simulation.py @@ -5,29 +5,49 @@ class Simulation(ABC): + """ + Abstract base class for multi-level Monte Carlo (MLMC) simulations. + + Defines the interface that all concrete simulation classes must implement. + Provides methods for creating level simulations, specifying result formats, and running calculations. + """ @abstractmethod def level_instance(self, fine_level_params: List[float], coarse_level_params: List[float]) -> LevelSimulation: """ - Create LevelSimulation object which is farther used for calculation etc. - :param fine_level_params: - :param coarse_level_params: - :return: LevelSimulation + Create a LevelSimulation object for a given level. + + The LevelSimulation instance is used for sample generation and result extraction + at both the fine and coarse levels in MLMC. + + :param fine_level_params: List of floats defining parameters for the fine simulation level. + :param coarse_level_params: List of floats defining parameters for the coarse simulation level. + :return: LevelSimulation instance configured for the given level parameters. """ @abstractmethod def result_format(self) -> List[QuantitySpec]: """ - Define simulation result format - :return: List[QuantitySpec, ...] + Define the format of the simulation results. + + This method should return a list of QuantitySpec objects, which describe the + type, shape, and units of each quantity produced by the simulation. + + :return: List of QuantitySpec objects defining the simulation output format. """ @staticmethod @abstractmethod - def calculate(config_dict, seed): + def calculate(config_dict, seed: int): """ - Method that actually run the calculation, calculate fine and coarse sample and also extract their results - :param config_dict: dictionary containing simulation configuration, LevelSimulation.config_dict (set in level_instance) - :param seed: random seed, int - :return: List[fine result, coarse result], both flatten arrays (see mlmc.sim.synth_simulation._calculate()) + Execute a single simulation calculation. + + This method runs the simulation for both fine and coarse levels, computes + the results, and returns them in a flattened form suitable for MLMC analysis. + + :param config_dict: Dictionary containing simulation configuration parameters + (usually LevelSimulation.config_dict from level_instance). + :param seed: Random seed (int) to ensure reproducibility of the stochastic simulation. + :return: List containing two elements: + [fine_result, coarse_result], both as flattened arrays. """ diff --git a/mlmc/sim/synth_simulation.py b/mlmc/sim/synth_simulation.py index 0a5176c6..2dd69701 100644 --- a/mlmc/sim/synth_simulation.py +++ b/mlmc/sim/synth_simulation.py @@ -1,3 +1,228 @@ +# import os +# import ruamel.yaml as ruyaml +# import numpy as np +# from typing import List +# import scipy.stats as stats +# from mlmc.sim.simulation import Simulation +# from mlmc.quantity.quantity_spec import QuantitySpec +# from mlmc.level_simulation import LevelSimulation +# +# +# class SynthSimulation(Simulation): +# """ +# Synthetic simulation for testing MLMC workflows. +# +# Produces artificial fine and coarse samples based on a distribution and a +# numerical step size. Can introduce NaNs to simulate failed simulations. +# """ +# +# # Class-level counters for failed samples and result bookkeeping +# n_nans = 0 +# nan_fraction = 0 +# len_results = 0 +# result_dict = {} +# +# def __init__(self, config=None): +# """ +# Initialize the synthetic simulation. +# +# :param config: Dictionary with keys: +# - distr: scipy.stats distribution (default: normal) +# - complexity: exponent for operation estimate (default: 2) +# - nan_fraction: fraction of failed samples to simulate +# - sim_method: unused here, placeholder for method type +# """ +# super().__init__() +# if config is None: +# config = dict(distr=stats.norm(), complexity=2) +# self.config = config +# +# # Reset class-level counters +# SynthSimulation.n_nans = 0 +# SynthSimulation.nan_fraction = config.get('nan_fraction', 0.0) +# SynthSimulation.len_results = 0 +# +# # Indicates that this simulation does not require a workspace +# self.need_workspace: bool = False +# +# @staticmethod +# def sample_fn(x, h): +# """ +# Synthetic sample function introducing a small numerical error. +# +# :param x: Random sample from distribution +# :param h: Simulation step size +# :return: Synthetic simulation output +# """ +# return x + h * np.sqrt(1e-4 + np.abs(x)) +# +# @staticmethod +# def sample_fn_no_error(x, h): +# """ +# Sample function without added error. +# +# :param x: Random sample +# :param h: Simulation step (unused) +# :return: Original sample +# """ +# return x +# +# def level_instance(self, fine_level_params: List[float], coarse_level_params: List[float]) -> LevelSimulation: +# """ +# Create a LevelSimulation object for the fine and coarse levels. +# +# :param fine_level_params: List of fine-level parameters (step size) +# :param coarse_level_params: List of coarse-level parameters (step size) +# :return: LevelSimulation object configured for MLMC +# """ +# config = dict() +# config["fine"] = {"step": fine_level_params[0]} +# config["coarse"] = {"step": coarse_level_params[0]} +# config["distr"] = self.config["distr"] +# config["res_format"] = self.result_format() +# +# return LevelSimulation(config_dict=config, task_size=self.n_ops_estimate(fine_level_params[0])) +# +# @staticmethod +# def generate_random_samples(distr, seed, size): +# """ +# Generate fine and coarse random samples from a given distribution. +# +# Optionally simulates a fraction of NaN results to represent failed simulations. +# +# :param distr: scipy.stats distribution object +# :param seed: random seed +# :param size: number of samples to generate +# :return: Tuple (fine_samples, coarse_samples) +# """ +# SynthSimulation.len_results += 1 +# distr.random_state = np.random.RandomState(seed) +# y = distr.rvs(size=size) +# +# # Simulate failed samples +# if SynthSimulation.n_nans / (1e-10 + SynthSimulation.len_results) < SynthSimulation.nan_fraction: +# SynthSimulation.n_nans += 1 +# y = [np.nan] +# +# return y, y +# +# @staticmethod +# def calculate(config, seed): +# """ +# Compute fine and coarse simulation results. +# +# :param config: Dictionary with LevelSimulation configuration +# :param seed: Random seed +# :return: Tuple (fine_result_flat, coarse_result_flat) +# """ +# quantity_format = config["res_format"] +# +# # Generate random samples for fine and coarse levels +# fine_random, coarse_random = SynthSimulation.generate_random_samples( +# config["distr"], seed, np.prod(quantity_format[0].shape) +# ) +# +# fine_step = config["fine"]["step"] +# coarse_step = config["coarse"]["step"] +# +# fine_result = SynthSimulation.sample_fn(fine_random, fine_step) +# coarse_result = np.zeros(len(fine_result)) if coarse_step == 0 else SynthSimulation.sample_fn(coarse_random, coarse_step) +# +# if np.any(np.isnan(fine_result)) or np.any(np.isnan(coarse_result)): +# raise Exception("Simulation produced NaN result") +# +# # Map results to quantity specifications +# results = [] +# for result in [fine_result, coarse_result]: +# quantities = [] +# for quantity in quantity_format: +# locations = np.array([result + i for i in range(len(quantity.locations))]) if coarse_step != 0 else np.array([result for _ in range(len(quantity.locations))]) +# times = np.array([locations for _ in range(len(quantity.times))]) +# quantities.append(times) +# results.append(np.array(quantities)) +# +# return results[0].flatten(), results[1].flatten() +# +# def n_ops_estimate(self, step): +# """ +# Estimate the computational cost for a given step size. +# +# :param step: simulation step size +# :return: estimated number of operations +# """ +# return (1 / step) ** self.config['complexity'] * np.log(max(1 / step, 2.0)) +# +# def result_format(self) -> List[QuantitySpec]: +# """ +# Define the result format for this synthetic simulation. +# +# :return: List of QuantitySpec objects +# """ +# spec1 = QuantitySpec(name="length", unit="m", shape=(2, 1), times=[1, 2, 3], locations=['10', '20']) +# spec2 = QuantitySpec(name="width", unit="mm", shape=(2, 1), times=[1, 2, 3], locations=['30', '40']) +# return [spec1, spec2] +# +# +# class SynthSimulationWorkspace(SynthSimulation): +# """ +# Synthetic simulation that requires a workspace. +# +# Extends SynthSimulation by supporting workspace-based execution and configuration +# read from a YAML file. +# """ +# +# CONFIG_FILE = 'synth_sim_config.yaml' +# +# def __init__(self, config): +# """ +# :param config: Dictionary containing workspace configuration: +# - config_yaml: path to YAML configuration file +# - nan_fraction: fraction of failed samples +# """ +# self.config_yaml = config["config_yaml"] +# +# # Reset counters +# SynthSimulationWorkspace.n_nans = 0 +# SynthSimulationWorkspace.nan_fraction = config.get('nan_fraction', 0.0) +# SynthSimulationWorkspace.len_results = 0 +# +# # Indicates that this simulation needs a workspace +# self.need_workspace: bool = True +# +# def level_instance(self, fine_level_params: List[float], coarse_level_params: List[float]) -> LevelSimulation: +# """ +# Create a LevelSimulation object using workspace configuration. +# +# :param fine_level_params: List of fine-level parameters (step size) +# :param coarse_level_params: List of coarse-level parameters (step size) +# :return: LevelSimulation object +# """ +# config = dict() +# config["fine"] = {"step": fine_level_params[0]} +# config["coarse"] = {"step": coarse_level_params[0]} +# config["res_format"] = self.result_format() +# +# job_weight = 20000 +# +# return LevelSimulation( +# config_dict=config, +# common_files=[self.config_yaml], +# task_size=1.0 / job_weight, +# need_sample_workspace=self.need_workspace +# ) +# +# @staticmethod +# def _read_config(): +# """ +# Read workspace configuration from YAML file. +# +# :return: Dictionary with configuration values +# """ +# with open(os.path.join(os.getcwd(), SynthSimulationWorkspace.CONFIG_FILE)) as file: +# yaml = ruyaml.YAML(typ='rt') +# config = yaml.load(file) +# return config + import os import ruamel.yaml as ruyaml import numpy as np diff --git a/mlmc/tool/context_statprof.py b/mlmc/tool/context_statprof.py deleted file mode 100644 index faf3afc4..00000000 --- a/mlmc/tool/context_statprof.py +++ /dev/null @@ -1,13 +0,0 @@ -import statprof -from contextlib import contextmanager - - - - -@contextmanager -def stat_profiler(): - statprof.start() - yield statprof - statprof.stop() - statprof.display() - diff --git a/mlmc/tool/distribution.py b/mlmc/tool/distribution.py index e377607b..1427ef2b 100644 --- a/mlmc/tool/distribution.py +++ b/mlmc/tool/distribution.py @@ -48,39 +48,6 @@ def __init__(self, moments_obj, moment_data, domain=None, force_decay=(True, Tru # Flag for monitoring convergence on stdout. self.monitor = monitor - # def choose_parameters_from_samples(self, samples): - # """ - # Determine model hyperparameters, in particular domain of the density function, - # from given samples. - # :param samples: np array of samples from the distribution or its approximation. - # :return: None - # """ - # self.domain = (np.min(samples), np.max(samples)) - # - # @staticmethod - # def choose_parameters_from_moments(mean, variance, quantile=0.9999, log=False): - # """ - # Determine model hyperparameters, in particular domain of the density function, - # from given samples. - # :param samples: np array of samples from the distribution or its approximation. - # :return: None - # """ - # if log: - # # approximate by log normal - # # compute mu, sigma parameters from observed mean and variance - # sigma_sq = np.log(np.exp(np.log(variance) - 2.0 * np.log(mean)) + 1.0) - # mu = np.log(mean) - sigma_sq / 2.0 - # sigma = np.sqrt(sigma_sq) - # domain = tuple(sc.stats.lognorm.ppf([1.0 - quantile, quantile], s=sigma, scale=np.exp(mu))) - # assert np.isclose(mean, sc.stats.lognorm.mean(s=sigma, scale=np.exp(mu))) - # assert np.isclose(variance, sc.stats.lognorm.var(s=sigma, scale=np.exp(mu))) - # else: - # domain = tuple(sc.stats.norm.ppf([1.0 - quantile, quantile], loc=mean, scale=np.sqrt(variance))) - # return domain - # - # def choose_parameters_from_approximation(self): - # pass - def estimate_density_minimize(self, tol=1e-5, reg_param =0.01): """ @@ -411,15 +378,9 @@ def _calculate_jacobian_matrix(self, multipliers): jacobian_matrix[np.diag_indices_from(jacobian_matrix)] += self._stab_penalty - - #e_vals = np.linalg.eigvalsh(jacobian_matrix) - - #print(multipliers) - #print("jac spectra: ", e_vals[0], e_vals[-1], e_vals[-1]/e_vals[0]) return jacobian_matrix - def compute_exact_moments(moments_fn, density, tol=1e-4): """ Compute approximation of moments using exact density. diff --git a/mlmc/tool/flow_mc.py b/mlmc/tool/flow_mc.py index 09fc12da..cdddb9c1 100644 --- a/mlmc/tool/flow_mc.py +++ b/mlmc/tool/flow_mc.py @@ -15,14 +15,21 @@ def create_corr_field(model='gauss', corr_length=0.125, dim=2, log=True, sigma=1, mode_no=1000): """ - Create random fields - :return: + Create correlated random-field provider (cf.Fields) according to selected backend. + + :param model: One of 'fourier', 'svd', 'exp', 'TPLgauss', 'TPLexp', 'TPLStable', or others (defaults to 'gauss'). + :param corr_length: Correlation length (used by GSTools or SVD implementations). + :param dim: Spatial dimension of the field (1, 2 or 3). + :param log: If True, generate log-normal field (exponentiate underlying Gaussian field). + :param sigma: Standard deviation for the generated field. + :param mode_no: Number of Fourier modes + :return: cf.Fields instance that can generate random field samples. """ if model == 'fourier': return cf.Fields([ cf.Field('conductivity', cf.FourierSpatialCorrelatedField('gauss', dim=dim, - corr_length=corr_length, - log=log, sigma=sigma)), + corr_length=corr_length, + log=log, sigma=sigma)), ]) elif model == 'svd': @@ -52,13 +59,14 @@ def create_corr_field(model='gauss', corr_length=0.125, dim=2, log=True, sigma=1 ]) - def substitute_placeholders(file_in, file_out, params): """ - Substitute for placeholders of format '' from the dict 'params'. - :param file_in: Template file. - :param file_out: Values substituted. - :param params: { 'name': value, ...} + Replace placeholders of form '' in a template file with corresponding values. + + :param file_in: Path to the template file containing placeholders. + :param file_out: Path where the substituted output will be written. + :param params: Dictionary mapping placeholder names to replacement values, e.g. {'mesh_file': 'mesh.msh'}. + :return: List of parameter names that were actually used (replaced) in the template. """ used_params = [] with open(file_in, 'r') as src: @@ -76,10 +84,10 @@ def substitute_placeholders(file_in, file_out, params): def force_mkdir(path, force=False): """ - Make directory 'path' with all parents, - remove the leaf dir recursively if it already exists. - :param path: path to directory - :param force: if dir already exists then remove it and create new one + Create directory tree; optionally remove existing leaf directory first. + + :param path: Directory path to create (parents created as needed). + :param force: If True and the directory already exists, remove it (recursively) before creating. :return: None """ if force: @@ -92,55 +100,36 @@ class FlowSim(Simulation): # placeholders in YAML total_sim_id = 0 MESH_FILE_VAR = 'mesh_file' - # Timestep placeholder given as O(h), h = mesh step - TIMESTEP_H1_VAR = 'timestep_h1' - # Timestep placeholder given as O(h^2), h = mesh step - TIMESTEP_H2_VAR = 'timestep_h2' + TIMESTEP_H1_VAR = 'timestep_h1' # O(h) + TIMESTEP_H2_VAR = 'timestep_h2' # O(h^2) - # files + # filenames used in workspace and job directories GEO_FILE = 'mesh.geo' MESH_FILE = 'mesh.msh' YAML_TEMPLATE = 'flow_input.yaml.tmpl' YAML_FILE = 'flow_input.yaml' FIELDS_FILE = 'fields_sample.msh' - """ - Gather data for single flow call (coarse/fine) - - Usage: - mlmc.sampler.Sampler uses instance of FlowSim, it calls once level_instance() for each level step (The level_instance() method - is called as many times as the number of levels), it takes place in main process - - mlmc.tool.pbs_job.PbsJob uses static methods in FlowSim, it calls calculate(). That's where the calculation actually runs, - it takes place in PBS process - It also extracts results and passes them back to PbsJob, which handles the rest - - """ - def __init__(self, config=None, clean=None): """ - Simple simulation using flow123d - :param config: configuration of the simulation, processed keys: - env - Environment object. - fields - FieldSet object - yaml_file: Template for main input file. Placeholders: - - replaced by generated mesh - - for FIELD be name of any of `fields`, replaced by the FieldElementwise field with generated - field input file and the field name for the component. - geo_file: Path to the geometry file. - :param clean: bool, if True remove existing simulation files - mesh files, ... + Initialize FlowSim instance that runs flow123d simulations using generated random fields. + + :param config: Dict with keys: + - env: dict of environment executables (flow123d, gmsh, gmsh_version, etc.) + - fields_params: parameters forwarded to create_corr_field + - yaml_file: base YAML template path + - geo_file: geometry (.geo) file path + - work_dir: base working directory for generated level common files + - field_template: optional template string for field definition in YAML + - time_factor: optional multiplier for timestep selection (default 1.0) + :param clean: If True, regenerate common files (mesh, yaml) for the given level. """ - self.need_workspace = True - # This simulation requires workspace + self.need_workspace = True # this simulation needs per-sample work directories self.env = config['env'] - # Environment variables, flow123d, gmsh, ... self._fields_params = config['fields_params'] self._fields = create_corr_field(**config['fields_params']) self._fields_used_params = None - # Random fields instance self.time_factor = config.get('time_factor', 1.0) - # It is used for minimal element from mesh determination (see level_instance method) - self.base_yaml_file = config['yaml_file'] self.base_geo_file = config['geo_file'] self.field_template = config.get('field_template', @@ -148,54 +137,55 @@ def __init__(self, config=None, clean=None): self.work_dir = config['work_dir'] self.clean = clean - super(Simulation, self).__init__() + super(Simulation, self).__init__() # keep compatibility with parent initialization + def level_instance(self, fine_level_params: List[float], coarse_level_params: List[float]) -> LevelSimulation: """ - Called from mlmc.Sampler, it creates single instance of LevelSimulation (mlmc.) - :param fine_level_params: in this version, it is just fine simulation step - :param coarse_level_params: in this version, it is just coarse simulation step - :return: mlmc.LevelSimulation object, this object is serialized in SamplingPoolPbs and deserialized in PbsJob, - so it allows pass simulation data from main process to PBS process + Create a LevelSimulation object for given fine/coarse steps. + + This method is called in the main process (Sampler) and must prepare + common files (mesh, YAML) for that level. The returned LevelSimulation + is serialized and sent to PBS jobs (PbsJob) for actual execution. + + :param fine_level_params: list with single element [fine_step] (mesh step) + :param coarse_level_params: list with single element [coarse_step] (mesh step) or [0] for one-level MC + :return: LevelSimulation configured with task size and calculate method """ fine_step = fine_level_params[0] coarse_step = coarse_level_params[0] - # TODO: determine minimal element from mesh + # Set time steps used in YAML substitution (O(h) and O(h^2) placeholders) self.time_step_h1 = self.time_factor * fine_step self.time_step_h2 = self.time_factor * fine_step * fine_step - # Set fine simulation common files directory - # Files in the directory are used by each simulation at that level + # Directory to store files common to all samples at this fine level common_files_dir = os.path.join(self.work_dir, "l_step_{}_common_files".format(fine_step)) force_mkdir(common_files_dir, force=self.clean) self.mesh_file = os.path.join(common_files_dir, self.MESH_FILE) if self.clean: - # Prepare mesh + # Create computational mesh from geometry template geo_file = os.path.join(common_files_dir, self.GEO_FILE) shutil.copyfile(self.base_geo_file, geo_file) - self._make_mesh(geo_file, self.mesh_file, fine_step) # Common computational mesh for all samples. + self._make_mesh(geo_file, self.mesh_file, fine_step) - # Prepare main input YAML + # Prepare main YAML by substituting placeholders yaml_template = os.path.join(common_files_dir, self.YAML_TEMPLATE) shutil.copyfile(self.base_yaml_file, yaml_template) yaml_file = os.path.join(common_files_dir, self.YAML_FILE) self._substitute_yaml(yaml_template, yaml_file) - # Mesh is extracted because we need number of mesh points to determine task_size parameter (see return value) + # Extract mesh metadata to determine task_size (number of points affects job weight) fine_mesh_data = self.extract_mesh(self.mesh_file) - # Set coarse simulation common files directory - # Files in the directory are used by each simulation at that level + # Set coarse sim common files dir if coarse level exists coarse_sim_common_files_dir = None if coarse_step != 0: coarse_sim_common_files_dir = os.path.join(self.work_dir, "l_step_{}_common_files".format(coarse_step)) - # Simulation config - # Configuration is used in mlmc.tool.pbs_job.PbsJob instance which is run from PBS process - # It is part of LevelSimulation which is serialized and then deserialized in mlmc.tool.pbs_job.PbsJob + # Prepare configuration dict that will be serialized in LevelSimulation config = dict() config["fine"] = {} config["coarse"] = {} @@ -204,71 +194,66 @@ def level_instance(self, fine_level_params: List[float], coarse_level_params: Li config["fine"]["common_files_dir"] = common_files_dir config["coarse"]["common_files_dir"] = coarse_sim_common_files_dir - config[ - "fields_used_params"] = self._fields_used_params # Params for Fields instance, which is createed in PbsJob + config["fields_used_params"] = self._fields_used_params config["gmsh"] = self.env['gmsh'] config["flow123d"] = self.env['flow123d'] config['fields_params'] = self._fields_params - # Auxiliary parameter which I use to determine task_size (should be from 0 to 1, if task_size is above 1 then pbs job is scheduled) - job_weight = 17000000 # 4000000 - 20 min, 2000000 - cca 10 min + # job_weight is used to convert mesh size into a normalized task_size + job_weight = 17000000 return LevelSimulation(config_dict=config, task_size=len(fine_mesh_data['points']) / job_weight, calculate=FlowSim.calculate, - # method which carries out the calculation, will be called from PBS processs - need_sample_workspace=True # If True, a sample directory is created + need_sample_workspace=True ) @staticmethod def calculate(config, seed): """ - Method that actually run the calculation, it's called from mlmc.tool.pbs_job.PbsJob.calculate_samples() - Calculate fine and coarse sample and also extract their results - :param config: dictionary containing simulation configuration, LevelSimulation.config_dict (set in level_instance) - :param seed: random seed, int - :return: List[fine result, coarse result], both flatten arrays (see mlmc.sim.synth_simulation.calculate()) + Execute one MLMC sample calculation (fine and optional coarse) inside PBS job. + + :param config: Configuration dict from LevelSimulation.config_dict (contains common_files dirs, steps, fields params) + :param seed: Random seed for the sample generation (derived from sample id) + :return: Tuple (fine_result_array, coarse_result_array), both numpy arrays (coarse may be zeros for one-level MC) """ - # Init correlation field objects - fields = create_corr_field(**config['fields_params']) # correlated_field.Fields instance + # Initialize fields object in the worker process + fields = create_corr_field(**config['fields_params']) fields.set_outer_fields(config["fields_used_params"]) - coarse_step = config["coarse"]["step"] # Coarse simulation step, zero if one level MC - flow123d = config["flow123d"] # Flow123d command + coarse_step = config["coarse"]["step"] + flow123d = config["flow123d"] - # Extract fine mesh - fine_common_files_dir = config["fine"]["common_files_dir"] # Directory with fine simulation common files + # Extract fine mesh structure and optionally coarse mesh structure + fine_common_files_dir = config["fine"]["common_files_dir"] fine_mesh_data = FlowSim.extract_mesh(os.path.join(fine_common_files_dir, FlowSim.MESH_FILE)) - # Extract coarse mesh coarse_mesh_data = None coarse_common_files_dir = None if coarse_step != 0: - coarse_common_files_dir = config["coarse"][ - "common_files_dir"] # Directory with coarse simulation common files + coarse_common_files_dir = config["coarse"]["common_files_dir"] coarse_mesh_data = FlowSim.extract_mesh(os.path.join(coarse_common_files_dir, FlowSim.MESH_FILE)) - # Create fields both fine and coarse + # Prepare combined fields object that has points for both fine and coarse meshes fields = FlowSim.make_fields(fields, fine_mesh_data, coarse_mesh_data) - # Set random seed, seed is calculated from sample id, so it is not user defined + # Sample random field realizations reproducibly np.random.seed(seed) - # Generate random samples - fine_input_sample, coarse_input_sample = FlowSim.generate_random_sample(fields, coarse_step=coarse_step, - n_fine_elements=len( - fine_mesh_data['points'])) + fine_input_sample, coarse_input_sample = FlowSim.generate_random_sample( + fields, coarse_step=coarse_step, n_fine_elements=len(fine_mesh_data['points']) + ) - # Run fine sample + # Run fine-level simulation fields_file = os.path.join(os.getcwd(), FlowSim.FIELDS_FILE) fine_res = FlowSim._run_sample(fields_file, fine_mesh_data['ele_ids'], fine_input_sample, flow123d, fine_common_files_dir) - # Rename fields_sample.msh to fine_fields_sample.msh, we might remove it + # Move generated files to have 'fine_' prefix so they don't collide for filename in os.listdir(os.getcwd()): if not filename.startswith("fine"): shutil.move(os.path.join(os.getcwd(), filename), os.path.join(os.getcwd(), "fine_" + filename)) - # Run coarse sample + # Run coarse-level simulation if coarse sample exists coarse_res = np.zeros(len(fine_res)) if coarse_input_sample: coarse_res = FlowSim._run_sample(fields_file, coarse_mesh_data['ele_ids'], coarse_input_sample, flow123d, @@ -279,17 +264,19 @@ def calculate(config, seed): @staticmethod def make_fields(fields, fine_mesh_data, coarse_mesh_data): """ - Create random fields that are used by both coarse and fine simulation - :param fields: correlated_field.Fields instance - :param fine_mesh_data: Dict contains data extracted from fine mesh file (points, point_region_ids, region_map) - :param coarse_mesh_data: Dict contains data extracted from coarse mesh file (points, point_region_ids, region_map) - :return: correlated_field.Fields + Assign evaluation points to fields and return the Fields object prepared for sampling. + + :param fields: correlated_field.Fields instance (with local field definitions) + :param fine_mesh_data: Dict returned by extract_mesh() for the fine mesh + :param coarse_mesh_data: Dict returned by extract_mesh() for the coarse mesh (or None for one-level) + :return: the same cf.Fields object with points set for sampling """ - # One level MC has no coarse_mesh_data + # If no coarse mesh, just register fine mesh points if coarse_mesh_data is None: fields.set_points(fine_mesh_data['points'], fine_mesh_data['point_region_ids'], fine_mesh_data['region_map']) else: + # Concatenate fine and coarse points to compute joint fields (ensures consistent sampling) coarse_centers = coarse_mesh_data['points'] both_centers = np.concatenate((fine_mesh_data['points'], coarse_centers), axis=0) both_regions_ids = np.concatenate( @@ -302,13 +289,14 @@ def make_fields(fields, fine_mesh_data, coarse_mesh_data): @staticmethod def _run_sample(fields_file, ele_ids, fine_input_sample, flow123d, common_files_dir): """ - Create random fields file, call Flow123d and extract results - :param fields_file: Path to file with random fields - :param ele_ids: Element IDs in computational mesh - :param fine_input_sample: fields: {'field_name' : values_array, ..} - :param flow123d: Flow123d command - :param common_files_dir: Directory with simulations common files (flow_input.yaml, ) - :return: simulation result, ndarray + Write random fields to Gmsh file, call flow123d, and extract sample results. + + :param fields_file: Path where fields will be written (in current working directory) + :param ele_ids: Array of element ids for which field values are provided + :param fine_input_sample: Dict mapping field names to arrays of shape (n_elements, 1) + :param flow123d: Path/command to flow123d executable + :param common_files_dir: Directory containing common YAML and other input files for the level + :return: numpy.ndarray with extracted simulation result (e.g., water balance) """ gmsh_io.GmshIO().write_fields(fields_file, ele_ids, fine_input_sample) @@ -321,11 +309,16 @@ def _run_sample(fields_file, ele_ids, fine_input_sample, flow123d, common_files_ @staticmethod def generate_random_sample(fields, coarse_step, n_fine_elements): """ - Generate random field, both fine and coarse part. - Store them separeted. - :return: Dict, Dict + Generate random field samples for the fine and (optionally) coarse meshes. + + :param fields: cf.Fields object (already configured with points) + :param coarse_step: coarse-level step (0 for no coarse sample) + :param n_fine_elements: Number of elements that belong to fine mesh (used to split combined sample) + :return: Tuple (fine_input_sample: dict, coarse_input_sample: dict) + Each dict maps field name -> array shaped (n_elements, 1). """ fields_sample = fields.sample() + # Fine inputs are first n_fine_elements rows; coarse are the remainder (if any) fine_input_sample = {name: values[:n_fine_elements, None] for name, values in fields_sample.items()} coarse_input_sample = {} if coarse_step != 0: @@ -336,10 +329,12 @@ def generate_random_sample(fields, coarse_step, n_fine_elements): def _make_mesh(self, geo_file, mesh_file, fine_step): """ - Make the mesh, mesh_file: _step.msh. - Make substituted yaml: _step.yaml, - using common fields_step.msh file for generated fields. - :return: + Invoke Gmsh to produce a mesh with the requested geometric scale (clscale). + + :param geo_file: Path to the .geo file used to generate the mesh + :param mesh_file: Path where the .msh output will be written + :param fine_step: Mesh step (controls element size via -clscale) + :return: None """ if self.env['gmsh_version'] == 2: subprocess.call( @@ -350,9 +345,14 @@ def _make_mesh(self, geo_file, mesh_file, fine_step): @staticmethod def extract_mesh(mesh_file): """ - Extract mesh from file - :param mesh_file: Mesh file path - :return: Dict + Parse a Gmsh mesh file and extract points (element centers), element ids and region mapping. + + :param mesh_file: Path to .msh file to parse (Gmsh 2/4 depending on GmshIO implementation) + :return: Dict with keys: + - 'points': np.ndarray of shape (n_elements, dim) with element center coordinates + - 'point_region_ids': np.ndarray of region id per element + - 'ele_ids': np.ndarray of original element ids + - 'region_map': dict mapping region name -> region id """ mesh = gmsh_io.GmshIO(mesh_file) is_bc_region = {} @@ -386,7 +386,7 @@ def extract_mesh(mesh_file): diff = max_pt - min_pt min_axis = np.argmin(diff) non_zero_axes = [0, 1, 2] - # TODO: be able to use this mesh_dimension in fields + # If mesh is effectively 2D (one axis collapsed), remove that axis from point coordinates if diff[min_axis] < 1e-10: non_zero_axes.pop(min_axis) points = centers[:, non_zero_axes] @@ -395,8 +395,11 @@ def extract_mesh(mesh_file): def _substitute_yaml(self, yaml_tmpl, yaml_out): """ - Create substituted YAML file from the tamplate. - :return: + Build YAML input file for flow123d by substituting placeholders for mesh and fields. + + :param yaml_tmpl: Path to YAML template with placeholders like '' and ''. + :param yaml_out: Path to output YAML file that will be used by flow123d. + :return: None (also populates self._fields_used_params with names of substituted fields) """ param_dict = {} field_tmpl = self.field_template @@ -412,11 +415,12 @@ def _substitute_yaml(self, yaml_tmpl, yaml_out): @staticmethod def _extract_result(sample_dir): """ - Extract the observed value from the Flow123d output. - :param sample_dir: str, path to sample directory - :return: None, inf or water balance result (float) and overall sample time + Extract the observed quantity (e.g., water balance flux) from a flow123d run directory. + + :param sample_dir: Directory where flow123d output (water_balance.yaml) is located. + :return: numpy.ndarray with a single value [-total_flux] representing outflow (negative sign). + Raises Exception if expected data is not found or inflow at outlet is positive. """ - # extract the flux balance_file = os.path.join(sample_dir, "water_balance.yaml") with open(balance_file, "r") as f: @@ -434,44 +438,19 @@ def _extract_result(sample_dir): flux_in = float(flux_item['data'][1]) if flux_in > 1e-10: raise Exception("Possitive inflow at outlet region.") - total_flux += flux # flux field + total_flux += flux found = True - # Get flow123d computing time - # run_time = FlowSim.get_run_time(sample_dir) - if not found: - raise Exception + raise Exception("No outlet flux found in water_balance.yaml") return np.array([-total_flux]) @staticmethod def result_format() -> List[QuantitySpec]: """ - Define simulation result format - :return: List[QuantitySpec, ...] + Describe the simulation output format as a list of QuantitySpec objects. + + :return: List[QuantitySpec] describing each output quantity (name, unit, shape, times, locations) """ spec1 = QuantitySpec(name="conductivity", unit="m", shape=(1, 1), times=[1], locations=['0']) - # spec2 = QuantitySpec(name="width", unit="mm", shape=(2, 1), times=[1, 2, 3], locations=['30', '40']) return [spec1] - - # @staticmethod - # def get_run_time(sample_dir): - # """ - # Get flow123d sample running time from profiler - # :param sample_dir: Sample directory - # :return: float - # """ - # profiler_file = os.path.join(sample_dir, "profiler_info_*.json") - # profiler = glob.glob(profiler_file)[0] - # - # try: - # with open(profiler, "r") as f: - # prof_content = json.load(f) - # - # run_time = float(prof_content['children'][0]['cumul-time-sum']) - # except: - # print("Extract run time failed") - # - # return run_time - - diff --git a/mlmc/tool/gmsh_io.py b/mlmc/tool/gmsh_io.py index c5a3ad36..0d059918 100644 --- a/mlmc/tool/gmsh_io.py +++ b/mlmc/tool/gmsh_io.py @@ -3,21 +3,8 @@ import struct import numpy as np -import enum -# class ElementType(enum.IntEnum): -# simplex_1d = 1 -# simplex_2d = 2 -# simplex_3d = 4 -# -# element_sizes = { -# 1: 1, -# 2: 2, -# 4: 3 -# } -# - class GmshIO: """This is a class for storing nodes and elements. Based on Gmsh.py @@ -28,25 +15,60 @@ class GmshIO: Methods: read([file]) -- Parse a Gmsh version 1.0 or 2.0 mesh file - write([file]) -- Output a Gmsh version 2.0 mesh file + write_ascii([file]) -- Output a Gmsh version 2.0 mesh file (ASCII) + write_binary([file]) -- Output a Gmsh version 2.0 mesh file (binary) + write_element_data(f, ele_ids, name, values) -- write $ElementData block + write_fields(msh_file, ele_ids, fields) -- convenience to write several ElementData blocks """ def __init__(self, filename=None): - """Initialise Gmsh data structure""" + """ + Initialise Gmsh data structure. + + :param filename: Optional path to a .msh file. If provided, the file is read on construction. + :return: None + """ self.reset() self.filename = filename if self.filename: self.read() def reset(self): - """Reinitialise Gmsh data structure""" + """ + Reinitialise internal storage. + + Clears nodes, elements, physical names and element_data dictionaries. + + :return: None + """ self.nodes = {} self.elements = {} self.physical = {} self.element_data = {} def read_element_data_head(self, mshfile): - + """ + Read header of a $ElementData block from an open mshfile. + + The method expects the lines after '$ElementData' to match the conventional + Gmsh textual ElementData header layout: + + "" + +