From 8a876f76fc8ee9364dc6b36b81ff65228dc99e7a Mon Sep 17 00:00:00 2001 From: carlagru Date: Thu, 15 May 2025 22:39:06 +0200 Subject: [PATCH] Added h5py loader to load in CHAOS MAT-file. Function load_matfile_h5py added to chaosmagpy/data_utils.py, option to switch to this loader added to chaosmagpy/chaos.py. --- chaosmagpy/chaos.py | 16 ++++-- chaosmagpy/data_utils.py | 103 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 4 deletions(-) diff --git a/chaosmagpy/chaos.py b/chaosmagpy/chaos.py index a602014..022fa46 100644 --- a/chaosmagpy/chaos.py +++ b/chaosmagpy/chaos.py @@ -2349,7 +2349,7 @@ def save_matfile(self, filepath): os.path.join(os.getcwd(), filepath))) @classmethod - def from_mat(self, filepath, name=None, satellites=None): + def from_mat(self, filepath, name=None, satellites=None, loader=None): """ Alternative constructor for creating a :class:`CHAOS` class instance. @@ -2366,6 +2366,8 @@ def from_mat(self, filepath, name=None, satellites=None): information is not given in the standard CHAOS MAT-file format (defaults to ``['oersted', 'champ', 'sac_c', 'swarm_a', 'swarm_b', 'swarm_c', 'cryosat-2_1', 'cryosat-2_2', 'cryosat-2_3']``.) + loader : string, optional + Name of loader to use for loading in the MAT-file. Defaults to "hdf5", to use h5py, set as "h5py". Returns ------- @@ -2387,7 +2389,7 @@ def from_mat(self, filepath, name=None, satellites=None): """ - return load_CHAOS_matfile(filepath, name=name, satellites=satellites) + return load_CHAOS_matfile(filepath, name=name, satellites=satellites, loader=loader) @classmethod def from_shc(self, filepath, *, name=None, leap_year=None): @@ -2434,7 +2436,7 @@ def from_shc(self, filepath, *, name=None, leap_year=None): return load_CHAOS_shcfile(filepath, name=name, leap_year=leap_year) -def load_CHAOS_matfile(filepath, name=None, satellites=None): +def load_CHAOS_matfile(filepath, name=None, satellites=None, loader = None): """ Load CHAOS model from MAT-file. @@ -2451,6 +2453,8 @@ def load_CHAOS_matfile(filepath, name=None, satellites=None): given in the standard CHAOS MAT-file format (defaults to ``['oersted', 'champ', 'sac_c', 'swarm_a', 'swarm_b', 'swarm_c', 'cryosat-2_1', 'cryosat-2_2', 'cryosat-2_3']``.) + loader : string, optional + Name of loader to use for loading in the MAT-file. Defaults to "hdf5", to use h5py, set as "h5py". Returns ------- @@ -2520,7 +2524,11 @@ def load_CHAOS_matfile(filepath, name=None, satellites=None): satellites = ['oersted', 'champ', 'sac_c', 'swarm_a', 'swarm_b', 'swarm_c', 'cryosat-2_1', 'cryosat-2_2', 'cryosat-2_3'] - mat_contents = du.load_matfile(filepath) + if loader == None or loader == "hdf5": + mat_contents = du.load_matfile(filepath) + elif loader == "h5py": + # Use h5py matfile loader to load in the data + mat_contents = du.load_matfile_h5py(filepath) pp = mat_contents['pp'] diff --git a/chaosmagpy/data_utils.py b/chaosmagpy/data_utils.py index cb076d9..658e45f 100644 --- a/chaosmagpy/data_utils.py +++ b/chaosmagpy/data_utils.py @@ -14,6 +14,7 @@ :toctree: functions load_matfile + load_matfile_h5py load_RC_datfile save_RC_h5file load_shcfile @@ -109,6 +110,108 @@ def traverse_struct(struct): return output +def load_matfile_h5py(filepath, **kwargs): + """ + Load CHAOS MAT-file and return dictionary. Builds on load_matfile but uses h5py instead of HDF5. + + Function to load in a CHAOS model MAT-file and store the output in a nested dictionary of numpy arrays. Traverses opened file, then + manually overrides objects previously saved as MATLAB "cell" objects. Arrays are squeezed if possible. + Note that in this form, this function works for CHAOS files only and not for any general MAT file. + Relies on the :mod:`h5py` package. + + Parameters + ---------- + filepath : str + Filepath and name of MAT-file. + **kwargs : keywords + Additional keyword arguments are passed to :func:`hfpy.File`. + + Returns + ------- + data : dict + Dictionary containing the data as dictionaries or numpy arrays. + + """ + # define a recursively called function to traverse structure + def traverse_struct(struct): + + # for h5py group objects, iterate through keys + if isinstance(struct, h5py.Group): + out = dict() + for key, value in struct.items(): + out[key] = traverse_struct(value) + return out + + # for h5py dataset objects, iterate through dtype names + elif isinstance(struct, h5py.Dataset): + + # collect dtype names if available + names = struct.dtype.names + + # if no fields in array + if names is None: + if struct.dtype == np.dtype('O') and struct.shape == (1, 1): + return traverse_struct(struct[0, 0]) + else: + # Added if statement to reverse automatic conversion from strings to int using ASCII conversion. Problematic if other 'uint16' type objects appear but works fine for this file. + if struct.dtype == 'uint16': + return(''.join(chr(np.squeeze(char)) for char in struct)) + else: + # Undo h5py transposing of 2D arrays when opening the file + if len(struct.shape) >=2 and struct.shape[0] >=2 and struct.shape[1] >= 2: + return np.squeeze(struct).T + else: + return np.squeeze(struct) + + else: # if there are fields, iterate through fields + out = dict() + for name in names: + out[name] = traverse_struct(struct[name]) + return out + + else: + return struct + + # Ope file with h5py + f = h5py.File(filepath, **kwargs) + + # Initialise empty output dictionary + output2 = dict() + + # loadmat returns dictionary, go through keys and call traverse_struct + for key, value in f.items(): + if key.startswith('__') and key.endswith('__'): + pass + # Skip reference key + elif key == "#refs#": + pass + else: + output2[key] = traverse_struct(value) + + # Manually override "cell" objects + # Begin with "model_Euler" + for key in output2["model_Euler"].keys(): + val = [] + for i in range(len(f["model_Euler"][key])): + # Read array from reference + val.append(f[output2["model_Euler"][key][i]][()].T) + + # Convert the resultant list to np.array with data type "object", and reshape to match expected output + output2["model_Euler"][key] = np.array(val, dtype=object) + + # Decode satellite names in "params" (from unsigned integer to character) + val=[] + for i in range(len(f["params"]["satellites"])): + val.append(''.join(chr(np.squeeze(char)) for char in f[output2["params"]["satellites"][i]])) + + # Convert the result to a tuple + output2["params"]["satellites"] = tuple(val) + + # Unpack referenced "cell" object in pp_CAL + output2["pp_CAL"] = traverse_struct(f[output2["pp_CAL"]]) + + return output2 + def load_RC_datfile(filepath=None, parse_dates=None): """