From c936b8bc3b2af700945e0632f7509af987f1f5ec Mon Sep 17 00:00:00 2001 From: jrudz Date: Mon, 2 Jun 2025 22:22:09 +0200 Subject: [PATCH 1/7] avoid double calc of bead_groups, fix randomization for msd --- .../molecular_dynamics.py | 107 +++++++++++------- 1 file changed, 68 insertions(+), 39 deletions(-) diff --git a/simulationworkflowschema/molecular_dynamics.py b/simulationworkflowschema/molecular_dynamics.py index 18f3d6a..66501bb 100644 --- a/simulationworkflowschema/molecular_dynamics.py +++ b/simulationworkflowschema/molecular_dynamics.py @@ -29,6 +29,7 @@ from MDAnalysis.core.universe import Universe import MDAnalysis.analysis.rdf as MDA_RDF from MDAnalysis.core._get_readers import get_reader_for +from functools import cached_property from nomad.datamodel.data import ArchiveSection from nomad.metainfo import ( @@ -581,6 +582,7 @@ def _get_molecular_bead_groups( def calc_molecular_rdf( universe: MDAnalysis.Universe, + bead_groups: dict[str, BeadGroup], n_traj_split: int = 10, n_prune: int = 1, interval_indices=None, @@ -590,8 +592,20 @@ def calc_molecular_rdf( Calculates the radial distribution functions between for each unique pair of molecule types as a function of their center of mass distance. - interval_indices: 2D array specifying the groups of the n_traj_split intervals to be averaged - max_mols: the maximum number of molecules per bead group for calculating the rdf, for efficiency purposes. + Parameters + ---------- + universe : MDAnalysis.Universe + The MDAnalysis universe object. + bead_groups : dict[str, BeadGroup] + Precomputed bead groups for the universe (use MolecularDynamicsResults.molecular_bead_groups). + n_traj_split : int + Number of intervals to split trajectory into for averaging. + n_prune : int + Pruning parameter for frames. + interval_indices : list or None + 2D array specifying the groups of the n_traj_split intervals to be averaged. + max_mols : int + Maximum number of molecules per bead group for calculating the rdf, for efficiency purposes. """ # TODO 5k default for max_mols was set after > 50k was giving problems. Should do further testing to see where the appropriate limit should be set. if ( @@ -623,10 +637,9 @@ def calc_molecular_rdf( if not interval_indices: interval_indices = [[i] for i in range(n_traj_split)] - bead_groups = _get_molecular_bead_groups(universe) if not bead_groups: return bead_groups - moltypes = [moltype for moltype in bead_groups.keys()] + moltypes = list(bead_groups.keys()) del_list = [ i_moltype for i_moltype, moltype in enumerate(moltypes) @@ -868,14 +881,22 @@ def correlate(start_frame): def calc_molecular_mean_squared_displacements( - universe: MDAnalysis.Universe, max_mols: int = 5000 + universe: MDAnalysis.Universe, + bead_groups: dict[str, BeadGroup], + max_mols: int = 5000, ) -> dict[str, Any]: """ Calculates the mean squared displacement for the center of mass of each molecule type. - max_mols: the maximum number of molecules per bead group for calculating the msd, for efficiency purposes. - 50k was tested and is very fast and does not seem to have any memory issues. + Parameters + ---------- + universe : MDAnalysis.Universe + The MDAnalysis universe object. + bead_groups : dict[str, BeadGroup] + Precomputed bead groups for the universe (use MolecularDynamicsResults.molecular_bead_groups). + max_mols : int + Maximum number of molecules per bead group for calculating the msd, for efficiency purposes. """ def parse_jumps( @@ -976,7 +997,6 @@ def mean_squared_displacement(start: np.ndarray, current: np.ndarray): return {} times = np.arange(n_frames) * dt - bead_groups = _get_molecular_bead_groups(universe) if bead_groups is {}: return bead_groups @@ -989,32 +1009,37 @@ def mean_squared_displacement(start: np.ndarray, current: np.ndarray): 'Calculating mean squared displacements for more than 50k molecules.' ' Expect long processing times!', ) - try: - # select max_mols nr. of rnd molecules from this moltype - moltype_indices = np.array( - [atom._ix for atom in bead_groups[moltype]._atoms] - ) - molnums = universe.atoms.molnums[moltype_indices] - molnum_types = np.unique(molnums) - molnum_types_rnd = np.sort( - np.random.choice(molnum_types, size=max_mols) - ) - atom_indices_rnd = np.concatenate( - [np.where(molnums == molnum)[0] for molnum in molnum_types_rnd] - ) - selection = ' '.join([str(i) for i in atom_indices_rnd]) - selection = f'index {selection}' - ags_moltype_rnd = universe.select_atoms(selection) - bead_groups[moltype] = BeadGroup(ags_moltype_rnd, compound='fragments') - LOGGER.warn( - 'Maximum number of molecules for calculating the msd has been reached.' - ' Will make a random selection for calculation.' - ) - except Exception: - LOGGER.warn( - 'Error in selecting random molecules for large group when calculating msd. Skipping this molecule type.' - ) - del_list.append(i_moltype) + try: + # select max_mols nr. of rnd molecules from this moltype + moltype_indices = np.array( + [atom._ix for atom in bead_groups[moltype]._atoms] + ) + molnums = universe.atoms.molnums[moltype_indices] + molnum_types = np.unique(molnums) + molnum_types_rnd = np.sort( + np.random.choice(molnum_types, size=max_mols) + ) + atom_indices_rnd = np.concatenate( + [ + moltype_indices[molnums == molnum] + for molnum in molnum_types_rnd + ] + ) + selection = ' '.join([str(i) for i in atom_indices_rnd]) + selection = f'index {selection}' + ags_moltype_rnd = universe.select_atoms(selection) + bead_groups[moltype] = BeadGroup( + ags_moltype_rnd, compound='fragments' + ) + LOGGER.warn( + 'Maximum number of molecules for calculating the msd has been reached.' + ' Will make a random selection for calculation.' + ) + except Exception: + LOGGER.warn( + 'Error in selecting random molecules for large group when calculating msd. Skipping this molecule type.' + ) + del_list.append(i_moltype) for index in sorted(del_list, reverse=True): del moltypes[index] @@ -2536,10 +2561,6 @@ class MolecularDynamicsResults(ThermodynamicsResults): sub_section=CorrelationFunction.m_def, repeats=True ) - radial_distribution_functions = SubSection( - sub_section=RadialDistributionFunction.m_def, repeats=True - ) - radius_of_gyration = SubSection(sub_section=RadiusOfGyration, repeats=True) mean_squared_displacements = SubSection( @@ -2550,9 +2571,12 @@ class MolecularDynamicsResults(ThermodynamicsResults): sub_section=FreeEnergyCalculations.m_def, repeats=True ) + # Removed @property and getattr-based universe and molecular_bead_groups + def normalize(self, archive, logger): super().normalize(archive, logger) + # Construct universe and bead_groups as local variables try: universe = archive_to_universe(archive) except Exception: @@ -2564,6 +2588,8 @@ def normalize(self, archive, logger): if universe is None: return + bead_groups = _get_molecular_bead_groups(universe) + # calculate molecular radial distribution functions if not self.radial_distribution_functions: n_traj_split = ( @@ -2586,6 +2612,7 @@ def normalize(self, archive, logger): n_prune = int(universe.trajectory.n_frames / len(archive.run[-1].system)) rdf_results = calc_molecular_rdf( universe, + bead_groups, n_traj_split=n_traj_split, n_prune=n_prune, interval_indices=interval_indices, @@ -2598,7 +2625,9 @@ def normalize(self, archive, logger): # calculate the molecular mean squared displacements if not self.mean_squared_displacements: - msd_results = calc_molecular_mean_squared_displacements(universe) + msd_results = calc_molecular_mean_squared_displacements( + universe, bead_groups + ) if msd_results: sec_msds = MeanSquaredDisplacement() sec_msds._msd_results = msd_results From 308be8a4c81b6628cea8406350c7436324df13d9 Mon Sep 17 00:00:00 2001 From: jrudz Date: Mon, 2 Jun 2025 22:30:53 +0200 Subject: [PATCH 2/7] removed unneccessary dep --- simulationworkflowschema/molecular_dynamics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/simulationworkflowschema/molecular_dynamics.py b/simulationworkflowschema/molecular_dynamics.py index 66501bb..ac235a2 100644 --- a/simulationworkflowschema/molecular_dynamics.py +++ b/simulationworkflowschema/molecular_dynamics.py @@ -29,7 +29,6 @@ from MDAnalysis.core.universe import Universe import MDAnalysis.analysis.rdf as MDA_RDF from MDAnalysis.core._get_readers import get_reader_for -from functools import cached_property from nomad.datamodel.data import ArchiveSection from nomad.metainfo import ( From e96aeea8dcb4e54bbd276114e1cbdc165fe253c3 Mon Sep 17 00:00:00 2001 From: jrudz Date: Mon, 2 Jun 2025 22:33:45 +0200 Subject: [PATCH 3/7] remove comments and update docstrs --- simulationworkflowschema/molecular_dynamics.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/simulationworkflowschema/molecular_dynamics.py b/simulationworkflowschema/molecular_dynamics.py index ac235a2..db3202c 100644 --- a/simulationworkflowschema/molecular_dynamics.py +++ b/simulationworkflowschema/molecular_dynamics.py @@ -596,7 +596,7 @@ def calc_molecular_rdf( universe : MDAnalysis.Universe The MDAnalysis universe object. bead_groups : dict[str, BeadGroup] - Precomputed bead groups for the universe (use MolecularDynamicsResults.molecular_bead_groups). + Precomputed bead groups for the universe. n_traj_split : int Number of intervals to split trajectory into for averaging. n_prune : int @@ -893,7 +893,7 @@ def calc_molecular_mean_squared_displacements( universe : MDAnalysis.Universe The MDAnalysis universe object. bead_groups : dict[str, BeadGroup] - Precomputed bead groups for the universe (use MolecularDynamicsResults.molecular_bead_groups). + Precomputed bead groups for the universe. max_mols : int Maximum number of molecules per bead group for calculating the msd, for efficiency purposes. """ @@ -2570,12 +2570,9 @@ class MolecularDynamicsResults(ThermodynamicsResults): sub_section=FreeEnergyCalculations.m_def, repeats=True ) - # Removed @property and getattr-based universe and molecular_bead_groups - def normalize(self, archive, logger): super().normalize(archive, logger) - # Construct universe and bead_groups as local variables try: universe = archive_to_universe(archive) except Exception: From 3403807f229bf6d8c5273732537d72a01d4da3bb Mon Sep 17 00:00:00 2001 From: jrudz Date: Tue, 3 Jun 2025 09:36:24 +0200 Subject: [PATCH 4/7] updated logging --- simulationworkflowschema/molecular_dynamics.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/simulationworkflowschema/molecular_dynamics.py b/simulationworkflowschema/molecular_dynamics.py index db3202c..4ef6d75 100644 --- a/simulationworkflowschema/molecular_dynamics.py +++ b/simulationworkflowschema/molecular_dynamics.py @@ -208,13 +208,13 @@ def create_empty_universe( n_segments = 0 if atom_resindex is None: - LOGGER.warn( + LOGGER.warning( 'Residues specified but no atom_resindex given. ' 'All atoms will be placed in first Residue.', ) if residue_segindex is None: - LOGGER.warn( + LOGGER.warning( 'Segments specified but no segment_resindex given. ' 'All residues will be placed in first Segment', ) @@ -842,7 +842,7 @@ def shifted_correlation_average( >>> indices, data = shifted_correlation(msd, coords) """ if window + skip >= 1: - LOGGER.warn( + LOGGER.warning( 'Invalid parameters for shifted_correlation(), resetting to defaults.', ) window = 0.5 @@ -981,7 +981,7 @@ def mean_squared_displacement(start: np.ndarray, current: np.ndarray): n_frames = universe.trajectory.n_frames if n_frames < 50: - LOGGER.warn( + LOGGER.warning( 'At least 50 frames required to calculate molecular' ' mean squared displacements, skipping.', ) @@ -989,7 +989,7 @@ def mean_squared_displacement(start: np.ndarray, current: np.ndarray): dt = getattr(universe.trajectory, 'dt') if dt is None: - LOGGER.warn( + LOGGER.warning( 'Universe is missing time step, cannot calculate molecular' ' mean squared displacements, skipping.', ) @@ -1004,7 +1004,7 @@ def mean_squared_displacement(start: np.ndarray, current: np.ndarray): for i_moltype, moltype in enumerate(moltypes): if len(bead_groups[moltype].positions) > max_mols: if max_mols > 50000: - LOGGER.warn( + LOGGER.warning( 'Calculating mean squared displacements for more than 50k molecules.' ' Expect long processing times!', ) @@ -1030,12 +1030,12 @@ def mean_squared_displacement(start: np.ndarray, current: np.ndarray): bead_groups[moltype] = BeadGroup( ags_moltype_rnd, compound='fragments' ) - LOGGER.warn( + LOGGER.warning( 'Maximum number of molecules for calculating the msd has been reached.' ' Will make a random selection for calculation.' ) except Exception: - LOGGER.warn( + LOGGER.warning( 'Error in selecting random molecules for large group when calculating msd. Skipping this molecule type.' ) del_list.append(i_moltype) @@ -2652,7 +2652,7 @@ def normalize(self, archive, logger): for rg in rg_results: n_frames = rg.get('n_frames') if len(sec_systems) != n_frames: - self.logger.warning( + logger.warning( 'Mismatch in length of system references in calculation and calculated Rg values.' 'Will not store Rg values under calculation section' ) From d5696c9fc77a9c2ed0c61f3466f9d81622727f60 Mon Sep 17 00:00:00 2001 From: jrudz Date: Tue, 3 Jun 2025 09:56:48 +0200 Subject: [PATCH 5/7] input validation --- .../molecular_dynamics.py | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/simulationworkflowschema/molecular_dynamics.py b/simulationworkflowschema/molecular_dynamics.py index 4ef6d75..b406a39 100644 --- a/simulationworkflowschema/molecular_dynamics.py +++ b/simulationworkflowschema/molecular_dynamics.py @@ -561,6 +561,11 @@ def _get_molecular_bead_groups( """ Creates bead groups based on the molecular types as defined by the MDAnalysis universe. """ + # Input validation + if universe is None: + LOGGER.warning('universe is None. Cannot create bead groups.') + return {} + if not moltypes: atoms_moltypes = getattr(universe.atoms, 'moltypes', []) moltypes = np.unique(atoms_moltypes) @@ -607,11 +612,16 @@ def calc_molecular_rdf( Maximum number of molecules per bead group for calculating the rdf, for efficiency purposes. """ # TODO 5k default for max_mols was set after > 50k was giving problems. Should do further testing to see where the appropriate limit should be set. + if bead_groups is None or not bead_groups: + LOGGER.warning('bead_groups is None or empty. Cannot calculate RDF.') + return {} + if ( not universe or not universe.trajectory or universe.trajectory[0].dimensions is None ): + LOGGER.warning('universe is None. Cannot calculate RDF.') return {} n_frames = universe.trajectory.n_frames @@ -972,11 +982,16 @@ def mean_squared_displacement(start: np.ndarray, current: np.ndarray): vec = start - current return (vec**2).sum(axis=1).mean() + if bead_groups is None or not bead_groups: + LOGGER.warning('bead_groups is None or empty. Cannot calculate MSD.') + return {} + if ( not universe or not universe.trajectory or universe.trajectory[0].dimensions is None ): + LOGGER.warning('universe is None. Cannot calculate MSD.') return {} n_frames = universe.trajectory.n_frames @@ -1081,11 +1096,18 @@ def calc_radius_of_gyration( """ Calculates the radius of gyration as a function of time for the atoms 'molecule_atom_indices'. """ + if molecule_atom_indices is None or len(molecule_atom_indices) == 0: + LOGGER.warning( + 'molecule_atom_indices is None or empty. Cannot calculate radius of gyration.' + ) + return {} + if ( not universe or not universe.trajectory or universe.trajectory[0].dimensions is None ): + LOGGER.warning('universe is None. Cannot calculate radius of gyration.') return {} selection = ' '.join([str(i) for i in molecule_atom_indices]) selection = f'index {selection}' @@ -1119,7 +1141,15 @@ def calc_molecular_radius_of_gyration( """ Calculates the radius of gyration as a function of time for each polymer in the system. """ - if not system_topology: + if universe is None: + LOGGER.warning( + 'universe is None. Cannot calculate molecular radius of gyration.' + ) + return [] + if system_topology is None or not system_topology: + LOGGER.warning( + 'system_topology is None or empty. Cannot calculate molecular radius of gyration.' + ) return [] rg_results = [] From 29afa1b0b597acc577ed2921872c027c7e729341 Mon Sep 17 00:00:00 2001 From: jrudz Date: Tue, 3 Jun 2025 12:03:36 +0200 Subject: [PATCH 6/7] update bandstructure dependency version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 55cacf9..667cb61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ homepage = "https://github.com/nomad-coe/nomad-schema-plugin-simulation-workflow [tool.uv] dev-dependencies = [ - "nomad-normalizer-plugin-bandstructure", + "nomad-normalizer-plugin-bandstructure>=1.0.3", "nomad-normalizer-plugin-system", 'mypy>=1.15', 'pytest>= 5.3.0, <8', From bc80212d8575499f73e1003fcbca915e34b1c8a3 Mon Sep 17 00:00:00 2001 From: jrudz Date: Tue, 3 Jun 2025 12:16:38 +0200 Subject: [PATCH 7/7] review changes --- pyproject.toml | 2 +- .../molecular_dynamics.py | 78 +++++++++---------- 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 667cb61..b2947c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ homepage = "https://github.com/nomad-coe/nomad-schema-plugin-simulation-workflow [tool.uv] dev-dependencies = [ "nomad-normalizer-plugin-bandstructure>=1.0.3", - "nomad-normalizer-plugin-system", + "nomad-normalizer-plugin-system>=1.0.3", 'mypy>=1.15', 'pytest>= 5.3.0, <8', 'pytest-timeout>=1.4.2', diff --git a/simulationworkflowschema/molecular_dynamics.py b/simulationworkflowschema/molecular_dynamics.py index b406a39..627613c 100644 --- a/simulationworkflowschema/molecular_dynamics.py +++ b/simulationworkflowschema/molecular_dynamics.py @@ -563,7 +563,7 @@ def _get_molecular_bead_groups( """ # Input validation if universe is None: - LOGGER.warning('universe is None. Cannot create bead groups.') + LOGGER.warning('Universe required to create beads.') return {} if not moltypes: @@ -613,7 +613,7 @@ def calc_molecular_rdf( """ # TODO 5k default for max_mols was set after > 50k was giving problems. Should do further testing to see where the appropriate limit should be set. if bead_groups is None or not bead_groups: - LOGGER.warning('bead_groups is None or empty. Cannot calculate RDF.') + LOGGER.warning('bead_groups required to calculate RDF.') return {} if ( @@ -621,7 +621,7 @@ def calc_molecular_rdf( or not universe.trajectory or universe.trajectory[0].dimensions is None ): - LOGGER.warning('universe is None. Cannot calculate RDF.') + LOGGER.warning('universe required to calculate RDF.') return {} n_frames = universe.trajectory.n_frames @@ -983,7 +983,7 @@ def mean_squared_displacement(start: np.ndarray, current: np.ndarray): return (vec**2).sum(axis=1).mean() if bead_groups is None or not bead_groups: - LOGGER.warning('bead_groups is None or empty. Cannot calculate MSD.') + LOGGER.warning('bead_groups required to calculate MSD.') return {} if ( @@ -991,7 +991,7 @@ def mean_squared_displacement(start: np.ndarray, current: np.ndarray): or not universe.trajectory or universe.trajectory[0].dimensions is None ): - LOGGER.warning('universe is None. Cannot calculate MSD.') + LOGGER.warning('universe required to calculate MSD.') return {} n_frames = universe.trajectory.n_frames @@ -1023,37 +1023,32 @@ def mean_squared_displacement(start: np.ndarray, current: np.ndarray): 'Calculating mean squared displacements for more than 50k molecules.' ' Expect long processing times!', ) - try: - # select max_mols nr. of rnd molecules from this moltype - moltype_indices = np.array( - [atom._ix for atom in bead_groups[moltype]._atoms] - ) - molnums = universe.atoms.molnums[moltype_indices] - molnum_types = np.unique(molnums) - molnum_types_rnd = np.sort( - np.random.choice(molnum_types, size=max_mols) - ) - atom_indices_rnd = np.concatenate( - [ - moltype_indices[molnums == molnum] - for molnum in molnum_types_rnd - ] - ) - selection = ' '.join([str(i) for i in atom_indices_rnd]) - selection = f'index {selection}' - ags_moltype_rnd = universe.select_atoms(selection) - bead_groups[moltype] = BeadGroup( - ags_moltype_rnd, compound='fragments' - ) - LOGGER.warning( - 'Maximum number of molecules for calculating the msd has been reached.' - ' Will make a random selection for calculation.' - ) - except Exception: - LOGGER.warning( - 'Error in selecting random molecules for large group when calculating msd. Skipping this molecule type.' - ) - del_list.append(i_moltype) + try: + # select max_mols nr. of rnd molecules from this moltype + moltype_indices = np.array( + [atom._ix for atom in bead_groups[moltype]._atoms] + ) + molnums = universe.atoms.molnums[moltype_indices] + molnum_types = np.unique(molnums) + molnum_types_rnd = np.sort( + np.random.choice(molnum_types, size=max_mols) + ) + atom_indices_rnd = np.concatenate( + [moltype_indices[molnums == molnum] for molnum in molnum_types_rnd] + ) + selection = ' '.join([str(i) for i in atom_indices_rnd]) + selection = f'index {selection}' + ags_moltype_rnd = universe.select_atoms(selection) + bead_groups[moltype] = BeadGroup(ags_moltype_rnd, compound='fragments') + LOGGER.warning( + 'Maximum number of molecules for calculating the msd has been reached.' + ' Will make a random selection for calculation.' + ) + except Exception: + LOGGER.warning( + 'Error in selecting random molecules for large group when calculating msd. Skipping this molecule type.' + ) + del_list.append(i_moltype) for index in sorted(del_list, reverse=True): del moltypes[index] @@ -1095,10 +1090,13 @@ def calc_radius_of_gyration( ) -> dict[str, Any]: """ Calculates the radius of gyration as a function of time for the atoms 'molecule_atom_indices'. + + molecule_atom_indices : np.ndarray + The indices of the atoms corresponding to a single molecule for which the Rg will be calculated. """ if molecule_atom_indices is None or len(molecule_atom_indices) == 0: LOGGER.warning( - 'molecule_atom_indices is None or empty. Cannot calculate radius of gyration.' + 'molecule_atom_indices is required to calculate radius of gyration' ) return {} @@ -1142,13 +1140,11 @@ def calc_molecular_radius_of_gyration( Calculates the radius of gyration as a function of time for each polymer in the system. """ if universe is None: - LOGGER.warning( - 'universe is None. Cannot calculate molecular radius of gyration.' - ) + LOGGER.warning('universe required to calculate molecular radius of gyration') return [] if system_topology is None or not system_topology: LOGGER.warning( - 'system_topology is None or empty. Cannot calculate molecular radius of gyration.' + 'system_topology require to calculate molecular radius of gyration.' ) return []