From b6ddf53e625a502f558f7d2be80677c4b368ac26 Mon Sep 17 00:00:00 2001 From: Deniz Dogan Date: Mon, 10 Apr 2023 16:20:06 +0300 Subject: [PATCH 1/2] add dccm and lmi analyzers --- pyinteraph/core/__init__.py | 11 ++++ .../core/validate_parser_file_extension.py | 16 +++++ pyinteraph/correlation/__init__.py | 0 .../correlation/base_correlation_analyzer.py | 42 +++++++++++++ pyinteraph/correlation/dccm.py | 8 +++ pyinteraph/correlation/lmi.py | 12 ++++ .../correlation/residue_coordinate_matrix.py | 53 +++++++++++++++++ pyinteraph/correlation_analysis.py | 59 +++++++++++++++++++ setup.py | 11 ++-- 9 files changed, 208 insertions(+), 4 deletions(-) create mode 100644 pyinteraph/core/__init__.py create mode 100644 pyinteraph/core/validate_parser_file_extension.py create mode 100644 pyinteraph/correlation/__init__.py create mode 100644 pyinteraph/correlation/base_correlation_analyzer.py create mode 100644 pyinteraph/correlation/dccm.py create mode 100644 pyinteraph/correlation/lmi.py create mode 100644 pyinteraph/correlation/residue_coordinate_matrix.py create mode 100644 pyinteraph/correlation_analysis.py diff --git a/pyinteraph/core/__init__.py b/pyinteraph/core/__init__.py new file mode 100644 index 0000000..bff5650 --- /dev/null +++ b/pyinteraph/core/__init__.py @@ -0,0 +1,11 @@ +import logging +import sys + +logging.basicConfig( + stream=sys.stdout, + level=logging.INFO, + format="[%(asctime)s] %(levelname)s | %(message)s", + datefmt="%Y/%m/%d %H:%M:%S" +) + +logger = logging.getLogger(__name__) diff --git a/pyinteraph/core/validate_parser_file_extension.py b/pyinteraph/core/validate_parser_file_extension.py new file mode 100644 index 0000000..fe3c695 --- /dev/null +++ b/pyinteraph/core/validate_parser_file_extension.py @@ -0,0 +1,16 @@ +import argparse +import os + + +class ArgumentParserFileExtensionValidation(argparse.FileType): + parser = argparse.ArgumentParser() + + def __init__(self, valid_extensions, file_name): + self.valid_extensions = valid_extensions + self.file_name = file_name + + def validate_file_extension(self): + given_extension = os.path.splitext(self.file_name)[1][1:] + if given_extension not in self.valid_extensions: + self.parser.error(f"Invalid file extension. Please provide a {self.valid_extensions} file") + return self.file_name diff --git a/pyinteraph/correlation/__init__.py b/pyinteraph/correlation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyinteraph/correlation/base_correlation_analyzer.py b/pyinteraph/correlation/base_correlation_analyzer.py new file mode 100644 index 0000000..22c6bfe --- /dev/null +++ b/pyinteraph/correlation/base_correlation_analyzer.py @@ -0,0 +1,42 @@ +import abc + +import numpy as np +import itertools + +from pyinteraph.core import logger +from pyinteraph.correlation.residue_coordinate_matrix import ResidueCoordinateMatrix + + +class BaseCorrelationAnalyzer(abc.ABC): + + def __init__(self, residue_coordinate_matrix: ResidueCoordinateMatrix, threshold: float = 0) -> None: + self.threshold = threshold + self.residue_coordinate_matrix = residue_coordinate_matrix + self.n_residues = self.residue_coordinate_matrix.n_residues + self.coordinates_by_residue = self.residue_coordinate_matrix.coordinates_by_residue + self.residues_with_atom_number = self.residue_coordinate_matrix.residues_with_atom_number + + def run(self) -> np.ndarray: + residue_number_combinations = itertools.combinations_with_replacement(range(0, self.n_residues), 2) + correlation_matrix = np.zeros(shape=(self.n_residues, self.n_residues)) + for i, j in residue_number_combinations: + correlation_matrix[i, j] = correlation_matrix[j, i] = self._compute_analyzer( + self.coordinates_by_residue[i, :], self.coordinates_by_residue[j, :] + ) + return correlation_matrix + + @abc.abstractmethod + def _compute_analyzer(self, residue_i_coords: np.ndarray, residue_j_coords: np.ndarray) -> float: + raise NotImplementedError() + + @property + def filtered_correlation_matrix(self): + correlation_matrix = self.run() + + if self.threshold == 0: + return correlation_matrix + return np.where(correlation_matrix > self.threshold, correlation_matrix, 0) + + def to_csv(self, file_name="correlation") -> None: + np.savetxt(f"{file_name}.csv", self.filtered_correlation_matrix, comments='', delimiter=',', + header=','.join(self.residues_with_atom_number.keys())) diff --git a/pyinteraph/correlation/dccm.py b/pyinteraph/correlation/dccm.py new file mode 100644 index 0000000..54aee62 --- /dev/null +++ b/pyinteraph/correlation/dccm.py @@ -0,0 +1,8 @@ +import numpy as np + +from pyinteraph.correlation.base_correlation_analyzer import BaseCorrelationAnalyzer + + +class DCCMAnalyzer(BaseCorrelationAnalyzer): + def _compute_analyzer(self, residue_i_coords, residue_j_coords): + return np.corrcoef(residue_i_coords, residue_j_coords)[1][0] diff --git a/pyinteraph/correlation/lmi.py b/pyinteraph/correlation/lmi.py new file mode 100644 index 0000000..c52c09f --- /dev/null +++ b/pyinteraph/correlation/lmi.py @@ -0,0 +1,12 @@ +import numpy as np + +from pyinteraph.core import logger +from pyinteraph.correlation.base_correlation_analyzer import BaseCorrelationAnalyzer + + +class LMIAnalyzer(BaseCorrelationAnalyzer): + def _compute_analyzer(self, residue_i_coords, residue_j_coords): + pearson_coefficient = np.corrcoef(residue_i_coords, residue_j_coords)[1][0] + mutual_info = - 3/2 * np.log(1 - pearson_coefficient**2) + linear_mutual_info = 1 - (np.exp(-2 * (mutual_info/3))) + return linear_mutual_info diff --git a/pyinteraph/correlation/residue_coordinate_matrix.py b/pyinteraph/correlation/residue_coordinate_matrix.py new file mode 100644 index 0000000..0e17a21 --- /dev/null +++ b/pyinteraph/correlation/residue_coordinate_matrix.py @@ -0,0 +1,53 @@ +from collections import OrderedDict +import functools + +import numpy as np +import MDAnalysis as mda + +from pyinteraph.core import logger + + +class ResidueCoordinateMatrix: + def __init__(self, ref: str, traj: str, atoms: str, backbone: bool, keep_3d_coordinates: bool) -> None: + self.ref = ref + self.traj = traj + self.selected_atoms = "backbone" if backbone else f"name {atoms.replace(',', ' ')}" + self.keep_3d_coordinates = keep_3d_coordinates + self.coordinates_by_residue_matrix_shape = (self.n_residues, self.n_frames, 3) if keep_3d_coordinates else (self.n_residues, self.n_frames) + + @property + @functools.lru_cache() + def trajectory(self): + return mda.Universe(self.ref, self.traj) + + @property + @functools.lru_cache() + def n_frames(self): + return self.trajectory.trajectory.n_frames + + @property + @functools.lru_cache() + def n_residues(self): + return self.trajectory.residues.residues.n_residues + + @property + @functools.lru_cache() + def residues_with_atom_number(self): + residues = OrderedDict() + for res in self.trajectory.residues: + residues[f"{res.resnum}{res.resname}"] = res.atoms.select_atoms(self.selected_atoms).atoms.ix_array + return residues + + @property + @functools.lru_cache() + def coordinates_by_residue(self) -> np.ndarray: + traj_by_res = np.zeros(shape=self.coordinates_by_residue_matrix_shape) + for i, traj in enumerate(self.trajectory.trajectory): + for res_num in range(0, self.n_residues): + traj_by_res[res_num][traj.frame] = self.get_atomic_positions_by_geometric_center(traj, res_num) + return traj_by_res + + def get_atomic_positions_by_geometric_center(self, traj, res_num): + if not self.keep_3d_coordinates: + return np.mean(traj.positions[list(self.residues_with_atom_number.values())[res_num]]) + return np.mean(traj.positions[list(self.residues_with_atom_number.values())[res_num]], axis=0) diff --git a/pyinteraph/correlation_analysis.py b/pyinteraph/correlation_analysis.py new file mode 100644 index 0000000..1f75f15 --- /dev/null +++ b/pyinteraph/correlation_analysis.py @@ -0,0 +1,59 @@ +import argparse +import warnings + +from pyinteraph.core import logger +from pyinteraph.correlation.dccm import DCCMAnalyzer +from pyinteraph.correlation.lmi import LMIAnalyzer +from pyinteraph.correlation.residue_coordinate_matrix import ResidueCoordinateMatrix +from pyinteraph.core.validate_parser_file_extension import ArgumentParserFileExtensionValidation + +warnings.filterwarnings("ignore") + +def run_dccm(args, backbone): + residue_coordinate_matrix = ResidueCoordinateMatrix(args.ref, args.traj, args.atoms, backbone, keep_3d_coordinates=False) + DCCMAnalyzer(residue_coordinate_matrix, threshold=args.threshold).to_csv(file_name="dccm") + + +def run_lmi(args, backbone): + residue_coordinate_matrix = ResidueCoordinateMatrix(args.ref, args.traj, args.atoms, backbone, keep_3d_coordinates=False) + LMIAnalyzer(residue_coordinate_matrix, threshold=args.threshold).to_csv(file_name="lmi") + + +def main(): + parser = argparse.ArgumentParser(description="Script to run correlation analysis using DCCM or LMI") + subparsers = parser.add_subparsers() + + common_args = argparse.ArgumentParser(add_help=False) + common_args.add_argument('--atoms', type=str, required=False) + common_args.add_argument('--backbone', action='store_true', required=False) + common_args.add_argument("--ref", help="Reference file", + type=lambda file_name: ArgumentParserFileExtensionValidation( + (".pdb, .gro, .psf, .top, .crd"), file_name).validate_file_extension(), + required=True) + common_args.add_argument("--traj", help="a trajectory file", + type=lambda file_name: ArgumentParserFileExtensionValidation( + (".trj, .pdb, .xtc, .dcd"), file_name).validate_file_extension(), + required=True) + common_args.add_argument("--threshold", type=float, default=0, help="Threshold for the correlation analysis") + + dccm_parser = subparsers.add_parser("dccm", help="Run DCCM correlation analysis", parents=[common_args]) + dccm_parser.set_defaults(func=run_dccm) + + lmi_parser = subparsers.add_parser("lmi", help="Run LMI correlation analysis", parents=[common_args]) + lmi_parser.set_defaults(func=run_lmi) + + args = parser.parse_args() + + if not hasattr(args, 'func'): + parser.print_help() + return + + backbone = False + if (args.atoms and args.backbone) or not args.atoms: + logger.warning(f"Backbone atoms will be utilized to compute dccm.") + backbone = True + args.func(args, backbone) + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index c11d09b..14dd3b8 100644 --- a/setup.py +++ b/setup.py @@ -35,18 +35,21 @@ "ff_masses/*"]} package_dir = {"libinteract" : "libinteract", - "pyinteraph" : "pyinteraph"} + "pyinteraph" : "pyinteraph", + "core": "pyinteraph.core", + "correlation": "pyinteraph.correlation"} -packages = ["libinteract", "pyinteraph"] +packages = ["libinteract", "pyinteraph", "pyinteraph.core", "pyinteraph.correlation"] -entry_points = {"console_scripts" : [\ +entry_points = {"console_scripts" : [ "pyinteraph = pyinteraph.main:main", "graph_analysis = pyinteraph.graph_analysis:main", "filter_graph = pyinteraph.filter_graph:main", "parse_masses = pyinteraph.parse_masses:main", "path_analysis = pyinteraph.path_analysis:main", "centrality_analysis = pyinteraph.centrality_analysis:main", - "dat2graphml = pyinteraph.dat2graphml:main"]} + "dat2graphml = pyinteraph.dat2graphml:main", + "correlation = pyinteraph.correlation_analysis:main"]} install_requires = ["cython", "biopython", From 7d6ebcd172a762903db6e1b76019f632cae00681 Mon Sep 17 00:00:00 2001 From: Deniz Dogan Date: Thu, 13 Apr 2023 20:37:27 +0300 Subject: [PATCH 2/2] rm keep_3d_coordinates --- pyinteraph/correlation/residue_coordinate_matrix.py | 10 +++------- pyinteraph/correlation_analysis.py | 4 ++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pyinteraph/correlation/residue_coordinate_matrix.py b/pyinteraph/correlation/residue_coordinate_matrix.py index 0e17a21..21aefed 100644 --- a/pyinteraph/correlation/residue_coordinate_matrix.py +++ b/pyinteraph/correlation/residue_coordinate_matrix.py @@ -8,12 +8,10 @@ class ResidueCoordinateMatrix: - def __init__(self, ref: str, traj: str, atoms: str, backbone: bool, keep_3d_coordinates: bool) -> None: + def __init__(self, ref: str, traj: str, atoms: str, backbone: bool) -> None: self.ref = ref self.traj = traj self.selected_atoms = "backbone" if backbone else f"name {atoms.replace(',', ' ')}" - self.keep_3d_coordinates = keep_3d_coordinates - self.coordinates_by_residue_matrix_shape = (self.n_residues, self.n_frames, 3) if keep_3d_coordinates else (self.n_residues, self.n_frames) @property @functools.lru_cache() @@ -41,13 +39,11 @@ def residues_with_atom_number(self): @property @functools.lru_cache() def coordinates_by_residue(self) -> np.ndarray: - traj_by_res = np.zeros(shape=self.coordinates_by_residue_matrix_shape) + traj_by_res = np.zeros(shape=(self.n_residues, self.n_frames)) for i, traj in enumerate(self.trajectory.trajectory): for res_num in range(0, self.n_residues): traj_by_res[res_num][traj.frame] = self.get_atomic_positions_by_geometric_center(traj, res_num) return traj_by_res def get_atomic_positions_by_geometric_center(self, traj, res_num): - if not self.keep_3d_coordinates: - return np.mean(traj.positions[list(self.residues_with_atom_number.values())[res_num]]) - return np.mean(traj.positions[list(self.residues_with_atom_number.values())[res_num]], axis=0) + return np.mean(traj.positions[list(self.residues_with_atom_number.values())[res_num]]) diff --git a/pyinteraph/correlation_analysis.py b/pyinteraph/correlation_analysis.py index 1f75f15..6b0ecfb 100644 --- a/pyinteraph/correlation_analysis.py +++ b/pyinteraph/correlation_analysis.py @@ -10,12 +10,12 @@ warnings.filterwarnings("ignore") def run_dccm(args, backbone): - residue_coordinate_matrix = ResidueCoordinateMatrix(args.ref, args.traj, args.atoms, backbone, keep_3d_coordinates=False) + residue_coordinate_matrix = ResidueCoordinateMatrix(args.ref, args.traj, args.atoms, backbone) DCCMAnalyzer(residue_coordinate_matrix, threshold=args.threshold).to_csv(file_name="dccm") def run_lmi(args, backbone): - residue_coordinate_matrix = ResidueCoordinateMatrix(args.ref, args.traj, args.atoms, backbone, keep_3d_coordinates=False) + residue_coordinate_matrix = ResidueCoordinateMatrix(args.ref, args.traj, args.atoms, backbone) LMIAnalyzer(residue_coordinate_matrix, threshold=args.threshold).to_csv(file_name="lmi")