From acd487e25d2b2db8a835122f95c1ad2507dd4e01 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 23 Feb 2025 20:04:27 +0100 Subject: [PATCH 01/17] datamodule: store num_of_labels to hparms --- chebai/preprocessing/datasets/base.py | 17 +++++++++++++++++ chebai/preprocessing/datasets/tox21.py | 4 ++++ 2 files changed, 21 insertions(+) diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index 6158b9dc..bcba6b99 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -401,6 +401,21 @@ def setup(self, **kwargs): if not ("keep_reader" in kwargs and kwargs["keep_reader"]): self.reader.on_finish() + self._add_num_of_labels_to_hparams() + + def _add_num_of_labels_to_hparams(self): + num_of_labels = len( + torch.load( + os.path.join( + self.processed_dir, self.processed_file_names_dict["data"] + ), + weights_only=False, + )[0]["labels"] + ) + + print(f"Number of labels for loaded data: {num_of_labels}") + self.hparams.num_of_labels = num_of_labels + def setup_processed(self): """ Setup the processed data. @@ -541,6 +556,8 @@ def setup(self, **kwargs): for s in self.subsets: s.setup(**kwargs) + self._add_num_of_labels_to_hparams() + def dataloader(self, kind: str, **kwargs) -> DataLoader: """ Creates a DataLoader for a specific subset. diff --git a/chebai/preprocessing/datasets/tox21.py b/chebai/preprocessing/datasets/tox21.py index 4bdfbdee..9a1397cd 100644 --- a/chebai/preprocessing/datasets/tox21.py +++ b/chebai/preprocessing/datasets/tox21.py @@ -129,6 +129,8 @@ def setup(self, **kwargs) -> None: ): self.setup_processed() + self._add_num_of_labels_to_hparams() + def _load_data_from_file(self, input_file_path: str) -> List[Dict]: """Loads data from a CSV file. @@ -311,6 +313,8 @@ def setup(self, **kwargs) -> None: ): self.setup_processed() + self._add_num_of_labels_to_hparams() + def _load_dict(self, input_file_path: str) -> Generator[Dict, None, None]: """Loads data from a CSV file as a generator. From 9a48fdf11aab7ef7afae65ab1704619e673cb306 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 23 Feb 2025 20:06:45 +0100 Subject: [PATCH 02/17] lightning module: retrieve num_of_labels from data module --- chebai/cli.py | 18 +++++++++--------- chebai/models/base.py | 11 +++++++++-- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/chebai/cli.py b/chebai/cli.py index f2ad1072..155b79e9 100644 --- a/chebai/cli.py +++ b/chebai/cli.py @@ -38,15 +38,15 @@ def add_arguments_to_parser(self, parser: LightningArgumentParser): Args: parser (LightningArgumentParser): Argument parser instance. """ - for kind in ("train", "val", "test"): - for average in ("micro-f1", "macro-f1", "balanced-accuracy"): - parser.link_arguments( - "model.init_args.out_dim", - f"model.init_args.{kind}_metrics.init_args.metrics.{average}.init_args.num_labels", - ) - parser.link_arguments( - "model.init_args.out_dim", "trainer.callbacks.init_args.num_labels" - ) + # for kind in ("train", "val", "test"): + # for average in ("micro-f1", "macro-f1", "balanced-accuracy"): + # parser.link_arguments( + # "model.init_args.out_dim", + # f"model.init_args.{kind}_metrics.init_args.metrics.{average}.init_args.num_labels", + # ) + # parser.link_arguments( + # "model.init_args.out_dim", "trainer.callbacks.init_args.num_labels" + # ) parser.link_arguments( "data", "model.init_args.criterion.init_args.data_extractor" ) diff --git a/chebai/models/base.py b/chebai/models/base.py index 362731df..2ca52f67 100644 --- a/chebai/models/base.py +++ b/chebai/models/base.py @@ -35,7 +35,6 @@ class ChebaiBaseNet(LightningModule): def __init__( self, criterion: torch.nn.Module = None, - out_dim: Optional[int] = None, train_metrics: Optional[torch.nn.Module] = None, val_metrics: Optional[torch.nn.Module] = None, test_metrics: Optional[torch.nn.Module] = None, @@ -48,7 +47,7 @@ def __init__( self.save_hyperparameters( ignore=["criterion", "train_metrics", "val_metrics", "test_metrics"] ) - self.out_dim = out_dim + self.out_dim = None if optimizer_kwargs: self.optimizer_kwargs = optimizer_kwargs else: @@ -70,6 +69,14 @@ def __init_subclass__(cls, **kwargs): else: _MODEL_REGISTRY[cls.NAME] = cls + def setup(self, stage: str) -> None: + if self.trainer and hasattr(self.trainer, "datamodule"): + self.out_dim = int(self.trainer.datamodule.hparams.num_of_labels) + else: + raise ValueError("Trainer has no data module") + assert self.out_dim is not None, "Model output dimension is None" + print(f"Output Dimension for the model: {self.out_dim}") + def _get_prediction_and_labels( self, data: Dict[str, Any], labels: torch.Tensor, output: torch.Tensor ) -> (torch.Tensor, torch.Tensor): From 381452ecec6a94590f4ca85dc8f61906bde09d61 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 23 Feb 2025 20:07:11 +0100 Subject: [PATCH 03/17] fnn: move layer initializaton to setup method --- chebai/models/ffn.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/chebai/models/ffn.py b/chebai/models/ffn.py index cd32086e..4fd23d95 100644 --- a/chebai/models/ffn.py +++ b/chebai/models/ffn.py @@ -20,10 +20,15 @@ def __init__( **kwargs ): super().__init__(**kwargs) + self.input_size = input_size + self.hidden_layers = hidden_layers + + def setup(self, stage: str) -> None: + super().setup(stage) layers = [] - current_layer_input_size = input_size - for hidden_dim in hidden_layers: + current_layer_input_size = self.input_size + for hidden_dim in self.hidden_layers: layers.append(MLPBlock(current_layer_input_size, hidden_dim)) layers.append(Residual(MLPBlock(hidden_dim, hidden_dim))) current_layer_input_size = hidden_dim From 9e209609c6c02350f98c7525f2463bd2e194e512 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 27 Feb 2025 00:02:19 +0100 Subject: [PATCH 04/17] modify logic to call prepare and setup during instantiation of data module - https://github.com/Lightning-AI/pytorch-lightning/discussions/20602#discussioncomment-12316606 --- chebai/cli.py | 29 ++++++--- chebai/models/base.py | 17 +++-- chebai/models/ffn.py | 10 +-- chebai/preprocessing/datasets/base.py | 62 +++++++++++++++---- chebai/preprocessing/datasets/chebi.py | 1 + .../datasets/deepGO/go_uniprot.py | 5 ++ .../datasets/deepGO/protein_pretraining.py | 5 ++ chebai/preprocessing/datasets/pubchem.py | 1 + chebai/preprocessing/datasets/tox21.py | 12 +++- 9 files changed, 101 insertions(+), 41 deletions(-) diff --git a/chebai/cli.py b/chebai/cli.py index 155b79e9..dd7a7c6a 100644 --- a/chebai/cli.py +++ b/chebai/cli.py @@ -38,15 +38,26 @@ def add_arguments_to_parser(self, parser: LightningArgumentParser): Args: parser (LightningArgumentParser): Argument parser instance. """ - # for kind in ("train", "val", "test"): - # for average in ("micro-f1", "macro-f1", "balanced-accuracy"): - # parser.link_arguments( - # "model.init_args.out_dim", - # f"model.init_args.{kind}_metrics.init_args.metrics.{average}.init_args.num_labels", - # ) - # parser.link_arguments( - # "model.init_args.out_dim", "trainer.callbacks.init_args.num_labels" - # ) + + parser.link_arguments( + "data.num_of_labels", "model.init_args.out_dim", apply_on="instantiate" + ) + parser.link_arguments( + "data.feature_vector_size", + "model.init_args.input_dim", + apply_on="instantiate", + ) + + for kind in ("train", "val", "test"): + for average in ("micro-f1", "macro-f1", "balanced-accuracy"): + parser.link_arguments( + "data.num_of_labels", + f"model.init_args.{kind}_metrics.init_args.metrics.{average}.init_args.num_labels", + apply_on="instantiate", + ) + parser.link_arguments( + "data.num_of_labels", "trainer.callbacks.init_args.num_labels" + ) parser.link_arguments( "data", "model.init_args.criterion.init_args.data_extractor" ) diff --git a/chebai/models/base.py b/chebai/models/base.py index 2ca52f67..3bbaaef5 100644 --- a/chebai/models/base.py +++ b/chebai/models/base.py @@ -35,6 +35,8 @@ class ChebaiBaseNet(LightningModule): def __init__( self, criterion: torch.nn.Module = None, + out_dim: Optional[int] = None, + input_dim: Optional[int] = None, train_metrics: Optional[torch.nn.Module] = None, val_metrics: Optional[torch.nn.Module] = None, test_metrics: Optional[torch.nn.Module] = None, @@ -47,7 +49,12 @@ def __init__( self.save_hyperparameters( ignore=["criterion", "train_metrics", "val_metrics", "test_metrics"] ) - self.out_dim = None + + self.out_dim = out_dim + self.input_dim = input_dim + assert out_dim is not None, "out_dim must be specified" + assert input_dim is not None, "input_dim must be specified" + if optimizer_kwargs: self.optimizer_kwargs = optimizer_kwargs else: @@ -69,14 +76,6 @@ def __init_subclass__(cls, **kwargs): else: _MODEL_REGISTRY[cls.NAME] = cls - def setup(self, stage: str) -> None: - if self.trainer and hasattr(self.trainer, "datamodule"): - self.out_dim = int(self.trainer.datamodule.hparams.num_of_labels) - else: - raise ValueError("Trainer has no data module") - assert self.out_dim is not None, "Model output dimension is None" - print(f"Output Dimension for the model: {self.out_dim}") - def _get_prediction_and_labels( self, data: Dict[str, Any], labels: torch.Tensor, output: torch.Tensor ) -> (torch.Tensor, torch.Tensor): diff --git a/chebai/models/ffn.py b/chebai/models/ffn.py index 4fd23d95..78df31f9 100644 --- a/chebai/models/ffn.py +++ b/chebai/models/ffn.py @@ -20,15 +20,9 @@ def __init__( **kwargs ): super().__init__(**kwargs) - self.input_size = input_size - self.hidden_layers = hidden_layers - - def setup(self, stage: str) -> None: - super().setup(stage) - layers = [] - current_layer_input_size = self.input_size - for hidden_dim in self.hidden_layers: + current_layer_input_size = input_size + for hidden_dim in hidden_layers: layers.append(MLPBlock(current_layer_input_size, hidden_dim)) layers.append(Residual(MLPBlock(hidden_dim, hidden_dim))) current_layer_input_size = hidden_dim diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index bcba6b99..f2b65d1a 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -117,6 +117,25 @@ def __init__( os.makedirs(os.path.join(self.processed_dir, self.fold_dir), exist_ok=True) self.save_hyperparameters() + self._num_of_labels = None + self._feature_vector_size = None + self._prepare_data_flag = 1 + self._setup_data_flag = 1 + self.prepare_data() + self.setup() + + @property + def num_of_labels(self): + assert self._num_of_labels is not None, "num of labels must be set" + return self._num_of_labels + + @property + def feature_vector_size(self): + assert ( + self._feature_vector_size is not None + ), "size of feature vector must be set" + return self._feature_vector_size + @property def identifier(self) -> tuple: """Identifier for the dataset.""" @@ -381,6 +400,12 @@ def predict_dataloader( """ return self.dataloader(self.prediction_kind, shuffle=False, **kwargs) + def prepare_data(self) -> None: + if self._prepare_data_flag != 1: + return + + self._prepare_data_flag += 1 + def setup(self, **kwargs): """ Setup the data module. @@ -390,6 +415,11 @@ def setup(self, **kwargs): Args: **kwargs: Additional keyword arguments. """ + if self._setup_data_flag != 1: + return + + self._setup_data_flag += 1 + rank_zero_info(f"Check for processed data in {self.processed_dir}") rank_zero_info(f"Cross-validation enabled: {self.use_inner_cross_validation}") if any( @@ -401,20 +431,20 @@ def setup(self, **kwargs): if not ("keep_reader" in kwargs and kwargs["keep_reader"]): self.reader.on_finish() - self._add_num_of_labels_to_hparams() + self._set_processed_data_props() - def _add_num_of_labels_to_hparams(self): - num_of_labels = len( - torch.load( - os.path.join( - self.processed_dir, self.processed_file_names_dict["data"] - ), - weights_only=False, - )[0]["labels"] - ) + def _set_processed_data_props(self): - print(f"Number of labels for loaded data: {num_of_labels}") - self.hparams.num_of_labels = num_of_labels + single_data_instance = torch.load( + os.path.join(self.processed_dir, self.processed_file_names_dict["data"]), + weights_only=False, + )[0] + + self._num_of_labels = len(single_data_instance["labels"]) + self._feature_vector_size = len(single_data_instance["features"]) + + print(f"Number of labels for loaded data: {self._num_of_labels}") + print(f"Feature vector size: {self._feature_vector_size}") def setup_processed(self): """ @@ -541,6 +571,7 @@ def prepare_data(self): """ Placeholder for data preparation logic. """ + super().prepare_data() for s in self.subsets: s.prepare_data() @@ -553,10 +584,14 @@ def setup(self, **kwargs): Args: **kwargs: Additional keyword arguments. """ + if self._setup_data_flag != 1: + return + + self._setup_data_flag += 1 for s in self.subsets: s.setup(**kwargs) - self._add_num_of_labels_to_hparams() + self._set_processed_data_props() def dataloader(self, kind: str, **kwargs) -> DataLoader: """ @@ -752,6 +787,7 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None: Returns: None """ + super().prepare_data() print("Checking for processed data in", self.processed_dir_main) processed_name = self.processed_main_file_names_dict["data"] diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index d927a44c..d1c82a32 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -60,6 +60,7 @@ def raw_file_names(self): return ["test.pkl", "train.pkl", "validation.pkl"] def prepare_data(self, *args, **kwargs): + super().prepare_data() print("Check for raw data in", self.raw_dir) if any( not os.path.isfile(os.path.join(self.raw_dir, f)) diff --git a/chebai/preprocessing/datasets/deepGO/go_uniprot.py b/chebai/preprocessing/datasets/deepGO/go_uniprot.py index 1b0eb2aa..6a62d517 100644 --- a/chebai/preprocessing/datasets/deepGO/go_uniprot.py +++ b/chebai/preprocessing/datasets/deepGO/go_uniprot.py @@ -783,6 +783,11 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None: Raises: FileNotFoundError: If the processed data file does not exist. """ + if self._prepare_data_flag != 1: + return + + self._prepare_data_flag += 1 + print("Checking for processed data in", self.processed_dir_main) processed_name = self.processed_main_file_names_dict["data"] diff --git a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py b/chebai/preprocessing/datasets/deepGO/protein_pretraining.py index 8f7e9c4d..a44d21bb 100644 --- a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py +++ b/chebai/preprocessing/datasets/deepGO/protein_pretraining.py @@ -64,6 +64,11 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None: *args: Additional positional arguments. **kwargs: Additional keyword arguments. """ + if self._prepare_data_flag != 1: + return + + self._prepare_data_flag += 1 + processed_name = self.processed_main_file_names_dict["data"] if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)): print("Missing processed data file (`data.pkl` file)") diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py index c82ea42f..09352326 100644 --- a/chebai/preprocessing/datasets/pubchem.py +++ b/chebai/preprocessing/datasets/pubchem.py @@ -183,6 +183,7 @@ def prepare_data(self, *args, **kwargs): """ Checks for raw data and downloads if necessary. """ + super().prepare_data() print("Check for raw data in", self.raw_dir) if any( not os.path.isfile(os.path.join(self.raw_dir, f)) diff --git a/chebai/preprocessing/datasets/tox21.py b/chebai/preprocessing/datasets/tox21.py index 9a1397cd..99c0b5bb 100644 --- a/chebai/preprocessing/datasets/tox21.py +++ b/chebai/preprocessing/datasets/tox21.py @@ -118,6 +118,10 @@ def setup_processed(self) -> None: def setup(self, **kwargs) -> None: """Sets up the dataset by downloading and processing if necessary.""" + if self._setup_data_flag != 1: + return + + self._setup_data_flag += 1 if any( not os.path.isfile(os.path.join(self.raw_dir, f)) for f in self.raw_file_names @@ -129,7 +133,7 @@ def setup(self, **kwargs) -> None: ): self.setup_processed() - self._add_num_of_labels_to_hparams() + self._set_processed_data_props() def _load_data_from_file(self, input_file_path: str) -> List[Dict]: """Loads data from a CSV file. @@ -302,6 +306,10 @@ def setup_processed(self) -> None: def setup(self, **kwargs) -> None: """Sets up the dataset by downloading and processing if necessary.""" + if self._setup_data_flag != 1: + return + + self._setup_data_flag += 1 if any( not os.path.isfile(os.path.join(self.raw_dir, f)) for f in self.raw_file_names @@ -313,7 +321,7 @@ def setup(self, **kwargs) -> None: ): self.setup_processed() - self._add_num_of_labels_to_hparams() + self._set_processed_data_props() def _load_dict(self, input_file_path: str) -> Generator[Dict, None, None]: """Loads data from a CSV file as a generator. From ed50c8d39f6fb9a8359118f8fac332826146eb31 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 27 Feb 2025 17:07:30 +0100 Subject: [PATCH 05/17] feature size to max of all features --- chebai/preprocessing/datasets/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index f2b65d1a..ab2b4e00 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -435,13 +435,13 @@ def setup(self, **kwargs): def _set_processed_data_props(self): - single_data_instance = torch.load( + data_pt = torch.load( os.path.join(self.processed_dir, self.processed_file_names_dict["data"]), weights_only=False, - )[0] + ) - self._num_of_labels = len(single_data_instance["labels"]) - self._feature_vector_size = len(single_data_instance["features"]) + self._num_of_labels = len(data_pt[0]["labels"]) + self._feature_vector_size = max(len(d["features"]) for d in data_pt) print(f"Number of labels for loaded data: {self._num_of_labels}") print(f"Feature vector size: {self._feature_vector_size}") From f7264da45adbb27f2876f30935f427c1522a2a9a Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 29 Mar 2025 15:45:25 +0100 Subject: [PATCH 06/17] add skip param to skip data methods during class init - Skips data setup in the constructor; methods will be called later according to the CLI workflow. - This change enables to skip these methods during initialization for unit-testing GitHub CI/CD --- chebai/preprocessing/datasets/base.py | 6 ++++++ chebai/preprocessing/datasets/deepGO/protein_pretraining.py | 2 +- tests/unit/dataset_classes/testChEBIOverX.py | 4 +++- tests/unit/dataset_classes/testChebiDataExtractor.py | 2 +- tests/unit/dataset_classes/testChebiOverXPartial.py | 4 +++- tests/unit/dataset_classes/testDynamicDataset.py | 2 +- tests/unit/dataset_classes/testGOUniProDataExtractor.py | 2 +- tests/unit/dataset_classes/testGoUniProtOverX.py | 2 +- tests/unit/dataset_classes/testProteinPretrainingData.py | 2 +- tests/unit/dataset_classes/testTox21Challenge.py | 2 +- tests/unit/dataset_classes/testXYBaseDataModule.py | 1 + 11 files changed, 20 insertions(+), 9 deletions(-) diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index edacfcc3..b4c95a1e 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -121,6 +121,12 @@ def __init__( self._feature_vector_size = None self._prepare_data_flag = 1 self._setup_data_flag = 1 + + # Skips data setup in the constructor; methods will be called later according to the CLI workflow. + if kwargs.get("_skip_data_methods_on_init", False): + # This change enables to skip these methods during initialization for unit-testing GitHub CI/CD + return + self.prepare_data() self.setup() diff --git a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py b/chebai/preprocessing/datasets/deepGO/protein_pretraining.py index a44d21bb..22ae3bc4 100644 --- a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py +++ b/chebai/preprocessing/datasets/deepGO/protein_pretraining.py @@ -38,7 +38,7 @@ def __init__(self, **kwargs): Args: **kwargs: Additional arguments for the superclass initialization. """ - self._go_uniprot_extractor = GOUniProtOver250() + self._go_uniprot_extractor = GOUniProtOver250(_skip_data_methods_on_init=True) assert self._go_uniprot_extractor.go_branch == GOUniProtOver250._ALL_GO_BRANCHES self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002)) diff --git a/tests/unit/dataset_classes/testChEBIOverX.py b/tests/unit/dataset_classes/testChEBIOverX.py index 270b868c..22b327a7 100644 --- a/tests/unit/dataset_classes/testChEBIOverX.py +++ b/tests/unit/dataset_classes/testChEBIOverX.py @@ -19,7 +19,9 @@ def setUpClass(cls, mock_makedirs, mock_processed_dir_main: PropertyMock) -> Non mock_processed_dir_main (PropertyMock): Mocked property for the processed directory path. """ mock_processed_dir_main.return_value = "/mock/processed_dir" - cls.chebi_extractor = ChEBIOverX(chebi_version=231) + cls.chebi_extractor = ChEBIOverX( + chebi_version=231, _skip_data_methods_on_init=True + ) cls.test_graph = ChebiMockOntology.get_transitively_closed_graph() @patch("builtins.open", new_callable=mock_open) diff --git a/tests/unit/dataset_classes/testChebiDataExtractor.py b/tests/unit/dataset_classes/testChebiDataExtractor.py index 8da900da..2926d6cd 100644 --- a/tests/unit/dataset_classes/testChebiDataExtractor.py +++ b/tests/unit/dataset_classes/testChebiDataExtractor.py @@ -35,7 +35,7 @@ def setUpClass( # Create an instance of the dataset cls.extractor: _ChEBIDataExtractor = _ChEBIDataExtractor( - chebi_version=231, chebi_version_train=200 + chebi_version=231, chebi_version_train=200, _skip_data_methods_on_init=True ) # Mock instance for _chebi_version_train_obj diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py index 76584ebf..da329193 100644 --- a/tests/unit/dataset_classes/testChebiOverXPartial.py +++ b/tests/unit/dataset_classes/testChebiOverXPartial.py @@ -16,7 +16,9 @@ def setUpClass(cls, mock_makedirs) -> None: """ Set up the ChEBIOverXPartial instance with a mock processed directory path and a test graph. """ - cls.chebi_extractor = ChEBIOverXPartial(top_class_id=11111, chebi_version=231) + cls.chebi_extractor = ChEBIOverXPartial( + top_class_id=11111, chebi_version=231, _skip_data_methods_on_init=True + ) cls.test_graph = ChebiMockOntology.get_transitively_closed_graph() @patch( diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py index c8846273..25467973 100644 --- a/tests/unit/dataset_classes/testDynamicDataset.py +++ b/tests/unit/dataset_classes/testDynamicDataset.py @@ -38,7 +38,7 @@ def setUpClass( _DynamicDataset.READER = ReaderMock # Creating an instance of the dataset - cls.dataset: _DynamicDataset = _DynamicDataset() + cls.dataset: _DynamicDataset = _DynamicDataset(_skip_data_methods_on_init=True) # Dataset with a balanced distribution of labels X = [ diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py index 96ff9a3a..6c37d17d 100644 --- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py +++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py @@ -35,7 +35,7 @@ def setUpClass( _GOUniProtDataExtractor.READER = ProteinDataReader - cls.extractor = _GOUniProtDataExtractor() + cls.extractor = _GOUniProtDataExtractor(_skip_data_methods_on_init=True) def test_term_callback(self) -> None: """ diff --git a/tests/unit/dataset_classes/testGoUniProtOverX.py b/tests/unit/dataset_classes/testGoUniProtOverX.py index 3f329c56..fc4e59eb 100644 --- a/tests/unit/dataset_classes/testGoUniProtOverX.py +++ b/tests/unit/dataset_classes/testGoUniProtOverX.py @@ -17,7 +17,7 @@ def setUpClass(cls, mock_makedirs) -> None: """ Set up the class for tests by initializing the extractor, graph, and input DataFrame. """ - cls.extractor = _GOUniProtOverX() + cls.extractor = _GOUniProtOverX(_skip_data_methods_on_init=True) cls.test_graph: nx.DiGraph = GOUniProtMockData.get_transitively_closed_graph() cls.input_df: pd.DataFrame = GOUniProtMockData.get_data_in_dataframe().iloc[ :, :4 diff --git a/tests/unit/dataset_classes/testProteinPretrainingData.py b/tests/unit/dataset_classes/testProteinPretrainingData.py index caac3eac..d5e22c52 100644 --- a/tests/unit/dataset_classes/testProteinPretrainingData.py +++ b/tests/unit/dataset_classes/testProteinPretrainingData.py @@ -37,7 +37,7 @@ def setUpClass( _ProteinPretrainingData.READER = ProteinDataReader # Initialize the extractor instance - cls.extractor = _ProteinPretrainingData() + cls.extractor = _ProteinPretrainingData(_skip_data_methods_on_init=True) @patch( "builtins.open", diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py index 9ad2af21..aa22fb9a 100644 --- a/tests/unit/dataset_classes/testTox21Challenge.py +++ b/tests/unit/dataset_classes/testTox21Challenge.py @@ -24,7 +24,7 @@ def setUpClass(cls, mock_makedirs) -> None: This is run once for the test class. """ Tox21Challenge.READER = ChemDataReader - cls.tox21 = Tox21Challenge() + cls.tox21 = Tox21Challenge(_skip_data_methods_on_init=True) @patch("rdkit.Chem.SDMolSupplier") def test_load_data_from_file(self, mock_sdmol_supplier: patch) -> None: diff --git a/tests/unit/dataset_classes/testXYBaseDataModule.py b/tests/unit/dataset_classes/testXYBaseDataModule.py index 64dfbe40..b9bed2d3 100644 --- a/tests/unit/dataset_classes/testXYBaseDataModule.py +++ b/tests/unit/dataset_classes/testXYBaseDataModule.py @@ -30,6 +30,7 @@ def setUpClass(cls, mock_makedirs, mock_name_property: PropertyMock) -> None: cls.module = XYBaseDataModule( label_filter=1, # Provide a label_filter balance_after_filter=1.0, # Balance ratio + _skip_data_methods_on_init=True, ) def test_filter_labels_valid_index(self) -> None: From c73c62ab3b9086e994b740523ff369deda58c9a5 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 30 Mar 2025 20:41:13 +0200 Subject: [PATCH 07/17] remove redundant `label_number` property - https://github.com/ChEB-AI/python-chebai/issues/47#issuecomment-2764257866 --- chebai/preprocessing/datasets/base.py | 19 ----------- chebai/preprocessing/datasets/chebi.py | 40 ------------------------ chebai/preprocessing/datasets/pubchem.py | 28 ----------------- chebai/preprocessing/datasets/tox21.py | 10 ------ 4 files changed, 97 deletions(-) diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index b4c95a1e..7daabd57 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -525,18 +525,6 @@ def raw_file_names_dict(self) -> dict: """ raise NotImplementedError - @property - def label_number(self) -> int: - """ - Returns the number of labels. - - This property should be implemented by subclasses to provide the number of labels. - - Returns: - int: The number of labels. Returns -1 for seq2seq encoding. - """ - raise NotImplementedError - class MergedDataset(XYBaseDataModule): MERGED = [] @@ -673,13 +661,6 @@ def processed_file_names(self) -> List[str]: """ return ["test.pt", "train.pt", "validation.pt"] - @property - def label_number(self) -> int: - """ - Returns the number of labels from the first subset. - """ - return self.subsets[0].label_number - @property def limits(self): """ diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index d1c82a32..59b4f704 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -89,10 +89,6 @@ def setup_processed(self): os.path.join(self.processed_dir, f"{k}.pt"), ) - @property - def label_number(self): - return 500 - class JCIData(JCIBase): READER = dr.OrdReader @@ -546,10 +542,6 @@ def raw_file_names_dict(self) -> dict: class JCIExtendedBase(_ChEBIDataExtractor): - @property - def label_number(self): - return 500 - @property def _name(self): return "JCI_extended" @@ -574,16 +566,6 @@ class ChEBIOverX(_ChEBIDataExtractor): READER: dr.ChemDataReader = dr.ChemDataReader THRESHOLD: int = None - @property - def label_number(self) -> int: - """ - Returns the number of labels in the dataset. - - Returns: - int: The number of labels. - """ - return 854 - @property def _name(self) -> str: """ @@ -676,17 +658,6 @@ class ChEBIOver100(ChEBIOverX): THRESHOLD: int = 100 - def label_number(self) -> int: - """ - Returns the number of labels in the dataset. - - Overrides the base class method to return the correct number of labels for this threshold. - - Returns: - int: The number of labels. - """ - return 854 - class ChEBIOver50(ChEBIOverX): """ @@ -700,17 +671,6 @@ class ChEBIOver50(ChEBIOverX): THRESHOLD: int = 50 - def label_number(self) -> int: - """ - Returns the number of labels in the dataset. - - Overrides the base class method to return the correct number of labels for this threshold. - - Returns: - int: The number of labels. - """ - return 1332 - class ChEBIOver100DeepSMILES(ChEBIOverXDeepSMILES, ChEBIOver100): """ diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py index 721c6b30..74ae8f44 100644 --- a/chebai/preprocessing/datasets/pubchem.py +++ b/chebai/preprocessing/datasets/pubchem.py @@ -693,13 +693,6 @@ class PubchemChem(PubChem): READER: Type[dr.ChemDataReader] = dr.ChemDataReader - @property - def label_number(self) -> int: - """ - Returns the label number. - """ - return -1 - class PubchemBPE(PubChem): """ @@ -713,13 +706,6 @@ class PubchemBPE(PubChem): READER: Type[dr.ChemBPEReader] = dr.ChemBPEReader - @property - def label_number(self) -> int: - """ - Returns the label number. - """ - return -1 - class SWJChem(SWJPreChem): """ @@ -733,13 +719,6 @@ class SWJChem(SWJPreChem): READER: Type[dr.ChemDataUnlabeledReader] = dr.ChemDataUnlabeledReader - @property - def label_number(self) -> int: - """ - Returns the label number. - """ - return -1 - class SWJBPE(SWJPreChem): """ @@ -753,13 +732,6 @@ class SWJBPE(SWJPreChem): READER: Type[dr.ChemBPEReader] = dr.ChemBPEReader - @property - def label_number(self) -> int: - """ - Returns the label number. - """ - return -1 - class PubChemTokens(PubChem): """ diff --git a/chebai/preprocessing/datasets/tox21.py b/chebai/preprocessing/datasets/tox21.py index 99c0b5bb..95c60cdd 100644 --- a/chebai/preprocessing/datasets/tox21.py +++ b/chebai/preprocessing/datasets/tox21.py @@ -39,11 +39,6 @@ def _name(self) -> str: """Returns the name of the dataset.""" return "Tox21MN" - @property - def label_number(self) -> int: - """Returns the number of labels.""" - return 12 - @property def raw_file_names(self) -> List[str]: """Returns a list of raw file names.""" @@ -177,11 +172,6 @@ def _name(self) -> str: """Returns the name of the dataset.""" return "Tox21Chal" - @property - def label_number(self) -> int: - """Returns the number of labels.""" - return 12 - @property def raw_file_names(self) -> List[str]: """Returns a list of raw file names.""" From 335adbf99960581523ea5a24acf7ae81cdb5df36 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 3 Apr 2025 11:16:30 +0200 Subject: [PATCH 08/17] data_methods should only be called after complete instantiation - prepare_data and setup method should be called only after the most derived based class is full instantiated --- chebai/preprocessing/datasets/base.py | 56 +++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index 7daabd57..643466a8 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import os import random from abc import ABC, abstractmethod -from typing import Any, Dict, Generator, List, Optional, Tuple, Union +from typing import Any, Dict, Generator, List, Optional, Tuple, Type, Union import lightning as pl import networkx as nx @@ -20,7 +22,40 @@ from chebai.preprocessing import reader as dr -class XYBaseDataModule(LightningDataModule): +class _InitMeta(type): + """ + Metaclass that ensures a specific method (`_call_data_processing_methods`) + is called after an instance (meaning the most derived class instance) is fully initialized. + + Purpose: + - Automatically calls `_call_data_processing_methods(**kwargs)` on instances + of classes that use this metaclass, if the method is defined. + - Ensures additional processing logic is executed immediately after object instantiation. + - Useful in cases where post-initialization processing is required across multiple subclasses. + """ + + def __call__( + cls: Type[XYBaseDataModule], *args: Any, **kwargs: Any + ) -> XYBaseDataModule: + """ + Overrides the instance creation process to call `_after_init` after the most derived class instance + is initialized. + + Args: + cls (Type[XYBaseDataModule]): The class being instantiated. + *args (Any): Positional arguments for the class constructor. + **kwargs (Any): Keyword arguments for the class constructor. + + Returns: + XYBaseDataModule: The initialized instance of the class. + """ + instance = super().__call__(*args, **kwargs) # Create the instance + if hasattr(instance, "_after_init"): + instance._after_init(**kwargs) # Call the method if defined + return instance + + +class XYBaseDataModule(LightningDataModule, metaclass=_InitMeta): """ Base class for data modules. @@ -122,9 +157,22 @@ def __init__( self._prepare_data_flag = 1 self._setup_data_flag = 1 - # Skips data setup in the constructor; methods will be called later according to the CLI workflow. + def _after_init(self, **kwargs): + """ + This method is called after the instantiation of most derived class is completed. + Refer the `_InitMeta` metaclass for more details. + """ + self._call_data_processing_methods(**kwargs) + + def _call_data_processing_methods(self, **kwargs) -> None: + """ + Calls data processing methods unless explicitly skipped. + + - Skips execution if `_skip_data_methods_on_init` is `True` (e.g., for unit tests). + - Otherwise, calls `prepare_data()` and `setup()` for data preparation. + + """ if kwargs.get("_skip_data_methods_on_init", False): - # This change enables to skip these methods during initialization for unit-testing GitHub CI/CD return self.prepare_data() From bfe137bbbdffd4397ab9e671e30e614c4a2ee585 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 3 Apr 2025 13:47:54 +0200 Subject: [PATCH 09/17] __init_subclass to call data methods at end of initialization - @dataclass to avoid conflicts with stack frame of save_hyperparameters() --- chebai/preprocessing/datasets/base.py | 80 +++++++++++--------------- chebai/preprocessing/datasets/chebi.py | 1 + 2 files changed, 35 insertions(+), 46 deletions(-) diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index 643466a8..137dbc7d 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -1,9 +1,8 @@ -from __future__ import annotations - import os import random from abc import ABC, abstractmethod -from typing import Any, Dict, Generator, List, Optional, Tuple, Type, Union +from dataclasses import dataclass +from typing import Any, Dict, Generator, List, Optional, Tuple, Union import lightning as pl import networkx as nx @@ -22,40 +21,8 @@ from chebai.preprocessing import reader as dr -class _InitMeta(type): - """ - Metaclass that ensures a specific method (`_call_data_processing_methods`) - is called after an instance (meaning the most derived class instance) is fully initialized. - - Purpose: - - Automatically calls `_call_data_processing_methods(**kwargs)` on instances - of classes that use this metaclass, if the method is defined. - - Ensures additional processing logic is executed immediately after object instantiation. - - Useful in cases where post-initialization processing is required across multiple subclasses. - """ - - def __call__( - cls: Type[XYBaseDataModule], *args: Any, **kwargs: Any - ) -> XYBaseDataModule: - """ - Overrides the instance creation process to call `_after_init` after the most derived class instance - is initialized. - - Args: - cls (Type[XYBaseDataModule]): The class being instantiated. - *args (Any): Positional arguments for the class constructor. - **kwargs (Any): Keyword arguments for the class constructor. - - Returns: - XYBaseDataModule: The initialized instance of the class. - """ - instance = super().__call__(*args, **kwargs) # Create the instance - if hasattr(instance, "_after_init"): - instance._after_init(**kwargs) # Call the method if defined - return instance - - -class XYBaseDataModule(LightningDataModule, metaclass=_InitMeta): +@dataclass +class XYBaseDataModule(LightningDataModule): """ Base class for data modules. @@ -157,26 +124,47 @@ def __init__( self._prepare_data_flag = 1 self._setup_data_flag = 1 - def _after_init(self, **kwargs): + def __init_subclass__(cls, *args, **kwargs): """ - This method is called after the instantiation of most derived class is completed. - Refer the `_InitMeta` metaclass for more details. + This method ensures that the '_call_data_processing_methods' is called only for the final subclass + in the class hierarchy. It overrides the default __init__ behavior to add custom initialization logic. + + - The method saves the original `__init__` method of the class and then defines a new `__init__` method. + - This new `__init__` method calls the original `__init__` method of the class and then checks if the + current class is the final subclass (i.e., not a subclass of a subclass). + - If it's the final class, it invokes the `_call_data_processing_methods` method to perform any necessary + data processing tasks. """ - self._call_data_processing_methods(**kwargs) + super().__init_subclass__(*args, **kwargs) + original_init = cls.__init__ + + def new_init(self, *args, **kwargs): + original_init(self, *args, **kwargs) # Call the original __init__ + if type(self) == cls: # Only run __post_init__ if it's the final class + self._call_data_processing_methods(*args, **kwargs) + + cls.__init__ = new_init - def _call_data_processing_methods(self, **kwargs) -> None: + def _call_data_processing_methods(self, *args, **kwargs) -> None: """ Calls data processing methods unless explicitly skipped. - Skips execution if `_skip_data_methods_on_init` is `True` (e.g., for unit tests). - Otherwise, calls `prepare_data()` and `setup()` for data preparation. + Note: This method is called after the instantiation of most derived class is completed. """ if kwargs.get("_skip_data_methods_on_init", False): + print( + f"Skipping data methods of class '{os.path.join(self.base_dir, self._name)}' during initialization" + ) return - self.prepare_data() - self.setup() + print( + f"Calling data method of class {os.path.join(self.base_dir, self._name)} during initialization" + ) + self.prepare_data(*args, **kwargs) + self.setup(*args, **kwargs) @property def num_of_labels(self): @@ -455,13 +443,13 @@ def predict_dataloader( """ return self.dataloader(self.prediction_kind, shuffle=False, **kwargs) - def prepare_data(self) -> None: + def prepare_data(self, *args, **kwargs) -> None: if self._prepare_data_flag != 1: return self._prepare_data_flag += 1 - def setup(self, **kwargs): + def setup(self, *args, **kwargs) -> None: """ Setup the data module. diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 59b4f704..3d3297e9 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -149,6 +149,7 @@ def __init__( # This is to get the data from respective directory related to "chebi_version_train" _init_kwargs = kwargs _init_kwargs["chebi_version"] = self.chebi_version_train + _init_kwargs["_skip_data_methods_on_init"] = True self._chebi_version_train_obj = self.__class__( single_class=self.single_class, **_init_kwargs, From 97846fafdc5adfdd2946e5918e34ef364a6a3b6e Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 3 Apr 2025 15:22:54 +0200 Subject: [PATCH 10/17] add prepare_data method to base class - after doing certain common processing the `prepare_data` method calls the _perform_data_preparation method which contain core logic for data preparation --- chebai/preprocessing/datasets/base.py | 13 ++++++++----- chebai/preprocessing/datasets/chebi.py | 7 ++----- chebai/preprocessing/datasets/deepGO/go_uniprot.py | 7 +------ .../datasets/deepGO/protein_pretraining.py | 7 +------ chebai/preprocessing/datasets/pubchem.py | 3 +-- 5 files changed, 13 insertions(+), 24 deletions(-) diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index 137dbc7d..94584ee8 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -138,9 +138,10 @@ def __init_subclass__(cls, *args, **kwargs): super().__init_subclass__(*args, **kwargs) original_init = cls.__init__ + # Creates updated definition for init method def new_init(self, *args, **kwargs): original_init(self, *args, **kwargs) # Call the original __init__ - if type(self) == cls: # Only run __post_init__ if it's the final class + if type(self) == cls: # Only run method if it's the final class self._call_data_processing_methods(*args, **kwargs) cls.__init__ = new_init @@ -448,6 +449,10 @@ def prepare_data(self, *args, **kwargs) -> None: return self._prepare_data_flag += 1 + self._perform_data_preparation(*args, **kwargs) + + def _perform_data_preparation(self, *args, **kwargs) -> None: + raise NotImplementedError def setup(self, *args, **kwargs) -> None: """ @@ -598,11 +603,10 @@ def __init__( os.makedirs(self.processed_dir, exist_ok=True) super(pl.LightningDataModule, self).__init__(**kwargs) - def prepare_data(self): + def _perform_data_preparation(self): """ Placeholder for data preparation logic. """ - super().prepare_data() for s in self.subsets: s.prepare_data() @@ -792,7 +796,7 @@ def _validate_splits_file_path(splits_file_path: Optional[str]) -> Optional[str] return splits_file_path # ------------------------------ Phase: Prepare data ----------------------------------- - def prepare_data(self, *args: Any, **kwargs: Any) -> None: + def _perform_data_preparation(self, *args: Any, **kwargs: Any) -> None: """ Prepares the data for the dataset. @@ -811,7 +815,6 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None: Returns: None """ - super().prepare_data() print("Checking for processed data in", self.processed_dir_main) processed_name = self.processed_main_file_names_dict["data"] diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 3d3297e9..eef80904 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -59,8 +59,7 @@ def download(self): def raw_file_names(self): return ["test.pkl", "train.pkl", "validation.pkl"] - def prepare_data(self, *args, **kwargs): - super().prepare_data() + def _perform_data_preparation(self, *args, **kwargs): print("Check for raw data in", self.raw_dir) if any( not os.path.isfile(os.path.join(self.raw_dir, f)) @@ -156,7 +155,7 @@ def __init__( ) # ------------------------------ Phase: Prepare data ----------------------------------- - def prepare_data(self, *args: Any, **kwargs: Any) -> None: + def _perform_data_preparation(self, *args: Any, **kwargs: Any) -> None: """ Prepares the data for the Chebi dataset. @@ -177,8 +176,6 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None: Returns: None """ - super().prepare_data(args, kwargs) - if self.chebi_version_train is not None: if not os.path.isfile( os.path.join( diff --git a/chebai/preprocessing/datasets/deepGO/go_uniprot.py b/chebai/preprocessing/datasets/deepGO/go_uniprot.py index 6a62d517..9c5d5c08 100644 --- a/chebai/preprocessing/datasets/deepGO/go_uniprot.py +++ b/chebai/preprocessing/datasets/deepGO/go_uniprot.py @@ -770,7 +770,7 @@ def _name(self) -> str: return f"{threshold_part}{self.max_sequence_length}" # ------------------------------ Phase: Prepare data ----------------------------------- - def prepare_data(self, *args: Any, **kwargs: Any) -> None: + def _perform_data_preparation(self, *args: Any, **kwargs: Any) -> None: """ Checks for the existence of migrated DeepGO data in the specified directory. Raises an error if the required data file is not found, prompting @@ -783,11 +783,6 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None: Raises: FileNotFoundError: If the processed data file does not exist. """ - if self._prepare_data_flag != 1: - return - - self._prepare_data_flag += 1 - print("Checking for processed data in", self.processed_dir_main) processed_name = self.processed_main_file_names_dict["data"] diff --git a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py b/chebai/preprocessing/datasets/deepGO/protein_pretraining.py index 22ae3bc4..92ff7a69 100644 --- a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py +++ b/chebai/preprocessing/datasets/deepGO/protein_pretraining.py @@ -55,7 +55,7 @@ def __init__(self, **kwargs): ) # ------------------------------ Phase: Prepare data ----------------------------------- - def prepare_data(self, *args: Any, **kwargs: Any) -> None: + def _perform_data_preparation(self, *args: Any, **kwargs: Any) -> None: """ Prepares the data by downloading and parsing Swiss-Prot data if not already available. Saves the processed data for further use. @@ -64,11 +64,6 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None: *args: Additional positional arguments. **kwargs: Additional keyword arguments. """ - if self._prepare_data_flag != 1: - return - - self._prepare_data_flag += 1 - processed_name = self.processed_main_file_names_dict["data"] if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)): print("Missing processed data file (`data.pkl` file)") diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py index 74ae8f44..f6f2cdb3 100644 --- a/chebai/preprocessing/datasets/pubchem.py +++ b/chebai/preprocessing/datasets/pubchem.py @@ -179,11 +179,10 @@ def processed_file_names(self) -> List[str]: """ return ["test.pt", "train.pt", "validation.pt"] - def prepare_data(self, *args, **kwargs): + def _perform_data_preparation(self, *args, **kwargs): """ Checks for raw data and downloads if necessary. """ - super().prepare_data() print("Check for raw data in", self.raw_dir) if any( not os.path.isfile(os.path.join(self.raw_dir, f)) From d3d86c6b034d9663eaf64018c9ca28752c83bf77 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Wed, 9 Apr 2025 21:11:22 +0200 Subject: [PATCH 11/17] revert _skip_data_methods_on_init --- chebai/preprocessing/datasets/deepGO/protein_pretraining.py | 2 +- tests/unit/dataset_classes/testChEBIOverX.py | 4 +--- tests/unit/dataset_classes/testChebiDataExtractor.py | 2 +- tests/unit/dataset_classes/testChebiOverXPartial.py | 4 +--- tests/unit/dataset_classes/testDynamicDataset.py | 2 +- tests/unit/dataset_classes/testGOUniProDataExtractor.py | 2 +- tests/unit/dataset_classes/testGoUniProtOverX.py | 2 +- tests/unit/dataset_classes/testProteinPretrainingData.py | 2 +- tests/unit/dataset_classes/testTox21Challenge.py | 2 +- tests/unit/dataset_classes/testXYBaseDataModule.py | 1 - 10 files changed, 9 insertions(+), 14 deletions(-) diff --git a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py b/chebai/preprocessing/datasets/deepGO/protein_pretraining.py index 92ff7a69..4be053a5 100644 --- a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py +++ b/chebai/preprocessing/datasets/deepGO/protein_pretraining.py @@ -38,7 +38,7 @@ def __init__(self, **kwargs): Args: **kwargs: Additional arguments for the superclass initialization. """ - self._go_uniprot_extractor = GOUniProtOver250(_skip_data_methods_on_init=True) + self._go_uniprot_extractor = GOUniProtOver250() assert self._go_uniprot_extractor.go_branch == GOUniProtOver250._ALL_GO_BRANCHES self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002)) diff --git a/tests/unit/dataset_classes/testChEBIOverX.py b/tests/unit/dataset_classes/testChEBIOverX.py index 22b327a7..270b868c 100644 --- a/tests/unit/dataset_classes/testChEBIOverX.py +++ b/tests/unit/dataset_classes/testChEBIOverX.py @@ -19,9 +19,7 @@ def setUpClass(cls, mock_makedirs, mock_processed_dir_main: PropertyMock) -> Non mock_processed_dir_main (PropertyMock): Mocked property for the processed directory path. """ mock_processed_dir_main.return_value = "/mock/processed_dir" - cls.chebi_extractor = ChEBIOverX( - chebi_version=231, _skip_data_methods_on_init=True - ) + cls.chebi_extractor = ChEBIOverX(chebi_version=231) cls.test_graph = ChebiMockOntology.get_transitively_closed_graph() @patch("builtins.open", new_callable=mock_open) diff --git a/tests/unit/dataset_classes/testChebiDataExtractor.py b/tests/unit/dataset_classes/testChebiDataExtractor.py index 2926d6cd..8da900da 100644 --- a/tests/unit/dataset_classes/testChebiDataExtractor.py +++ b/tests/unit/dataset_classes/testChebiDataExtractor.py @@ -35,7 +35,7 @@ def setUpClass( # Create an instance of the dataset cls.extractor: _ChEBIDataExtractor = _ChEBIDataExtractor( - chebi_version=231, chebi_version_train=200, _skip_data_methods_on_init=True + chebi_version=231, chebi_version_train=200 ) # Mock instance for _chebi_version_train_obj diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py index da329193..76584ebf 100644 --- a/tests/unit/dataset_classes/testChebiOverXPartial.py +++ b/tests/unit/dataset_classes/testChebiOverXPartial.py @@ -16,9 +16,7 @@ def setUpClass(cls, mock_makedirs) -> None: """ Set up the ChEBIOverXPartial instance with a mock processed directory path and a test graph. """ - cls.chebi_extractor = ChEBIOverXPartial( - top_class_id=11111, chebi_version=231, _skip_data_methods_on_init=True - ) + cls.chebi_extractor = ChEBIOverXPartial(top_class_id=11111, chebi_version=231) cls.test_graph = ChebiMockOntology.get_transitively_closed_graph() @patch( diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py index 25467973..c8846273 100644 --- a/tests/unit/dataset_classes/testDynamicDataset.py +++ b/tests/unit/dataset_classes/testDynamicDataset.py @@ -38,7 +38,7 @@ def setUpClass( _DynamicDataset.READER = ReaderMock # Creating an instance of the dataset - cls.dataset: _DynamicDataset = _DynamicDataset(_skip_data_methods_on_init=True) + cls.dataset: _DynamicDataset = _DynamicDataset() # Dataset with a balanced distribution of labels X = [ diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py index 6c37d17d..96ff9a3a 100644 --- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py +++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py @@ -35,7 +35,7 @@ def setUpClass( _GOUniProtDataExtractor.READER = ProteinDataReader - cls.extractor = _GOUniProtDataExtractor(_skip_data_methods_on_init=True) + cls.extractor = _GOUniProtDataExtractor() def test_term_callback(self) -> None: """ diff --git a/tests/unit/dataset_classes/testGoUniProtOverX.py b/tests/unit/dataset_classes/testGoUniProtOverX.py index fc4e59eb..3f329c56 100644 --- a/tests/unit/dataset_classes/testGoUniProtOverX.py +++ b/tests/unit/dataset_classes/testGoUniProtOverX.py @@ -17,7 +17,7 @@ def setUpClass(cls, mock_makedirs) -> None: """ Set up the class for tests by initializing the extractor, graph, and input DataFrame. """ - cls.extractor = _GOUniProtOverX(_skip_data_methods_on_init=True) + cls.extractor = _GOUniProtOverX() cls.test_graph: nx.DiGraph = GOUniProtMockData.get_transitively_closed_graph() cls.input_df: pd.DataFrame = GOUniProtMockData.get_data_in_dataframe().iloc[ :, :4 diff --git a/tests/unit/dataset_classes/testProteinPretrainingData.py b/tests/unit/dataset_classes/testProteinPretrainingData.py index d5e22c52..caac3eac 100644 --- a/tests/unit/dataset_classes/testProteinPretrainingData.py +++ b/tests/unit/dataset_classes/testProteinPretrainingData.py @@ -37,7 +37,7 @@ def setUpClass( _ProteinPretrainingData.READER = ProteinDataReader # Initialize the extractor instance - cls.extractor = _ProteinPretrainingData(_skip_data_methods_on_init=True) + cls.extractor = _ProteinPretrainingData() @patch( "builtins.open", diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py index aa22fb9a..9ad2af21 100644 --- a/tests/unit/dataset_classes/testTox21Challenge.py +++ b/tests/unit/dataset_classes/testTox21Challenge.py @@ -24,7 +24,7 @@ def setUpClass(cls, mock_makedirs) -> None: This is run once for the test class. """ Tox21Challenge.READER = ChemDataReader - cls.tox21 = Tox21Challenge(_skip_data_methods_on_init=True) + cls.tox21 = Tox21Challenge() @patch("rdkit.Chem.SDMolSupplier") def test_load_data_from_file(self, mock_sdmol_supplier: patch) -> None: diff --git a/tests/unit/dataset_classes/testXYBaseDataModule.py b/tests/unit/dataset_classes/testXYBaseDataModule.py index b9bed2d3..64dfbe40 100644 --- a/tests/unit/dataset_classes/testXYBaseDataModule.py +++ b/tests/unit/dataset_classes/testXYBaseDataModule.py @@ -30,7 +30,6 @@ def setUpClass(cls, mock_makedirs, mock_name_property: PropertyMock) -> None: cls.module = XYBaseDataModule( label_filter=1, # Provide a label_filter balance_after_filter=1.0, # Balance ratio - _skip_data_methods_on_init=True, ) def test_filter_labels_valid_index(self) -> None: From f253446ff2cff3331110237b083bd8b6138a6331 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Wed, 9 Apr 2025 21:15:49 +0200 Subject: [PATCH 12/17] revert __init_subclass --- chebai/preprocessing/datasets/base.py | 44 --------------------------- 1 file changed, 44 deletions(-) diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index 94584ee8..25917404 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -21,7 +21,6 @@ from chebai.preprocessing import reader as dr -@dataclass class XYBaseDataModule(LightningDataModule): """ Base class for data modules. @@ -124,49 +123,6 @@ def __init__( self._prepare_data_flag = 1 self._setup_data_flag = 1 - def __init_subclass__(cls, *args, **kwargs): - """ - This method ensures that the '_call_data_processing_methods' is called only for the final subclass - in the class hierarchy. It overrides the default __init__ behavior to add custom initialization logic. - - - The method saves the original `__init__` method of the class and then defines a new `__init__` method. - - This new `__init__` method calls the original `__init__` method of the class and then checks if the - current class is the final subclass (i.e., not a subclass of a subclass). - - If it's the final class, it invokes the `_call_data_processing_methods` method to perform any necessary - data processing tasks. - """ - super().__init_subclass__(*args, **kwargs) - original_init = cls.__init__ - - # Creates updated definition for init method - def new_init(self, *args, **kwargs): - original_init(self, *args, **kwargs) # Call the original __init__ - if type(self) == cls: # Only run method if it's the final class - self._call_data_processing_methods(*args, **kwargs) - - cls.__init__ = new_init - - def _call_data_processing_methods(self, *args, **kwargs) -> None: - """ - Calls data processing methods unless explicitly skipped. - - - Skips execution if `_skip_data_methods_on_init` is `True` (e.g., for unit tests). - - Otherwise, calls `prepare_data()` and `setup()` for data preparation. - - Note: This method is called after the instantiation of most derived class is completed. - """ - if kwargs.get("_skip_data_methods_on_init", False): - print( - f"Skipping data methods of class '{os.path.join(self.base_dir, self._name)}' during initialization" - ) - return - - print( - f"Calling data method of class {os.path.join(self.base_dir, self._name)} during initialization" - ) - self.prepare_data(*args, **kwargs) - self.setup(*args, **kwargs) - @property def num_of_labels(self): assert self._num_of_labels is not None, "num of labels must be set" From 992b48c768b188179a52114aa9fab5cde2ba4394 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Wed, 9 Apr 2025 21:17:17 +0200 Subject: [PATCH 13/17] call data methods from cli through compute_fn --- chebai/cli.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/chebai/cli.py b/chebai/cli.py index 114fdaf5..080f5436 100644 --- a/chebai/cli.py +++ b/chebai/cli.py @@ -39,9 +39,19 @@ def add_arguments_to_parser(self, parser: LightningArgumentParser): parser (LightningArgumentParser): Argument parser instance. """ + def call_data_methods(data): + if data._num_of_labels is None: + data.prepare_data() + data.setup() + return data.num_of_labels + parser.link_arguments( - "data.num_of_labels", "model.init_args.out_dim", apply_on="instantiate" + "data", + "model.init_args.out_dim", + apply_on="instantiate", + compute_fn=call_data_methods, ) + parser.link_arguments( "data.feature_vector_size", "model.init_args.input_dim", From 914941b20018d5279439e577712798f2b4eca7f8 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Wed, 9 Apr 2025 21:20:48 +0200 Subject: [PATCH 14/17] minor changes --- chebai/models/ffn.py | 1 + chebai/preprocessing/datasets/base.py | 1 - chebai/preprocessing/datasets/chebi.py | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/chebai/models/ffn.py b/chebai/models/ffn.py index ada31725..c9c6f912 100644 --- a/chebai/models/ffn.py +++ b/chebai/models/ffn.py @@ -20,6 +20,7 @@ def __init__( **kwargs ): super().__init__(**kwargs) + layers = [] current_layer_input_size = input_size for hidden_dim in hidden_layers: diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index 25917404..e8698731 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -1,7 +1,6 @@ import os import random from abc import ABC, abstractmethod -from dataclasses import dataclass from typing import Any, Dict, Generator, List, Optional, Tuple, Union import lightning as pl diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index eef80904..84927378 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -148,7 +148,6 @@ def __init__( # This is to get the data from respective directory related to "chebi_version_train" _init_kwargs = kwargs _init_kwargs["chebi_version"] = self.chebi_version_train - _init_kwargs["_skip_data_methods_on_init"] = True self._chebi_version_train_obj = self.__class__( single_class=self.single_class, **_init_kwargs, From 8136a4d94356b6b94f217d3ee4fdecea10f84731 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 12 Apr 2025 01:18:20 +0200 Subject: [PATCH 15/17] chebi: fix for prepare data --- chebai/preprocessing/datasets/chebi.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 84927378..d3387a05 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -175,6 +175,8 @@ def _perform_data_preparation(self, *args: Any, **kwargs: Any) -> None: Returns: None """ + super()._perform_data_preparation(args, kwargs) + if self.chebi_version_train is not None: if not os.path.isfile( os.path.join( From 80205bc57d0693f878fcaf77d6bf08018fe32355 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 12 Apr 2025 11:08:16 +0200 Subject: [PATCH 16/17] cli: data typehint for data param --- chebai/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/chebai/cli.py b/chebai/cli.py index 080f5436..61a1da8a 100644 --- a/chebai/cli.py +++ b/chebai/cli.py @@ -1,7 +1,8 @@ -from typing import Dict, Set +from typing import Dict, Set, Type from lightning.pytorch.cli import LightningArgumentParser, LightningCLI +from chebai.preprocessing.datasets import XYBaseDataModule from chebai.trainer.CustomTrainer import CustomTrainer @@ -39,7 +40,7 @@ def add_arguments_to_parser(self, parser: LightningArgumentParser): parser (LightningArgumentParser): Argument parser instance. """ - def call_data_methods(data): + def call_data_methods(data: Type[XYBaseDataModule]): if data._num_of_labels is None: data.prepare_data() data.setup() From 500243763ae7f424a11f62aaa1c0c8e0e9d7b176 Mon Sep 17 00:00:00 2001 From: sfluegel Date: Wed, 16 Apr 2025 10:52:05 +0200 Subject: [PATCH 17/17] merge dev into out_dim_dynamic --- .github/workflows/token_consistency.yaml | 6 - .../bin/protein_token/tokens.txt | 21 - .../bin/protein_token_3_gram/tokens.txt | 8359 ----------------- chebai/preprocessing/datasets/base.py | 14 +- .../preprocessing/datasets/deepGO/__init__.py | 0 .../datasets/deepGO/go_uniprot.py | 0 .../datasets/deepGO/protein_pretraining.py | 0 .../preprocessing/datasets/scope/__init__.py | 0 chebai/preprocessing/datasets/scope/scope.py | 972 -- .../migration/deep_go/__init__.py | 0 .../deep_go/migrate_deep_go_1_data.py | 316 - .../deep_go/migrate_deep_go_2_data.py | 366 - chebai/preprocessing/reader.py | 397 - configs/data/deepGO/deepgo2_esm2.yml | 5 - .../data/deepGO/deepgo_1_migrated_data.yml | 4 - .../data/deepGO/deepgo_2_migrated_data.yml | 5 - configs/data/deepGO/go250.yml | 3 - configs/data/deepGO/go50.yml | 1 - configs/data/scope/scope2000.yml | 3 - configs/data/scope/scope50.yml | 3 - setup.py | 2 - .../testGOUniProDataExtractor.py | 229 - .../dataset_classes/testGoUniProtOverX.py | 140 - .../testProteinPretrainingData.py | 76 - tests/unit/mock_data/ontology_mock_data.py | 407 - tests/unit/readers/testProteinDataReader.py | 139 - tutorials/data_exploration_go.ipynb | 1341 --- tutorials/data_exploration_scope.ipynb | 1182 --- 28 files changed, 12 insertions(+), 13979 deletions(-) delete mode 100644 chebai/preprocessing/bin/protein_token/tokens.txt delete mode 100644 chebai/preprocessing/bin/protein_token_3_gram/tokens.txt delete mode 100644 chebai/preprocessing/datasets/deepGO/__init__.py delete mode 100644 chebai/preprocessing/datasets/deepGO/go_uniprot.py delete mode 100644 chebai/preprocessing/datasets/deepGO/protein_pretraining.py delete mode 100644 chebai/preprocessing/datasets/scope/__init__.py delete mode 100644 chebai/preprocessing/datasets/scope/scope.py delete mode 100644 chebai/preprocessing/migration/deep_go/__init__.py delete mode 100644 chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py delete mode 100644 chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py delete mode 100644 configs/data/deepGO/deepgo2_esm2.yml delete mode 100644 configs/data/deepGO/deepgo_1_migrated_data.yml delete mode 100644 configs/data/deepGO/deepgo_2_migrated_data.yml delete mode 100644 configs/data/deepGO/go250.yml delete mode 100644 configs/data/deepGO/go50.yml delete mode 100644 configs/data/scope/scope2000.yml delete mode 100644 configs/data/scope/scope50.yml delete mode 100644 tests/unit/dataset_classes/testGOUniProDataExtractor.py delete mode 100644 tests/unit/dataset_classes/testGoUniProtOverX.py delete mode 100644 tests/unit/dataset_classes/testProteinPretrainingData.py delete mode 100644 tests/unit/readers/testProteinDataReader.py delete mode 100644 tutorials/data_exploration_go.ipynb delete mode 100644 tutorials/data_exploration_scope.ipynb diff --git a/.github/workflows/token_consistency.yaml b/.github/workflows/token_consistency.yaml index 06c3a42e..5261bf52 100644 --- a/.github/workflows/token_consistency.yaml +++ b/.github/workflows/token_consistency.yaml @@ -13,21 +13,17 @@ on: - "chebai/preprocessing/bin/smiles_token/tokens.txt" - "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt" - "chebai/preprocessing/bin/selfies/tokens.txt" - - "chebai/preprocessing/bin/protein_token/tokens.txt" - "chebai/preprocessing/bin/graph_properties/tokens.txt" - "chebai/preprocessing/bin/graph/tokens.txt" - "chebai/preprocessing/bin/deepsmiles_token/tokens.txt" - - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" pull_request: paths: - "chebai/preprocessing/bin/smiles_token/tokens.txt" - "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt" - "chebai/preprocessing/bin/selfies/tokens.txt" - - "chebai/preprocessing/bin/protein_token/tokens.txt" - "chebai/preprocessing/bin/graph_properties/tokens.txt" - "chebai/preprocessing/bin/graph/tokens.txt" - "chebai/preprocessing/bin/deepsmiles_token/tokens.txt" - - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" jobs: check_tokens: @@ -58,11 +54,9 @@ jobs: "chebai/preprocessing/bin/smiles_token/tokens.txt" "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt" "chebai/preprocessing/bin/selfies/tokens.txt" - "chebai/preprocessing/bin/protein_token/tokens.txt" "chebai/preprocessing/bin/graph_properties/tokens.txt" "chebai/preprocessing/bin/graph/tokens.txt" "chebai/preprocessing/bin/deepsmiles_token/tokens.txt" - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" ) echo "TOKENS_FILES=${TOKENS_FILES[*]}" >> $GITHUB_ENV diff --git a/chebai/preprocessing/bin/protein_token/tokens.txt b/chebai/preprocessing/bin/protein_token/tokens.txt deleted file mode 100644 index c31c5b72..00000000 --- a/chebai/preprocessing/bin/protein_token/tokens.txt +++ /dev/null @@ -1,21 +0,0 @@ -M -S -I -G -A -T -R -L -Q -N -D -K -Y -P -C -F -W -E -V -H -X diff --git a/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt b/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt deleted file mode 100644 index 534e5db1..00000000 --- a/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt +++ /dev/null @@ -1,8359 +0,0 @@ -MAT -ATP -TPG -PGA -GAS -ASS -SSA -SAR -ARD -RDE -DEF -EFV -FVY -VYM -YMA -MAK -AKL -KLA -LAE -AEQ -EQA -QAE -AER -ERY -RYE -YEE -EEM -EMV -MVE -VEF -EFM -FME -MEK -EKV -KVA -VAK -AKA -KAV -AVD -VDK -DKD -KDE -DEL -ELT -LTV -TVE -VEE -EER -ERN -RNL -NLL -LLS -LSV -SVA -VAY -AYK -YKN -KNV -NVI -VIG -IGA -GAR -ARR -RRA -RAS -ASW -SWR -WRI -RII -IIS -ISS -SSI -SIE -IEQ -EQK -QKE -KEE -EES -ESR -SRG -RGN -GND -NDD -DDH -DHV -HVS -VSL -SLI -LIR -IRD -RDY -DYR -YRS -RSK -SKI -KIE -IET -ETE -TEL -ELS -LSD -SDI -DIC -ICD -CDG -DGI -GIL -ILK -LKL -KLL -LLD -LDT -DTI -TIL -ILV -LVP -VPA -PAA -AAA -AAS -ASG -SGD -GDS -DSK -SKV -KVF -VFY -FYL -YLK -LKM -KMK -MKG -KGD -GDY -DYH -YHR -HRY -RYL -YLA -AEF -EFK -FKS -KSG -SGQ -GQE -QER -ERK -RKD -KDA -DAA -AAE -AEH -EHT -HTL -TLT -LTA -TAY -YKA -KAA -AAQ -AQD -QDI -DIA -IAN -ANS -NSE -SEL -ELA -LAP -APT -PTH -THP -HPI -PIR -IRL -RLG -LGL -GLA -LAL -ALN -LNF -NFS -FSV -SVF -FYY -YYE -YEI -EIL -ILN -LNS -NSP -SPD -PDR -DRA -RAC -ACN -CNL -NLA -LAK -AKQ -KQA -QAF -AFD -FDE -DEA -EAI -AIA -IAE -AEL -ELD -DTL -TLG -LGE -GEE -ESY -SYK -YKD -KDS -DST -STL -TLI -LIM -IMQ -MQL -QLL -LLR -LRD -RDN -DNL -NLT -LTL -TLW -LWT -WTS -TSD -SDM -DMQ -MQD -QDD -DDV -DVA -VAD -ADD -DDI -DIK -IKE -KEA -EAA -AAP -APA -AAK -AKP -KPA -PAD -ADE -DEQ -EQQ -QQS -MSD -SDT -DTV -EEL -ELV -LVQ -VQR -QRA -RAK -RYD -YDD -DDM -DMA -MAA -AAM -AMK -MKK -KKV -KVT -VTE -TEQ -EQG -QGQ -QEL -LSN -SNE -NEE -NVV -VVG -VGA -RRS -RSS -SSW -WRV -RVI -VIS -QKT -KTE -TEG -EGS -GSE -SEK -EKK -KKQ -KQQ -QQL -QLA -AKE -KEY -EYR -YRV -RVK -VKV -KVE -VEQ -EQE -ELN -LND -NDI -ICQ -CQD -QDV -DVL -VLK -LDE -EFL -FLI -LIV -IVK -VKA -KAG -AGA -GAA -AES -ESK -DYY -YYR -YRY -AEV -EVA -VAS -ASE -SED -EDR -RAA -AAV -AVV -VVE -VEK -EKS -KSQ -SQK -QKA -KAY -AYQ -YQE -QEA -EAL -ALD -LDI -IAK -AKD -KDK -DKM -KMQ -MQP -QPT -LNT -NTP -TPE -PEH -EHA -HAC -ACQ -CQL -FDD -DDA -DAI -TLN -LNE -NED -EDS -DSY -SDV -DVG -GAE -AED -EDQ -DQE -QEQ -QEG -EGN -GNQ -NQE -EAG -AGN -MAS -ASA -SAE -LSR -SRE -REE -EEN -ENV -NVY -AKT -KTV -TVD -VDS -DSE -SEE -EEG -EGR -GRG -GNE -DRV -RVT -VTL -LIK -IKD -KDY -YRG -RGK -GKI -LTK -TKI -KIC -LLE -LET -ETH -THL -HLV -VPS -PSS -SST -STA -TAP -APE -PES -FKT -KTG -TGA -AEN -ENT -NTM -TMV -MVA -IAL -ALA -ACS -CSL -SLA -AIS -ISE -TLS -LSE -DIS -EDP -DPA -PAE -AEE -EEI -EIR -IRE -REA -EAP -APK -PKR -KRD -RDS -DSS -SSE -SEG -EGQ -LES -ESH -SHL -LLH -LHD -HDN -PKH -KHD -HDL -DLS -MST -STR -TRE -VDV -DVE -SVE -SKG -KGN -EDH -HVA -VAI -AII -IIK -IES -ESE -LSK -LNV -NVL -VLE -LEA -EAH -AHL -HLI -LIP -IPS -PSA -SAS -ASP -SPA -FKA -RKE -EST -TLV -LVA -YKS -KSA -ASD -IAT -ATA -TAE -DMT -MTD -TDE -AGD -GDE -DEI -EIK -EAS -ASK -SKP -KPD -PDG -DGA -MAE -RED -EDC -DCV -CVF -VFL -FLS -SKL -EQS -QSE -SER -YDE -DEM -MVQ -VQY -QYM -YMK -MKQ -KQV -QVA -VAA -AAL -NTE -IGS -GSR -SRR -IIT -ITS -TSL -SLE -LEQ -KEQ -QAK -AKG -NDK -DKH -KHV -HVE -VEI -EII -IKG -KGY -GYR -YRA -AKI -IED -EDE -AKY -KYC -YCD -CDD -LKV -KVI -VIK -KEN -ENL -LLP -LPN -PNA -NAS -AST -STS -TSE -SES -FYK -YKK -KKM -KME -MEG -EGD -RYY -YYA -YAE -EFT -FTV -VDE -DEK -EKR -KRQ -RQE -QEV -ADK -DKS -KSL -LAA -AAY -AYT -YTE -TEA -EAT -ATE -TEI -EIS -ISN -SNA -NAD -ADL -DLA -EIM -IMN -MND -NDA -DAD -DKA -KAC -DDS -DSI -SIA -KLD -DEV -EVP -VPE -ESS -SSY -DTA -TAD -DEE -AAT -ATL -LGR -GRD -RDQ -DQY -QYV -YVY -VQF -QFM -MEQ -EQL -QLV -LVT -VTG -GAT -TPA -GSL -SLR -LRA -AAW -AWR -RIV -IVS -VSS -SRK -RKN -KND -NDE -DEH -EHV -SLV -LVK -VKD -VES -LSS -SSV -SVC -VCS -CSG -SGI -LDS -DSH -SAG -RYM -DER -RKT -KTA -TAA -EDT -DTM -TML -MLA -LAY -IAA -AAD -ADM -MAP -NSS -SSD -SDK -CNM -NMA -AFE -FEE -EEA -MQE -EQM -QMD -MDE -ATT -TTL -SRD -LVS -VSG -SGA -PAG -AGE -GEL -KNE -EEH -VET -SIC -ICS -ILR -LRL -RLL -SAT -TAS -TMI -MIA -IAY -VAV -AVA -EKA -CSM -SMA -MTM -TMD -MDK -KSE -VQK -KAK -MKA -AVT -QGH -GHE -HEL -TER -RNE -NEK -QQM -QMG -MGK -GKE -YRE -REK -EKI -IEA -EAE -ELQ -LQD -ICN -CND -NDV -LEL -ELL -LDK -DKY -KYL -YLI -IPN -NAT -ATQ -TQP -QPE -DYF -YFR -FRY -YLS -SEV -GDN -DNK -NKQ -KQT -QTT -TTV -TVS -VSN -SNS -NSQ -SQQ -QQA -QAY -EAF -FEI -ISK -SKK -KKE -KEM -EMQ -SPE -PEK -TAF -SEN -ENQ -NQG -QGD -DEG -GDA -DAG -GEG -EGE -GEN -LIL -LNA -TQA -SGE -ENK -CSD -ATH -THA -HAE -MTE -ERE -REN -ENN -NNV -VYK -VEA -EAM -ASM -SMD -MDV -VEL -TSI -NKG -KGA -EEK -EKL -KLE -LEM -EMI -MIK -IKT -KTY -TYR -RGQ -GQV -QVE -EKE -KEL -ELR -RDI -DIL -LEK -EKH -KHL -IPC -PCA -CAT -ATS -TSG -GES -YYK -YKM -EFA -FAT -ATG -TGS -GSD -SDR -DRK -ENS -NSL -LIA -IAM -AMN -NDL -DLP -LPP -PPT -ACR -CRL -RLA -AAF -MQA -EEV -EVD -VDP -DPN -NAG -GDG -DGE -GEP -EPK -PKE -EQI -QIQ -IQD -VED -DQD -DVS -MDD -DDR -DRE -EDL -DLV -LVY -VYQ -YQA -ESM -SMK -VAG -AGM -GMD -KGG -GGE -GED -EDK -DKL -KLK -KMI -MIR -REY -YRQ -RQM -QMV -ELK -KLI -LIC -ICC -CCD -CDI -ILD -LDV -VLD -IPA -AAN -ANT -NTG -TGE -TGN -NDR -AMT -ELP -MQG -EEQ -EQN -QNK -NKE -ALQ -DEN -MGD -GDR -REQ -LLQ -LQR -RAR -ARL -SAM -NEP -EPL -PLS -DRN -KTM -TMA -MAD -ADG -DGN -KKL -KVK -AYR -IEK -ELE -ETV -TVC -VCN -VLS -LSL -SLL -DKF -KFL -IKN -KNC -NCN -NDF -DFQ -FQY -QYE -YES -GEK -KKN -KNS -NSV -SVV -SEA -YKE -SKE -QMQ -EIQ -IQN -QNA -NAP -PEQ -QAC -ACL -CLL -LLA -SDQ -DQQ -QQD -QDE -VLA -ALL -KEH -EHM -HMQ -MVD -VDR -KAR -MKN -NVT -KTS -TSA -SAD -KKI -IEM -MVR -VRA -RAY -EAV -AVC -VCQ -LDN -DNY -NYL -NCS -CSE -SET -ETQ -TQY -VAT -KRA -RAT -ATV -TVV -AYS -YSE -AHE -HEI -LNY -NYS -YSV -ACH -CHL -HLA -DDD -DDG -DGG -GNN -MER -ERA -ASL -LIQ -IQK -YED -EDM -AFM -FMK -MKS -SAV -AVE -EKG -KGE -LSC -SCE -CEE -VGG -GGQ -GQR -RVL -QKS -KSN -KGP -GPE -PEV -EVK -VKE -LRG -RGV -GVC -VCD -CDT -TVL -VLG -GLL -GAG -DAE -SRV -RVF -TGD -GDD -DDK -DKK -KKR -KRI -IID -IDS -DSA -ARS -RSA -SAY -AMD -MDI -EMP -MPP -PTN -TNP -NPI -VFH -FHY -HYE -EIA -PEE -ISL -KTT -TTF -TFD -AMA -DLH -LHT -WTA -ADS -EGG -GEA -EEP -EPQ -PQS -EKT -ELI -ATC -TCM -CMK -QGA -GGR -GRR -SAW -KTD -TDT -DTS -KLQ -LQL -QLI -LRS -RSI -ICT -CTT -ANA -ATN -NPE -VAC -ACG -CGD -RKQ -QTI -TID -IDN -DNS -SQG -GAY -FDI -LNN -NNP -PEL -LAC -ACT -CTL -TLA -SDS -EEC -ECD -CDA -AEG -EGA -TIE -IEN -STV -DKE -MAQ -AQA -QAM -KSV -SVT -TET -ETG -TGV -GVE -ARK -LAR -ARE -RER -ERV -RVE -LRE -REI -EIC -ICY -CYE -YEV -EVL -IPK -PKA -KAS -ASN -SNP -DAR -ARN -RNT -NTV -VVD -VDD -DSQ -SQT -QTA -YQD -QDA -DAF -KGK -GKM -PDK -DTQ -TQG -AEP -PQE -GGD -DKN -NEL -AAC -ACM -RVV -VVS -AEK -QMA -MAR -EKF -ASQ -SQA -AAG -KKG -KGI -GIV -IVD -VDQ -DQS -QSQ -AEA -SQP -MPA -PAS -ASR -DSV -SVY -VYL -VEN -ENM -NMK -SSG -EAK -NES -ESQ -SQV -VAL -ALI -ICE -CED -EDI -ILS -SVL -SDH -DHL -LIT -SAQ -AQT -QTG -FAI -KRK -EAY -DAV -DLE -ETL -WTD -TDL -TEE -QQQ -QSS -SSQ -QAP -AQP -PTE -EGK -GKA -KAD -ADQ -MTR -VAE -NEN -ENH -NHV -HVK -VKK -KIK -EYK -YKC -KCK -CKV -LTD -TDI -ILE -LEV -GNP -NPR -PRK -SSL -IAV -DVH -VHN -HNM -NME -EKN -KNQ -NQD -QDG -DGD -DDQ -DQN -QNE -EPG -PGM -AFT -FTR -EDY -DYV -YVF -VFM -FMA -AQL -QLN -ENA -NAE -ETM -TMR -MRK -RKI -KIS -ISG -SGM -GME -KER -IGP -GPR -PRR -KEK -KGR -GRQ -RQK -QKP -KPN -NAK -AKR -RIE -QIR -IRV -RVY -VYR -QKI -LQE -EQF -QFV -FVP -VPR -PRS -RST -STN -TNA -ADA -DAK -AKV -AEY -EYS -YSS -KIA -IAG -AGS -GSA -SAL -NAY -AYN -YNS -NSA -SAF -ISQ -QLP -ILA -LAS -ACE -CEL -RKA -KAF -FDA -AAI -AIT -ITD -DLD -KLT -LTE -NLN -LNL -NLW -LWV -WVT -VTD -TDS -DDN -DNA -NEA -ALS -VLN -DNF -NFL -NCG -CGE -GET -TQH -QHE -HES -KSY -SYS -DDE -MVS -VSQ -QVV -VVA -EKP -KPQ -PQL -KKA -AGC -GCN -CNS -NSH -SHG -HGQ -GQD -QDS -SYF -YFL -FLG -LGW -GWQ -WQE -QEY -EYE -YEK -KNP -NPF -PFD -FDP -DPV -PVS -NPS -PSG -GII -IIQ -IQM -MGL -NQL -QLS -LSF -SFD -FDL -DLL -LEE -EEW -EWL -WLE -NPH -PHA -HAL -ALG -GLR -LRR -RRE -REG -GGG -GGA -ASV -VFR -FRE -REL -ALF -LFQ -FQD -QDY -YHG -HGL -GLP -LPA -PAF -AFK -FKN -KNA -NAL -ARF -RFM -FMS -MSE -SEQ -EQR -QRG -RGY -GYK -YKV -KVV -VVF -VFD -DPS -PSN -SNI -NIV -IVL -VLT -TAG -SAN -ANE -ALM -LMF -MFC -FCL -CLA -LAD -ADH -DHG -HGD -AFL -IPT -PTP -TPY -PYY -YYP -YPG -PGF -GFD -FDR -DRD -RDL -DLK -LKW -KWR -WRT -RTG -AEI -EIV -IVP -VPV -PVH -VHC -HCA -CAS -ANG -NGF -GFR -FRV -VTR -TRP -RPA -PAL -LDD -DAY -YRR -RAQ -AQK -QKR -KRR -RRL -RLR -LRV -VKG -KGV -GVL -VLI -ITN -NPL -PLG -LGT -GTA -SPR -PRA -RAD -ETI -TIV -VDF -DFV -FVA -GIH -IHL -LIS -ISD -SDE -EIY -IYA -YAG -AGT -AFA -FAE -EPP -PPA -AGF -GFV -FVS -VSA -ALE -EVV -AGR -RDG -GAD -ADV -VSD -RVH -VHV -HVV -VVY -VYS -YSL -SLS -SKD -KDL -DLG -LPG -RVG -GAI -AIY -IYS -YSA -NAA -SAA -ATK -TKM -KMS -MSS -SSF -SFG -FGL -GLV -QTQ -QYL -YLL -LLG -LGD -RDF -DFT -TRS -RSY -SYV -YVA -NKR -RRI -RIK -ERH -RHD -HDQ -DQL -LVD -VDG -DGL -EIG -IGI -GIG -IGC -GCL -CLP -LPS -AGL -GLF -LFC -FCW -CWV -WVD -VDM -DMS -MSH -HLM -LMR -MRS -RSR -SRS -RSF -SFA -FAG -GEM -EME -MEL -ELW -LWK -WKK -VFE -FEV -EVG -VGL -GLN -LNI -NIS -ISP -SPG -PGS -GSS -SSC -SCH -CHC -HCR -CRE -REP -PGW -GWF -WFR -RVC -VCF -CFA -FAN -ANM -NMS -MSA -SAK -KTL -TLD -VAM -AMQ -MQR -QRL -SFV -FVD -TGG -ALR -AVP -PVR -VRS -RSV -SVS -VSC -SCP -CPL -PLA -LAI -AIK -IKW -KWA -WAL -RLT -LTP -TPS -PSI -IAD -ADR -KAE -MAY -YQG -QGI -GID -IDL -LST -STK -TKA -HGE -YFD -FDG -DGW -GWK -WKA -AYD -YDT -DTN -DLR -LRH -RHN -HNR -NRG -RGG -GGV -GVI -VIQ -SLD -LDL -DLI -LIE -IEE -EWS -WSK -SKN -KNH -NHP -HPE -PEA -ASI -CTP -PEG -EGV -GVS -SQF -QFK -FKR -RIA -ANF -NFQ -LPE -PEF -EFR -FRK -KAM -AQF -FMG -MGQ -QVR -VRG -GGK -KAT -ATF -DPD -VVM -VMS -MSG -SGG -GAQ -AQE -QET -LAF -AFC -LAN -ANP -NPG -PGE -FLV -VPT -YPA -RDC -DCC -CCW -CWR -WRS -RSG -GIK -IKL -LPI -PIE -IEC -ECH -CHS -HSF -SFN -FND -DFR -FRL -TKE -ALV -YDG -RRQ -RQG -GIS -ISV -SVK -ILI -GTI -TIT -TDR -RDT -LAM -AML -LAT -TFA -TEH -EHR -HRV -VHL -LVC -CDE -GSV -VFA -PEY -EYV -YVS -VSI -EVI -VIE -IER -ERD -RDV -DVP -VPW -PWC -WCN -CNR -NRD -LIH -IHV -KDF -DFG -VGI -IIY -YSY -SYN -YND -AAR -RRM -RMS -QYF -FLA -ARM -RML -MLS -EEF -EFI -FIG -IGR -GRF -RFL -FLQ -QES -SKC -KCR -RLV -VAR -ARH -RHE -HER -ERF -RFT -FTS -SGL -REV -CLR -GNA -LFS -FSW -SWM -WMD -MDL -MLR -LWR -VIV -IVH -VHQ -HQV -QVK -VKL -KLN -NVS -VSP -PGT -GTS -TSF -SFH -FHC -VCH -CHA -HAN -NMD -DET -TME -MEV -GRI -RIH -IHD -HDF -FVR -VRQ -RQH -QHQ -HQQ -QQR -QRR -RRV -ERW -RWA -WAA -ANR -NRQ -RQL -QLR -RLS -SLP -LPH -PHH -HHH -HHL -HLS -LSP -PAH -SSP -SPL -SPQ -QSP -SPM -PMV -KQL -TKV -VTS -TSN -SNG -NGH -GHG -GWE -WEE -EEY -NPY -PYD -NPN -PNG -NGM -GMI -MIQ -QLC -LCF -CFD -ESW -SWL -WLT -TKN -NPD -PDA -SLK -LKR -KRN -RNG -NGQ -GQS -QSI -SIF -IFR -HGM -GMP -MPE -FKK -MEE -IRG -GNR -NRV -VTF -DPK -PKK -KIV -GST -NET -TLM -PGD -FLL -LPT -VPI -PIH -IHC -HCS -CSS -SSS -SSN -GFQ -FQI -QIT -ITE -TES -ESA -LQQ -YQQ -QAQ -QKL -VLV -VTN -TAL -ALT -LTR -TRR -LLV -DFI -FIT -TSK -KNI -NIH -YSG -SGT -GTM -TMF -MFG -FGF -GFE -FEQ -QFI -FIS -SVM -VMD -LKD -LED -DTE -TEV -EVS -VSK -SKR -KRV -YSN -SND -MIV -LSA -KKF -KFT -TSQ -SQY -YLE -NQK -KRL -RLK -LKS -KSR -SRQ -RQR -GLE -AGI -GIT -ITC -TCL -RSN -DMR -MRH -RHL -HLL -TNT -NTF -TFE -FEA -DLW -IVY -VYN -YNV -NVK -HCT -CTE -TEP -ALK -LKT -KTF -TFV -FVE -STD -TDC -DCG -CGR -GRM -RMI -MIS -ISR -SSH -SHE -ERL -LRK -RKK -KKT -SNW -NWV -WVF -RVS -VSW -SWT -RVP -VPD -PDE -VAF -TEK -KQD -QDL -DLN -IAS -DGH -AYE -ENP -PFH -FHP -PID -IDR -DRP -RPD -DGV -LCG -GDL -DLM -RKW -KWV -WVL -LKH -KHP -CTS -GVN -VNQ -NQF -QFS -FSD -IAI -AIF -IFQ -FRQ -RQA -QAV -AKF -KFM -KTR -TRN -RNN -NNK -NKV -VKF -KFD -DRI -IVM -GAH -HET -TVA -DGF -GFL -LRW -RWR -VNL -NLV -PVT -VTC -TCH -HSS -GFK -FKI -KIT -ITV -YEN -NAR -RKS -NIP -IPV -PVK -KGL -GTT -LDR -REC -ECL -CLK -LVN -VNF -NFT -FTN -TND -DKG -YAA -TFG -FGQ -SEF -EIE -DCN -IHI -HIV -KDM -DMG -PGL -VVQ -VQI -QIA -IAR -RKM -QHL -AKM -KML -FIR -RES -KLR -RHA -EIT -ITT -TTG -TGL -GLD -LDG -GLG -LGI -IGW -GWL -WLK -LKA -LFL -FLW -LWM -LRN -LLK -TAT -FDS -PGG -GGS -GSF -HCH -CHE -HEP -MDH -DHK -HKT -MET -ETA -LER -ERI -RIR -VFT -SQL -QLE -EEE -EET -ETK -TKP -KPM -PMA -TTM -TMM -MMA -AKK -KKK -KKC -KCW -CWQ -WQS -QSN -SNL -NLR -SFS -DTR -RRF -RFD -GFF -FFS -FSP -SPH -PHS -HSP -SPV -PVP -VPP -PPS -PSP -PLV -LVR -RKV -NAH -AHG -NGI -ETW -TWL -WLA -AKN -GLK -LKK -KKD -KDG -DGQ -IFK -FKE -KAL -PSK -MLT -GTV -TVF -VFG -VSV -KNL -NLE -LEN -VHI -MVV -TST -STY -TYL -YLD -LKI -KIR -IRQ -QKK -KLV -VYD -YDV -DVK -MKR -LKE -YVE -DSR -SKS -KSS -SHD -HDR -IKS -RKR -KRT -RTV -MHG -HGS -GSG -SGH -GHS -HSL -SLT -LTG -GAP -APH -PHQ -HQI -QIP -IPP -PPP -PPR -PRT -RTQ -GQQ -TAN -ANQ -DKI -KID -IDP -DPF -FHN -HNK -KRG -RGT -TSR -LRI -RIN -INN -NNS -SSR -SRY -RYN -NVD -VQL -KDT -NEQ -EQP -QPA -LVI -VQC -QCQ -CQH -QHV -HVF -FDF -DFY -FYD -YDP -PVA -VAQ -QLK -LKC -CKE -KEI -IKR -LID -IDH -DHI -HIT -TKG -AIV -IVE -TIY -IYP -PAV -AVI -IKM -KMV -NIF -VLP -PSE -ENC -NCE -CEF -EFD -DPE -EED -DEP -EPT -PTL -TLE -SWP -WPH -PHL -HLQ -VYE -YEL -ELF -FLR -LRF -FLE -ESP -PDF -FQA -QAS -SIG -IGK -GKK -KKY -KYI -YID -IDQ -DQR -QRF -RFV -FVL -DLF -LFD -DPR -PRE -DFL -FLK -VLH -LHR -HRI -RIY -IYG -YGK -GKF -RAF -AFI -IRK -RKH -KHI -HIN -NNM -NMF -MFL -YET -ETD -DSF -FNG -NGV -GVG -VGE -LEI -ILG -LGS -GSI -SII -IIN -ING -GFA -FAL -ALP -LPL -PLK -LKQ -KQE -QEH -EHK -HKV -KVL -VLL -PLH -LHK -HKP -KPK -PKC -KCL -CLS -SLY -LYH -YHA -HAQ -AYC -YCV -CVV -FIE -EKD -TPQ -PQV -QVF -LKF -KFW -FWP -WPR -RTC -TCS -SSK -KEV -EVM -VMF -GEV -EVE -DII -IIE -IEP -EPE -KII -DPL -PLF -LFR -AKC -KCV -CVS -PHF -HFQ -FQV -RAL -ALY -LYF -YFW -FWN -WNN -NNE -NEY -EYI -YIL -TSS -LVM -VMP -MPI -PIM -IMF -MFP -FPA -LYR -YRI -RIS -EHW -HWN -WNQ -NQT -IVA -TFM -MEM -EMN -MNG -NGK -GKL -KLF -LTS -TYK -YKG -GER -EKQ -KQR -QRE -KDR -RDA -AFW -FWK -MEA -LNP -NPP -EVT -VTP -PSL -SLF -LFP -FPE -TDY -DYL -DGP -GPN -PNM -NMT -MTP -TPL -PLP -LPV -AGG -GDK -KSP -SPS -PSV -VVK -KKS -STG -ETT -TTT -TTP -PAK -TKL -KLP -STP -TPT -PTS -TSP -GLS -PPD -DKV -KVD -GFS -FSR -RSL -ARP -RPR -RSH -SHS -QFR -RYQ -YQS -SNQ -NQQ -QQE -PLL -KDV -ELH -LHE -RKL -LAQ -AQC -QCG -CGV -GVM -MFD -FLD -LDC -CVA -LKG -VKR -LVE -VEC -ECV -CVG -VGS -TRG -EPV -PVY -VYP -YPD -PDI -IIR -IRM -SVN -VNI -FRT -RTL -TLP -EPN -PNL -LEP -EPS -PSW -YEF -EFF -FFL -FQP -QPS -KRY -RYV -YVD -DQK -QKF -KFV -VLM -LML -MLL -EYL -KTI -ILH -VYG -AYI -YIR -KQC -QCN -CNH -NHI -HIF -IFL -RFI -FIY -IYE -LEH -EHF -HFN -GVA -HKQ -KQF -QFL -VRV -IPL -LHS -HSV -VKS -FHA -DAT -HVI -VIR -RGL -LKY -KYW -YWP -WPK -PKT -KTC -TCT -CTQ -TQK -DVI -PSQ -FVK -VKI -KIQ -IQE -QEP -LFK -FKQ -ARC -RCV -EDN -DNC -NCH -CHT -HTV -AVF -FGT -GTL -TLY -LYQ -YQV -QVS -LIY -IYN -ASY -YKL -QQK -KAQ -ERQ -WRG -RLQ -LQG -QGT -GTQ -GAK -APV -PRP -RPT -MPY -PYK -KEP -PPK -PKV -KCT -CTA -TAK -KPS -SGK -GKD -EAQ -QPQ -PQP -PQA -AQS -QPP -SNK -KRP -RPS -NST -TPP -PTQ -TQL -IKY -KYS -GGP -GPQ -PQI -QIV -ERR -RQS -SRF -RFN -FNL -NLS -KNR -NRE -LQK -DSP -SPT -TQE -LFI -FIQ -LRQ -RQC -QCC -CCV -CVL -VLF -SDP -SDL -KFK -RAG -NEM -VEY -YIT -ITH -THS -HSR -DVV -VVT -YPE -VTM -MFS -NLF -NPT -PTG -AWP -QPN -PNI -NIA -IRR -RQI -QIN -INH -IFY -FYR -YRF -EHH -HHN -HNG -GIA -HKM -KMF -VYH -YHP -HPQ -KES -PVI -IVG -KTH -SPK -FLN -EFS -FSK -KVM -VME -MEP -LYY -YYW -YWN -YIM -IMS -MSL -SDN -ARV -YRN -RNS -NSK -KSH -SHW -WNK -NKT -TIH -IHG -GLI -YNA -LFM -MNQ -DDC -DCT -TQQ -QQY -QYK -KQK -QKG -RFR -FRM -RMK -MKE -EMW -MWQ -WQK -RLN -NPQ -PQY -QYP -YPM -PMF -MFR -FRA -RAP -APP -PPL -PPV -YSM -SME -ETP -PTA -DIQ -IQL -AVQ -VQM -QML -MLK -KDI -IKK -RRK -LPQ -PQD -DVY -VYT -YTI -TIK -IKA -AHK -HKR -RAE -FLT -SQE -MMR -MRG -RGF -RLI -STT -TTS -KKP -HGT -TTH -GSK -KST -TTE -GKQ -KQS -QSG -SGS -SVP -QGK -GKH -KHH -HHS -SKT -KTK -TKT -VSR -TKK -RKG -KGQ -QSK -SKQ -QQP -SQS -QKQ -KQG -QGS -AIM -MNP -TPV -PVL -TVT -VTK -TKD -KDD -DHA -HAH -AHP -HPT -TLL -LGA -GAV -AVS -SPI -PIS -TAV -ENG -NGN -GNS -NSN -SNN -NNN -NMN -MNI -NIN -INT -NTS -SNT -NTQ -TQD -DAN -ANH -NHA -HAS -SID -IDI -DIP -IPR -SFE -FER -RLP -PTK -PDT -DTD -KTP -PQR -QRH -RHS -RFE -FEP -PSR -RYT -YTP -PLT -PNF -NFN -FNE -NEV -RIP -FIA -DQC -CNT -DFN -NDP -PSF -IQG -KRS -IEF -TNR -NRF -FTY -TYT -YTN -TNE -EMY -MYA -YAH -AHV -VVN -VNM -MFK -KIN -INL -FRP -RPI -PIP -PVN -VNP -NPV -PVG -VGD -GDI -DIY -IYD -DED -VNE -LAW -PHM -AVY -FNH -NHQ -KQY -QYI -QDF -FIL -DIR -DCL -TLH -SFI -RSM -SMN -MNN -NNI -LQF -KFN -VRI -RIL -KVR -VRC -RCL -YCI -CIV -IVQ -KDP -LLT -VMG -LRY -RYW -PKI -INS -NEI -DIF -IFE -PLE -LEF -FIK -IKV -VEV -VPL -LFV -FVQ -KCI -CIS -LSY -SYW -EYF -NLC -LCI -CIE -VIL -ILP -PII -IIF -IFP -LYE -NGE -SIS -DPY -PYM -YML -MLV -QAI -AIN -NSG -GSW -SWN -WNR -NRA -RAI -AIH -IHA -HAM -MAF -KIF -ETN -VLY -CNA -LYL -KET -QRK -KVQ -ENW -NWS -YVK -VKN -NND -KDQ -QYT -NSF -FNT -NTA -NNT -NTL -ENE -END -NDC -DCD -CDS -SEI -IKQ -KQI -QIF -IFG -FGK -LPR -RKP -SHN -HND -NDS -DSN -VNS -NSY -SYY -YYI -YIP -PNS -NGA -GAN -NGT -TVI -VIA -IAP -APS -SNR -NRT -RTN -TNQ -NQV -QVN -VNG -GVY -YEA -SFR -FRD -KLS -LSM -SMC -MCC -RQT -QTL -VDY -DYI -YIA -VST -SDA -QEI -RTF -TFP -FPS -NHE -KIL -DVD -EPA -PAW -LQV -LLL -PMT -TDA -RYI -DHS -FMV -MVH -VHR -HRP -RPF -PFI -KAI -FIF -FET -KHN -HKL -IRA -RPK -KCA -AYH -YHQ -SYC -DFK -FKL -ADT -WPV -TNS -QAA -EFQ -FQR -QRC -RCM -CMV -MVP -CLN -SHF -LWN -NDH -HIR -IRN -NLI -ITQ -TQN -QNH -NHK -VIM -IMP -PIV -IVF -VFP -PAM -AME -NTR -RGH -GHW -NQA -VQS -QSL -NVR -VRK -VMA -AET -TDQ -DQI -QIL -ILF -DEC -KFQ -FQE -QED -EAN -KRE -ATW -TWK -WKL -AVL -PRF -RFS -FSS -TGK -GKT -LTC -TCN -CNK -NKA -SRM -RMV -VDA -NGP -GPF -PFQ -QPV -PVV -VVL -LHI -QEK -KWK -WKE -SEM -THN -NRN -RNV -VIT -EPI -PIY -VVH -VHM -HMF -MFA -FAV -AVN -VLQ -HKI -MAL -KIM -IME -THW -QQF -EAW -AWV -WVK -KAN -YTV -TVY -YSQ -STM -TMS -MSI -SIP -TDG -GPL -LFE -FED -EDV -DVQ -TVK -AHQ -HQA -QKD -RPL -QDP -DPH -PHT -HTK -AHC -CRA -SQD -DGR -MSV -ATD -TDD -DAL -LYP -YPI -PIA -IDE -DVT -TLR -NSI -SIR -STI -TIA -LGV -VER -ERT -RTR -IQF -LVL -QLG -LGN -GNF -FTP -LVG -GPD -PDH -HVH -HCL -VVR -VRD -RDK -ESL -KHS -HFV -VPM -PML -GDW -DWF -WFT -SRT -RTS -SAC -CGL -YPR -PRV -PAI -KSM -SMF -TLC -LCR -CRD -RDD -DDT -DTP -TPM -VRR -KLG -GEF -FAK -FEK -IEG -EGL -GLH -LHV -HVD -EQD -SVR -VRL -SAI -IAF -AFG -ANK -NKK -PIL -IEL -KSW -RVR -VRY -YMV -IEI -QNV -DMD -MDT -DTT -NMY -MYT -TNL -EVR -RCA -CAA -TQR -QEF -NLP -PED -DKR -RQN -QNI -NII -IIC -LLN -NVA -LAG -AGV -IMG -APL -PLI -LIG -EQT -QTV -VSE -IYM -YMQ -NDQ -DQT -QTP -KVN -EDG -DGK -GKW -FMP -MPL -LGQ -FFD -PLC -LCL -LNW -NWL -TDH -VFS -FSI -IMK -LTQ -KFG -FGG -GQW -QWA -WAS -TNI -VPK -PKM -MQK -TNY -YLQ -QRM -RMT -MTC -CLF -MTQ -EDD -VPN -PNV -VRF -FNA -AKS -RIG -GKN -PST -VKP -KPL -LGK -DSD -SDF -DFD -FDV -DVR -RYF -YFS -FSE -SLG -SVD -DSL -LKN -SIK -RSE -IPF -PFL -FAM -AMY -MYL -LRT -EHS -HSA -EIH -VVP -TLQ -VCY -CYP -VTQ -RAN -NFR -KLC -LCQ -NKL -TEY -KSD -NFV -LAV -EAC -ACV -IAQ -VEH -EHL -QCA -VDL -DLQ -AVG -VGP -PEI -ITR -TRV -RVD -AFQ -DFC -FCA -CAN -ANL -NLD -QVQ -QII -IIL -SIL -LPY -PYV -YVR -PNP -PHV -SVI -MLG -YQT -ECP -CPE -CVN -VND -GIQ -IQQ -LSQ -SKW -IEY -EYM -YMP -AGQ -GQL -FDQ -GLC -LCM -CMG -MGW -WLN -HVY -VYA -YAI -AIR -LNM -QFG -FGA -APW -PWA -WAE -IIP -IPM -PMI -MIL -MSR -SRN -RNK -NKN -KNY -YLH -HRM -EVC -VCG -CGT -GTD -DIT -TTK -PTV -ADP -VAN -ANV -FNV -SPF -VID -IDA -DAQ -AQV -KPT -NTD -TDV -VKH -KHF -HFA -FAA -LPF -GTF -TFT -FTT -YVH -ISH -HEH -PSD -AHF -AVK -RQY -FRN -LCS -SDD -DNV -FSN -MPT -FTE -ITK -FQN -QNL -NLM -LMK -MKD -KDC -DCE -CEA -ASH -SHK -KEF -EFC -FCE -CEN -ADC -DCR -MSQ -SQI -LPC -PCI -CIK -NQH -KDN -DNT -NTI -IEH -GIR -EDA -AKW -SLC -CMA -MAW -AWL -WLV -VDH -NLK -KEW -EWA -WAH -AHA -HAT -ATI -TII -AMS -GDP -PNY -MTT -TLF -FCI -CIN -INV -CGQ -TKH -KHM -HML -MLP -VLR -LRM -RMA -MAG -SLQ -KIG -GPI -LQS -KPI -QDQ -VKY -KYF -YFA -FAQ -TTA -YPL -LLM -LMD -HDD -LGP -PER -EVF -VPY -PYI -YIG -IGG -QYA -YAT -ILL -VRE -SLN -QLF -ADW -WFS -KVS -IVR -NIL -MVK -RAV -VGK -NLG -EDW -DWD -WDY -YIS -FQK -IND -NDN -DNQ -VDC -CLI -ISI -KFF -FFN -DES -SHT -HTQ -IGD -DRF -VQP -QPF -LCE -DNE -NEG -GDV -SGF -LNK -NKI -VQN -TVR -NKD -DQV -QVI -VIN -NNF -FLP -NML -EFP -FPD -PDV -IIA -GIE -DVN -VNW -NWR -VRM -MAI -IPI -LGM -GMQ -MQF -QFF -DLC -LSW -WLW -LWD -WDT -YSI -VNN -NNL -EIF -FGS -SDW -DWC -WCR -SRL -ENF -FTI -LTT -GVP -NIR -IRF -SYA -YAV -KYD -YDA -KNT -LQT -AEC -ECQ -CQE -MVM -SQN -QNQ -NQP -AND -FDM -EGP -ETF -PVD -INW -NWK -WKF -FNQ -GNI -NID -VHT -HTE -EAD -ISC -SCV -CVE -FSH -HDG -GEY -GRV -VVI -VIF -QRD -GKY -KYV -GVR -EYN -YST -STF -TFQ -FQS -QSH -FDY -EID -INQ -NQI -IRW -RWL -NFI -DKT -KLW -WKI -DAW -AWN -WNL -NRI -FRG -RGR -GRL -LQI -SIV -PME -YGN -AHT -HTY -TYH -YHV -HVN -NSD -TFL -DDL -RVN -ESF -FNI -VDI -IKP -PAN -ITA -EFH -TQC -CNW -NWF -WFV -KGS -RLC -LCD -CDM -MRD -RDR -ALC -AYA -YAK -DPQ -QSR -SFF -KFS -NGR -GRY -TRD -YLT -KVW -VWD -WDL -MES -PVE -ETY -TYP -YPV -HNY -YLR -RTK -LCA -CAL -IFD -FDK -KFE -FEC -CDW -DWS -WSG -HIL -ILT -GSY -SYH -YHN -HNL -FRS -YAR -ARG -NNQ -KTW -TWE -WEA -EAR -RPQ -EPH -HSQ -FVV -QLQ -QFD -HTA -TAW -AWH -WHP -HPK -PKD -DNI -TNN -NLY -LYI -YIF -IFS -MGR -GRW -RWG -WGR -PDP -PQM -MQT -FMR -MRQ -SIT -IGN -GNM -MLN -TAI -INI -SWC -WCF -CFS -FSQ -QIK -GAL -ADI -EFN -NHD -RDP -SKA -RRG -RGE -INK -WLQ -QKN -VHF -HFL -WKV -KSF -GGY -GYN -YNT -NTK -NGL -PQN -VTA -VKQ -RRT -YHI -LWH -WHL -HLE -NQS -QSY -YNI -TNM -TEC -ECN -CNV -NVF -VFV -KGT -TIR -CDR -DRH -HSK -QFE -PEN -NRS -SGR -YMI -LSI -LHM -HME -VHE -HEY -DCI -CIF -ECC -CWN -WNG -SIM -IMT -MTG -YNN -NFF -FFR -LKP -KPR -KVC -VCT -CTG -GKR -CLD -LDF -FNK -ENI -QDK -DID -IDT -TRK -SFL -RDH -HSY -IST -NHT -HTG -QVH -HRR -WLP -PQQ -QQN -AYF -RPE -EGY -YNL -PAT -LRP -RPM -PMD -LMV -TPR -SDY -DYE -TYM -YMS -WNF -NFE -QSF -HPH -HHC -HCN -MRA -RHT -TKF -FFE -HSG -MEN -ENR -NRP -RPV -TYQ -VHD -HDY -CVW -VWN -NGS -RMF -TKR -AIL -VCV -DFS -HPS -MRF -RFC -FCV -AWF -WFF -FFP -FPN -NTT -TTR -VFW -FWD -WDA -AFS -SNF -FTG -TGC -GCH -CHH -HHG -GQN -GLY -YFQ -RFG -FGY -GYI -IPE -PET -TFS -FSG -SGN -FTD -DDF -ELY -QTN -TNF -LDA -LTI -TIQ -IQH -QHI -IVI -VIP -PRC -RCG -CGN -SLM -LMH -HGG -EVN -RTH -HLH -LHA -HAV -YTL -FPG -EPR -PRW -RWP -PRN -RNR -NRR -RRD -DLT -LTY -TYA -YAF -PKN -SRA -FGR -RWS -WSD -FTL -FST -ITI -TIG -IGF -GFY -FYT -YTG -GDH -EPF -LAH -HAF -SPP -KFH -FHL -HLD -WVV -ESV -AVH -IGH -GHL -LGH -ESI -IMY -MYP -YPT -PTI -LTN -VEG -EGI -IQY -YLY -LYG -YGA -KHQ -HQR -DTG -GGF -FSA -RID -IDG -DGS -TVG -VLW -LWF -WFL -MGS -PLR -KPG -TSW -WNS -VRT -TQV -EYG -YGC -GCF -CFE -KGH -LNG -GNK -NKP -KPE -EYD -GFT -EGM -GMG -MGV -VGR -RIT -LMW -MWP -WPE -CET -SYG -KRM -KMM -MMV -MVF -FES -FGM -HFD -SFC -CES -LHF -HFM -MRY -QPG -PGK -GRS -RSP -SLH -HKD -KSI -IVN -NQN -QND -EFE -GEW -EWI -WIL -ADN -DNH -GDC -DCF -CFM -AWS -WSN -RLH -QAR -FSF -SFP -FPK -EHP -HPL -LLF -LFN -FNP -PFE -YCF -CFT -FTK -KEG -CDL -PAQ -PFR -FRI -QGP -ERP -RQQ -QQC -QCS -CSQ -SQR -QRI -RIQ -QGE -NQC -QCR -CRS -RSQ -SQM -QSC -SCC -CCQ -LQN -NVE -EQC -CQC -MPG -GWS -WSC -SCL -CLV -FVG -VGQ -VQE -QTK -MLE -LEG -AQY -CQG -VIH -IHT -IDV -VSH -SHV -HVL -PRQ -IYC -YCS -CST -AGP -HEE -HHE -STW -TWS -AYP -YPY -PYS -YSK -KNG -NGG -GGT -HTC -TCA -PMY -MYI -YIY -YGE -ERS -VMI -KNK -VYV -YVG -VGN -GNV -VAW -AWA -AHI -NVQ -VQG -GQF -QFY -TPH -HQS -SYD -LNC -NCT -EWG -WGL -RLD -SWS -WSL -LLY -LYW -YWL -VSF -PFY -FYN -YNY -NYR -YRP -RPP -PPF -PFN -FNC -SKF -FTF -FSY -AQR -LGY -GYV -YVP -SWE -SEW -WIG -IGT -EQH -QHR -HRE -RET -DTK -TKS -GGL -AFR -QNR -TAC -ACI -CII -DVF -FGV -GVT -VTH -THR -MNV -NVN -VNV -CVQ -VQA -PVF -VFI -IYT -YTS -IEV -QNG -NTW -TWP -WPT -PYP -NGW -GWN -NGD -GDT -LYT -YTC -PTY -TYI -SIN -INE -NNG -SVG -TVN -KAP -YDN -NYI -EFG -SRW -LMY -MYW -YWI -SYQ -YQP -FNR -NRH -YKP -PLY -LYS -YSW -VEW -EWV -WVG -RHK -HKE -TLK -KSK -KTQ -YRT -KHK -VTV -RGD -DIV -QGM -GMS -VII -IIH -DAC -TFH -FHT -MVN -VNR -KNN -KRH -SIQ -NYT -WGF -GFC -MVT -VTI -TIS -ISY -GYE -YEP -QVP -YLV -GGC -GCG -CGF -GEH -EHI -LEW -EWE -WEP -PRL -LHL -TGP -GPV -PVQ -VQV -QVT -AIQ -QAH -HEV -GSH -IHK -VQT -TGT -GTR -TRL -SSM -GHP -HPF -PYE -IHR -HRH -RHP -HPY -YPC -PCS -CSK -GRK -RLF -AIP -EHG -HGR -AWM -WMH -MHI -LMG -MGG -QVY -VYF -YFC -FCY -CYD -YDK -SPY -SYE -EDF -FNM -MEF -SPC -PCG -GTH -PYW -WLL -LQW -QWL -PYT -TNK -RHF -HFG -ART -RTI -IHW -HWV -WVQ -RMG -DAS -ELG -VTT -DRG -WVR -DVC -VCA -TIF -IFH -ELM -DEY -QRS -NVG -GTE -TEN -HAG -GVQ -YTD -DLY -AQN -GVD -DGM -GML -CAI -IRP -GIW -IWG -WGN -GNG -GDQ -QTM -GHV -HGF -GFI -AAH -DGT -APG -PGQ -GQA -YFI -FIN -PIN -INM -MFE -FEF -FAR -QRW -KMR -MRI -SGP -GPA -AVR -VRW -RWV -WVM -VMT -TGW -WQR -HFR -FRF -GFP -PAP -RLY -NYF -LFT -TTQ -QAL -YYV -QMK -ARA -MMK -QLH -RMR -GRT -RTP -RLE -AHN -HNI -LQA -CLQ -PLM -LMA -SFK -LDP -PDS -SMG -EMS -MSC -SCA -ARI -FEM -EMT -MTL -LQP -QPL -HKK -DWN -WNT -QAT -QGL -LGG -GSP -HSH -HTT -MAN -YHF -FVT -KED -YAN -ANY -IQA -QAD -ADY -NHG -PSM -SMT -MTA -THF -HFP -FPR -YGV -GRE -CVM -VMM -MML -GMK -FCS -SYL -PEP -LMT -MTF -LYD -DDW -DWM -WMR -CSR -PPE -YLM -MKF -VNK -NKM -KMT -LLW -LWP -WPP -DQA -QLD -IQV -VGV -GVV -IQS -QSA -DIN -INF -QDT -DRL -RTE -PAR -PTM -TMP -PPQ -PPG -GTP -TVP -PGP -NPA -QVD -SGV -QPR -HNV -NVH -VHK -TAM -PLN -LNR -NRL -HTH -THM -HMA -QCK -CKD -HFS -YFT -FTH -HRK -NHS -APF -PFS -QEE -MTS -ALH -HDV -QEN -FNN -GIF -APQ -QQV -MTV -LPK -PKP -PTD -VGT -PCP -CPA -SNM -NMP -DQG -TED -GGH -HPP -PRG -EMH -MHW -HWP -PMK -AIG -LTM -AGY -GYL -KWP -WPL -FVI -KRC -CVY -VYY -YYF -YFK -PQG -GAF -FSL -LSG -SGY -YNR -RVM -VMR -FPF -PFK -HIS -KKH -KHR -HRT -RTW -TWF -WMA -GHF -HFH -FHE -HEK -PLD -SFY -FYG -TDN -YEH -EHD -EPD -PGR -MHP -PAY -YPP -DMP -MPR -RAH -AHS -SFT -GPG -KHG -LPD -LCP -CPR -EPC -DPP -KPP -PPC -PCF -CFR -EPW -PWT -WTP -PGH -HGA -GAC -IMA -RNC -NCD -CDK -RGP -GPP -SEP -PKF -AMP -VAP -APR -RQP -KVP -FVN -VNT -ESC -CEV -LYC -CIR -GKV -LVV -VVW -WDE -ETS -VRN -RNY -RIF -KFY -GSM -SMV -EHY -HYH -YHT -THV -PSH -SHQ -PYG -YGY -GYT -IQI -QIE -EIN -TFR -GNC -NCI -RPY -AQI -CQK -HAA -MSN -HEW -EWQ -WQF -FDN -NAW -AWQ -QEM -EML -LNH -QKV -MDA -DCH -EHQ -FRR -NKS -SRP -PYF -YFE -QVC -TYS -DIH -HRQ -GDF -DFP -FPT -PGV -FQL -EKC -KCD -CDY -DYP -YPS -GSQ -QMS -ACD -DYD -VRP -DVW -VWE -WEH -EHE -LDH -LMM -QQT -STE -QRP -RHC -HCD -CDV -TSC -HHQ -HQL -NHL -TPI -PIK -VSM -SMR -MRE -DRS -RRR -PRI -LNQ -QST -INR -ARQ -KFR -KPY -YWE -RVA -RQF -QRV -LVH -ARY -AMG -FEL -KYY -YVQ -KMA -IHE -MGP -RGC -TSV -DSC -SCS -CSN -TQS -QSV -GPT -MPD -PDQ -DQF -QFP -RPG -GMM -MMF -FPV -SEC -ECS -PEC -ECE -ERG -ANN -NNR -NRM -LQC -QIG -ISA -REH -HKA -LQM -GKS -TRM -GCD -GVK -YHS -HSN -WDD -YGD -HAD -IGE -IFN -FNS -QLW -WMV -VDN -FQT -QTE -YWS -WSE -LGF -LHG -HGY -FEH -HFK -FKD -DQM -QFT -FTA -NDT -QTR -VFN -AFP -KFA -AYL -YRW -RWH -WHS -SYI -TPD -FHS -QCL -CLW -WRW -RWW -WWK -WGC -GCP -LTF -TFI -IRH -RHR -EFY -IDM -DMV -VKT -DMY -MYD -DTF -KRW -RWD -WDP -MVL -EMA -QGR -AEW -WIA -TGY -PTF -FEN -GHR -QPI -PFP -FPH -HHI -ILQ -IDF -NDY -DYA -YAC -CSI -TRC -RCY -CYK -ASC -SCT -SCY -CYM -STQ -MIE -NWE -WEF -PDN -DNN -NNA -API -KHA -AFN -LHH -HHF -HFY -YRD -DGY -GYS -LDY -QFA -SVQ -VQQ -CVK -AQW -QWI -SCI -DNP -DMI -YMR -LIN -CLG -GSC -SCN -DFA -CGY -GYA -IVC -CFW -HSD -GQK -III -GGI -RGA -YER -GLQ -GPH -PHG -HGW -GWR -WRM -SWG -LDQ -IVV -YLP -FQQ -QQH -QHY -HYG -YGG -HRS -RSD -KLH -LHN -DIE -IHS -DAP -AEM -EMK -IGY -HFI -QRY -RTA -DWG -YNH -NHC -CDP -QDR -WRN -NNW -NWW -WWQ -WQM -HAP -PLQ -LQY -AVM -MAM -MED -LFA -GNL -LDW -DWE -RRP -RCS -SRI -IQT -RFW -FWG -WGE -WHV -EGT -TAR -WFI -YAD -DWL -LWG -WGY -GYD -HIA -MPQ -EWR -WRY -RYA -YAL -NWQ -WQP -PPY -YDW -WSW -WML -IPD -CNP -PGC -GCV -CVD -QGV -QLY -YIC -ICF -CFP -LPM -MTI -TIP -IPG -MKT -QTF -PGI -RWT -RGW -WQA -PDD -DDY -RFP -GMT -RRY -RWK -WKP -KPW -PWR -HIW -IWY -WYT -EGW -QPD -RIC -ICV -LFF -FFA -FAP -RNA -NPW -PWN -AGK -LYM -FQH -QHF -NAV -VEM -MYQ -YQR -QRN -RNF -TMH -MHS -RFH -KHY -HYS -YSF -TRW -RWE -FYS -GPM -PMR -MRT -TGH -NWI -WIV -IRT -TGR -TTD -DSG -SDG -QYY -FWI -WII -FLY -YDL -ACW -CWA -WAP -LFG -IWI -WIP -NYD -YDQ -GYM -CVR -RGM -GMA -AYV -SKM -GIP -IPY -PYR -RAM -KYA -YPH -PHI -HIE -RTM -MDP -MRP -PGN -HSM -SML -GIM -IML -YPW -DRR -MWC -VQD -QRQ -QQI -INA -RNQ -EMR -YLN -PTR -NPC -QYG -DAH -AHR -HRA -QAW -GRA -AHH -HGC -GCS -SRH -GVH -VHG -AWI -ASF -QNP -NPM -PMG -LMP -VYW -YWK -WKG -RRW -KIW -IWR -WRA -EYA -GGN -DRY -YYG -FYA -YAM -AMR -MRL -RLW -WPG -GEI -GTK -FAF -MVG -GKP -MFY -FYM -YMT -TGQ -VVV -GMV -HQG -PHY -GVW -VWI -PNN -RKY -HAI -IIG -DTY -PEM -LCW -WVP -VPG -PGY -YSD -VEP -KPF -PDL -PMN -MNM -NMV -VMQ -MQQ -HPR -KVG -TWG -WGK -VGM -IGL -LYV -GIY -IYV -RHG -HGV -EHN -HNE -QMR -MRV -KYQ -PIT -TEW -EWT -WTV -LME -AWW -WWG -WGP -PWF -WFA -IIV -KRF -FMN -MNE -SMP -HHM -HMY -MYG -GQY -YGQ -GQG -WLI -LIF -QYR -IFA -KWL -ESG -DFH -FHR -HRG -YDR -DPT -IKH -HGP -RTD -LYA -PVM -MGH -GHT -TVQ -RTY -HGI -KHT -HTP -KMC -MCW -GRP -AYG -MKV -TMW -MWA -WAK -HEA -CGG -LVF -RYR -WLD -NAF -VGH -SAP -QAG -QDW -DWT -YTA -AQG -GLT -TTI -SIW -IWL -RQD -NIE -PDY -RMD -INP -DIG -GRC -CTK -DRM -MIG -QNF -NFA -PRY -MHA -FEG -AIW -IWS -WSM -GPS -ATR -RRN -VPQ -TSH -CSP -DNG -SFM -FMI -MIF -DCP -CPP -AQH -QHC -CRK -RCR -AFF -FFC -FCP -PPN -AIE -AID -GNT -FYP -AMV -SYR -QDM -MIC -CYN -YNQ -PTT -GQC -QCY -DHR -GCA -CAC -ACP -CPN -CCS -KCN -YKT -TCP -LCY -MFM -GCI -CID -CPK -YVC -VCC -CCN -DRC -RCN -VCL -KCY -CYV -TQT -QTC -CEK -EKY -VSY -YFH -FHD -YEC -ECT -CHR -GPY -PYN -NVC -LCN -MGE -THT -HTI -HTS -HLN -KFI -ITY -EIP -NAN -LII -DFF -FCN -TSM -TYF -LLC -LCT -CTF -FLH -HHP -LHQ -HQT -FPL -PMS -LFY -YRK -KTN -TNV -YKH -NMR -YGP -LSH -PHD -HDT -HEC -FLC -CFG -AQQ -SGC -GCR -CRF -LWL -EMD -EGF -VGF -TWV -PQK -HDA -THC -HCG -CGW -WSS -GWP -MPM -IYI -HLP -RPC -PCL -NNH -HIY -YTY -TIM -IMI -FVF -MGA -YLG -ACF -CFV -VIC -ICI -EGC -CIH -IHF -HDI -QSD -PKG -VML -LTH -THK -HKG -YMH -HSE -LMC -MCV -LFH -FHI -QFC -KYK -PFV -PPI -TVM -IKF -KFP -QGY -GYG -YGM -AMC -MCL -MKI -QIM -TRT -IDK -WLH -DND -FIV -KGF -HPN -AFV -YKR -VFF -FFV -PKS -TKQ -PNH -QIY -NSR -IQP -QPK -IVT -CHV -CLH -QAN -NEH -YIH -DVM -MLC -LCV -IQR -RYK -WLY -IDD -TFY -FID -SPN -VVC -TTN -EMM -TGM -MSK -SHR -HRN -GEQ -FIC -CTV -MFH -YGL -YGS -MHE -HEM -MMS -SMH -MHT -VLC -KYP -TGI -RYG -YGT -GQI -GPK -PKQ -GYF -WLR -CYI -IFV -GYQ -FPM -YVV -APY -IKI -MDS -HAR -PNT -HVT -VNH -HPD -NIK -ESD -FHV -TFW -WPD -PDM -DMK -KYN -TWY -IHQ -SHP -EYP -YPK -PKL -IRS -RSC -CSA -LMS -PHK -KPV -VCI -FGW -WFH -FDT -KYG -INC -NCA -CAV -FCK -CKK -FKV -DYS -TRI -RKF -FLM -MEC -ECR -CRN -PRD -PPM -HLR -GHQ -HQP -DYC -YCT -PCH -MIT -DPI -PIQ -QMP -EVY -RGS -SNV -NVP -PSC -TPF -RKC -CVP -QFQ -MDR -KCP -CPH -PHR -YTK -YDS -EKW -KWH -WHA -KDH -HRL -REF -FGD -FGE -RND -SYT -PEW -EWF -TGF -CNG -NEF -VPC -SMI -RWF -PHE -QNS -GNY -MLQ -PFM -GDM -TMK -MSP -GQP -GLM -VFQ -TRA -PIG -FQG -GMR -TAQ -AQM -NFY -FYQ -GFG -DRT -KMY -MYE -NRY -VPH -HVP -LHP -HPG -VHP -PQH -SHA -HMH -KWF -WFG -LEY -DYK -APM -PMH -NPK -QTS -THQ -MPH -SHC -HCV -SDC -CVT -KQP -QPM -MNA -GWV -LFW -FWL -WLG -QKW -KWW -WWH -WHT -HKN -QTD -QID -NFG -TPN -NSW -SWF -VDT -EFW -WQN -NIT -LLI -GTN -ESN -NRW -RWC -WCS -CSW -YQL -QLM -MLF -MLW -DPG -RHW -HWD -WDQ -NER -HEG -FPY -PYA -QMN -MNL -KLY -FAD -TKC -KCH -QKH -YKI -NDM -MVI -SHI -HIQ -ECK -CKY -KYE -RQV -KLM -MKL -YVT -VKM -DHY -HYA -DME -VFC -CIT -PIF -IFF -FFF -KIP -WFK -KSC -SCK -CKG -CAY -CKS -LQH -QHP -PWV -WVE -MRM -MLH -SHM -NSM -QGN -GYY -YYD -KGW -RYP -YSP -PND -ITP -IFC -NAC -QVL -NKW -KWT -WTL -TCD -LCC -CCT -HLC -YWA -WAI -TDP -IDY -YVN -LTW -CTI -AFY -FYI -YGR -TRH -RNW -WRL -EVH -TPC -CAP -IIM -MGT -ILC -CWL -PFF -FFI -PFC -CHM -HMP -VIY -YAY -YFN -IKC -CKF -KFC -FCR -CRQ -WNI -WRR -RRC -RCP -CPV -YQI -FGN -CVI -IFI -ITW -TWI -CRI -ILM -DTC -VHH -HHY -HYV -LHC -HCK -CKP -ETC -IQC -HNC -IYQ -LPW -PWK -ITL -TMY -CDF -DFW -WLS -TCC -IMH -MHL -TPK -LVW -VWV -FFW -FWR -WRQ -PNK -VCW -FII -PIC -ICK -CWF -FHM -FNW -YTM -AFH -FHK -RFK -FKC -PNQ -GAW -AWD -YTT -TWN -DIW -IWV -WVS -AGH -GHA -AMI -AVW -TAH -QIS -STC -TCG -CGA -ILY -ITG -ICW -ICR -SCW -CWI -WIH -IHP -HPA -FFT -FTW -TNC -EKM -MLI -ICM -CMT -YIV -DRW -EVW -VWL -CTC -NAI -LMI -TVW -VWT -WTI -ISM -SQC -QCT -QHD -HDH -IYH -YQK -FAS -CKL -TFC -TEF -IRI -DHP -SIY -ADF -IRC -MPS -NWT -CEG -KNW -WSA -MAV -DML -MPV -WIY -IYL -IHH -VFK -FIP -IMV -SIH -TMQ -MQS -ACK -FLF -VMW -WCP -CPF -NIM -CNE -FVW -YIQ -CQY -KQH -QHS -LMQ -DCS -MEI -PTC -NRK -QVG -CTD -FVH -LHW -HWA -FVM -AMW -WLF -NQY -QYN -TCV -DFM -FML -VTY -MLD -MRR -CNQ -CNY -KIY -IYF -RNP -FFK -GIN -EQV -SEH -CDH -TVH -VHW -IWP -PHN -TCE -HRD -NDG -RTT -VNA -VGY -NYQ -KCS -YFG -MFQ -NWA -VMV -DWP -CPI -PIW -CIA -NYP -PHP -ITF -SNH -IMM -MMI -MII -KVY -SYP -TMG -SMQ -CGP -GPC -PCD -ANI -RLM -SWV -TRY -THI -LGC -NCK -CKQ -VHS -VWQ -FKF -QNW -NWP -WPA -HNA -YDY -YVW -VWP -YLC -PVW -WIS -IVW -VWA -IGV -TTC -TYC -YCL -CLT -YVL -HGH -KCC -CCK -CKR -IMW -IYR -SNY -LRC -NYK -NIY -YRH -HTN -MQV -TFN -FQF -NCC -CCC -APN -CGK -SKY -RCD -GKG -HNS -RDW -DWR -WRK -TTY -YIW -WYR -QFW -FWT -QWN -WNP -VRH -RHQ -EVQ -QNY -YNF -NFP -QNC -SLW -WEL -FYV -YFV -VCM -PLW -QLT -MGN -GNH -MCG -ETR -LWS -WSV -SVW -VWH -WHY -QYW -YWT -VYI -FSM -NYY -IAW -CSH -HMG -DFE -WSI -IWQ -WQY -CIP -IPQ -HST -QWT -GVF -PDW -MAH -HVG -IWH -LWA -WAC -CIL -DTH -THH -YHL -QKY -KYH -YHK -YNW -WTK -VHA -VWY -WYQ -WND -IWA -APD -ENY -TFK -HDK -QFN -RRH -PNC -NCR -HFF -TIC -HDE -VWS -WSQ -YLF -ALW -WGG -TRQ -QYH -CVH -SMW -MWY -RNH -QTY -WQL -GCT -APC -CAE -FWF -WFQ -LCH -CHF -WSR -GGW -TIN -DQH -QHG -PFT -ISF -WDN -NWN -KEC -DKP -FYF -YFP -DSM -WEI -MSM -YII -MMN -PMP -RCC -CCP -CPT -SGW -GWT -NCP -CPG -GQH -IWN -WNC -NCY -CYS -YSR -HTF -HHA -RPW -PWH -WHN -HNQ -VQW -ECG -SMS -ANW -RAW -SFQ -QIH -HHR -REW -YEQ -CRP -FKM -GQM -WTR -ICL -MVW -ISW -IRY -LDM -RNI -QTH -THG -YVI -VPF -DAM -CWD -WDR -DRQ -MPF -PFG -CCI -AIC -CQP -GCW -WVI -QGW -NIG -CSV -AYY -STH -MGC -CFC -CLC -DYT -QVW -YIN -GQT -QWE -WES -QCH -CHP -VCR -WAY -EMF -MFI -TWC -WCV -FCF -SRC -RCH -HLT -TFF -IWF -WFY -NHN -PQT -QDN -TNH -FAW -RWQ -LWI -IAC -WNV -RHM -MEY -EYT -NVM -TWA -GWG -CQV -PSY -IYK -DTW -TWR -WRE -CSC -SCD -IWK -WKS -NYN -KNF -PNR -SQH -WFP -SWD -WDI -AWG -WVA -VMC -GWH -WHE -YCR -GMF -MFF -VTW -CDC -GYC -YCN -TMN -PCV -HCP -CCL -TQI -FCC -CLE -PRH -WST -MMD -YGH -SWA -PTW -TWD -INY -NYG -NCL -KWI -WIF -FGH -GKC -KCM -GWA -WAQ -FMY -MYY -YYQ -AKH -HKF -ECA -KHE -ICG -GMH -CTR -DHH -HNW -MIH -HPV -WMP -RVQ -DHC -GHD -AWE -WET -CHI -HIG -NTY -QNN -GAM -HLY -GWM -WMI -TCI -RYS -HNT -NQW -QWS -DPW -YSC -MPK -EWK -YAQ -AYW -MLY -CAR -KYM -YME -EYQ -MTN -NHY -ATM -FFQ -RPH -QMF -YGI -IFT -MMP -TDF -CHN -EWY -HSI -NFM -FMD -MDF -FKG -NKH -YMD -YDI -MCP -QYQ -TIW -VAH -HEF -FPI -YPF -HGN -SCR -NMG -MIN -HTD -MFW -WNH -MCI -WEN -SVH -CRV -KMD -FQM -QMI -HDS -CHG -WAV -PKW -PQW -QWP -TSY -QWR -TCR -SEY -SHY -HYQ -SFW -FWA -WAM -GFN -VMK -DYW -YWY -WYS -HWQ -WQT -VMH -MHF -PKY -VKW -WWS -HMM -MMT -MTY -KWD -TKW -KWS -HPM -MKP -KPH -HNH -QHA -MVY -YYT -EYY -WDF -GYH -WVY -YGF -GFH -HDP -NEW -WYE -YKY -YIE -EWN -GTG -YAS -CSY -GMN -RIW -IMD -TYE -DHF -HIK -PCC -CCE -CEW -GHY -DIM -TDK -KWN -QGF -GFM -ATY -TYG -YGW -MWR -NKF -WTG -PYL -YQN -QNT -RGI -GHI -HID -REM -YRC -QYS -DYG -CRY -GWD -GMC -MCA -CAF -EMC -MCK -DMM -MID -MFV -VGC -FGP -KAW -HLK -DWV -KYR -WTC -TCF -DSW -GTW -WVH -TNW -ICA -CAK -HIM -YCA -MLM -FRW -WGI -QFH -FHH -HHT -SCF -CFL -NHH -IDW -DWA -WAR -ARW -WHF -FWV -NTN -KKW -SAH -CGS -VWF -LWW -WWR -WRP -HRF -SHH -HGK -NYE -YEG -FSC -NAM -MKM -KMG -IGM -TWT -WTM -SWQ -WQD -PMM -FPP -FIH -HHD -RFA -NQM -CIG -IGQ -YSH -IIW -IPW -PWY -WYL -HKY -LWE -WEG -YDF -FGI -KGM -HSC -GTC -WPF -WWL -HQD -WGD -FKP -NYV -YMM -WGH -HQF -GGM -YDM -QGG -QDC -CDN -FDC -EQY -MQM -GYP -WLM -FEW -FAY -HLG -DGC -HLF -AHD -GFW -FWW -YTH -THY -HYK -TQF -FMH -DMF -YFF -VWG -WGV -TEM -RMP -FNF -RHY -HYC -YCE -MQI -FFG -YCK -CKH -YYN -PPH -CPS -GHH -KQN -FTC -GCK -CRG -QHH -QYC -MGM -DPM -MGI -EPY -QCM -CMQ -NIC -EYC -CRT -YPN -TRF -RDM -WPY -LYK -MNC -NCV -SCQ -CQA -GHN -HNN -AMF -LYN -NIW -ECF -FAC -ACA -RHV -EWM -WME -FEY -DDP -TMT -MTW -IPH -CNI -NIQ -NVW -VWK -DHE -RWN -FCT -PCW -DMW -YWF -FHG -YMG -DHW -HWK -VCK -SWK -RFF -CAG -MFT -VMY -MYN -PRM -RMC -CPM -THD -CKI -FWQ -WQV -MDY -RME -WGA -AYM -YYL -FFM -KNM -YCP -CPD -WDG -HNF -MGY -LEC -CLY -SWI -NLQ -GSN -PQC -QCV -PNW -IYY -NMI -EGH -HIP -WQG -MWS -HII -YNG -RMH -WKR -RHI -YNK -HQE -RWM -DHT -WEV -KCE -IWT -YRL -KCG -WSP -KGC -CKA -HDM -MPN -DKC -DNR -LCK -YRM -RFQ -FMM -WMS -QIW -HVC -NWD -MDG -FIW -DPC -HED -KWQ -WQQ -PVC -HLW -TYN -GHK -EWD -WDS -PHC -HCI -CIQ -IWD -WDV -AWC -IDC -DCA -CRR -WPM -QYD -HYR -HCW -CWS -WHI -WRD -KMP -VIW -AWK -WKH -HYP -FFY -QCI -CIY -FAH -NWG -FYE -TDW -DWK -CEP -HMV -DKW -YFY -VDW -HKS -MQH -FDH -CCM -MMM -DYN -WKQ -WCL -IHN -NNY -NYH -VNY -HTR -MNR -DMH -MHY -GTY -IFM -QRT -GCE -CEI -YEM -RMM -YTR -YAP -YMN -WGQ -WNM -GHM -WQI -NFD -WEY -FKY -HYD -HVW -AMM -RMY -QWV -MMQ -MSF -HFE -WER -HQM -VQH -YPQ -PQF -GHC -HMN -MNT -FFH -MMH -FIM -MIY -IYW -YWH -FMC -MCS -DWY -WYA -MWK -CMR -HYN -GWI -WIW -KWC -PWQ -RYC -THE -YQF -ERM -EWP -SWY -WYN -WKY -WEC -ECM -CME -RVW -VWC -WCK -RFY -NHM -KHC -GWC -HRW -RWI -WIK -FRC -HIH -RCW -CWP -CGI -FVC -VMN -KDW -AMH -MHQ -NQR -TCY -CYT -YTQ -HHV -AHY -QSM -LMN -FMT -MTH -GRN -NMQ -NGY -AWT -FCG -NMH -MHM -YCQ -NWC -CKT -VCE -HWH -NLH -VWW -PCR -RWY -WYF -YCM -QVM -QHT -HVR -RMN -QPW -FRH -HQK -YKF -MQN -KWE -TYV -HMR -ICH -KYT -TDM -CEY -CVC -PAC -NFK -KCF -YNC -QSW -WEW -WPW -YQH -NFH -MSY -YNP -DQP -HKH -MTK -KAH -VKC -YKW -GWW -WWP -MWG -VYC -YCG -HSW -WNE -CFI -CLM -CHK -RCQ -TCQ -PFA -NNC -QGC -MNY -NYM -KQM -QME -NCF -PDC -WAN -RPN -VCP -WIN -PPW -PWL -CRH -PWD -SYM -FGC -YIK -VNC -YTF -SNC -QHM -MEH -CQT -ITM -EYH -CQF -DYM -SMM -QMH -CYA -MAC -WVN -WAT -FWM -WMT -CCG -CYG -WAF -EPM -MVC -HWG -ELC -RCI -WQH -FWH -QWQ -AGW -NWY -WYC -CRW -CQS -LIW -CAQ -QMW -MWT -CER -ERC -VGW -IAH -NAQ -WIM -MKC -FQC -MWE -TQM -YHW -HWS -NYA -WMM -MMW -MWN -WNW -NWM -YEY -PCQ -HFW -FNY -NHR -NSC -TNG -HVM -HQW -EYW -IWE -HCE -PYH -YHD -YKQ -SWH -HAY -QMY -KIH -WFN -CSF -RCE -YCH -GRH -YNE -HQN -QPH -HYL -MHV -WIT -SCG -SPW -FHF -CIW -WAG -CTW -YAW -RHH -NFW -MNK -GEC -AHM -CYY -HEQ -MWV -IMR -FCD -HQC -CYF -MHC -PMC -HQY -WTH -QKC -HRC -HYF -CYL -HKC -WPS -WDC -FMQ -QHK -CFK -NEC -DNM -CQM -QMT -MDN -DCK -WDW -LHY -TKY -FPC -MDM -QWF -MDW -DWW -WWE -GLW -TWM -MSW -WEQ -WKN -PMQ -WAW -WMQ -DCY -CYR -CFH -HMS -IWW -WWI -PFW -WVC -ACY -MNS -CGC -GCM -TYY -YYS -MIM -MKW -HMI -FWE -MKH -MEW -SMY -MYH -HYI -CKN -NMM -RIM -SKH -YEW -CQR -RYH -HTM -WKT -KMN -FKH -TCK -WYI -HNP -NGC -MRN -FHW -EIW -KVH -WFE -YCY -AHW -TYW -YWR -WNA -EMG -CFF -HYT -FHQ -NKY -HHK -PCE -FCM -CMY -DHM -QQW -QWY -WYM -MRW -FPQ -MME -MYR -LWQ -GWY -WYD -HPW -YWD -CAH -EQW -QWK -WSH -NMC -PNE -FYH -QKM -HWE -WHD -RQW -SWW -WWA -MYS -KQW -WWT -CPQ -WIE -ACC -CCH -WEK -GMY -HFT -WTY -MMG -WTN -YYM -NTH -YCC -CCF -DYQ -WEM -WGT -NHF -CMS -WGS -MIW -YQM -IHM -QDH -TWQ -CAD -GNW -NWH -YYH -YYY -YFM -TPW -WED -MCR -YNM -WWD -MYV -YWM -SCM -CMM -NRC -RCT -CTN -YHM -QWC -WCT -TTW -TWW -WWY -WMG -YYC -WID -YVM -WIR -FYC -FWS -FYW -WTW -RCF -QQG -HMD -HEN -CKM -MKY -HCF -SQW -TYD -GIC -FQW -IFW -YQY -CCY -WAD -WSF -MYK -NDW -MIP -QWG -TCW -CWW -YLW -TQW -IHY -MQC -QCD -WTQ -MWW -VWM -WMK -GMW -MQW -NCQ -CQI -MRC -PWP -WTF -HVQ -HMC -DWQ -ILW -PWS -YHH -CPC -YHE -HAK -RNM -CEH -CMF -QHN -QCE -MDQ -DHQ -YTW -WLC -MCF -WFC -CFQ -YCW -CWE -MPW -WYK -MGF -FTM -CWK -HWF -PCT -MHN -HKW -WYV -DCW -CYQ -CAW -HWC -HWR -RSW -PYC -FKW -WFW -FMF -YMY -DCM -YDH -LWY -WKD -WRF -DKQ -QEC -WTE -CEM -GCY -MNH -CEQ -HYY -PYQ -QIC -GPW -PWW -MCD -WHR -NYW -QWM -CQQ -YHC -FCH -CHQ -QCF -NFC -PCN -PWG -CMI -CTM -QCP -WWN -TMC -CYW -EHC -CCR -FTQ -CNF -FDW -DWI -PWM -YWG -KMH -PWE -KWG -WGM -WHM -WPQ -CHY -VWR -WRH -CYC -AWY -DHN -CIC -CPW -ICP -QWD -CQW -CTY -WRC -WYW -MWL -CGH -HPC -PCY -EWH -QNM -PCM -QMM -WMY -WPN -WCE -HQH -CNN -CMW -PCK -QWH -NTC -HIC -CMC -MCQ -KHW -KCQ -MHK -CWG -HMT -WFM -IWC -CML -HWT -MHR -DQW -IQW -WVW -WPC -WHG -WYH -IEW -VHY -YQW -WDH -CHD -QPY -WKC -YDC -NHW -WDM -QPC -CKW -KWY -NCM -CQN -MYF -YMW -MMC -KMW -MWI -MHD -ECI -CMD -WCI -CGM -GCQ -MCE -WWF -WTT -HDC -FCQ -DMN -PWI -RMQ -WGW -WYP -MYM -HCC -CDQ -MNW -CMP -RCK -MWD -FPW -QTW -WNY -MCT -MHH -IWM -CFY -HYW -PHW -HWW -CFN -MWF -HCM -MWH -GYW -HAW -DWH -YWV -NMW -QEW -CNC -WDK -NKC -GCC -MPC -MCN -CCA -KWM -MCM -HWL -WSY -CKC -WMF -CWY -HCQ -WCA -HMK -DHD -YHY -DNW -WCD -WPI -WFD -WHW -WHC -HCY -WHQ -IMC -KPC -YMC -CRC -MCY -ECY -MCH -HWI -DCQ -PMW -LWC -CRM -DMC -MNF -HWY -YWW -YWC -WYY -EWC -FWC -FWY -WMN -WWV -EWW -WCM -CAM -WKM -WHH -YMF -WCQ -WIQ -MFN -ANC -ECW -WCG -CIM -WQC -CMH -MYC -CTH -HHW -QWW -WIC -CPY -MDC -NYC -CMN -WHK -MMY -DEW -QHW -WQW -CEC -TWH -HFC -WKW -HWM -MQY -HDW -WYG -CWM -CYH -HYM -QMC -QCW -NCW -YQC -FMW -WMC -WWW -HMW -RMW -CHW -WCW -HTW -CWC -WCY -YWQ -WMW -CWT -CWH -MWM -WWC -WCC -WCH -WWM -TAX -AXD -XDR -IEX -EXV -QAX -AXX -XXE -XES -MXN -XNF -NRX -RXX -XXX -XXR -XRI -SAX -AXG -XGG -PRX -RXR -XRX -RXE -XEF -QEX -EXQ -XQR -REX -EXR -RXQ -XQQ -DRX -RXP -XPG -QMX -MXT -XTX -TXR -XRM -APX -PXX -XXG -XGI -NLX -LXX -XXM -XMA -LNX -NXE -XEA -GTX -TXN -XND -LIX -IXI -XIM -MVX -VXX -XXK -XKT -GLX -LXP -XPP -QGX -GXD -XDL -XAP -QNX -NXM -XMN -VAX -XGV -IKX -KXY -KEX -EXL -XLY -GQX -QXE -XEP -PLX -XKC -PVX -XKE -RXI -XIR -AXL -XLN -LLX -LXD -XDA -AXE -XEL -GGX -GXG -KAX -XXA -XAG -XWS -SPX -PXC -XCD -GWX -WXH -XHF -MPX -ESX -SXN -XNK -DLX -LXN -XNS -QXG -XGD -ITX -XRG -NEX -EXA -XAL -LDX -DXI -XII -TPX -PXM -XMR -NXG -XGY -ASX -SXV -XVE -TKX -KXA -KRX -XXT -XTL -IDX -DXX -XXL -XLV -AKX -KXX -QHX -HXV -XVN -NSX -SXX -XKX -XDP -DAX -AXK -XKQ -PIX -IXX -XXF -VLX -XDI -DIX -IXL -XLK -LKX -KXV -XVA -DNX -NXD -ILX -LXK -XKV -VYX -YXE -XEI -RXS -XSH -KGX -XGF -AVX -VXY -XYG -HVX -XXI -XID -TVX -XXS -XSA -ENX -NXX -XMD -IIX -XMQ -AEX -EXX -XME -PGX -GXP -XPR -SKX -KXF -XFT -HRX -XSW -PQX -XGR -QQX -VTX -XRP -PSX -SXP -XPL -VGX -GXY -RSX -SXS -XSL -VSX -XST -AXV -XVL -AGX -GXX -XTK -KLX -LXR -XRV -AHX -HXC -XCS -LVX -VXN -XNR -NGX -GXL -TSX -SXQ -XQN -KXL -XLL -VIX -IXG -XGA -GFX -FXG -XGL -PTX -TXT -XTS -EMX -MXQ -SXY -XYA -IQX -QXY -XYR -TXK -IGX -XPS -PXT -XTG -NXQ -VKX -KXS -XSN -GVX -VXE -GRX -XRE -YKX -KXE -XEE -EEX -EXT -XTI -EHX -HXN -XNL -NDX -DXD -IAX -KSX -SXL -RRX -XRK -DDX -DXE -RXG -VXL -XLS -DTX -TXG -VXF -XFA -XIG -VXT -XTA -ISX -SXR -XRY -VQX -QXP -XPC -LGX -GXS -HGX -XGH -XXD -XDD -KKX -XXV -PKX -XLT -XSP -XLD -RAX -AXS -XSI -IYX -YXX -XXP -XPI -MSX -SXT -GEX -XHP -LFX -FXX -VXI -XIW -QTX -TXX -XXQ -XQA -FLX -DXN -XNC -MXS -XSR -YLX -EQX -QXS -TMX -MXC -XCY -NXA -XAV -EXE -XEQ -HPX -PXP -LMX -MXX -KTX -XKK -XXH -XHS -MKX -XIH -WRX -XKS -EXY -XYQ -QKX diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index e8698731..5cd210be 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -79,6 +79,7 @@ def __init__( inner_k_folds: int = -1, # use inner cross-validation if > 1 fold_index: Optional[int] = None, base_dir: Optional[str] = None, + n_token_limit: Optional[int] = None, **kwargs, ): super().__init__() @@ -110,6 +111,7 @@ def __init__( ), "fold_index can't be larger than the total number of folds" self.fold_index = fold_index self._base_dir = base_dir + self.n_token_limit = n_token_limit os.makedirs(self.raw_dir, exist_ok=True) os.makedirs(self.processed_dir, exist_ok=True) if self.use_inner_cross_validation: @@ -328,8 +330,14 @@ def _load_data_from_file(self, path: str) -> List[Dict[str, Any]]: for d in tqdm.tqdm(self._load_dict(path), total=lines) if d["features"] is not None ] - # filter for missing features in resulting data - data = [val for val in data if val["features"] is not None] + # filter for missing features in resulting data, keep features length below token limit + data = [ + val + for val in data + if val["features"] is not None + and self.n_token_limit is None + or len(val["features"]) <= self.n_token_limit + ] return data @@ -1215,4 +1223,6 @@ def processed_file_names_dict(self) -> dict: dict: A dictionary mapping dataset keys to their respective file names. For example, {"data": "data.pt"}. """ + if self.n_token_limit is not None: + return {"data": f"data_maxlen{self.n_token_limit}.pt"} return {"data": "data.pt"} diff --git a/chebai/preprocessing/datasets/deepGO/__init__.py b/chebai/preprocessing/datasets/deepGO/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/chebai/preprocessing/datasets/deepGO/go_uniprot.py b/chebai/preprocessing/datasets/deepGO/go_uniprot.py deleted file mode 100644 index e69de29b..00000000 diff --git a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py b/chebai/preprocessing/datasets/deepGO/protein_pretraining.py deleted file mode 100644 index e69de29b..00000000 diff --git a/chebai/preprocessing/datasets/scope/__init__.py b/chebai/preprocessing/datasets/scope/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/chebai/preprocessing/datasets/scope/scope.py b/chebai/preprocessing/datasets/scope/scope.py deleted file mode 100644 index e9127b25..00000000 --- a/chebai/preprocessing/datasets/scope/scope.py +++ /dev/null @@ -1,972 +0,0 @@ -# References for this file : - -# Reference 1: -# John-Marc Chandonia, Naomi K Fox, Steven E Brenner, SCOPe: classification of large macromolecular structures -# in the structural classification of proteins—extended database, Nucleic Acids Research, Volume 47, -# Issue D1, 08 January 2019, Pages D475–D481, https://doi.org/10.1093/nar/gky1134 -# https://scop.berkeley.edu/about/ver=2.08 - -# Reference 2: -# Murzin AG, Brenner SE, Hubbard TJP, Chothia C. 1995. SCOP: a structural classification of proteins database for -# the investigation of sequences and structures. Journal of Molecular Biology 247:536-540 - -import gzip -import os -import re -import shutil -from abc import ABC, abstractmethod -from tempfile import NamedTemporaryFile -from typing import Any, Dict, Generator, List, Optional, Tuple - -import networkx as nx -import pandas as pd -import requests -import torch -from Bio import SeqIO - -from chebai.preprocessing.datasets.base import _DynamicDataset -from chebai.preprocessing.reader import ProteinDataReader - - -class _SCOPeDataExtractor(_DynamicDataset, ABC): - """ - A class for extracting and processing data from the SCOPe (Structural Classification of Proteins - extended) dataset. - - This class is designed to handle the parsing, preprocessing, and hierarchical structure extraction from various - SCOPe dataset files, such as classification (CLA), hierarchy (HIE), and description (DES) files. - Additionally, it supports downloading related data like PDB sequence files. - - Args: - scope_version (str): The SCOPe version to use. - scope_version_train (Optional[str]): The training SCOPe version, if different. - dynamic_data_split_seed (int, optional): The seed for random data splitting. Defaults to 42. - splits_file_path (str, optional): Path to the splits CSV file. Defaults to None. - **kwargs: Additional keyword arguments passed to DynamicDataset and XYBaseDataModule. - """ - - # -- Index for columns of processed `data.pkl` (derived from `_graph_to_raw_dataset`) - # "id" at row index 0 - # "sids" at row index 1 - # "sequence" at row index 2 - # labels starting from row index 3 - _ID_IDX: int = 0 - _DATA_REPRESENTATION_IDX: int = 2 # here `sequence` column - _LABELS_START_IDX: int = 3 - - _SCOPE_GENERAL_URL = "https://scop.berkeley.edu/downloads/parse/dir.{data_type}.scope.{version_number}-stable.txt" - _PDB_SEQUENCE_DATA_URL = ( - "https://files.rcsb.org/pub/pdb/derived_data/pdb_seqres.txt.gz" - ) - - SCOPE_HIERARCHY: Dict[str, str] = { - "cl": "class", - "cf": "fold", - "sf": "superfamily", - "fa": "family", - "dm": "protein", - "sp": "species", - "px": "domain", - } - - def __init__( - self, - scope_version: str, - scope_version_train: Optional[str] = None, - max_sequence_len: int = 1000, - **kwargs, - ): - self.scope_version: str = scope_version - self.scope_version_train: str = scope_version_train - self.max_sequence_len: int = max_sequence_len - - super(_SCOPeDataExtractor, self).__init__(**kwargs) - - if self.scope_version_train is not None: - # Instantiate another same class with "scope_version" as "scope_version_train", if train_version is given - # This is to get the data from respective directory related to "scope_version_train" - _init_kwargs = kwargs - _init_kwargs["scope_version"] = self.scope_version_train - self._scope_version_train_obj = self.__class__( - **_init_kwargs, - ) - - @staticmethod - def _get_scope_url(data_type: str, version_number: str) -> str: - """ - Generates the URL for downloading SCOPe files. - - Args: - data_type (str): The type of data (e.g., 'cla', 'hie', 'des'). - version_number (str): The version of the SCOPe file. - - Returns: - str: The formatted SCOPe file URL. - """ - return _SCOPeDataExtractor._SCOPE_GENERAL_URL.format( - data_type=data_type, version_number=version_number - ) - - # ------------------------------ Phase: Prepare data ----------------------------------- - def _download_required_data(self) -> str: - """ - Downloads the required raw data for SCOPe and PDB sequence datasets. - - Returns: - str: Path to the downloaded data. - """ - self._download_pdb_sequence_data() - return self._download_scope_raw_data() - - def _download_pdb_sequence_data(self) -> None: - """ - Downloads and unzips the PDB sequence dataset from the RCSB PDB repository. - - The file is downloaded as a temporary gzip file, which is then extracted to the - specified directory. - """ - pdb_seq_file_path = os.path.join( - self.scope_root_dir, self.raw_file_names_dict["PDB"] - ) - os.makedirs(os.path.dirname(pdb_seq_file_path), exist_ok=True) - - if not os.path.isfile(pdb_seq_file_path): - print(f"Missing PDB raw data, Downloading PDB sequence data....") - - # Create a temporary file - with NamedTemporaryFile(delete=False) as tf: - temp_filename = tf.name - print(f"Downloading to temporary file {temp_filename}") - - # Download the file - response = requests.get(self._PDB_SEQUENCE_DATA_URL, stream=True) - with open(temp_filename, "wb") as temp_file: - shutil.copyfileobj(response.raw, temp_file) - - print(f"Downloaded to {temp_filename}") - - # Unpack the gzipped file - try: - print(f"Unzipping the file....") - with gzip.open(temp_filename, "rb") as f_in: - output_file_path = pdb_seq_file_path - with open(output_file_path, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - print(f"Unpacked and saved to {output_file_path}") - - except Exception as e: - print(f"Failed to unpack the file: {e}") - finally: - # Clean up the temporary file - os.remove(temp_filename) - print(f"Removed temporary file {temp_filename}") - - def _download_scope_raw_data(self) -> str: - """ - Downloads the raw SCOPe dataset files (CLA, HIE, DES, and COM). - - Each file is downloaded from the SCOPe repository and saved to the specified directory. - Files are only downloaded if they do not already exist. - - Returns: - str: A dummy path to indicate completion (can be extended for custom behavior). - """ - os.makedirs(self.raw_dir, exist_ok=True) - for data_type in ["CLA", "HIE", "DES"]: - data_file_name = self.raw_file_names_dict[data_type] - scope_path = os.path.join(self.raw_dir, data_file_name) - if not os.path.isfile(scope_path): - print(f"Missing Scope: {data_file_name} raw data, Downloading...") - r = requests.get( - self._get_scope_url(data_type.lower(), self.scope_version), - allow_redirects=False, - verify=False, # Disable SSL verification - ) - r.raise_for_status() # Check if the request was successful - open(scope_path, "wb").write(r.content) - return "dummy/path" - - def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: - """ - Extracts the class hierarchy from SCOPe data and computes its transitive closure. - - Args: - data_path (str): Path to the processed SCOPe dataset. - - Returns: - nx.DiGraph: A directed acyclic graph representing the SCOPe class hierarchy. - """ - print("Extracting class hierarchy...") - df_scope = self._get_scope_data() - pdb_chain_df = self._parse_pdb_sequence_file() - pdb_id_set = set(pdb_chain_df["pdb_id"]) # Search time complexity - O(1) - - # Initialize sets and dictionaries for storing edges and attributes - parent_node_edges, node_child_edges = set(), set() - node_attrs = {} - px_level_nodes = set() - sequence_nodes = dict() - px_to_seq_edges = set() - required_graph_nodes = set() - - # Create a lookup dictionary for PDB chain sequences - lookup_dict = ( - pdb_chain_df.groupby("pdb_id")[["chain_id", "sequence"]] - .apply(lambda x: dict(zip(x["chain_id"], x["sequence"]))) - .to_dict() - ) - - def add_sequence_nodes_edges(chain_sequence, px_sun_id): - """Adds sequence nodes and edges connecting px-level nodes to sequence nodes.""" - if chain_sequence not in sequence_nodes: - sequence_nodes[chain_sequence] = f"seq_{len(sequence_nodes)}" - px_to_seq_edges.add((px_sun_id, sequence_nodes[chain_sequence])) - - # Step 1: Build the graph structure and store node attributes - for row in df_scope.itertuples(index=False): - if row.level == "px": - - pdb_id, chain_id = row.sid[1:5], row.sid[5] - - if pdb_id not in pdb_id_set or chain_id == "_": - # Don't add domain level nodes that don't have pdb_id in pdb_sequences.txt file - # Also chain_id with "_" which corresponds to no chain - continue - px_level_nodes.add(row.sunid) - - # Add edges between px-level nodes and sequence nodes - if chain_id != ".": - if chain_id not in lookup_dict[pdb_id]: - continue - add_sequence_nodes_edges(lookup_dict[pdb_id][chain_id], row.sunid) - else: - # If chain_id is '.', connect all chains of this PDB ID - for chain, chain_sequence in lookup_dict[pdb_id].items(): - add_sequence_nodes_edges(chain_sequence, row.sunid) - else: - required_graph_nodes.add(row.sunid) - - node_attrs[row.sunid] = {"sid": row.sid, "level": row.level} - - if row.parent_sunid != -1: - parent_node_edges.add((row.parent_sunid, row.sunid)) - - for child_id in row.children_sunids: - node_child_edges.add((row.sunid, child_id)) - - del df_scope, pdb_chain_df, pdb_id_set - - g = nx.DiGraph() - g.add_nodes_from(node_attrs.items()) - # Note - `add_edges` internally create a node, if a node doesn't exist already - g.add_edges_from({(p, c) for p, c in parent_node_edges if p in node_attrs}) - g.add_edges_from({(p, c) for p, c in node_child_edges if c in node_attrs}) - - seq_nodes = set(sequence_nodes.values()) - g.add_nodes_from([(seq_id, {"level": "sequence"}) for seq_id in seq_nodes]) - g.add_edges_from( - { - (px_node, seq_node) - for px_node, seq_node in px_to_seq_edges - if px_node in node_attrs and seq_node in seq_nodes - } - ) - - # Step 2: Count sequence successors for required graph nodes only - for node in required_graph_nodes: - num_seq_successors = sum( - g.nodes[child]["level"] == "sequence" - for child in nx.descendants(g, node) - ) - g.nodes[node]["num_seq_successors"] = num_seq_successors - - # Step 3: Remove nodes which are not required before computing transitive closure for better efficiency - g.remove_nodes_from(px_level_nodes | seq_nodes) - - print("Computing Transitive Closure.........") - # Transitive closure is not needed in `select_classes` method but is required in _SCOPeOverXPartial - return nx.transitive_closure_dag(g) - - def _get_scope_data(self) -> pd.DataFrame: - """ - Merges and preprocesses the SCOPe classification, hierarchy, and description files into a unified DataFrame. - - Returns: - pd.DataFrame: A DataFrame containing combined SCOPe data with classification and hierarchy details. - """ - df_cla = self._get_classification_data() - df_hie = self._get_hierarchy_data() - df_des = self._get_node_description_data() - df_hie_with_cla = pd.merge(df_hie, df_cla, how="left", on="sunid") - df_all = pd.merge( - df_hie_with_cla, - df_des.drop(columns=["sid"], axis=1), - how="left", - on="sunid", - ) - return df_all - - def _get_classification_data(self) -> pd.DataFrame: - """ - Parses and processes the SCOPe CLA (classification) file. - - Returns: - pd.DataFrame: A DataFrame containing classification details, including hierarchy levels. - """ - df_cla = pd.read_csv( - os.path.join(self.raw_dir, self.raw_file_names_dict["CLA"]), - sep="\t", - header=None, - comment="#", - ) - df_cla.columns = [ - "sid", - "PDB_ID", - "description", - "sccs", - "sunid", - "hie_levels", - ] - - # Convert to dict - {cl:46456, cf:46457, sf:46458, fa:46459, dm:46460, sp:116748, px:113449} - df_cla["hie_levels"] = df_cla["hie_levels"].apply( - lambda x: {k: int(v) for k, v in (item.split("=") for item in x.split(","))} - ) - - # Split ancestor_nodes into separate columns and assign values - for key in self.SCOPE_HIERARCHY.keys(): - df_cla[self.SCOPE_HIERARCHY[key]] = df_cla["hie_levels"].apply( - lambda x: x[key] - ) - - df_cla["sunid"] = df_cla["sunid"].astype("int64") - - return df_cla - - def _get_hierarchy_data(self) -> pd.DataFrame: - """ - Parses and processes the SCOPe HIE (hierarchy) file. - - Returns: - pd.DataFrame: A DataFrame containing hierarchy details, including parent-child relationships. - """ - df_hie = pd.read_csv( - os.path.join(self.raw_dir, self.raw_file_names_dict["HIE"]), - sep="\t", - header=None, - comment="#", - low_memory=False, - ) - df_hie.columns = ["sunid", "parent_sunid", "children_sunids"] - - # if not parent id, then insert -1 - df_hie["parent_sunid"] = df_hie["parent_sunid"].replace("-", -1).astype(int) - # convert children ids to list of ids - df_hie["children_sunids"] = df_hie["children_sunids"].apply( - lambda x: list(map(int, x.split(","))) if x != "-" else [] - ) - - # Ensure the 'sunid' column in both DataFrames has the same type - df_hie["sunid"] = df_hie["sunid"].astype("int64") - return df_hie - - def _get_node_description_data(self) -> pd.DataFrame: - """ - Parses and processes the SCOPe DES (description) file. - - Returns: - pd.DataFrame: A DataFrame containing node-level descriptions from the SCOPe dataset. - """ - df_des = pd.read_csv( - os.path.join(self.raw_dir, self.raw_file_names_dict["DES"]), - sep="\t", - header=None, - comment="#", - low_memory=False, - ) - df_des.columns = ["sunid", "level", "scss", "sid", "description"] - df_des.loc[len(df_des)] = {"sunid": 0, "level": "root"} - - # Ensure the 'sunid' column in both DataFrames has the same type - df_des["sunid"] = df_des["sunid"].astype("int64") - return df_des - - def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame: - """ - Processes a directed acyclic graph (DAG) to generate a raw dataset in DataFrame format. This dataset includes - chain-level sequences and their corresponding labels based on the hierarchical structure of the associated domains. - - The process: - - Extracts SCOPe domain identifiers (sids) from the graph. - - Retrieves class labels for each domain based on all applicable taxonomy levels. - - Fetches the chain-level sequences from the Protein Data Bank (PDB) for each domain. - - For each sequence, identifies all domains associated with the same chain and assigns their corresponding labels. - - Notes: - - SCOPe hierarchy levels are used as labels, with each level represented by a column. The value in each column - indicates whether a PDB chain is associated with that particular hierarchy level. - - PDB chains are treated as samples. The method considers only domains that are mapped to the selected hierarchy levels. - - Data Format: pd.DataFrame - - Column 0 : id (Unique identifier for each sequence entry) - - Column 1 : sids (List of domain identifiers associated with the sequence) - - Column 2 : sequence (Amino acid sequence of the chain) - - Column 3 to Column "n": Each column corresponds to a SCOPe class hierarchy level with a value - of True/False indicating whether the chain is associated with the corresponding level. - - Args: - graph (nx.DiGraph): The class hierarchy graph. - - Returns: - pd.DataFrame: The raw dataset created from the graph. - - Raises: - RuntimeError: If no sunids are selected. - """ - print(f"Process graph") - - selected_sun_ids_per_lvl = self.select_classes(graph) - - if not selected_sun_ids_per_lvl: - raise RuntimeError("No sunid selected.") - - df_cla = self._get_classification_data() - hierarchy_levels = list(self.SCOPE_HIERARCHY.values()) - hierarchy_levels.remove("domain") - - df_cla = df_cla[["sid", "sunid"] + hierarchy_levels] - - # Initialize selected target columns - df_encoded = df_cla[["sid", "sunid"]].copy() - - # Collect all new columns in a dictionary first (avoids fragmentation) - encoded_df_columns = {} - - lvl_to_target_cols_mapping = {} - # Iterate over only the selected sun_ids (nodes) to one-hot encode them - for level, selected_sun_ids in selected_sun_ids_per_lvl.items(): - level_column = self.SCOPE_HIERARCHY[level] - if level_column in df_cla.columns: - # Create binary encoding for only relevant sun_ids - for sun_id in selected_sun_ids: - col_name = f"{level_column}_{sun_id}" - encoded_df_columns[col_name] = ( - df_cla[level_column] == sun_id - ).astype(bool) - - lvl_to_target_cols_mapping.setdefault(level_column, []).append( - col_name - ) - - # Convert the dictionary into a DataFrame and concatenate at once (prevents fragmentation) - df_encoded = pd.concat([df_encoded, pd.DataFrame(encoded_df_columns)], axis=1) - - encoded_target_columns = [] - for level in hierarchy_levels: - if level in lvl_to_target_cols_mapping: - encoded_target_columns.extend(lvl_to_target_cols_mapping[level]) - - print( - f"{len(encoded_target_columns)} labels has been selected for specified threshold, " - ) - print("Constructing data.pkl file .....") - - df_encoded = df_encoded[["sid", "sunid"] + encoded_target_columns] - - # Filter to select only domains that atleast map to any one selected sunid in any level - df_encoded = df_encoded[df_encoded.iloc[:, 2:].any(axis=1)] - - df_encoded["pdb_id"] = df_encoded["sid"].str[1:5] - df_encoded["chain_id"] = df_encoded["sid"].str[5] - - # "_" (underscore) means it has no chain - df_encoded = df_encoded[df_encoded["chain_id"] != "_"] - - pdb_chain_df = self._parse_pdb_sequence_file() - - # Handle chain_id == "." - Multiple chain case - # Split df_encoded into two: One for specific chains, one for "multiple chains" (".") - df_specific_chains = df_encoded[df_encoded["chain_id"] != "."] - df_multiple_chains = df_encoded[df_encoded["chain_id"] == "."].drop( - columns=["chain_id"] - ) - - # Merge specific chains normally - merged_specific = df_specific_chains.merge( - pdb_chain_df, on=["pdb_id", "chain_id"], how="left" - ) - - # Merge all chains case -> Join by pdb_id (not chain_id) - merged_all_chains = df_multiple_chains.merge( - pdb_chain_df, on="pdb_id", how="left" - ) - - # Combine both cases - sequence_hierarchy_df = pd.concat( - [merged_specific, merged_all_chains], ignore_index=True - ).dropna(subset=["sequence"]) - - # Vectorized Aggregation Instead of Row-wise Updates - sequence_hierarchy_df = ( - sequence_hierarchy_df.groupby("sequence", as_index=False) - .agg( - { - "sid": list, # Collect all SIDs per sequence - **{ - col: "max" for col in encoded_target_columns - }, # Max works as Bitwise OR for labels - } - ) - .rename(columns={"sid": "sids"}) - ) # Rename for clarity - - sequence_hierarchy_df = sequence_hierarchy_df.assign( - id=range(1, len(sequence_hierarchy_df) + 1) - )[["id", "sids", "sequence"] + encoded_target_columns] - - # Ensure atleast one label is true for each protein sequence - sequence_hierarchy_df = sequence_hierarchy_df[ - sequence_hierarchy_df.iloc[:, self._LABELS_START_IDX :].any(axis=1) - ] - - with open(os.path.join(self.processed_dir_main, "classes.txt"), "wt") as fout: - fout.writelines(str(sun_id) + "\n" for sun_id in encoded_target_columns) - - return sequence_hierarchy_df - - def _parse_pdb_sequence_file(self) -> pd.DataFrame: - """ - Parses the PDB sequence file and returns a DataFrame containing PDB IDs, chain IDs, and sequences. - - Returns: - pd.DataFrame: A DataFrame with columns ["pdb_id", "chain_id", "sequence"]. - """ - records = [] - valid_amino_acids = "".join(ProteinDataReader.AA_LETTER) - - for record in SeqIO.parse( - os.path.join(self.scope_root_dir, self.raw_file_names_dict["PDB"]), "fasta" - ): - - if not record.seq or len(record.seq) > self.max_sequence_len: - continue - - pdb_id, chain = record.id.split("_") - sequence = re.sub(f"[^{valid_amino_acids}]", "X", str(record.seq)) - - # Store as a dictionary entry (list of dicts -> DataFrame later) - records.append( - { - "pdb_id": pdb_id.lower(), - "chain_id": chain.lower(), - "sequence": sequence, - } - ) - - # Convert list of dictionaries to a DataFrame - pdb_chain_df = pd.DataFrame.from_records(records) - - return pdb_chain_df - - @abstractmethod - def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> Dict[str, List[int]]: - # Override the return type of the method from superclass - pass - - # ------------------------------ Phase: Setup data ----------------------------------- - def setup_processed(self) -> None: - """ - Transform and prepare processed data for the SCOPe dataset. - - Main function of this method is to transform `data.pkl` into a model input data format (`data.pt`), - ensuring that the data is in a format compatible for input to the model. - The transformed data must contain the following keys: `ident`, `features`, `labels`, and `group`. - This method uses a subclass of Data Reader to perform the transformation. - - It will transform the data related to `scope_version_train`, if specified. - """ - super().setup_processed() - - # Transform the data related to "scope_version_train" to encoded data, if it doesn't exist - if self.scope_version_train is not None and not os.path.isfile( - os.path.join( - self._scope_version_train_obj.processed_dir, - self._scope_version_train_obj.processed_file_names_dict["data"], - ) - ): - print( - f"Missing encoded data related to train version: {self.scope_version_train}" - ) - print("Calling the setup method related to it") - self._scope_version_train_obj.setup() - - def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]: - """ - Loads data from a pickled file and yields individual dictionaries for each row. - - The pickled file is expected to contain rows with the following structure: - - Data at row index `self._ID_IDX`: ID of go data instance - - Data at row index `self._DATA_REPRESENTATION_IDX`: Sequence representation of protein - - Data from row index `self._LABELS_START_IDX` onwards: Labels - - This method is used by `_load_data_from_file` to generate dictionaries that are then - processed and converted into a list of dictionaries containing the features and labels. - - Args: - input_file_path (str): The path to the pickled input file. - - Yields: - Dict[str, Any]: A dictionary containing: - - `features` (str): The sequence data from the file. - - `labels` (np.ndarray): A boolean array of labels starting from row index 4. - - `ident` (Any): The identifier from row index 0. - """ - with open(input_file_path, "rb") as input_file: - df = pd.read_pickle(input_file) - for row in df.values: - labels = row[self._LABELS_START_IDX :].astype(bool) - # chebai.preprocessing.reader.DataReader only needs features, labels, ident, group - # "group" set to None, by default as no such entity for this data - yield dict( - features=row[self._DATA_REPRESENTATION_IDX], - labels=labels, - ident=row[self._ID_IDX], - ) - - # ------------------------------ Phase: Dynamic Splits ----------------------------------- - def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Loads encoded/transformed data and generates training, validation, and test splits. - - This method first loads encoded data from a file named `data.pt`, which is derived from either - `scope_version` or `scope_version_train`. It then splits the data into training, validation, and test sets. - - If `scope_version_train` is provided: - - Loads additional encoded data from `scope_version_train`. - - Splits this data into training and validation sets, while using the test set from `scope_version`. - - Prunes the test set from `scope_version` to include only labels that exist in `scope_version_train`. - - If `scope_version_train` is not provided: - - Splits the data from `scope_version` into training, validation, and test sets without modification. - - Raises: - FileNotFoundError: If the required `data.pt` file(s) do not exist. Ensure that `prepare_data` - and/or `setup` methods have been called to generate the dataset files. - - Returns: - Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames: - - Training set - - Validation set - - Test set - """ - try: - filename = self.processed_file_names_dict["data"] - data_scope_version = torch.load( - os.path.join(self.processed_dir, filename), weights_only=False - ) - except FileNotFoundError: - raise FileNotFoundError( - f"File data.pt doesn't exists. " - f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" - ) - - df_scope_version = pd.DataFrame(data_scope_version) - train_df_scope_ver, df_test_scope_ver = self.get_test_split( - df_scope_version, seed=self.dynamic_data_split_seed - ) - - if self.scope_version_train is not None: - # Load encoded data derived from "scope_version_train" - try: - filename_train = ( - self._scope_version_train_obj.processed_file_names_dict["data"] - ) - data_scope_train_version = torch.load( - os.path.join( - self._scope_version_train_obj.processed_dir, filename_train - ), - weights_only=False, - ) - except FileNotFoundError: - raise FileNotFoundError( - f"File data.pt doesn't exists related to scope_version_train {self.scope_version_train}." - f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" - ) - - df_scope_train_version = pd.DataFrame(data_scope_train_version) - # Get train/val split of data based on "scope_version_train", but - # using test set from "scope_version" - df_train, df_val = self.get_train_val_splits_given_test( - df_scope_train_version, - df_test_scope_ver, - seed=self.dynamic_data_split_seed, - ) - # Modify test set from "scope_version" to only include the labels that - # exists in "scope_version_train", all other entries remains same. - df_test = self._setup_pruned_test_set(df_test_scope_ver) - else: - # Get all splits based on "scope_version" - df_train, df_val = self.get_train_val_splits_given_test( - train_df_scope_ver, - df_test_scope_ver, - seed=self.dynamic_data_split_seed, - ) - df_test = df_test_scope_ver - - return df_train, df_val, df_test - - def _setup_pruned_test_set( - self, df_test_scope_version: pd.DataFrame - ) -> pd.DataFrame: - """ - Create a test set with the same leaf nodes, but use only classes that appear in the training set. - - Args: - df_test_scope_version (pd.DataFrame): The test dataset. - - Returns: - pd.DataFrame: The pruned test dataset. - """ - # TODO: find a more efficient way to do this - filename_old = "classes.txt" - # filename_new = f"classes_v{self.scope_version_train}.txt" - # dataset = torch.load(os.path.join(self.processed_dir, "test.pt")) - - # Load original classes (from the current SCOPe version - scope_version) - with open(os.path.join(self.processed_dir_main, filename_old), "r") as file: - orig_classes = file.readlines() - - # Load new classes (from the training SCOPe version - scope_version_train) - with open( - os.path.join( - self._scope_version_train_obj.processed_dir_main, filename_old - ), - "r", - ) as file: - new_classes = file.readlines() - - # Create a mapping which give index of a class from scope_version, if the corresponding - # class exists in scope_version_train, Size = Number of classes in scope_version - mapping = [ - None if or_class not in new_classes else new_classes.index(or_class) - for or_class in orig_classes - ] - - # Iterate over each data instance in the test set which is derived from scope_version - for _, row in df_test_scope_version.iterrows(): - # Size = Number of classes in scope_version_train - new_labels = [False for _ in new_classes] - for ind, label in enumerate(row["labels"]): - # If the scope_version class exists in the scope_version_train and has a True label, - # set the corresponding label in new_labels to True - if mapping[ind] is not None and label: - new_labels[mapping[ind]] = label - # Update the labels from test instance from scope_version to the new labels, which are compatible to both versions - row["labels"] = new_labels - - return df_test_scope_version - - # ------------------------------ Phase: Raw Properties ----------------------------------- - @property - def scope_root_dir(self) -> str: - """ - Returns the root directory of scope data. - - Returns: - str: The path to the base directory, which is "data/GO_UniProt". - """ - return os.path.join("data", "SCOPe") - - @property - def base_dir(self) -> str: - """ - Returns the base directory path for storing SCOPe data. - - Returns: - str: The path to the base directory, which is "data/GO_UniProt". - """ - return os.path.join(self.scope_root_dir, f"version_{self.scope_version}") - - @property - def raw_file_names_dict(self) -> dict: - """ - Returns a dictionary of raw file names used in data processing. - - Returns: - dict: A dictionary mapping dataset names to their respective file names. - """ - return { - "CLA": "cla.txt", - "DES": "des.txt", - "HIE": "hie.txt", - "PDB": "pdb_sequences.txt", - } - - -class _SCOPeOverX(_SCOPeDataExtractor, ABC): - """ - A class for extracting data from the SCOPe dataset with a threshold for selecting classes/labels based on - the number of subclasses. - - This class is designed to filter SCOPe classes/labels based on a specified threshold, selecting only those classes - which have a certain number of subclasses in the hierarchy. - - Attributes: - READER (dr.ProteinDataReader): The reader used for reading the dataset. - THRESHOLD (int): The threshold for selecting classes/labels based on the number of subclasses. - - """ - - READER = ProteinDataReader - THRESHOLD: int = None - - @property - def _name(self) -> str: - """ - Returns the name of the dataset. - - Returns: - str: The dataset name, formatted with the current threshold. - """ - return f"SCOPe{self.THRESHOLD}" - - def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> Dict[str, List[int]]: - """ - Selects classes from the SCOPe dataset based on the number of successors meeting a specified threshold. - - This method iterates over the nodes in the graph, counting the number of successors for each node. - Nodes with a number of successors greater than or equal to the defined threshold are selected. - - Note: - The input graph must be transitive closure of a directed acyclic graph. - - Args: - g (nx.Graph): The graph representing the dataset. - *args: Additional positional arguments (not used). - **kwargs: Additional keyword arguments (not used). - - Returns: - Dict: A dict containing selected nodes at each hierarchy level. - - Notes: - - The `THRESHOLD` attribute should be defined in the subclass of this class. - """ - selected_sunids_for_level = {} - for node, attr_dict in g.nodes(data=True): - if attr_dict["level"] in {"root", "px", "sequence"}: - # Skip nodes with level "root", "px", or "sequence" - continue - - # Check if the number of "sequence"-level successors meets or exceeds the threshold - if g.nodes[node]["num_seq_successors"] >= self.THRESHOLD: - selected_sunids_for_level.setdefault(attr_dict["level"], []).append( - node - ) - return selected_sunids_for_level - - -class _SCOPeOverXPartial(_SCOPeOverX, ABC): - """ - Dataset that doesn't use the full SCOPe dataset, but extracts a part of SCOPe (subclasses of a given top class) - - Attributes: - top_class_sunid (int): The Sun-ID of the top class from which to extract subclasses. - """ - - def __init__(self, top_class_sunid: int, **kwargs): - """ - Initializes the _SCOPeOverXPartial dataset. - - Args: - top_class_sunid (int): The Sun-ID of the top class from which to extract subclasses. - **kwargs: Additional keyword arguments passed to the superclass initializer. - """ - if "top_class_sunid" not in kwargs: - kwargs["top_class_sunid"] = top_class_sunid - - self.top_class_sunid: int = top_class_sunid - super().__init__(**kwargs) - - @property - def processed_dir_main(self) -> str: - """ - Returns the main processed data directory specific to the top class. - - Returns: - str: The processed data directory path. - """ - return os.path.join( - self.base_dir, - self._name, - f"partial_{self.top_class_sunid}", - "processed", - ) - - def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: - """ - Extracts a subset of SCOPe based on subclasses of the top class ID. - - This method calls the superclass method to extract the full class hierarchy, - then extracts the subgraph containing only the descendants of the top class ID, including itself. - - Args: - data_path (str): The file path to the SCOPe ontology file. - - Returns: - nx.DiGraph: The extracted class hierarchy as a directed graph, limited to the - descendants of the top class ID. - """ - g = super()._extract_class_hierarchy(data_path) - g = g.subgraph( - list(g.successors(self.top_class_sunid)) + [self.top_class_sunid] - ) - return g - - -class SCOPeOver2000(_SCOPeOverX): - """ - A class for extracting data from the SCOPe dataset with a threshold of 2000 for selecting classes. - - Inherits from `_SCOPeOverX` and sets the threshold for selecting classes to 2000. - - Attributes: - THRESHOLD (int): The threshold for selecting classes (2000). - """ - - THRESHOLD: int = 2000 - - -class SCOPeOver50(_SCOPeOverX): - - THRESHOLD = 50 - - -class SCOPeOverPartial2000(_SCOPeOverXPartial): - """ - A class for extracting data from the SCOPe dataset with a threshold of 2000 for selecting classes. - - Inherits from `_SCOPeOverXPartial` and sets the threshold for selecting classes to 2000. - - Attributes: - THRESHOLD (int): The threshold for selecting classes (2000). - """ - - THRESHOLD: int = 2000 - - -if __name__ == "__main__": - scope = SCOPeOver50(scope_version="2.08") - - # g = scope._extract_class_hierarchy("dummy/path") - # # Save graph - # import pickle - # with open("graph.gpickle", "wb") as f: - # pickle.dump(g, f) - - # Load graph - import pickle - - with open("graph.gpickle", "rb") as f: - g = pickle.load(f) - - # print(len([node for node in g.nodes() if g.out_degree(node) > 10000])) - scope._graph_to_raw_dataset(g) diff --git a/chebai/preprocessing/migration/deep_go/__init__.py b/chebai/preprocessing/migration/deep_go/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py b/chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py deleted file mode 100644 index 7d59c699..00000000 --- a/chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py +++ /dev/null @@ -1,316 +0,0 @@ -import os -from collections import OrderedDict -from typing import List, Literal, Optional, Tuple - -import pandas as pd -from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit -from jsonargparse import CLI - -from chebai.preprocessing.datasets.deepGO.go_uniprot import DeepGO1MigratedData - - -class DeepGo1DataMigration: - """ - A class to handle data migration and processing for the DeepGO project. - It migrates the DeepGO data to our data structure followed for GO-UniProt data. - - This class handles migration of data from the DeepGO paper below: - Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf, - DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier, - Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668 - (https://doi.org/10.1093/bioinformatics/btx624). - """ - - # Max sequence length as per DeepGO1 - _MAXLEN = 1002 - _LABELS_START_IDX = DeepGO1MigratedData._LABELS_START_IDX - - def __init__(self, data_dir: str, go_branch: Literal["cc", "mf", "bp"]): - """ - Initializes the data migration object with a data directory and GO branch. - - Args: - data_dir (str): Directory containing the data files. - go_branch (Literal["cc", "mf", "bp"]): GO branch to use. - """ - valid_go_branches = list(DeepGO1MigratedData.GO_BRANCH_MAPPING.keys()) - if go_branch not in valid_go_branches: - raise ValueError(f"go_branch must be one of {valid_go_branches}") - self._go_branch = go_branch - - self._data_dir: str = rf"{data_dir}" - self._train_df: Optional[pd.DataFrame] = None - self._test_df: Optional[pd.DataFrame] = None - self._validation_df: Optional[pd.DataFrame] = None - self._terms_df: Optional[pd.DataFrame] = None - self._classes: Optional[List[str]] = None - - def migrate(self) -> None: - """ - Executes the data migration by loading, processing, and saving the data. - """ - print("Starting the migration process...") - self._load_data() - if not all( - df is not None - for df in [ - self._train_df, - self._validation_df, - self._test_df, - self._terms_df, - ] - ): - raise Exception( - "Data splits or terms data is not available in instance variables." - ) - splits_df = self._record_splits() - data_with_labels_df = self._extract_required_data_from_splits() - - if not all( - var is not None for var in [data_with_labels_df, splits_df, self._classes] - ): - raise Exception( - "Data splits or terms data is not available in instance variables." - ) - - self.save_migrated_data(data_with_labels_df, splits_df) - - def _load_data(self) -> None: - """ - Loads the test, train, validation, and terms data from the pickled files - in the data directory. - """ - try: - print(f"Loading data files from directory: {self._data_dir}") - self._test_df = pd.DataFrame( - pd.read_pickle( - os.path.join(self._data_dir, f"test-{self._go_branch}.pkl") - ) - ) - - # DeepGO 1 lacks a validation split, so we will create one by further splitting the training set. - # Although this reduces the training data slightly compared to the original DeepGO setup, - # given the data size, the impact should be minimal. - train_df = pd.DataFrame( - pd.read_pickle( - os.path.join(self._data_dir, f"train-{self._go_branch}.pkl") - ) - ) - - self._train_df, self._validation_df = self._get_train_val_split(train_df) - - self._terms_df = pd.DataFrame( - pd.read_pickle(os.path.join(self._data_dir, f"{self._go_branch}.pkl")) - ) - - except FileNotFoundError as e: - raise FileNotFoundError( - f"Data file not found in directory: {e}. " - "Please ensure all required files are available in the specified directory." - ) - - @staticmethod - def _get_train_val_split( - train_df: pd.DataFrame, - ) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ - Splits the training data into a smaller training set and a validation set. - - Args: - train_df (pd.DataFrame): Original training DataFrame. - - Returns: - Tuple[pd.DataFrame, pd.DataFrame]: Training and validation DataFrames. - """ - labels_list_train = train_df["labels"].tolist() - train_split = 0.85 - test_size = ((1 - train_split) ** 2) / train_split - - splitter = MultilabelStratifiedShuffleSplit( - n_splits=1, test_size=test_size, random_state=42 - ) - - train_indices, validation_indices = next( - splitter.split(labels_list_train, labels_list_train) - ) - - df_validation = train_df.iloc[validation_indices] - df_train = train_df.iloc[train_indices] - return df_train, df_validation - - def _record_splits(self) -> pd.DataFrame: - """ - Creates a DataFrame that stores the IDs and their corresponding data splits. - - Returns: - pd.DataFrame: A combined DataFrame containing split assignments. - """ - print("Recording data splits for train, validation, and test sets.") - split_assignment_list: List[pd.DataFrame] = [ - pd.DataFrame({"id": self._train_df["proteins"], "split": "train"}), - pd.DataFrame( - {"id": self._validation_df["proteins"], "split": "validation"} - ), - pd.DataFrame({"id": self._test_df["proteins"], "split": "test"}), - ] - - combined_split_assignment = pd.concat(split_assignment_list, ignore_index=True) - return combined_split_assignment - - def _extract_required_data_from_splits(self) -> pd.DataFrame: - """ - Extracts required columns from the combined data splits. - - Returns: - pd.DataFrame: A DataFrame containing the essential columns for processing. - """ - print("Combining data splits into a single DataFrame with required columns.") - required_columns = [ - "proteins", - "accessions", - "sequences", - "gos", - "labels", - ] - - new_df = pd.concat( - [ - self._train_df[required_columns], - self._validation_df[required_columns], - self._test_df[required_columns], - ], - ignore_index=True, - ) - new_df["go_ids"] = new_df.apply( - lambda row: self.extract_go_id(row["gos"]), axis=1 - ) - - labels_df = self._get_labels_columns(new_df) - - data_df = pd.DataFrame( - OrderedDict( - swiss_id=new_df["proteins"], - accession=new_df["accessions"], - go_ids=new_df["go_ids"], - sequence=new_df["sequences"], - ) - ) - - df = pd.concat([data_df, labels_df], axis=1) - - return df - - @staticmethod - def extract_go_id(go_list: List[str]) -> List[int]: - """ - Extracts and parses GO IDs from a list of GO annotations. - - Args: - go_list (List[str]): List of GO annotation strings. - - Returns: - List[int]: List of parsed GO IDs. - """ - return [DeepGO1MigratedData._parse_go_id(go_id_str) for go_id_str in go_list] - - def _get_labels_columns(self, data_df: pd.DataFrame) -> pd.DataFrame: - """ - Generates columns for labels based on provided selected terms. - - Args: - data_df (pd.DataFrame): DataFrame with GO annotations and labels. - - Returns: - pd.DataFrame: DataFrame with label columns. - """ - print("Generating label columns from provided selected terms.") - parsed_go_ids: pd.Series = self._terms_df["functions"].apply( - lambda gos: DeepGO1MigratedData._parse_go_id(gos) - ) - all_go_ids_list = parsed_go_ids.values.tolist() - self._classes = all_go_ids_list - - new_label_columns = pd.DataFrame( - data_df["labels"].tolist(), index=data_df.index, columns=all_go_ids_list - ) - - return new_label_columns - - def save_migrated_data( - self, data_df: pd.DataFrame, splits_df: pd.DataFrame - ) -> None: - """ - Saves the processed data and split information. - - Args: - data_df (pd.DataFrame): Data with GO labels. - splits_df (pd.DataFrame): Split assignment DataFrame. - """ - print("Saving transformed data files.") - - deepgo_migr_inst: DeepGO1MigratedData = DeepGO1MigratedData( - go_branch=DeepGO1MigratedData.GO_BRANCH_MAPPING[self._go_branch], - max_sequence_length=self._MAXLEN, - ) - - # Save data file - deepgo_migr_inst.save_processed( - data_df, deepgo_migr_inst.processed_main_file_names_dict["data"] - ) - print( - f"{deepgo_migr_inst.processed_main_file_names_dict['data']} saved to {deepgo_migr_inst.processed_dir_main}" - ) - - # Save splits file - splits_df.to_csv( - os.path.join(deepgo_migr_inst.processed_dir_main, "splits_deep_go1.csv"), - index=False, - ) - print(f"splits_deep_go1.csv saved to {deepgo_migr_inst.processed_dir_main}") - - # Save classes file - classes = sorted(self._classes) - with open( - os.path.join(deepgo_migr_inst.processed_dir_main, "classes_deep_go1.txt"), - "wt", - ) as fout: - fout.writelines(str(node) + "\n" for node in classes) - print(f"classes_deep_go1.txt saved to {deepgo_migr_inst.processed_dir_main}") - - print("Migration process completed!") - - -class Main: - """ - Main class to handle the migration process for DeepGo1DataMigration. - - Methods: - migrate(data_dir: str, go_branch: Literal["cc", "mf", "bp"]): - Initiates the migration process for the specified data directory and GO branch. - """ - - @staticmethod - def migrate(data_dir: str, go_branch: Literal["cc", "mf", "bp"]) -> None: - """ - Initiates the migration process by creating a DeepGoDataMigration instance - and invoking its migrate method. - - Args: - data_dir (str): Directory containing the data files. - go_branch (Literal["cc", "mf", "bp"]): GO branch to use - ("cc" for cellular_component, - "mf" for molecular_function, - or "bp" for biological_process). - """ - DeepGo1DataMigration(data_dir, go_branch).migrate() - - -if __name__ == "__main__": - # Example: python script_name.py migrate --data_dir="data/deep_go1" --go_branch="mf" - # --data_dir specifies the directory containing the data files. - # --go_branch specifies the GO branch (cc, mf, or bp) you want to use for the migration. - CLI( - Main, - description="DeepGo1DataMigration CLI tool to handle migration of GO data for specified branches (cc, mf, bp).", - as_positional=False, - ) diff --git a/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py b/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py deleted file mode 100644 index d23247c0..00000000 --- a/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py +++ /dev/null @@ -1,366 +0,0 @@ -import os -import re -from collections import OrderedDict -from typing import List, Literal, Optional - -import pandas as pd -from jsonargparse import CLI - -from chebai.preprocessing.datasets.deepGO.go_uniprot import DeepGO2MigratedData -from chebai.preprocessing.reader import ProteinDataReader - - -class DeepGo2DataMigration: - """ - A class to handle data migration and processing for the DeepGO project. It migrates the data from the DeepGO-SE - data structure to our data structure followed for GO-UniProt data. - - This class handles migration of data from the DeepGO paper below: - Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf, - DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier, - Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668 - (https://doi.org/10.1093/bioinformatics/btx624) - """ - - _LABELS_START_IDX = DeepGO2MigratedData._LABELS_START_IDX - - def __init__( - self, data_dir: str, go_branch: Literal["cc", "mf", "bp"], max_len: int = 1000 - ): - """ - Initializes the data migration object with a data directory and GO branch. - - Args: - data_dir (str): Directory containing the data files. - go_branch (Literal["cc", "mf", "bp"]): GO branch to use. - max_len (int): Used to truncate the sequence to this length. Default is 1000. - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 - """ - valid_go_branches = list(DeepGO2MigratedData.GO_BRANCH_MAPPING.keys()) - if go_branch not in valid_go_branches: - raise ValueError(f"go_branch must be one of {valid_go_branches}") - self._go_branch = go_branch - - self._data_dir: str = os.path.join(rf"{data_dir}", go_branch) - self._max_len: int = max_len - - self._train_df: Optional[pd.DataFrame] = None - self._test_df: Optional[pd.DataFrame] = None - self._validation_df: Optional[pd.DataFrame] = None - self._terms_df: Optional[pd.DataFrame] = None - self._classes: Optional[List[str]] = None - - def migrate(self) -> None: - """ - Executes the data migration by loading, processing, and saving the data. - """ - print("Starting the migration process...") - self._load_data() - if not all( - df is not None - for df in [ - self._train_df, - self._validation_df, - self._test_df, - self._terms_df, - ] - ): - raise Exception( - "Data splits or terms data is not available in instance variables." - ) - splits_df = self._record_splits() - - data_df = self._extract_required_data_from_splits() - data_with_labels_df = self._generate_labels(data_df) - - if not all( - var is not None for var in [data_with_labels_df, splits_df, self._classes] - ): - raise Exception( - "Data splits or terms data is not available in instance variables." - ) - - self.save_migrated_data(data_with_labels_df, splits_df) - - def _load_data(self) -> None: - """ - Loads the test, train, validation, and terms data from the pickled files - in the data directory. - """ - - try: - print(f"Loading data from directory: {self._data_dir}......") - - print( - "Pre-processing the data before loading them into instance variables\n" - f"2-Steps preprocessing: \n" - f"\t 1: Truncating every sequence to {self._max_len}\n" - f"\t 2: Replacing every amino acid which is not in {ProteinDataReader.AA_LETTER}" - ) - - self._test_df = self._pre_process_data( - pd.DataFrame( - pd.read_pickle(os.path.join(self._data_dir, "test_data.pkl")) - ) - ) - self._train_df = self._pre_process_data( - pd.DataFrame( - pd.read_pickle(os.path.join(self._data_dir, "train_data.pkl")) - ) - ) - self._validation_df = self._pre_process_data( - pd.DataFrame( - pd.read_pickle(os.path.join(self._data_dir, "valid_data.pkl")) - ) - ) - - self._terms_df = pd.DataFrame( - pd.read_pickle(os.path.join(self._data_dir, "terms.pkl")) - ) - - except FileNotFoundError as e: - raise FileNotFoundError( - f"Data file not found in directory: {e}. " - "Please ensure all required files are available in the specified directory." - ) - - def _pre_process_data(self, df: pd.DataFrame) -> pd.DataFrame: - """ - Pre-processes the input dataframe by truncating sequences to the maximum - length and replacing invalid amino acids with 'X'. - - Args: - df (pd.DataFrame): The dataframe to preprocess. - - Returns: - pd.DataFrame: The processed dataframe. - """ - df = self._truncate_sequences(df) - df = self._replace_invalid_amino_acids(df) - return df - - def _truncate_sequences( - self, df: pd.DataFrame, column: str = "sequences" - ) -> pd.DataFrame: - """ - Truncate sequences in a specified column of a dataframe to the maximum length. - - https://github.com/bio-ontology-research-group/deepgo2/blob/main/train_cnn.py#L206-L217 - - Args: - df (pd.DataFrame): The input dataframe containing the data to be processed. - column (str, optional): The column containing sequences to truncate. - Defaults to "sequences". - - Returns: - pd.DataFrame: The dataframe with sequences truncated to `self._max_len`. - """ - df[column] = df[column].apply(lambda x: x[: self._max_len]) - return df - - @staticmethod - def _replace_invalid_amino_acids( - df: pd.DataFrame, column: str = "sequences" - ) -> pd.DataFrame: - """ - Replaces invalid amino acids in a sequence with 'X' using regex. - - https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L26-L33 - https://github.com/ChEB-AI/python-chebai/pull/64#issuecomment-2517067073 - - Args: - df (pd.DataFrame): The dataframe containing the sequences to be processed. - column (str, optional): The column containing the sequences. Defaults to "sequences". - - Returns: - pd.DataFrame: The dataframe with invalid amino acids replaced by 'X'. - """ - valid_amino_acids = "".join(ProteinDataReader.AA_LETTER) - # Replace any character not in the valid set with 'X' - df[column] = df[column].apply( - lambda x: re.sub(f"[^{valid_amino_acids}]", "X", x) - ) - return df - - def _record_splits(self) -> pd.DataFrame: - """ - Creates a DataFrame that stores the IDs and their corresponding data splits. - - Returns: - pd.DataFrame: A combined DataFrame containing split assignments. - """ - print("Recording data splits for train, validation, and test sets.") - split_assignment_list: List[pd.DataFrame] = [ - pd.DataFrame({"id": self._train_df["proteins"], "split": "train"}), - pd.DataFrame( - {"id": self._validation_df["proteins"], "split": "validation"} - ), - pd.DataFrame({"id": self._test_df["proteins"], "split": "test"}), - ] - - combined_split_assignment = pd.concat(split_assignment_list, ignore_index=True) - return combined_split_assignment - - def _extract_required_data_from_splits(self) -> pd.DataFrame: - """ - Extracts required columns from the combined data splits. - - Returns: - pd.DataFrame: A DataFrame containing the essential columns for processing. - """ - print("Combining the data splits with required data..... ") - required_columns = [ - "proteins", - "accessions", - "sequences", - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/gendata/uni2pandas.py#L60-L69 - "prop_annotations", # Direct and Transitively associated GO ids - "esm2", - ] - - new_df = pd.concat( - [ - self._train_df[required_columns], - self._validation_df[required_columns], - self._test_df[required_columns], - ], - ignore_index=True, - ) - new_df["go_ids"] = new_df["prop_annotations"].apply( - lambda x: self.extract_go_id(x) - ) - - data_df = pd.DataFrame( - OrderedDict( - swiss_id=new_df["proteins"], - accession=new_df["accessions"], - go_ids=new_df["go_ids"], - sequence=new_df["sequences"], - esm2_embeddings=new_df["esm2"], - ) - ) - return data_df - - @staticmethod - def extract_go_id(go_list: List[str]) -> List[int]: - """ - Extracts and parses GO IDs from a list of GO annotations. - - Args: - go_list (List[str]): List of GO annotation strings. - - Returns: - List[str]: List of parsed GO IDs. - """ - return [DeepGO2MigratedData._parse_go_id(go_id_str) for go_id_str in go_list] - - def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame: - """ - Generates label columns for each GO term in the dataset. - - Args: - data_df (pd.DataFrame): DataFrame containing data with GO IDs. - - Returns: - pd.DataFrame: DataFrame with new label columns. - """ - print("Generating labels based on terms.pkl file.......") - parsed_go_ids: pd.Series = self._terms_df["gos"].apply( - DeepGO2MigratedData._parse_go_id - ) - all_go_ids_list = parsed_go_ids.values.tolist() - self._classes = all_go_ids_list - new_label_columns = pd.DataFrame( - False, index=data_df.index, columns=all_go_ids_list - ) - data_df = pd.concat([data_df, new_label_columns], axis=1) - - for index, row in data_df.iterrows(): - for go_id in row["go_ids"]: - if go_id in data_df.columns: - data_df.at[index, go_id] = True - - data_df = data_df[data_df.iloc[:, self._LABELS_START_IDX :].any(axis=1)] - return data_df - - def save_migrated_data( - self, data_df: pd.DataFrame, splits_df: pd.DataFrame - ) -> None: - """ - Saves the processed data and split information. - - Args: - data_df (pd.DataFrame): Data with GO labels. - splits_df (pd.DataFrame): Split assignment DataFrame. - """ - print("Saving transformed data......") - deepgo_migr_inst: DeepGO2MigratedData = DeepGO2MigratedData( - go_branch=DeepGO2MigratedData.GO_BRANCH_MAPPING[self._go_branch], - max_sequence_length=self._max_len, - ) - - # Save data file - deepgo_migr_inst.save_processed( - data_df, deepgo_migr_inst.processed_main_file_names_dict["data"] - ) - print( - f"{deepgo_migr_inst.processed_main_file_names_dict['data']} saved to {deepgo_migr_inst.processed_dir_main}" - ) - - # Save split file - splits_df.to_csv( - os.path.join(deepgo_migr_inst.processed_dir_main, "splits_deep_go2.csv"), - index=False, - ) - print(f"splits_deep_go2.csv saved to {deepgo_migr_inst.processed_dir_main}") - - # Save classes.txt file - classes = sorted(self._classes) - with open( - os.path.join(deepgo_migr_inst.processed_dir_main, "classes_deep_go2.txt"), - "wt", - ) as fout: - fout.writelines(str(node) + "\n" for node in classes) - print(f"classes_deep_go2.txt saved to {deepgo_migr_inst.processed_dir_main}") - - print("Migration completed!") - - -class Main: - """ - Main class to handle the migration process for DeepGoDataMigration. - - Methods: - migrate(data_dir: str, go_branch: Literal["cc", "mf", "bp"]): - Initiates the migration process for the specified data directory and GO branch. - """ - - @staticmethod - def migrate( - data_dir: str, go_branch: Literal["cc", "mf", "bp"], max_len: int = 1000 - ) -> None: - """ - Initiates the migration process by creating a DeepGoDataMigration instance - and invoking its migrate method. - - Args: - data_dir (str): Directory containing the data files. - go_branch (Literal["cc", "mf", "bp"]): GO branch to use - ("cc" for cellular_component, - "mf" for molecular_function, - or "bp" for biological_process). - max_len (int): Used to truncate the sequence to this length. Default is 1000. - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 - """ - DeepGo2DataMigration(data_dir, go_branch, max_len).migrate() - - -if __name__ == "__main__": - # Example: python script_name.py migrate --data_dir="data/deep_go_se_training_data" --go_branch="bp" - # --data_dir specifies the directory containing the data files. - # --go_branch specifies the GO branch (cc, mf, or bp) you want to use for the migration. - CLI( - Main, - description="DeepGoDataMigration CLI tool to handle migration of GO data for specified branches (cc, mf, bp).", - as_positional=False, - ) diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py index 7e943eb5..345b2567 100644 --- a/chebai/preprocessing/reader.py +++ b/chebai/preprocessing/reader.py @@ -1,18 +1,8 @@ import os -from pathlib import Path from typing import Any, Dict, List, Optional, Tuple -from urllib.error import HTTPError import deepsmiles import selfies as sf -import torch -from esm import Alphabet -from esm.model.esm2 import ESM2 -from esm.pretrained import ( - _has_regression_weights, - load_model_and_alphabet_core, - load_model_and_alphabet_local, -) from pysmiles.read_smiles import _tokenize from transformers import RobertaTokenizerFast @@ -340,390 +330,3 @@ def name(cls) -> str: def _read_data(self, raw_data: str) -> List[int]: """Convert characters in raw data to their ordinal values.""" return [ord(s) for s in raw_data] - - -class ProteinDataReader(DataReader): - """ - Data reader for protein sequences using amino acid tokens. This class processes raw protein sequences into a format - suitable for model input by tokenizing them and assigning unique indices to each token. - - Note: - Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure - - Args: - collator_kwargs (Optional[Dict[str, Any]]): Optional dictionary of keyword arguments for configuring the collator. - token_path (Optional[str]): Path to the token file. If not provided, it will be created automatically. - kwargs: Additional keyword arguments. - """ - - COLLATOR = RaggedCollator - - # 21 natural amino acid notation - AA_LETTER = [ - "A", - "R", - "N", - "D", - "C", - "Q", - "E", - "G", - "H", - "I", - "L", - "K", - "M", - "F", - "P", - "S", - "T", - "W", - "Y", - "V", - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L3-L5 - "X", # Consider valid in latest paper year 2024 Reference number 3 in go_uniprot.py - ] - - def name(self) -> str: - """ - Returns the name of the data reader. This method identifies the specific type of data reader. - - Returns: - str: The name of the data reader, which is "protein_token". - """ - if self.n_gram is not None: - return f"protein_token_{self.n_gram}_gram" - - return "protein_token" - - def __init__(self, *args, n_gram: Optional[int] = None, **kwargs): - """ - Initializes the ProteinDataReader, loading existing tokens from the specified token file. - - Args: - *args: Additional positional arguments passed to the base class. - **kwargs: Additional keyword arguments passed to the base class. - """ - if n_gram is not None: - assert ( - int(n_gram) >= 2 - ), "Ngrams must be greater than or equal to 2 if provided." - self.n_gram = int(n_gram) - else: - self.n_gram = None - - super().__init__(*args, **kwargs) - - # Load the existing tokens from the token file into a cache - with open(self.token_path, "r") as pk: - self.cache = [x.strip() for x in pk] - - def _get_token_index(self, token: str) -> int: - """ - Returns a unique index for each token (amino acid). If the token is not already in the cache, it is added. - - Args: - token (str): The amino acid token to retrieve or add. - - Returns: - int: The index of the token, offset by the predefined EMBEDDING_OFFSET. - """ - error_str = ( - f"Please ensure that the input only contains valid amino acids " - f"20 Valid natural amino acid notation: {self.AA_LETTER}" - f"Refer to the amino acid sequence details here: " - f"https://en.wikipedia.org/wiki/Protein_primary_structure" - ) - - if self.n_gram is None: - # Single-letter amino acid token check - if str(token) not in self.AA_LETTER: - raise KeyError(f"Invalid token '{token}' encountered. " + error_str) - else: - # n-gram token validation, ensure that each component of the n-gram is valid - for aa in token: - if aa not in self.AA_LETTER: - raise KeyError( - f"Invalid token '{token}' encountered as part of n-gram {self.n_gram}. " - + error_str - ) - - if str(token) not in self.cache: - self.cache.append(str(token)) - return self.cache.index(str(token)) + EMBEDDING_OFFSET - - def _read_data(self, raw_data: str) -> List[int]: - """ - Reads and tokenizes raw protein sequence data into a list of token indices. - - Args: - raw_data (str): The raw protein sequence to be tokenized (e.g., "MKTFF..."). - - Returns: - List[int]: A list of integers representing the indices of the amino acid tokens. - """ - if self.n_gram is not None: - # Tokenize the sequence into n-grams - tokens = [ - raw_data[i : i + self.n_gram] - for i in range(len(raw_data) - self.n_gram + 1) - ] - return [self._get_token_index(gram) for gram in tokens] - - # If n_gram is None, tokenize the sequence at the amino acid level (single-letter representation) - return [self._get_token_index(aa) for aa in raw_data] - - def on_finish(self) -> None: - """ - Saves the current cache of tokens to the token file. This method is called after all data processing is complete. - """ - with open(self.token_path, "w") as pk: - print(f"Saving {len(self.cache)} tokens to {self.token_path}...") - print(f"First 10 tokens: {self.cache[:10]}") - pk.writelines([f"{c}\n" for c in self.cache]) - - -class ESM2EmbeddingReader(DataReader): - """ - A data reader to process protein sequences using the ESM2 model for embeddings. - - References: - https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/extract_esm.py - - Note: - For layer availability by model, Please check below link: - https://github.com/facebookresearch/esm?tab=readme-ov-file#pre-trained-models- - - To test this reader, try lighter models: - esm2_t6_8M_UR50D: 6 layers (valid layers: 1–6), (~28 Mb) - A tiny 8M parameter model. - esm2_t12_35M_UR50D: 12 layers (valid layers: 1–12), (~128 Mb) - A slightly larger, 35M parameter model. - These smaller models are good for testing and debugging purposes. - - """ - - # https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L53 - _MODELS_URL = "https://dl.fbaipublicfiles.com/fair-esm/models/{}.pt" - _REGRESSION_URL = ( - "https://dl.fbaipublicfiles.com/fair-esm/regression/{}-contact-regression.pt" - ) - - def __init__( - self, - save_model_dir: str = os.path.join("data", "esm2_reader"), - model_name: str = "esm2_t36_3B_UR50D", - device: Optional[torch.device] = None, - truncation_length: int = 1022, - toks_per_batch: int = 4096, - return_contacts: bool = False, - repr_layer: int = 36, - *args, - **kwargs, - ): - """ - Initialize the ESM2EmbeddingReader class. - - Args: - save_model_dir (str): Directory to save/load the pretrained ESM model. - model_name (str): Name of the pretrained model. Defaults to "esm2_t36_3B_UR50D". - device (torch.device or str, optional): Device for computation (e.g., 'cpu', 'cuda'). - truncation_length (int): Maximum sequence length for truncation. Defaults to 1022. - toks_per_batch (int): Tokens per batch for data processing. Defaults to 4096. - return_contacts (bool): Whether to return contact maps. Defaults to False. - repr_layers (int): Layer number to extract representations from. Defaults to 36. - """ - self.save_model_dir = save_model_dir - if not os.path.exists(self.save_model_dir): - os.makedirs((os.path.dirname(self.save_model_dir)), exist_ok=True) - self.model_name = model_name - self.device = device - self.truncation_length = truncation_length - self.toks_per_batch = toks_per_batch - self.return_contacts = return_contacts - self.repr_layer = repr_layer - - self._model: Optional[ESM2] = None - self._alphabet: Optional[Alphabet] = None - - self._model, self._alphabet = self.load_model_and_alphabet() - self._model.eval() - - if self.device: - self._model = self._model.to(device) - - super().__init__(*args, **kwargs) - - def load_model_and_alphabet(self) -> Tuple[ESM2, Alphabet]: - """ - Load the ESM2 model and its alphabet. - - References: - https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L24-L28 - - Returns: - Tuple[ESM2, Alphabet]: Loaded model and alphabet. - """ - model_location = os.path.join(self.save_model_dir, f"{self.model_name}.pt") - if os.path.exists(model_location): - return load_model_and_alphabet_local(model_location) - else: - return self.load_model_and_alphabet_hub() - - def load_model_and_alphabet_hub(self) -> Tuple[ESM2, Alphabet]: - """ - Load the model and alphabet from the hub URL. - - References: - https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L62-L64 - - Returns: - Tuple[ESM2, Alphabet]: Loaded model and alphabet. - """ - model_url = self._MODELS_URL.format(self.model_name) - model_data = self.load_hub_workaround(model_url) - regression_data = None - if _has_regression_weights(self.model_name): - regression_url = self._REGRESSION_URL.format(self.model_name) - regression_data = self.load_hub_workaround(regression_url) - return load_model_and_alphabet_core( - self.model_name, model_data, regression_data - ) - - def load_hub_workaround(self, url) -> torch.Tensor: - """ - Workaround to load models from the PyTorch Hub. - - References: - https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L31-L43 - - Returns: - torch.Tensor: Loaded model state dictionary. - """ - try: - data = torch.hub.load_state_dict_from_url( - url, self.save_model_dir, progress=True, map_location=self.device - ) - - except RuntimeError: - # Handle PyTorch version issues - fn = Path(url).name - data = torch.load( - f"{torch.hub.get_dir()}/checkpoints/{fn}", - map_location="cpu", - ) - except HTTPError as e: - raise Exception( - f"Could not load {url}. Did you specify the correct model name?" - ) - return data - - @staticmethod - def name() -> str: - """ - Returns the name of the data reader. This method identifies the specific type of data reader. - - Returns: - str: The name of the data reader, which is "protein_token". - """ - return "esm2_embedding" - - @property - def token_path(self) -> None: - """ - Not used as no token file is not created for this reader. - - Returns: - str: Empty string since this method is not implemented. - """ - return - - def _read_data(self, raw_data: str) -> List[int]: - """ - Reads protein sequence data and generates embeddings. - - Args: - raw_data (str): The protein sequence. - - Returns: - List[int]: Embeddings generated for the sequence. - """ - alp_tokens_idx = self._sequence_to_alphabet_tokens_idx(raw_data) - return self._alphabet_tokens_to_esm_embedding(alp_tokens_idx).tolist() - - def _sequence_to_alphabet_tokens_idx(self, sequence: str) -> torch.Tensor: - """ - Converts a protein sequence into ESM alphabet token indices. - - Args: - sequence (str): Protein sequence. - - References: - https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/data.py#L249-L250 - https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/data.py#L262-L297 - - Returns: - torch.Tensor: Tokenized sequence with special tokens (BOS/EOS) included. - """ - seq_encoded = self._alphabet.encode(sequence) - tokens = [] - - # Add BOS token if configured - if self._alphabet.prepend_bos: - tokens.append(self._alphabet.cls_idx) - - # Add the main sequence - tokens.extend(seq_encoded) - - # Add EOS token if configured - if self._alphabet.append_eos: - tokens.append(self._alphabet.eos_idx) - - # Convert to PyTorch tensor and return - return torch.tensor([tokens], dtype=torch.int64) - - def _alphabet_tokens_to_esm_embedding(self, tokens: torch.Tensor) -> torch.Tensor: - """ - Converts alphabet tokens into ESM embeddings. - - Args: - tokens (torch.Tensor): Tokenized protein sequences. - - References: - https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/extract_esm.py#L82-L107 - - Returns: - torch.Tensor: Protein embedding from the specified representation layer. - """ - if self.device: - tokens = tokens.to(self.device, non_blocking=True) - - with torch.no_grad(): - out = self._model( - tokens, - repr_layers=[ - self.repr_layer, - ], - return_contacts=self.return_contacts, - ) - - # Extract representations and compute the mean embedding for each layer - representations = { - layer: t.to(self.device) for layer, t in out["representations"].items() - } - truncate_len = min(self.truncation_length, tokens.size(1)) - - result = { - "mean_representations": { - layer: t[0, 1 : truncate_len + 1].mean(0).clone() - for layer, t in representations.items() - } - } - return result["mean_representations"][self.repr_layer] - - def on_finish(self) -> None: - """ - Not used here as no token file exists for this reader. - - Returns: - None - """ - pass diff --git a/configs/data/deepGO/deepgo2_esm2.yml b/configs/data/deepGO/deepgo2_esm2.yml deleted file mode 100644 index 5a0436e3..00000000 --- a/configs/data/deepGO/deepgo2_esm2.yml +++ /dev/null @@ -1,5 +0,0 @@ -class_path: chebai.preprocessing.datasets.deepGO.go_uniprot.DeepGO2MigratedData -init_args: - go_branch: "MF" - max_sequence_length: 1000 - use_esm2_embeddings: True diff --git a/configs/data/deepGO/deepgo_1_migrated_data.yml b/configs/data/deepGO/deepgo_1_migrated_data.yml deleted file mode 100644 index 0924e023..00000000 --- a/configs/data/deepGO/deepgo_1_migrated_data.yml +++ /dev/null @@ -1,4 +0,0 @@ -class_path: chebai.preprocessing.datasets.deepGO.go_uniprot.DeepGO1MigratedData -init_args: - go_branch: "MF" - max_sequence_length: 1002 diff --git a/configs/data/deepGO/deepgo_2_migrated_data.yml b/configs/data/deepGO/deepgo_2_migrated_data.yml deleted file mode 100644 index 5a0436e3..00000000 --- a/configs/data/deepGO/deepgo_2_migrated_data.yml +++ /dev/null @@ -1,5 +0,0 @@ -class_path: chebai.preprocessing.datasets.deepGO.go_uniprot.DeepGO2MigratedData -init_args: - go_branch: "MF" - max_sequence_length: 1000 - use_esm2_embeddings: True diff --git a/configs/data/deepGO/go250.yml b/configs/data/deepGO/go250.yml deleted file mode 100644 index 01e34aa4..00000000 --- a/configs/data/deepGO/go250.yml +++ /dev/null @@ -1,3 +0,0 @@ -class_path: chebai.preprocessing.datasets.go_uniprot.deepGO.GOUniProtOver250 -init_args: - go_branch: "BP" diff --git a/configs/data/deepGO/go50.yml b/configs/data/deepGO/go50.yml deleted file mode 100644 index bee43773..00000000 --- a/configs/data/deepGO/go50.yml +++ /dev/null @@ -1 +0,0 @@ -class_path: chebai.preprocessing.datasets.deepGO.go_uniprot.GOUniProtOver50 diff --git a/configs/data/scope/scope2000.yml b/configs/data/scope/scope2000.yml deleted file mode 100644 index d75c807f..00000000 --- a/configs/data/scope/scope2000.yml +++ /dev/null @@ -1,3 +0,0 @@ -class_path: chebai.preprocessing.datasets.scope.scope.SCOPeOver2000 -init_args: - scope_version: "2.08" diff --git a/configs/data/scope/scope50.yml b/configs/data/scope/scope50.yml deleted file mode 100644 index c65028e2..00000000 --- a/configs/data/scope/scope50.yml +++ /dev/null @@ -1,3 +0,0 @@ -class_path: chebai.preprocessing.datasets.scope.scope.SCOPeOver50 -init_args: - scope_version: "2.08" \ No newline at end of file diff --git a/setup.py b/setup.py index 8a6d3e0c..21ddfa49 100644 --- a/setup.py +++ b/setup.py @@ -50,8 +50,6 @@ "chardet", "pyyaml", "torchmetrics", - "biopython", - "fair-esm", ], extras_require={"dev": ["black", "isort", "pre-commit"]}, ) diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py deleted file mode 100644 index 96ff9a3a..00000000 --- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py +++ /dev/null @@ -1,229 +0,0 @@ -import unittest -from collections import OrderedDict -from unittest.mock import PropertyMock, mock_open, patch - -import fastobo -import networkx as nx -import pandas as pd - -from chebai.preprocessing.datasets.deepGO.go_uniprot import _GOUniProtDataExtractor -from chebai.preprocessing.reader import ProteinDataReader -from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData - - -class TestGOUniProtDataExtractor(unittest.TestCase): - """ - Unit tests for the _GOUniProtDataExtractor class. - """ - - @classmethod - @patch.multiple(_GOUniProtDataExtractor, __abstractmethods__=frozenset()) - @patch.object(_GOUniProtDataExtractor, "base_dir", new_callable=PropertyMock) - @patch.object(_GOUniProtDataExtractor, "_name", new_callable=PropertyMock) - @patch("os.makedirs", return_value=None) - def setUpClass( - cls, - mock_makedirs, - mock_name_property: PropertyMock, - mock_base_dir_property: PropertyMock, - ) -> None: - """ - Class setup for mocking abstract properties of _GOUniProtDataExtractor. - """ - mock_base_dir_property.return_value = "MockedBaseDirPropGOUniProtDataExtractor" - mock_name_property.return_value = "MockedNamePropGOUniProtDataExtractor" - - _GOUniProtDataExtractor.READER = ProteinDataReader - - cls.extractor = _GOUniProtDataExtractor() - - def test_term_callback(self) -> None: - """ - Test the term_callback method for correct parsing and filtering of GO terms. - """ - self.extractor.go_branch = "all" - term_mapping = {} - for term in fastobo.loads(GOUniProtMockData.get_GO_raw_data()): - if isinstance(term, fastobo.typedef.TypedefFrame): - continue - term_mapping[self.extractor._parse_go_id(term.id)] = term - - # Test individual term callback - term_dict = self.extractor.term_callback(term_mapping[4]) - expected_dict = {"go_id": 4, "parents": [3, 2], "name": "GO_4"} - self.assertEqual( - term_dict, - expected_dict, - "The term_callback did not return the expected dictionary.", - ) - - # Test filtering valid terms - valid_terms_docs = set() - for term_id, term_doc in term_mapping.items(): - if self.extractor.term_callback(term_doc): - valid_terms_docs.add(term_id) - - self.assertEqual( - valid_terms_docs, - set(GOUniProtMockData.get_nodes()), - "The valid terms do not match expected nodes.", - ) - - # Test that obsolete terms are filtered out - self.assertFalse( - any( - self.extractor.term_callback(term_mapping[obs_id]) - for obs_id in GOUniProtMockData.get_obsolete_nodes_ids() - ), - "Obsolete terms should not be present.", - ) - - # Test filtering by GO branch (e.g., BP) - self.extractor.go_branch = "BP" - BP_terms = { - term_id - for term_id, term in term_mapping.items() - if self.extractor.term_callback(term) - } - self.assertEqual( - BP_terms, {2, 4}, "The BP terms do not match the expected set." - ) - - @patch( - "fastobo.load", return_value=fastobo.loads(GOUniProtMockData.get_GO_raw_data()) - ) - def test_extract_class_hierarchy(self, mock_load) -> None: - """ - Test the extraction of the class hierarchy from the ontology. - """ - graph = self.extractor._extract_class_hierarchy("fake_path") - - # Validate the graph structure - self.assertIsInstance( - graph, nx.DiGraph, "The result should be a directed graph." - ) - - # Check nodes - actual_nodes = set(graph.nodes) - self.assertEqual( - set(GOUniProtMockData.get_nodes()), - actual_nodes, - "The graph nodes do not match the expected nodes.", - ) - - # Check edges - actual_edges = set(graph.edges) - self.assertEqual( - GOUniProtMockData.get_edges_of_transitive_closure_graph(), - actual_edges, - "The graph edges do not match the expected edges.", - ) - - # Check number of nodes and edges - self.assertEqual( - GOUniProtMockData.get_number_of_nodes(), - len(actual_nodes), - "The number of nodes should match the actual number of nodes in the graph.", - ) - - self.assertEqual( - GOUniProtMockData.get_number_of_transitive_edges(), - len(actual_edges), - "The number of transitive edges should match the actual number of transitive edges in the graph.", - ) - - @patch( - "builtins.open", - new_callable=mock_open, - read_data=GOUniProtMockData.get_UniProt_raw_data(), - ) - def test_get_swiss_to_go_mapping(self, mock_open) -> None: - """ - Test the extraction of SwissProt to GO term mapping. - """ - mapping_df = self.extractor._get_swiss_to_go_mapping() - expected_df = pd.DataFrame( - OrderedDict( - swiss_id=["Swiss_Prot_1", "Swiss_Prot_2"], - accession=["Q6GZX4", "DCGZX4"], - go_ids=[[2, 3, 5], [2, 5]], - sequence=list(GOUniProtMockData.protein_sequences().values()), - ) - ) - - pd.testing.assert_frame_equal( - mapping_df, - expected_df, - obj="The SwissProt to GO mapping DataFrame does not match the expected DataFrame.", - ) - - @patch( - "fastobo.load", return_value=fastobo.loads(GOUniProtMockData.get_GO_raw_data()) - ) - @patch( - "builtins.open", - new_callable=mock_open, - read_data=GOUniProtMockData.get_UniProt_raw_data(), - ) - @patch.object( - _GOUniProtDataExtractor, - "select_classes", - return_value=GOUniProtMockData.get_nodes(), - ) - def test_graph_to_raw_dataset( - self, mock_select_classes, mock_open, mock_load - ) -> None: - """ - Test the conversion of the class hierarchy graph to a raw dataset. - """ - graph = self.extractor._extract_class_hierarchy("fake_path") - actual_df = self.extractor._graph_to_raw_dataset(graph) - expected_df = GOUniProtMockData.get_data_in_dataframe() - - pd.testing.assert_frame_equal( - actual_df, - expected_df, - obj="The raw dataset DataFrame does not match the expected DataFrame.", - ) - - @patch("builtins.open", new_callable=mock_open, read_data=b"Mocktestdata") - @patch("pandas.read_pickle") - def test_load_dict( - self, mock_read_pickle: PropertyMock, mock_open: mock_open - ) -> None: - """ - Test the loading of the dictionary from a DataFrame. - """ - mock_df = GOUniProtMockData.get_data_in_dataframe() - mock_read_pickle.return_value = mock_df - - generator = self.extractor._load_dict("data/tests") - result = list(generator) - - # Convert NumPy arrays to lists for comparison - for item in result: - item["labels"] = list(item["labels"]) - - # Expected output for comparison - expected_result = [ - { - "features": mock_df["sequence"][0], - "labels": mock_df.iloc[0, 4:].to_list(), - "ident": mock_df["swiss_id"][0], - }, - { - "features": mock_df["sequence"][1], - "labels": mock_df.iloc[1, 4:].to_list(), - "ident": mock_df["swiss_id"][1], - }, - ] - - self.assertEqual( - result, - expected_result, - "The loaded dictionary does not match the expected structure.", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/unit/dataset_classes/testGoUniProtOverX.py b/tests/unit/dataset_classes/testGoUniProtOverX.py deleted file mode 100644 index 3f329c56..00000000 --- a/tests/unit/dataset_classes/testGoUniProtOverX.py +++ /dev/null @@ -1,140 +0,0 @@ -import unittest -from typing import List -from unittest.mock import mock_open, patch - -import networkx as nx -import pandas as pd - -from chebai.preprocessing.datasets.deepGO.go_uniprot import _GOUniProtOverX -from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData - - -class TestGOUniProtOverX(unittest.TestCase): - @classmethod - @patch.multiple(_GOUniProtOverX, __abstractmethods__=frozenset()) - @patch("os.makedirs", return_value=None) - def setUpClass(cls, mock_makedirs) -> None: - """ - Set up the class for tests by initializing the extractor, graph, and input DataFrame. - """ - cls.extractor = _GOUniProtOverX() - cls.test_graph: nx.DiGraph = GOUniProtMockData.get_transitively_closed_graph() - cls.input_df: pd.DataFrame = GOUniProtMockData.get_data_in_dataframe().iloc[ - :, :4 - ] - - @patch("builtins.open", new_callable=mock_open) - def test_select_classes(self, mock_open_file: mock_open) -> None: - """ - Test the `select_classes` method to ensure it selects classes based on the threshold. - - Args: - mock_open_file (mock_open): Mocked open function to intercept file operations. - """ - # Set threshold for testing - self.extractor.THRESHOLD = 2 - selected_classes: List[int] = self.extractor.select_classes( - self.test_graph, data_df=self.input_df - ) - - # Expected result: GO terms 1, 2, and 5 should be selected based on the threshold - expected_selected_classes: List[int] = sorted([1, 2, 5]) - - # Check if the selected classes are as expected - self.assertEqual( - selected_classes, - expected_selected_classes, - msg="The selected classes do not match the expected output for threshold 2.", - ) - - # Expected data as string - expected_lines: str = "\n".join(map(str, expected_selected_classes)) + "\n" - - # Extract the generator passed to writelines - written_generator = mock_open_file().writelines.call_args[0][0] - written_lines: str = "".join(written_generator) - - # Ensure the data matches - self.assertEqual( - written_lines, - expected_lines, - msg="The written lines do not match the expected lines for the given threshold of 2.", - ) - - @patch("builtins.open", new_callable=mock_open) - def test_no_classes_meet_threshold(self, mock_open_file: mock_open) -> None: - """ - Test the `select_classes` method when no nodes meet the successor threshold. - - Args: - mock_open_file (mock_open): Mocked open function to intercept file operations. - """ - self.extractor.THRESHOLD = 5 - selected_classes: List[int] = self.extractor.select_classes( - self.test_graph, data_df=self.input_df - ) - - # Expected result: No classes should meet the threshold of 5 - expected_selected_classes: List[int] = [] - - # Check if the selected classes are as expected - self.assertEqual( - selected_classes, - expected_selected_classes, - msg="The selected classes list should be empty when no nodes meet the threshold of 5.", - ) - - # Expected data as string - expected_lines: str = "" - - # Extract the generator passed to writelines - written_generator = mock_open_file().writelines.call_args[0][0] - written_lines: str = "".join(written_generator) - - # Ensure the data matches - self.assertEqual( - written_lines, - expected_lines, - msg="The written lines do not match the expected lines when no nodes meet the threshold of 5.", - ) - - @patch("builtins.open", new_callable=mock_open) - def test_all_nodes_meet_threshold(self, mock_open_file: mock_open) -> None: - """ - Test the `select_classes` method when all nodes meet the successor threshold. - - Args: - mock_open_file (mock_open): Mocked open function to intercept file operations. - """ - self.extractor.THRESHOLD = 0 - selected_classes: List[int] = self.extractor.select_classes( - self.test_graph, data_df=self.input_df - ) - - # Expected result: All nodes except those not referenced by any protein (4 and 6) should be selected - expected_classes: List[int] = sorted([1, 2, 3, 5]) - - # Check if the returned selected classes match the expected list - self.assertListEqual( - selected_classes, - expected_classes, - msg="The selected classes do not match the expected output when all nodes meet the threshold of 0.", - ) - - # Expected data as string - expected_lines: str = "\n".join(map(str, expected_classes)) + "\n" - - # Extract the generator passed to writelines - written_generator = mock_open_file().writelines.call_args[0][0] - written_lines: str = "".join(written_generator) - - # Ensure the data matches - self.assertEqual( - written_lines, - expected_lines, - msg="The written lines do not match the expected lines when all nodes meet the threshold of 0.", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/unit/dataset_classes/testProteinPretrainingData.py b/tests/unit/dataset_classes/testProteinPretrainingData.py deleted file mode 100644 index caac3eac..00000000 --- a/tests/unit/dataset_classes/testProteinPretrainingData.py +++ /dev/null @@ -1,76 +0,0 @@ -import unittest -from unittest.mock import PropertyMock, mock_open, patch - -from chebai.preprocessing.datasets.deepGO.protein_pretraining import ( - _ProteinPretrainingData, -) -from chebai.preprocessing.reader import ProteinDataReader -from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData - - -class TestProteinPretrainingData(unittest.TestCase): - """ - Unit tests for the _ProteinPretrainingData class. - Tests focus on data parsing and validation checks for protein pretraining. - """ - - @classmethod - @patch.multiple(_ProteinPretrainingData, __abstractmethods__=frozenset()) - @patch.object(_ProteinPretrainingData, "base_dir", new_callable=PropertyMock) - @patch.object(_ProteinPretrainingData, "_name", new_callable=PropertyMock) - @patch("os.makedirs", return_value=None) - def setUpClass( - cls, - mock_makedirs, - mock_name_property: PropertyMock, - mock_base_dir_property: PropertyMock, - ) -> None: - """ - Class setup for mocking abstract properties of _ProteinPretrainingData. - - Mocks the required abstract properties and sets up the data extractor. - """ - mock_base_dir_property.return_value = "MockedBaseDirPropProteinPretrainingData" - mock_name_property.return_value = "MockedNameProp_ProteinPretrainingData" - - # Set the READER class for the pretraining data - _ProteinPretrainingData.READER = ProteinDataReader - - # Initialize the extractor instance - cls.extractor = _ProteinPretrainingData() - - @patch( - "builtins.open", - new_callable=mock_open, - read_data=GOUniProtMockData.get_UniProt_raw_data(), - ) - def test_parse_protein_data_for_pretraining( - self, mock_open_file: mock_open - ) -> None: - """ - Tests the _parse_protein_data_for_pretraining method. - - Verifies that: - - The parsed DataFrame contains the expected protein IDs. - - The protein sequences are not empty. - """ - # Parse the pretraining data - pretrain_df = self.extractor._parse_protein_data_for_pretraining() - list_of_pretrain_swiss_ids = GOUniProtMockData.proteins_for_pretraining() - - # Assert that all expected Swiss-Prot IDs are present in the DataFrame - self.assertEqual( - set(pretrain_df["swiss_id"]), - set(list_of_pretrain_swiss_ids), - msg="The parsed DataFrame does not contain the expected Swiss-Prot IDs for pretraining.", - ) - - # Assert that all sequences are not empty - self.assertTrue( - pretrain_df["sequence"].str.len().gt(0).all(), - msg="Some protein sequences in the pretraining DataFrame are empty.", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index 552d2918..d94a8d94 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -404,410 +404,3 @@ def get_transitively_closed_graph() -> nx.DiGraph: g.add_edges_from(ChebiMockOntology.get_edges_of_transitive_closure_graph()) return g - - -class GOUniProtMockData(MockOntologyGraphData): - """ - A mock ontology representing a simplified version of the Gene Ontology (GO) structure with nodes and edges - representing GO terms and their relationships in a directed acyclic graph (DAG). - - Nodes: - - GO_1 - - GO_2 - - GO_3 - - GO_4 - - GO_5 - - GO_6 - - Edges (Parent-Child Relationships): - - GO_1 -> GO_2 - - GO_1 -> GO_3 - - GO_2 -> GO_4 - - GO_2 -> GO_5 - - GO_3 -> GO_4 - - GO_4 -> GO_6 - - This mock ontology structure is useful for testing methods related to GO hierarchy, graph extraction, and transitive - closure operations. - - The class also includes methods to retrieve nodes, edges, and transitive closure of the graph. - - Visual Representation Graph with Valid Nodes and Edges: - - GO_1 - / \ - GO_2 GO_3 - / \ / - GO_5 GO_4 - \ - GO_6 - - Valid Swiss Proteins with mapping to valid GO ids - Swiss_Prot_1 -> GO_2, GO_3, GO_5 - Swiss_Prot_2 -> GO_2, GO_5 - """ - - @staticmethod - def get_nodes() -> List[int]: - """ - Get a sorted list of node IDs. - - Returns: - List[int]: A sorted list of node IDs in the ontology graph. - """ - return sorted([1, 2, 3, 4, 5, 6]) - - @staticmethod - def get_number_of_nodes() -> int: - """ - Get the total number of nodes in the ontology graph. - - Returns: - int: The number of nodes. - """ - return len(GOUniProtMockData.get_nodes()) - - @staticmethod - def get_edges() -> Set[Tuple[int, int]]: - """ - Get the set of edges in the ontology graph. - - Returns: - Set[Tuple[int, int]]: A set of tuples where each tuple represents an edge between two nodes. - """ - return {(1, 2), (1, 3), (2, 4), (2, 5), (3, 4), (4, 6)} - - @staticmethod - def get_number_of_edges() -> int: - """ - Get the total number of edges in the ontology graph. - - Returns: - int: The number of edges. - """ - return len(GOUniProtMockData.get_edges()) - - @staticmethod - def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]: - """ - Get the set of edges in the transitive closure of the ontology graph. - - Returns: - Set[Tuple[int, int]]: A set of tuples representing edges in the transitive closure graph. - """ - return { - (1, 2), - (1, 3), - (1, 4), - (1, 5), - (1, 6), - (2, 4), - (2, 5), - (2, 6), - (3, 4), - (3, 6), - (4, 6), - } - - @staticmethod - def get_number_of_transitive_edges() -> int: - """ - Get the total number of edges in the transitive closure graph. - - Returns: - int: The number of transitive edges. - """ - return len(GOUniProtMockData.get_edges_of_transitive_closure_graph()) - - @staticmethod - def get_obsolete_nodes_ids() -> Set[int]: - """ - Get the set of obsolete node IDs in the ontology graph. - - Returns: - Set[int]: A set of node IDs representing obsolete nodes. - """ - return {7, 8} - - @staticmethod - def get_GO_raw_data() -> str: - """ - Get raw data in string format for a basic Gene Ontology (GO) structure. - - This data simulates a basic GO ontology format typically used for testing purposes. - The data will include valid and obsolete GO terms with various relationships between them. - - Scenarios covered: - - Obsolete terms being the parent of valid terms. - - Valid terms being the parent of obsolete terms. - - Both direct and indirect hierarchical relationships between terms. - - The data is designed to help test the proper handling of obsolete and valid GO terms, - ensuring that the ontology parser can correctly manage both cases. - - Returns: - str: The raw GO data in string format, structured as test input. - """ - return """ - [Term] - id: GO:0000001 - name: GO_1 - namespace: molecular_function - def: "OBSOLETE. Assists in the correct assembly of ribosomes or ribosomal subunits in vivo, but is not a component of the assembled ribosome when performing its normal biological function." [GOC:jl, PMID:12150913] - comment: This term was made obsolete because it refers to a class of gene products and a biological process rather than a molecular function. - synonym: "ribosomal chaperone activity" EXACT [] - xref: MetaCyc:BETAGALACTOSID-RXN - xref: Reactome:R-HSA-189062 "lactose + H2O => D-glucose + D-galactose" - xref: Reactome:R-HSA-5658001 "Defective LCT does not hydrolyze Lac" - xref: RHEA:10076 - - [Term] - id: GO:0000002 - name: GO_2 - namespace: biological_process - is_a: GO:0000001 ! hydrolase activity, hydrolyzing O-glycosyl compounds - is_a: GO:0000008 ! hydrolase activity, hydrolyzing O-glycosyl compounds - - [Term] - id: GO:0000003 - name: GO_3 - namespace: cellular_component - is_a: GO:0000001 ! regulation of DNA recombination - - [Term] - id: GO:0000004 - name: GO_4 - namespace: biological_process - is_a: GO:0000003 ! regulation of DNA recombination - is_a: GO:0000002 ! hydrolase activity, hydrolyzing O-glycosyl compounds - - [Term] - id: GO:0000005 - name: GO_5 - namespace: molecular_function - is_a: GO:0000002 ! regulation of DNA recombination - - [Term] - id: GO:0000006 - name: GO_6 - namespace: cellular_component - is_a: GO:0000004 ! glucoside transport - - [Term] - id: GO:0000007 - name: GO_7 - namespace: biological_process - is_a: GO:0000003 ! glucoside transport - is_obsolete: true - - [Term] - id: GO:0000008 - name: GO_8 - namespace: molecular_function - is_obsolete: true - - [Typedef] - id: term_tracker_item - name: term tracker item - namespace: external - xref: IAO:0000233 - is_metadata_tag: true - is_class_level: true - """ - - @staticmethod - def protein_sequences() -> Dict[str, str]: - """ - Get the protein sequences for Swiss-Prot proteins. - - Returns: - Dict[str, str]: A dictionary where keys are Swiss-Prot IDs and values are their respective sequences. - """ - return { - "Swiss_Prot_1": "MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK".replace( - " ", "" - ), - "Swiss_Prot_2": "EKGLIVGHFS GIKYKGEKAQ ASEVDVNKMC CWVSKFKDAM RRYQGIQTCK".replace( - " ", "" - ), - } - - @staticmethod - def proteins_for_pretraining() -> List[str]: - """ - Returns a list of protein IDs which will be used for pretraining based on mock UniProt data. - - Proteins include those with: - - No GO classes or invalid GO classes (missing required evidence codes). - - Returns: - List[str]: A list of protein IDs that do not meet validation criteria. - """ - return [ - "Swiss_Prot_5", # No GO classes associated - "Swiss_Prot_6", # GO class with no evidence code - "Swiss_Prot_7", # GO class with invalid evidence code - ] - - @staticmethod - def get_UniProt_raw_data() -> str: - """ - Get raw data in string format for UniProt proteins. - - This mock data contains eleven Swiss-Prot proteins with different properties: - - **Swiss_Prot_1**: A valid protein with three valid GO classes and one invalid GO class. - - **Swiss_Prot_2**: Another valid protein with two valid GO classes and one invalid. - - **Swiss_Prot_3**: Contains valid GO classes but has a sequence length > 1002. - - **Swiss_Prot_4**: Has valid GO classes but contains an invalid amino acid, 'B'. - - **Swiss_Prot_5**: Has a sequence but no GO classes associated. - - **Swiss_Prot_6**: Has GO classes without any associated evidence codes. - - **Swiss_Prot_7**: Has a GO class with an invalid evidence code. - - **Swiss_Prot_8**: Has a sequence length > 1002 and has only invalid GO class. - - **Swiss_Prot_9**: Has no GO classes but contains an invalid amino acid, 'B', in its sequence. - - **Swiss_Prot_10**: Has a valid GO class but lacks a sequence. - - **Swiss_Prot_11**: Has only Invalid GO class but lacks a sequence. - - Note: - A valid GO label is the one which has one of the following evidence code specified in - go_uniprot.py->`EXPERIMENTAL_EVIDENCE_CODES`. - Invalid amino acids are specified in go_uniprot.py->`AMBIGUOUS_AMINO_ACIDS`. - - Returns: - str: The raw UniProt data in string format. - """ - protein_sq_1 = GOUniProtMockData.protein_sequences()["Swiss_Prot_1"] - protein_sq_2 = GOUniProtMockData.protein_sequences()["Swiss_Prot_2"] - raw_str = ( - # Below protein with 3 valid associated GO class and one invalid GO class - f"ID Swiss_Prot_1 Reviewed; {len(protein_sq_1)} AA. \n" - "AC Q6GZX4;\n" - "DR GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n" - "DR GO; GO:0000003; C:membrane; IDA:UniProtKB-KW.\n" - "DR GO; GO:0000005; P:regulation of viral transcription; IPI:InterPro.\n" - "DR GO; GO:0000004; P:regulation of viral transcription; IEA:SGD.\n" - f"SQ SEQUENCE {len(protein_sq_1)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - f" {protein_sq_1}\n" - "//\n" - # Below protein with 2 valid associated GO class and one invalid GO class - f"ID Swiss_Prot_2 Reviewed; {len(protein_sq_2)} AA.\n" - "AC DCGZX4;\n" - "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - "DR GO; GO:0000002; P:regulation of viral transcription; IMP:InterPro.\n" - "DR GO; GO:0000005; P:regulation of viral transcription; IGI:InterPro.\n" - "DR GO; GO:0000006; P:regulation of viral transcription; IEA:PomBase.\n" - f"SQ SEQUENCE {len(protein_sq_2)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - f" {protein_sq_2}\n" - "//\n" - # Below protein with all valid associated GO class but sequence length greater than 1002 - f"ID Swiss_Prot_3 Reviewed; {len(protein_sq_1 * 25)} AA.\n" - "AC Q6GZX4;\n" - "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - "DR GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n" - "DR GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n" - "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" - f"SQ SEQUENCE {len(protein_sq_1 * 25)} AA; 129118 MW; FE2984658CED53A8 CRC64;\n" - f" {protein_sq_1 * 25}\n" - "//\n" - # Below protein has valid go class association but invalid amino acid `X` in its sequence - "ID Swiss_Prot_4 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - "DR GO; GO:0000002; P:regulation of viral transcription; EXP:InterPro.\n" - "DR GO; GO:0000005; P:regulation of viral transcription; IEA:InterPro.\n" - "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " BAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - "//\n" - # Below protein with sequence string but has no GO class - "ID Swiss_Prot_5 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - "//\n" - # Below protein with sequence string and with NO `valid` associated GO class (no evidence code) - "ID Swiss_Prot_6 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR GO; GO:0000023; P:regulation of viral transcription;\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - "//\n" - # Below protein with sequence string and with NO `valid` associated GO class (invalid evidence code) - "ID Swiss_Prot_7 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR GO; GO:0000024; P:regulation of viral transcription; IEA:SGD.\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - "//\n" - # Below protein with sequence length greater than 1002 but with `Invalid` associated GO class - f"ID Swiss_Prot_8 Reviewed; {len(protein_sq_2 * 25)} AA.\n" - "AC Q6GZX4;\n" - "DR GO; GO:0000025; P:regulation of viral transcription; IC:Inferred.\n" - f"SQ SEQUENCE {len(protein_sq_2 * 25)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - f" {protein_sq_2 * 25}\n" - "//\n" - # Below protein with sequence string but invalid amino acid `X` in its sequence - "ID Swiss_Prot_9 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " BAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - "//\n" - # Below protein with a `valid` associated GO class but without sequence string - "ID Swiss_Prot_10 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR GO; GO:0000027; P:regulation of viral transcription; EXP:InterPro.\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " \n" - "//\n" - # Below protein with a `Invalid` associated GO class but without sequence string - "ID Swiss_Prot_11 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR GO; GO:0000028; P:regulation of viral transcription; ND:NoData.\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " \n" - "//\n" - ) - - return raw_str - - @staticmethod - def get_data_in_dataframe() -> pd.DataFrame: - """ - Get a mock DataFrame representing UniProt data. - - The DataFrame contains Swiss-Prot protein data, including identifiers, accessions, GO terms, sequences, - and binary label columns representing whether each protein is associated with certain GO classes. - - Returns: - pd.DataFrame: A DataFrame containing mock UniProt data with columns for 'swiss_id', 'accession', 'go_ids', 'sequence', - and binary labels for GO classes. - """ - expected_data = OrderedDict( - swiss_id=["Swiss_Prot_1", "Swiss_Prot_2"], - accession=["Q6GZX4", "DCGZX4"], - go_ids=[[1, 2, 3, 5], [1, 2, 5]], - sequence=list(GOUniProtMockData.protein_sequences().values()), - **{ - # SP_1, SP_2 - 1: [True, True], - 2: [True, True], - 3: [True, False], - 4: [False, False], - 5: [True, True], - 6: [False, False], - }, - ) - return pd.DataFrame(expected_data) - - @staticmethod - def get_transitively_closed_graph() -> nx.DiGraph: - """ - Get the transitive closure of the ontology graph. - - Returns: - nx.DiGraph: A directed graph representing the transitive closure of the ontology graph. - """ - g = nx.DiGraph() - g.add_nodes_from(node for node in ChebiMockOntology.get_nodes()) - g.add_edges_from(GOUniProtMockData.get_edges_of_transitive_closure_graph()) - return g diff --git a/tests/unit/readers/testProteinDataReader.py b/tests/unit/readers/testProteinDataReader.py deleted file mode 100644 index c5bc5e9a..00000000 --- a/tests/unit/readers/testProteinDataReader.py +++ /dev/null @@ -1,139 +0,0 @@ -import unittest -from typing import List -from unittest.mock import mock_open, patch - -from chebai.preprocessing.reader import EMBEDDING_OFFSET, ProteinDataReader - - -class TestProteinDataReader(unittest.TestCase): - """ - Unit tests for the ProteinDataReader class. - """ - - @classmethod - @patch( - "chebai.preprocessing.reader.open", - new_callable=mock_open, - read_data="M\nK\nT\nF\nR\nN", - ) - def setUpClass(cls, mock_file: mock_open) -> None: - """ - Set up the test environment by initializing a ProteinDataReader instance with a mocked token file. - - Args: - mock_file: Mock object for file operations. - """ - cls.reader = ProteinDataReader(token_path="/mock/path") - # After initializing, cls.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N'] - assert cls.reader.cache == [ - "M", - "K", - "T", - "F", - "R", - "N", - ], "Cache initialization did not match expected tokens." - - def test_read_data(self) -> None: - """ - Test the _read_data method with a protein sequence to ensure it correctly tokenizes the sequence. - """ - raw_data = "MKTFFRN" - - # Expected output based on the cached tokens - expected_output: List[int] = [ - EMBEDDING_OFFSET + 0, # M - EMBEDDING_OFFSET + 1, # K - EMBEDDING_OFFSET + 2, # T - EMBEDDING_OFFSET + 3, # F - EMBEDDING_OFFSET + 3, # F (repeated token) - EMBEDDING_OFFSET + 4, # R - EMBEDDING_OFFSET + 5, # N - ] - result = self.reader._read_data(raw_data) - self.assertEqual( - result, - expected_output, - "The _read_data method did not produce the expected tokenized output.", - ) - - def test_read_data_with_new_token(self) -> None: - """ - Test the _read_data method with a protein sequence that includes a new token. - Ensure that the new token is added to the cache and processed correctly. - """ - raw_data = "MKTFY" - - # 'Y' is not in the initial cache and should be added. - expected_output: List[int] = [ - EMBEDDING_OFFSET + 0, # M - EMBEDDING_OFFSET + 1, # K - EMBEDDING_OFFSET + 2, # T - EMBEDDING_OFFSET + 3, # F - EMBEDDING_OFFSET + len(self.reader.cache), # Y (new token) - ] - - result = self.reader._read_data(raw_data) - self.assertEqual( - result, - expected_output, - "The _read_data method did not correctly handle a new token.", - ) - - # Verify that 'Y' was added to the cache - self.assertIn( - "Y", self.reader.cache, "The new token 'Y' was not added to the cache." - ) - # Ensure it's at the correct index - self.assertEqual( - self.reader.cache.index("Y"), - len(self.reader.cache) - 1, - "The new token 'Y' was not added at the correct index in the cache.", - ) - - def test_read_data_with_invalid_token(self) -> None: - """ - Test the _read_data method with an invalid amino acid token to ensure it raises a KeyError. - """ - raw_data = "MKTFZ" # 'Z' is not a valid amino acid token - - with self.assertRaises(KeyError) as context: - self.reader._read_data(raw_data) - - self.assertIn( - "Invalid token 'Z' encountered", - str(context.exception), - "The KeyError did not contain the expected message for an invalid token.", - ) - - def test_read_data_with_empty_sequence(self) -> None: - """ - Test the _read_data method with an empty protein sequence to ensure it returns an empty list. - """ - raw_data = "" - - result = self.reader._read_data(raw_data) - self.assertEqual( - result, - [], - "The _read_data method did not return an empty list for an empty input sequence.", - ) - - def test_read_data_with_repeated_tokens(self) -> None: - """ - Test the _read_data method with repeated amino acid tokens to ensure it handles them correctly. - """ - raw_data = "MMMMM" - - expected_output: List[int] = [EMBEDDING_OFFSET + 0] * 5 # All tokens are 'M' - - result = self.reader._read_data(raw_data) - self.assertEqual( - result, - expected_output, - "The _read_data method did not correctly handle repeated tokens.", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb deleted file mode 100644 index 6f67c82b..00000000 --- a/tutorials/data_exploration_go.ipynb +++ /dev/null @@ -1,1341 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "da687d32ba48b188", - "metadata": {}, - "source": [ - "# Introduction\n", - "\n", - "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the GO-UniProt dataset. It covers\n", - "- how to instantiate a data class and generate data\n", - "- how the data is processed and stored\n", - "- and how to work with different molecule encodings.\n", - "\n", - "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", - "\n", - "---\n" - ] - }, - { - "cell_type": "markdown", - "id": "0bd07c91-bb02-48d4-b759-aa35ecb224bd", - "metadata": {}, - "source": [ - "# 1. Instantiation of a Data Class\n", - "\n", - "To start working with `chebai`, you first need to instantiate a GO-UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a4d590fb-9a83-456e-9cb4-303caa8203e8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Already in the project root directory: G:\\github-aditya0by0\\python-chebai\n" - ] - } - ], - "source": [ - "# To run this notebook, you need to change the working directory of the jupyter notebook to root dir of the project.\n", - "import os\n", - "\n", - "# Root directory name of the project\n", - "expected_root_dir = \"python-chebai\"\n", - "\n", - "# Check if the current directory ends with the expected root directory name\n", - "if not os.getcwd().endswith(expected_root_dir):\n", - " os.chdir(\"..\") # Move up one directory level\n", - " if os.getcwd().endswith(expected_root_dir):\n", - " print(\"Changed to project root directory:\", os.getcwd())\n", - " else:\n", - " print(\"Warning: Directory change unsuccessful. Current directory:\", os.getcwd())\n", - "else:\n", - " print(\"Already in the project root directory:\", os.getcwd())" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "440f203ceaf7e4b7", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:25:03.920610Z", - "start_time": "2024-09-30T21:25:03.622407Z" - } - }, - "outputs": [], - "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250" - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a648346d81d0dc5e", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:25:08.863132Z", - "start_time": "2024-09-30T21:25:08.387739Z" - } - }, - "outputs": [], - "source": [ - "go_class = GOUniProtOver250(go_branch=\"BP\")" - ] - }, - { - "cell_type": "markdown", - "id": "64585012b0d7f66f", - "metadata": {}, - "source": [ - "### Inheritance Hierarchy\n", - "\n", - "GO_UniProt data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", - "\n", - "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", - "\n", - "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", - "\n", - "In summary, GO_UniProt data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", - "\n", - "\n", - "### Configuration Parameters\n", - "\n", - "Data classes related to proteins can be configured using the following main parameters:\n", - "\n", - "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", - " - **`\"BP\"`**: Biological Process branch.\n", - " - **`\"MF\"`**: Molecular Function branch.\n", - " - **`\"CC\"`**: Cellular Component branch.\n", - "\n", - "- **`max_sequence_length (int)`**: Specifies the maximum allowed sequence length for a protein, with a default of `1002`. During data preprocessing, any proteins exceeding this length will be excluded from further processing.\n", - "\n", - "This allows for more specific datasets focused on a particular aspect of gene function.\n", - "\n", - "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", - "\n", - "### Additional Input Parameters\n", - "\n", - "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_GOUniProtDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py#L33), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n", - "\n", - "\n", - "# Available Data Classes\n", - "\n", - "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py).\n", - "\n", - "There is a range of available dataset classes for GOUniProt classes. Usually, you want to use `GOUniProtOver250` or `GOUniProtOver50`. Both inherit from `_GOUniProtOverX`. The number indicates the threshold for selecting label classes. The selection process is based on the annotations of the GO terms with its ancestors across the dataset. For instance, GOUniProtOver50 will only select labels which have at least 50 samples in the dataset.\n", - "\n", - "Refer `select_classes` method of `_GOUniProtOverX` for more details on selection process.\n", - "\n", - "If you need a different threshold, you can create your own subclass." - ] - }, - { - "cell_type": "markdown", - "id": "651ab5c39833bd2c", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "a52b4363-7398-44aa-a4cc-8bba14bdd966", - "metadata": {}, - "source": [ - "# 2. Preparation / Setup Methods\n", - "\n", - "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is to generate the actual dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "9f77351090560bc4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checking for processed data in data\\GO_UniProt\\GO250_BP_1002\\processed\n", - "Missing processed data file (`data.pkl` file)\n", - "Downloading Swiss UniProt data....\n", - "Downloading to temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n", - "Downloaded to C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n", - "Unzipping the file....\n", - "Unpacked and saved to data\\GO_UniProt\\raw\\uniprot_sprot.dat\n", - "Removed temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n", - "Missing Gene Ontology raw data\n", - "Downloading Gene Ontology data....\n", - "Extracting class hierarchy...\n", - "Compute transitive closure\n", - "Processing graph\n", - "Parsing swiss uniprot raw data....\n", - "Selecting GO terms based on given threshold: 250 ...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Check for processed data in data\\GO_UniProt\\GO250_BP_1002\\processed\\protein_token\n", - "Cross-validation enabled: False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Missing transformed data (`data.pt` file). Transforming data.... \n", - "Processing 53604 lines...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|███████████████████████████████████████████████████████████████████████████| 53604/53604 [01:18<00:00, 678.84it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving 20 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\protein_token\\tokens.txt...\n", - "First 10 tokens: ['M', 'S', 'I', 'G', 'A', 'T', 'R', 'L', 'Q', 'N']\n" - ] - } - ], - "source": [ - "go_class.prepare_data()\n", - "go_class.setup()" - ] - }, - { - "cell_type": "markdown", - "id": "2328e824c4dafb2d", - "metadata": {}, - "source": [ - "### Automatic Execution: \n", - "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", - "\n", - "\n", - "### Why is Preparation Needed?\n", - "\n", - "- **Data Availability**: The preparation step ensures that the required GOUniProt data files are downloaded or loaded, which are essential for analysis.\n", - "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", - "\n", - "### Main Methods for Data Preprocessing\n", - "\n", - "The data preprocessing in a data class involves two main methods:\n", - "\n", - "1. **`prepare_data` Method**:\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", - " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", - "\n", - "2. **`setup` Method**:\n", - " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", - " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", - "\n", - "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." - ] - }, - { - "cell_type": "markdown", - "id": "db5b58f2d96823fc", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "ee174b61b36c71aa", - "metadata": {}, - "source": [ - "# 3. Overview of the 3 preprocessing stages\n", - "\n", - "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", - "\n", - "1. **Raw Data Stage**:\n", - " - **File**: `go-basic.obo` and `uniprot_sprot.data`\n", - " - **Description**: This stage contains the raw GO ontology data and raw Swiss-UniProt data, serving as the initial input for further processing.\n", - " - **File Paths**:\n", - " - `data/GO_UniProt/raw/go-basic.obo`\n", - " - `data/GO_UniProt/raw/uniprot_sprot.dat`\n", - "\n", - "2. **Processed Data Stage 1**:\n", - " - **File**: `data.pkl`\n", - " - **Description**: This stage includes the data after initial processing. It contains sequence strings, class columns, and metadata but lacks data splits.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", - " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", - "\n", - "3. **Processed Data Stage 2**:\n", - " - **File**: `data.pt`\n", - " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", - " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", - "\n", - "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n", - "\n", - "### Summary of File Paths\n", - "\n", - "- **Raw Data**: `data/GO_UniProt/raw`\n", - "- **Processed Data 1**: `data/GO_UniProt/${dataset_name}/processed`\n", - "- **Processed Data 2**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}`\n", - "\n", - "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n", - "\n", - "### Data Splits\n", - "\n", - "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", - "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n" - ] - }, - { - "cell_type": "markdown", - "id": "a927ad484c930960", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "3f92b58e460c08fd", - "metadata": {}, - "source": [ - "# 4. Data Files and their structure\n", - "\n", - "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n" - ] - }, - { - "cell_type": "markdown", - "id": "cca75d881cb8bade", - "metadata": {}, - "source": [ - "## go-basic.obo File\n", - "\n", - "**Description**: The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n", - "\n", - "#### Example of a Term Document\n", - "\n", - "```plaintext\n", - "[Term]\n", - "id: GO:0000032\n", - "name: cell wall mannoprotein biosynthetic process\n", - "namespace: biological_process\n", - "def: \"The chemical reactions and pathways resulting in the formation of cell wall mannoproteins, any cell wall protein that contains covalently bound mannose residues.\" [GOC:ai]\n", - "synonym: \"cell wall mannoprotein anabolism\" EXACT []\n", - "is_a: GO:0006057 ! mannoprotein biosynthetic process\n", - "is_a: GO:0031506 ! cell wall glycoprotein biosynthetic process\n", - "```\n", - "\n", - "**File Path**: `data/GO_UniProt/raw/go-basic.obo`\n", - "\n", - "### Structure of `go-basic.obo`\n", - "\n", - "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n", - "\n", - "\n", - "\n", - "### Breakdown of Attributes\n", - "\n", - "Each term document in the `go-basic.obo` file consists of the following key attributes:\n", - "\n", - "- **`[Term]`**: \n", - " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct biological process, molecular function, or cellular component.\n", - "\n", - "- **`id: GO:0000032`**: \n", - " - **Description**: A unique identifier for the biological term within the GO ontology.\n", - " - **Example**: `GO:0000032` refers to the term \"cell wall mannoprotein biosynthetic process.\"\n", - "\n", - "- **`name: cell wall mannoprotein biosynthetic process`**: \n", - " - **Description**: The name of the biological process, molecular function, or cellular component being described.\n", - " - **Example**: The name \"cell wall mannoprotein biosynthetic process\" is a descriptive label for the GO term with the identifier `GO:0000032`.\n", - "\n", - "- **`namespace: biological_process`**: \n", - " - **Description**: Specifies which ontology the term belongs to. The main namespaces are `biological_process`, `molecular_function`, and `cellular_component`.\n", - "\n", - "- **`is_a: GO:0006057`**: \n", - " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current term is a subclass or specific instance of the referenced term.\n", - " - **Example**: The term `GO:0000032` (\"cell wall mannoprotein biosynthetic process\") is a subclass of `GO:0006057` and subclass of `GO:0031506`.\n" - ] - }, - { - "cell_type": "markdown", - "id": "87c841de7d80beef", - "metadata": {}, - "source": [ - "## uniprot_sprot.dat File\n", - "\n", - "**Description**: The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotations. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n", - "\n", - "\n", - "### Example of a Protein Entry\n", - "\n", - "```plaintext\n", - "ID 002L_FRG3G Reviewed; 320 AA.\n", - "AC Q6GZX3;\n", - "DT 28-JUN-2011, integrated into UniProtKB/Swiss-Prot.\n", - "DT 19-JUL-2004, sequence version 1.\n", - "DT 08-NOV-2023, entry version 46.\n", - "DE RecName: Full=Uncharacterized protein 002L;\n", - "GN ORFNames=FV3-002L;\n", - "OS Frog virus 3 (isolate Goorha) (FV-3).\n", - "OC Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes;\n", - "OX NCBI_TaxID=654924;\n", - "OH NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens).\n", - "RN [1]\n", - "RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].\n", - "RX PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;\n", - "RA Tan W.G., Barkman T.J., Gregory Chinchar V., Essani K.;\n", - "RT \"Comparative genomic analyses of frog virus 3, type species of the genus\n", - "RT Ranavirus (family Iridoviridae).\";\n", - "RL Virology 323:70-84(2004).\n", - "CC -!- SUBCELLULAR LOCATION: Host membrane {ECO:0000305}; Single-pass membrane\n", - "CC protein {ECO:0000305}.\n", - "DR EMBL; AY548484; AAT09661.1; -; Genomic_DNA.\n", - "DR RefSeq; YP_031580.1; NC_005946.1.\n", - "DR GeneID; 2947774; -.\n", - "DR KEGG; vg:2947774; -.\n", - "DR Proteomes; UP000008770; Segment.\n", - "DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n", - "DR GO; GO:0016020; C:membrane; IEA:UniProtKB-KW.\n", - "PE 4: Predicted;\n", - "KW Host membrane; Membrane; Reference proteome; Transmembrane;\n", - "KW Transmembrane helix.\n", - "FT CHAIN 1..320\n", - "FT /note=\"Uncharacterized protein 002L\"\n", - "FT /id=\"PRO_0000410509\"\n", - "SQ SEQUENCE 320 AA; 34642 MW; 9E110808B6E328E0 CRC64;\n", - " MSIIGATRLQ NDKSDTYSAG PCYAGGCSAF TPRGTCGKDW DLGEQTCASG FCTSQPLCAR\n", - " IKKTQVCGLR YSSKGKDPLV SAEWDSRGAP YVRCTYDADL IDTQAQVDQF VSMFGESPSL\n", - " AERYCMRGVK NTAGELVSRV SSDADPAGGW CRKWYSAHRG PDQDAALGSF CIKNPGAADC\n", - " KCINRASDPV YQKVKTLHAY PDQCWYVPCA ADVGELKMGT QRDTPTNCPT QVCQIVFNML\n", - " DDGSVTMDDV KNTINCDFSK YVPPPPPPKP TPPTPPTPPT PPTPPTPPTP PTPRPVHNRK\n", - " VMFFVAGAVL VAILISTVRW\n", - "//\n", - "```\n", - "\n", - "**File Path**: `data/GO_UniProt/raw/uniprot_sprot.dat`\n", - "\n", - "\n", - "## Structure of `uniprot_sprot.dat`\n", - "\n", - "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n", - "\n", - "### Breakdown of Attributes\n", - "\n", - "Each protein entry in the `uniprot_sprot.dat` file is structured with specific tags and sections that describe the protein in detail. Here's a breakdown of the key attributes:\n", - "\n", - "- **`ID`**: \n", - " - **Description**: Contains the unique identifier for the protein and its status (e.g., `Reviewed` indicates the sequence has been manually curated).\n", - " - **Example**: `002L_FRG3G` is the identifier for the protein from Frog virus 3.\n", - "\n", - "- **`AC`**: \n", - " - **Description**: Accession number, a unique identifier for the protein sequence.\n", - " - **Example**: `Q6GZX3` is the accession number for this entry.\n", - "\n", - "- **`DR`**: \n", - " - **Description**: Cross-references to other databases like EMBL, RefSeq, KEGG, and GeneID.\n", - " - **Example**: This entry is cross-referenced with the EMBL database, RefSeq, GO, etc.\n", - "\n", - "- **`GO`**: \n", - " - **Description**: Gene Ontology annotations that describe the cellular component, biological process, or molecular function associated with the protein.\n", - " - **Example**: The protein is associated with the GO terms `GO:0033644` (host cell membrane) and `GO:0016020` (membrane).\n", - "\n", - "- **`SQ`**: \n", - " - **Description**: The amino acid sequence of the protein.\n", - " - **Example**: The sequence consists of 320 amino acids.\n", - "\n", - "__Note__: For more detailed information refer [here](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt\n", - "). \n", - "\n", - "Consider the below line from above example: \n", - "```plaintext\n", - "DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n", - "```\n", - "\n", - "The line contains a **Gene Ontology (GO) annotation** describing the protein's subcellular location. Here's a detailed breakdown:\n", - "\n", - "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n", - "\n", - "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n", - "\n", - "__Note__: For more details on evidence codes check section 5.2" - ] - }, - { - "cell_type": "markdown", - "id": "b7687078-f6b8-4fbf-afa7-dfda89061a5e", - "metadata": {}, - "source": [ - "## data.pkl File\n", - "\n", - "**Description**: This file is generated by the `prepare_data` method and contains the processed GO data in a dataframe format. It includes protein IDs, data representations (such as sequence strings), and class columns with boolean values." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "b4da7e73e251e1d1", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T14:08:33.990378Z", - "start_time": "2024-09-30T14:08:33.959459Z" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "b66fbb9b720d053c", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T14:10:12.796911Z", - "start_time": "2024-09-30T14:10:06.052276Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Size of the data (rows x columns): (53604, 902)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
swiss_idaccessiongo_idssequence4175122165209226...1990778200002620001452000146200014720002412000243200114120012332001234
111S1_CARILB5KVH4[3006, 8150, 9791, 10431, 21700, 22414, 32501,...MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
311S2_SESINQ9XHP0[3006, 8150, 10431, 21700, 22414, 32502, 48609]MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
614310_ARATHP48347,Q9LME5[7165, 8150, 9742, 9755, 9987, 43401, 50789, 5...MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
814331_ARATHP42643,Q945M2,Q9M0S7[8150, 19222, 50789, 65007]MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
914331_CAEELP41932,Q21537[132, 226, 1708, 6611, 6810, 6886, 6913, 6950,...MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...FalseFalseFalseFalseFalseTrue...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", - "

5 rows × 902 columns

\n", - "
" - ], - "text/plain": [ - " swiss_id accession \\\n", - "1 11S1_CARIL B5KVH4 \n", - "3 11S2_SESIN Q9XHP0 \n", - "6 14310_ARATH P48347,Q9LME5 \n", - "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", - "9 14331_CAEEL P41932,Q21537 \n", - "\n", - " go_ids \\\n", - "1 [3006, 8150, 9791, 10431, 21700, 22414, 32501,... \n", - "3 [3006, 8150, 10431, 21700, 22414, 32502, 48609] \n", - "6 [7165, 8150, 9742, 9755, 9987, 43401, 50789, 5... \n", - "8 [8150, 19222, 50789, 65007] \n", - "9 [132, 226, 1708, 6611, 6810, 6886, 6913, 6950,... \n", - "\n", - " sequence 41 75 122 \\\n", - "1 MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE... False False False \n", - "3 MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL... False False False \n", - "6 MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV... False False False \n", - "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", - "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", - "\n", - " 165 209 226 ... 1990778 2000026 2000145 2000146 2000147 \\\n", - "1 False False False ... False False False False False \n", - "3 False False False ... False False False False False \n", - "6 False False False ... False False False False False \n", - "8 False False False ... False False False False False \n", - "9 False False True ... False False False False False \n", - "\n", - " 2000241 2000243 2001141 2001233 2001234 \n", - "1 False False False False False \n", - "3 False False False False False \n", - "6 False False False False False \n", - "8 False False False False False \n", - "9 False False False False False \n", - "\n", - "[5 rows x 902 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pkl_df = pd.DataFrame(\n", - " pd.read_pickle(\n", - " os.path.join(\n", - " go_class.processed_dir_main,\n", - " go_class.processed_dir_main_file_names_dict[\"data\"],\n", - " )\n", - " )\n", - ")\n", - "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", - "pkl_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "735844f0b2474ad6", - "metadata": {}, - "source": [ - "**File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", - "\n", - "\n", - "### Structure of `data.pkl`\n", - "`data.pkl` as following structure: \n", - "- **Column 0**: Contains the Identifier from Swiss-UniProt Dataset for each Swiss Protein data instance.\n", - "- **Column 1**: Contains the accession of each Protein data instance.\n", - "- **Column 2**: Contains the list of GO-IDs (Identifiers from Gene Ontology) which maps each Swiss Protein to the Gene Ontology instance.\n", - "- **Column 3**: Contains the sequence representation for the Swiss Protein using Amino Acid notation.\n", - "- **Column 4 and onwards**: Contains the labels, starting from column 4.\n", - "\n", - "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" - ] - }, - { - "cell_type": "markdown", - "id": "2c9b17f6-93bd-4cc3-8967-7ab1d2e06e51", - "metadata": {}, - "source": [ - "## data.pt File\n", - "\n", - "**Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "85b097601fb242d6", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T14:10:35.034002Z", - "start_time": "2024-09-30T14:10:35.018342Z" - } - }, - "outputs": [], - "source": [ - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "289a54a71dec20fb", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T14:11:36.443693Z", - "start_time": "2024-09-30T14:11:34.199285Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type of loaded data: \n", - "Content of the data file: \n", - " {'features': [10, 14, 21, 23, 12, 17, 17, 11, 12, 22, 17, 24, 17, 12, 12, 28, 14, 17, 25, 19, 13, 24, 17, 14, 18, 11, 13, 13, 16, 18, 18, 29, 21, 25, 13, 18, 24, 18, 17, 19, 16, 17, 20, 14, 17, 27, 23, 15, 19, 16, 12, 27, 14, 27, 14, 13, 28, 12, 27, 11, 26, 20, 23, 19, 29, 18, 18, 17, 18, 24, 14, 13, 28, 14, 28, 28, 16, 16, 15, 12, 27, 23, 19, 13, 17, 17, 17, 23, 29, 22, 11, 19, 14, 23, 18, 17, 28, 22, 12, 14, 16, 13, 16, 13, 12, 15, 13, 28, 17, 25, 23, 13, 24, 23, 27, 15, 25, 27, 27, 11, 18, 16, 18, 11, 18, 18, 13, 18, 16, 16, 27, 25, 18, 18, 20, 16, 29, 18, 21, 12, 16, 29, 25, 16, 27, 13, 20, 12, 12, 14, 25, 23, 14, 13, 28, 14, 29, 26, 24, 22, 19, 20, 13, 11, 11, 23, 28, 28, 14, 12, 25, 17, 17, 20, 15, 29, 19, 19, 14, 19, 18, 17, 20, 18, 19, 23, 16, 19, 25, 22, 17, 14, 13, 19, 23, 20, 20, 27, 25, 16, 23, 18, 13, 18, 18, 27, 22, 27, 18, 29, 16, 16, 18, 18, 18, 29, 18, 18, 16, 16, 13, 27, 29, 13, 27, 18, 18, 16, 20, 17, 13, 19, 19, 28, 25, 11, 13, 25, 20, 14, 27, 25, 17, 14, 20, 14, 25, 19, 28, 20, 15, 27, 15, 14, 16, 16, 17, 18, 11, 27, 19, 20, 29, 16, 13, 11, 12, 28, 16, 28, 27, 13, 16, 18, 17, 18, 28, 12, 16, 23, 16, 26, 11, 16, 27, 27, 18, 27, 29, 27, 27, 16, 21, 27, 16, 27, 16, 27, 16, 27, 11, 27, 11, 27, 16, 16, 18, 11, 16, 16, 13, 13, 16, 20, 20, 19, 13, 17, 27, 27, 15, 12, 24, 15, 17, 11, 17, 16, 27, 19, 12, 13, 20, 23, 11, 16, 14, 20, 12, 22, 15, 27, 27, 14, 13, 16, 12, 11, 15, 28, 19, 11, 29, 19, 17, 23, 12, 17, 16, 26, 17, 18, 17, 11, 14, 27, 16, 13, 14, 17, 22, 11, 20, 14, 17, 22, 28, 23, 29, 26, 19, 17, 19, 14, 29, 11, 28, 28, 22, 14, 17, 16, 13, 16, 14, 27, 28, 18, 28, 28, 20, 19, 25, 13, 18, 15, 28, 25, 20, 20, 27, 17, 16, 27, 13, 18, 17, 17, 15, 12, 23, 18, 19, 25, 14, 28, 28, 21, 16, 14, 16, 20, 27, 13, 25, 27, 26, 28, 11, 25, 21, 15, 19, 27, 19, 14, 10, 28, 11, 23, 17, 14, 13, 16, 15, 11, 14, 12, 16, 14, 17, 23, 27, 27, 28, 17, 28, 19, 14, 25, 18, 12, 23, 16, 27, 20, 14, 16, 16, 17, 21, 25, 19, 16, 18, 27, 11, 15, 17, 28, 16, 11, 16, 11, 16, 11, 11, 16, 11, 27, 16, 16, 14, 27, 28], 'labels': array([False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, True, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, True, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, True, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, True, False, False, False, False, False, True,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, True, True, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, True,\n", - " True, False, False, False, False, False, False, False, False,\n", - " True, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False]), 'ident': '11S1_CARIL', 'group': None}\n" - ] - } - ], - "source": [ - "data_pt = torch.load(\n", - " os.path.join(go_class.processed_dir, go_class.processed_file_names_dict[\"data\"]),\n", - " weights_only=False,\n", - ")\n", - "print(\"Type of loaded data:\", type(data_pt))\n", - "print(\"Content of the data file: \\n\", data_pt[0])" - ] - }, - { - "cell_type": "markdown", - "id": "2c9f23883c66b48d", - "metadata": {}, - "source": [ - "**File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", - "\n", - "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", - "\n", - "- **`features`**: \n", - " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", - "\n", - "- **`labels`**: \n", - " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", - "\n", - "- **`ident`**: \n", - " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" - ] - }, - { - "cell_type": "markdown", - "id": "36aed0b8-ab05-428d-8833-2a24deebacc3", - "metadata": {}, - "source": [ - "## classes.txt File\n", - "\n", - "**Description**: This file lists the GO classes that are used as labels. It can be used to match labels in `data.pt` with GO classes: For position `i` in the label-tensor, the GO-ID is in line `i` of `classes.txt`" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "19200f7ff9a6ebba", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:30:34.344202Z", - "start_time": "2024-09-30T21:30:34.328318Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "41\n", - "75\n", - "122\n", - "165\n", - "209\n" - ] - } - ], - "source": [ - "with open(os.path.join(go_class.processed_dir_main, \"classes.txt\"), \"r\") as file:\n", - " for i in range(5):\n", - " line = file.readline()\n", - " print(line.strip())" - ] - }, - { - "cell_type": "markdown", - "id": "f69012b3540fd1b6", - "metadata": {}, - "source": [ - "**File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", - "\n", - "The `classes.txt` file lists selected GO classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique Swiss Protein class ID, identifying specific protein from Swiss-UniProt dataset." - ] - }, - { - "cell_type": "markdown", - "id": "b81ea34f-cfa8-4ffa-8b88-b54ca96afd84", - "metadata": {}, - "source": [ - "## splits.csv File\n", - "\n", - "**Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "88c3ea8f01ba9fac", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:30:41.586616Z", - "start_time": "2024-09-30T21:30:39.318598Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idsplit
014331_ARATHtrain
114331_CAEELtrain
214331_MAIZEtrain
314332_MAIZEtrain
414333_ARATHtrain
\n", - "
" - ], - "text/plain": [ - " id split\n", - "0 14331_ARATH train\n", - "1 14331_CAEEL train\n", - "2 14331_MAIZE train\n", - "3 14332_MAIZE train\n", - "4 14333_ARATH train" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv_df = pd.read_csv(os.path.join(go_class.processed_dir_main, \"splits.csv\"))\n", - "csv_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "6661dc11247e9753", - "metadata": {}, - "source": [ - "**File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", - "\n", - "To reuse an existing split, you can use the `splits_file_path` argument. This way, you can reuse the same datasplit across several runs." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "2b02d8b4-c2de-4b8e-b680-ec67b40d9a30", - "metadata": {}, - "outputs": [], - "source": [ - "# You can specify a literal path for the `splits_file_path`, or if another `go_class` instance is already defined,\n", - "# you can use its existing `splits_file_path` attribute for consistency.\n", - "go_class_with_splits = GOUniProtOver250(\n", - " go_branch=\"BP\",\n", - " # splits_file_path=\"data/GO_UniProt/GO250_BP_1002/processed/splits.csv\", # Literal path option\n", - " splits_file_path=go_class.splits_file_path, # Use path from an existing `go_class` instance\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "e6b1f184a5091b83", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "481b8c0271ec9636", - "metadata": {}, - "source": [ - "## 5.1 Protein Representation Using Amino Acid Sequence Notation\n", - "\n", - "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", - "\n", - "### Example Protein Sequence\n", - "\n", - "Protein: **Lysozyme C** from **Gallus gallus** (Chicken). \n", - "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n", - "\n", - "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n", - "- **Sequence Length**: 147\n", - "\n", - "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", - "\n", - "### Tokenization and Encoding\n", - "\n", - "To tokenize and numerically encode this protein sequence, the `ProteinDataReader` class is used. This class allows for n-gram tokenization, where the `n_gram` parameter defines the size of the tokenized units. If `n_gram` is not provided (default is `None`), each amino acid letter is treated as a single token.\n", - "\n", - "For more details, you can explore the implementation of the `ProteinDataReader` class in the source code [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/reader.py)." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "e0cf4fb6-2ca4-4b85-a4e7-0cfbac5cd6c1", - "metadata": {}, - "outputs": [], - "source": [ - "from chebai.preprocessing.reader import ProteinDataReader" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "e8343d83-0be3-44df-9224-bba8d5c32336", - "metadata": {}, - "outputs": [], - "source": [ - "protein_dr_3gram = ProteinDataReader(n_gram=3)\n", - "protein_dr = ProteinDataReader()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "8a18dc27-f308-4dde-b1ae-b03a20fb0d45", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[10, 16, 11, 17, 17, 12, 17, 28, 17, 24, 25, 17, 23, 17, 14, 14, 17, 13, 21]\n", - "[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]\n" - ] - } - ], - "source": [ - "protein = \"MRSLLILVLCFLPLAALGK\"\n", - "print(protein_dr._read_data(protein))\n", - "print(protein_dr_3gram._read_data(protein))" - ] - }, - { - "cell_type": "markdown", - "id": "7e95738a-0b2d-4c56-ac97-f3b24c1de18f", - "metadata": {}, - "source": [ - "The numbers mentioned above refer to the index of each individual token from the [`tokens.txt`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/bin/protein_token/tokens.txt) file, which is used by the `ProteinDataReader` class. \n", - "\n", - "Each token in the `tokens.txt` file corresponds to a specific amino-acid letter, and these tokens are referenced by their index. Additionally, the index values are offset by the `EMBEDDING_OFFSET`, ensuring that the token embeddings are adjusted appropriately during processing." - ] - }, - { - "cell_type": "markdown", - "id": "fd54ca4a-743c-496e-9e89-cff2d8226eb2", - "metadata": {}, - "source": [ - "### The 20 Amino Acids and Their One-Letter Notations\n", - "\n", - "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n", - "\n", - "| One-Letter Notation | Amino Acid Name | Description |\n", - "|---------------------|----------------------|---------------------------------------------------------|\n", - "| **A** | Alanine | Non-polar, aliphatic amino acid. |\n", - "| **C** | Cysteine | Polar, contains a thiol group, forms disulfide bonds. |\n", - "| **D** | Aspartic Acid | Acidic, negatively charged at physiological pH. |\n", - "| **E** | Glutamic Acid | Acidic, negatively charged at physiological pH. |\n", - "| **F** | Phenylalanine | Aromatic, non-polar. |\n", - "| **G** | Glycine | Smallest amino acid, non-polar. |\n", - "| **H** | Histidine | Polar, positively charged, can participate in enzyme active sites. |\n", - "| **I** | Isoleucine | Non-polar, aliphatic. |\n", - "| **K** | Lysine | Basic, positively charged at physiological pH. |\n", - "| **L** | Leucine | Non-polar, aliphatic. |\n", - "| **M** | Methionine | Non-polar, contains sulfur, start codon in mRNA translation. |\n", - "| **N** | Asparagine | Polar, uncharged. |\n", - "| **P** | Proline | Non-polar, introduces kinks in protein chains. |\n", - "| **Q** | Glutamine | Polar, uncharged. |\n", - "| **R** | Arginine | Basic, positively charged, involved in binding phosphate groups. |\n", - "| **S** | Serine | Polar, can be phosphorylated. |\n", - "| **T** | Threonine | Polar, can be phosphorylated. |\n", - "| **V** | Valine | Non-polar, aliphatic. |\n", - "| **W** | Tryptophan | Aromatic, non-polar, largest amino acid. |\n", - "| **Y** | Tyrosine | Aromatic, polar, can be phosphorylated. |\n", - "\n", - "### Understanding Protein Sequences\n", - "\n", - "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", - "\n", - "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", - "\n", - "\n", - "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" - ] - }, - { - "cell_type": "markdown", - "id": "db6d7f2cc446e6f9", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "7f42b928364e5cd1", - "metadata": {}, - "source": [ - "## 5.2 More on GO Evidence Codes\n", - "\n", - "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the GO evidence codes with brief descriptions:\n", - "\n", - "| **Evidence Code** | **Description** |\n", - "|-----------------------|-----------------|\n", - "| **EXP** | [Inferred from Experiment (EXP)](http://wiki.geneontology.org/index.php/Inferred_from_Experiment_(EXP)) |\n", - "| **IDA** | [Inferred from Direct Assay (IDA)](http://wiki.geneontology.org/index.php/Inferred_from_Direct_Assay_(IDA)) |\n", - "| **IPI** | [Inferred from Physical Interaction (IPI)](http://wiki.geneontology.org/index.php/Inferred_from_Physical_Interaction_(IPI)) |\n", - "| **IMP** | [Inferred from Mutant Phenotype (IMP)](http://wiki.geneontology.org/index.php/Inferred_from_Mutant_Phenotype_(IMP)) |\n", - "| **IGI** | [Inferred from Genetic Interaction (IGI)](http://wiki.geneontology.org/index.php/Inferred_from_Genetic_Interaction_(IGI)) |\n", - "| **IEP** | [Inferred from Expression Pattern (IEP)](http://wiki.geneontology.org/index.php/Inferred_from_Expression_Pattern_(IEP)) |\n", - "| **HTP** | [Inferred from High Throughput Experiment (HTP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Experiment_(HTP) ) |\n", - "| **HDA** | [Inferred from High Throughput Direct Assay (HDA)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Direct_Assay_(HDA)) |\n", - "| **HMP** | [Inferred from High Throughput Mutant Phenotype (HMP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Mutant_Phenotype_(HMP)) |\n", - "| **HGI** | [Inferred from High Throughput Genetic Interaction (HGI)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Genetic_Interaction_(HGI)) |\n", - "| **HEP** | [Inferred from High Throughput Expression Pattern (HEP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Expression_Pattern_(HEP)) |\n", - "| **IBA** | [Inferred from Biological aspect of Ancestor (IBA)](http://wiki.geneontology.org/index.php/Inferred_from_Biological_aspect_of_Ancestor_(IBA)) |\n", - "| **IBD** | [Inferred from Biological aspect of Descendant (IBD)](http://wiki.geneontology.org/index.php/Inferred_from_Biological_aspect_of_Descendant_(IBD)) |\n", - "| **IKR** | [Inferred from Key Residues (IKR)](http://wiki.geneontology.org/index.php/Inferred_from_Key_Residues_(IKR)) |\n", - "| **IRD** | [Inferred from Rapid Divergence (IRD)](http://wiki.geneontology.org/index.php/Inferred_from_Rapid_Divergence(IRD)) |\n", - "| **ISS** | [Inferred from Sequence or Structural Similarity (ISS)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_or_structural_Similarity_(ISS)) |\n", - "| **ISO** | [Inferred from Sequence Orthology (ISO)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Orthology_(ISO)) |\n", - "| **ISA** | [Inferred from Sequence Alignment (ISA)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Alignment_(ISA)) |\n", - "| **ISM** | [Inferred from Sequence Model (ISM)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Model_(ISM)) |\n", - "| **RCA** | [Inferred from Reviewed Computational Analysis (RCA)](http://wiki.geneontology.org/index.php/Inferred_from_Reviewed_Computational_Analysis_(RCA)) |\n", - "| **IEA** | [Inferred from Electronic Annotation (IEA)](http://wiki.geneontology.org/index.php/Inferred_from_Electronic_Annotation_(IEA)) |\n", - "| **TAS** | [Traceable Author Statement (TAS)](http://wiki.geneontology.org/index.php/Traceable_Author_Statement_(TAS)) |\n", - "| **NAS** | [Non-traceable Author Statement (NAS)](http://wiki.geneontology.org/index.php/Non-traceable_Author_Statement_(NAS)) |\n", - "| **IC** | [Inferred by Curator (IC)](http://wiki.geneontology.org/index.php/Inferred_by_Curator_(IC)) |\n", - "| **ND** | [No Biological Data Available (ND)](http://wiki.geneontology.org/index.php/No_biological_Data_available_(ND)_evidence_code) |\n", - "| **NR** | Not Recorded |\n", - "\n", - "\n", - "### **Grouping of Codes**:\n", - "\n", - "- **Experimental Evidence Codes**:\n", - " - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n", - " \n", - "- **High-Throughput Experimental Codes**:\n", - " - **HTP**, **HDA**, **HMP**, **HGI**, **HEP**\n", - "\n", - "- **Phylogenetically-Inferred Codes**:\n", - " - **IBA**, **IBD**, **IKR**, **IRD**\n", - "\n", - "- **Author/Curator Inferred Codes**:\n", - " - **TAS**, **IC**, **NAS**\n", - "\n", - "- **Computational Evidence Codes**:\n", - " - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n", - "\n", - "- **Others**:\n", - " - **ND** (No Biological Data Available), **NR** (Not Recorded)\n", - "\n", - "\n", - "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation.\n", - "\n", - "__Note__ : For more information on GO evidence codes please check [here](https://geneontology.org/docs/guide-go-evidence-codes/) " - ] - }, - { - "cell_type": "markdown", - "id": "1c11d6f520b02434", - "metadata": {}, - "source": [ - "---" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tutorials/data_exploration_scope.ipynb b/tutorials/data_exploration_scope.ipynb deleted file mode 100644 index c14046ac..00000000 --- a/tutorials/data_exploration_scope.ipynb +++ /dev/null @@ -1,1182 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", - "metadata": {}, - "source": [ - "# Introduction\n", - "\n", - "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the SCOPe dataset. It covers\n", - "- how to instantiate a data class and generate data\n", - "- how the data is processed and stored\n", - "- and how to work with different molecule encodings.\n", - "\n", - "The `chebai` package simplifies the handling of these datasets by **automatically downloading and processing** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", - "\n", - "---\n" - ] - }, - { - "cell_type": "markdown", - "id": "cca637ce-d4ea-4365-acd9-657418e0640f", - "metadata": {}, - "source": [ - "### Overview of SCOPe Data and its Usage in Protein-Related Tasks\n", - "\n", - "#### **What is SCOPe?**\n", - "\n", - "The **Structural Classification of Proteins — extended (SCOPe)** is a comprehensive database that extends the original SCOP (Structural Classification of Proteins) database. SCOPe offers a detailed classification of protein domains based on their structural and evolutionary relationships.\n", - "\n", - "The SCOPe database, like SCOP, organizes proteins into a hierarchy of domains based on structural similarities, which is crucial for understanding evolutionary patterns and functional aspects of proteins. This hierarchical structure is comparable to taxonomy in biology, where species are classified based on shared characteristics.\n", - "\n", - "#### **SCOPe Hierarchy:**\n", - "By analogy with taxonomy, SCOP was created as a hierarchy of several levels where the fundamental unit of classification is a **domain** in the experimentally determined protein structure. Starting at the bottom, the hierarchy of SCOP domains comprises the following levels:\n", - "\n", - "1. **Species**: Representing distinct protein sequences and their naturally occurring or artificially created variants.\n", - "2. **Protein**: Groups together similar sequences with essentially the same functions. These can originate from different biological species or represent isoforms within the same species.\n", - "3. **Family**: Contains proteins with similar sequences but typically distinct functions.\n", - "4. **Superfamily**: Bridges protein families with common functional and structural features, often inferred from a shared evolutionary ancestor.\n", - "5. **Fold**: Groups structurally similar superfamilies. \n", - "6. **Class**: Based on secondary structure content and organization. This level classifies proteins based on their secondary structure properties, such as alpha-helices and beta-sheets.\n", - "\n", - "\n", - "\n", - "For more details, you can refer to the [SCOPe documentation](https://scop.berkeley.edu/help/ver=2.08).\n", - "\n", - "---\n", - "\n", - "#### **Why are We Using SCOPe?**\n", - "\n", - "We are integrating the SCOPe data into our pipeline as part of an ontology pretraining task for protein-related models. SCOPe is a great fit for our goal because it is primarily **structure-based**, unlike other protein-related databases like Gene Ontology (GO), which focuses more on functional classes.\n", - "\n", - "Our primary objective is to reproduce **ontology pretraining** on a protein-related task, and SCOPe provides the structural ontology that we need for this. The steps in our pipeline are aligned as follows:\n", - "\n", - "| **Stage** | **Chemistry Task** | **Proteins Task** |\n", - "|--------------------------|-------------------------------------|------------------------------------------------|\n", - "| **Unsupervised Pretraining** | Mask pretraining (ELECTRA) | Mask pretraining (ESM2, optional) |\n", - "| **Ontology Pretraining** | ChEBI | SCOPe |\n", - "| **Finetuning Task** | Toxicity, Solubility, etc. | GO (MF, BP, CC branches) |\n", - "\n", - " \n", - "This integration will allow us to use **SCOPe** for tasks such as **protein classification** and will contribute to the success of **pretraining models** for protein structures. The data will be processed with the same approach as the GO data, with **different labels** corresponding to the SCOPe classification system.\n", - "\n", - "---\n", - "\n", - "#### **Why SCOPe is Suitable for Our Task**\n", - "\n", - "1. **Structure-Based Classification**: SCOPe is primarily concerned with the structural characteristics of proteins, making it ideal for protein structure pretraining tasks. This contrasts with other ontology databases like **GO**, which categorize proteins based on more complex functional relationships.\n", - " \n", - "2. **Manageable Size**: SCOPe contains around **140,000 entries**, making it a manageable dataset for training models. This is similar in size to **ChEBI**, which is used in the chemical domain, and ensures we can work with it effectively for pretraining." - ] - }, - { - "cell_type": "markdown", - "id": "338e452f-426c-493d-bec2-5bd51e24e4aa", - "metadata": {}, - "source": [ - "\n", - "### Protein Data Bank (PDB)\n", - "\n", - "The **Protein Data Bank (PDB)** is a global repository that stores 3D structural data of biological macromolecules like proteins and nucleic acids. It contains information obtained through experimental methods such as **X-ray crystallography**, **NMR spectroscopy**, and **cryo-EM**. The data includes atomic coordinates, secondary structure details, and experimental conditions.\n", - "\n", - "The PDB is an essential resource for **structural biology**, **bioinformatics**, and **drug discovery**, enabling scientists to understand protein functions, interactions, and mechanisms at the molecular level.\n", - "\n", - "For more details, visit the [RCSB PDB website](https://www.rcsb.org/).\n" - ] - }, - { - "cell_type": "markdown", - "id": "f6c25706-251c-438c-9915-e8002647eb94", - "metadata": {}, - "source": [ - "### Understanding [SCOPe](https://scop.berkeley.edu/) and [PDB](https://www.rcsb.org/) \n", - "\n", - "\n", - "1. **Protein domains form chains.** \n", - "2. **Chains form complexes** (protein complexes or structures). \n", - "3. These **complexes are the entries in PDB**, represented by unique identifiers like `\"1A3N\"`. \n", - "\n", - "---\n", - "\n", - "#### **Protein Domain** \n", - "A **protein domain** is a **structural and functional unit** of a protein. \n", - "\n", - "\n", - "##### Key Characteristics:\n", - "- **Domains are part of a protein chain.** \n", - "- A domain can span: \n", - " 1. **The entire chain** (single-domain protein): \n", - " - In this case, the protein domain is equivalent to the chain itself. \n", - " - Example: \n", - " - All chains of the **PDB structure \"1A3N\"** are single-domain proteins. \n", - " - Each chain has a SCOPe domain identifier. \n", - " - For example, Chain **A**: \n", - " - Domain identifier: `d1a3na_` \n", - " - Breakdown of the identifier: \n", - " - `d`: Denotes domain. \n", - " - `1a3n`: Refers to the PDB protein structure identifier. \n", - " - `a`: Specifies the chain within the structure. (`_` for None and `.` for multiple chains)\n", - " - `_`: Indicates the domain spans the entire chain (single-domain protein). \n", - " - Example: [PDB Structure 1A3N - Chain A](https://www.rcsb.org/sequence/1A3N#A)\n", - " 2. **A specific portion of the chain** (multi-domain protein): \n", - " - Here, a single chain contains multiple domains. \n", - " - Example: Chain **A** of the **PDB structure \"1PKN\"** contains three domains: `d1pkna1`, `d1pkna2`, `d1pkna3`. \n", - " - Example: [PDB Structure 1PKN - Chain A](https://www.rcsb.org/annotations/1PKN). \n", - "\n", - "---\n", - "\n", - "#### **Protein Chain** \n", - "A **protein chain** refers to the entire **polypeptide chain** observed in a protein's 3D structure (as described in PDB files). \n", - "\n", - "##### Key Points:\n", - "- A chain can consist of **one or multiple domains**:\n", - " - **Single-domain chain**: The chain and domain are identical. \n", - " - Example: Myoglobin. \n", - " - **Multi-domain chain**: Contains several domains, each with distinct structural and functional roles. \n", - "- Chains assemble to form **protein complexes** or **structures**. \n", - "\n", - "\n", - "---\n", - "\n", - "#### **Key Observations About SCOPe** \n", - "- The **fundamental classification unit** in SCOPe is the **protein domain**, not the entire protein. \n", - "- _**The taxonomy in SCOPe is not for the entire protein (i.e., the full-length amino acid sequence as encoded by a gene) but for protein domains, which are smaller, structurally and functionally distinct regions of the protein.**_\n", - "\n", - "\n", - "--- \n", - "\n", - "**SCOPe 2.08 Data Analysis:**\n", - "\n", - "The current SCOPe version (2.08) includes the following statistics based on analysis for relevant data:\n", - "\n", - "- **Classes**: 12\n", - "- **Folds**: 1485\n", - "- **Superfamilies**: 2368\n", - "- **Families**: 5431\n", - "- **Proteins**: 13,514\n", - "- **Species**: 30,294\n", - "- **Domains**: 344,851\n", - "\n", - "For more detailed statistics, please refer to the official SCOPe website:\n", - "\n", - "- [SCOPe 2.08 Statistics](https://scop.berkeley.edu/statistics/ver=2.08)\n", - "- [SCOPe 2.08 Release](https://scop.berkeley.edu/ver=2.08)\n", - "\n", - "---\n", - "\n", - "## SCOPe Labeling \n", - "\n", - "- Use SCOPe labels for protein domains.\n", - "- Map them back to their **protein-chain** sequences (protein sequence label = sum of all domain labels).\n", - "- Train on protein sequences.\n", - "- This pretraining task would be comparable to GO-based training.\n", - "\n", - "--- " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "990cc6f2-6b4a-4fa7-905f-dda183c3ec4c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Changed to project root directory: G:\\github-aditya0by0\\python-chebai\n" - ] - } - ], - "source": [ - "# To run this notebook, you need to change the working directory of the jupyter notebook to root dir of the project.\n", - "import os\n", - "\n", - "# Root directory name of the project\n", - "expected_root_dir = \"python-chebai\"\n", - "\n", - "# Check if the current directory ends with the expected root directory name\n", - "if not os.getcwd().endswith(expected_root_dir):\n", - " os.chdir(\"..\") # Move up one directory level\n", - " if os.getcwd().endswith(expected_root_dir):\n", - " print(\"Changed to project root directory:\", os.getcwd())\n", - " else:\n", - " print(\"Warning: Directory change unsuccessful. Current directory:\", os.getcwd())\n", - "else:\n", - " print(\"Already in the project root directory:\", os.getcwd())" - ] - }, - { - "cell_type": "markdown", - "id": "4550d01fc7af5ae4", - "metadata": {}, - "source": [ - "# 1. Instantiation of a Data Class\n", - "\n", - "To start working with `chebai`, you first need to instantiate a SCOPe data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", - "metadata": {}, - "outputs": [], - "source": [ - "from chebai.preprocessing.datasets.scope.scope import SCOPeOver50" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:07:26.371796Z", - "start_time": "2024-10-05T21:07:26.058728Z" - } - }, - "outputs": [], - "source": [ - "scope_class = SCOPeOver50(scope_version=\"2.08\")" - ] - }, - { - "cell_type": "markdown", - "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", - "metadata": {}, - "source": [ - "\n", - "### Inheritance Hierarchy\n", - "\n", - "SCOPe data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L598), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L23). Specifically:\n", - "\n", - "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", - "\n", - "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", - "\n", - "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", - "\n", - "\n", - "### Input parameters\n", - "A SCOPe data class can be configured with a range of parameters, including:\n", - "\n", - "- **scope_version (str)**: Specifies the version of the ChEBI database to be used. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", - "\n", - "- **scope_version_train (str, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `scope_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", - "\n", - "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n", - "\n", - "### Additional Input Parameters\n", - "\n", - "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_SCOPeDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/scope/scope.py#L31), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n" - ] - }, - { - "cell_type": "markdown", - "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", - "metadata": {}, - "source": [ - "# Available SCOPe Data Classes\n", - "\n", - "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/scope/scope.py):\n", - "\n", - "There is a range of available dataset classes for SCOPe. Usually, you want to use `SCOPeOver2000` or `SCOPeOver50`. The number indicates the threshold for selecting label classes: SCOPe classes which have at least 2000 / 50 subclasses will be used as labels.\n", - "\n", - "Both inherit from `SCOPeOverX`. If you need a different threshold, you can create your own subclass. By default, `SCOPeOverX` uses the Protein encoding (see Section 5).\n", - "\n", - "Finally, `SCOPeOver2000Partial` selects extracts a part of SCOPe based on a given top class, with a threshold of 2000 for selecting labels.\n", - "This class inherits from `SCOPEOverXPartial`.\n" - ] - }, - { - "cell_type": "markdown", - "id": "8456b545-88c5-401d-baa5-47e8ae710f04", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "ed973fb59df11849", - "metadata": {}, - "source": [ - "# 2. Preparation / Setup Methods\n", - "\n", - "Now we have a SCOPe data class with all the relevant parameters. Next, we need to generate the actual dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "11f2208e-fa40-44c9-bfe7-576ca23ad366", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checking for processed data in data\\SCOPe\\version_2.08\\SCOPe50\\processed\n", - "Missing processed data file (`data.pkl` file)\n", - "Missing PDB raw data, Downloading PDB sequence data....\n", - "Downloading to temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmpsif7r129\n", - "Downloaded to C:\\Users\\HP\\AppData\\Local\\Temp\\tmpsif7r129\n", - "Unzipping the file....\n", - "Unpacked and saved to data\\SCOPe\\pdb_sequences.txt\n", - "Removed temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmpsif7r129\n", - "Missing Scope: cla.txt raw data, Downloading...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "G:\\anaconda3\\envs\\env_chebai\\lib\\site-packages\\urllib3\\connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'scop.berkeley.edu'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n", - "warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Missing Scope: hie.txt raw data, Downloading...\n", - "Missing Scope: des.txt raw data, Downloading...\n", - "Extracting class hierarchy...\n", - "Computing transitive closure\n", - "Process graph\n", - "101 labels has been selected for specified threshold, \n", - "Constructing data.pkl file .....\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Check for processed data in data\\SCOPe\\version_2.08\\SCOPe50\\processed\\protein_token\n", - "Cross-validation enabled: False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Missing transformed data (`data.pt` file). Transforming data.... \n", - "Processing 60298 lines...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|█████████████████████████████████████████████████████████████████████████| 60298/60298 [00:53<00:00, 1119.10it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving 21 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\protein_token\\tokens.txt...\n", - "First 10 tokens: ['M', 'S', 'I', 'G', 'A', 'T', 'R', 'L', 'Q', 'N']\n" - ] - } - ], - "source": [ - "scope_class.prepare_data()\n", - "scope_class.setup()" - ] - }, - { - "cell_type": "markdown", - "id": "1655d489-25fe-46de-9feb-eeca5d36936f", - "metadata": {}, - "source": [ - "\n", - "### Automatic Execution: \n", - "These methods are executed automatically when using the training command `chebai fit`. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", - "\n", - "### Why is Preparation Needed?\n", - "\n", - "- **Data Availability**: The preparation step ensures that the required SCOPe data files are downloaded or loaded, which are essential for analysis.\n", - "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", - "\n", - "### Main Methods for Data Preprocessing\n", - "\n", - "The data preprocessing in a data class involves two main methods:\n", - "\n", - "1. **`prepare_data` Method**:\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels. This step is independent of input encodings.\n", - " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", - "\n", - "2. **`setup` Method**:\n", - " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), tokenizing the input according to the specified encoding. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the tokenization.\n", - " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", - "\n", - "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." - ] - }, - { - "cell_type": "markdown", - "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "bb6e9a81554368f7", - "metadata": {}, - "source": [ - "# 3. Overview of the 3 preprocessing stages\n", - "\n", - "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", - "\n", - "1. **Raw Data Stage**:\n", - " - **Files**: `cla.txt`, `des.txt` and `hie.txt`. Please find description of each file [here](https://scop.berkeley.edu/help/ver=2.08#parseablefiles-2.08).\n", - " - **Description**: This stage contains the raw SCOPe data in txt format, serving as the initial input for further processing.\n", - " - **File Path**: `data/SCOPe/version_${scope_version}/raw/${filename}.txt`\n", - "\n", - "2. **Processed Data Stage 1**:\n", - " - **File**: `data.pkl`\n", - " - **Description**: This stage includes the data after initial processing. It contains protein sequence strings, class columns, and metadata but lacks data splits.\n", - " - **File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/data.pkl`\n", - " - **Additional File**: `classes.txt` - A file listing the relevant SCOPe classes.\n", - "\n", - "3. **Processed Data Stage 2**:\n", - " - **File**: `data.pt`\n", - " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", - " - **File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", - " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", - "\n", - "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n", - "\n", - "### Data Splits\n", - "\n", - "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", - "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n" - ] - }, - { - "cell_type": "markdown", - "id": "7e172c0d1e8bb93f", - "metadata": {}, - "source": [ - "# 4. Data Files and their structure\n", - "\n", - "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n" - ] - }, - { - "cell_type": "markdown", - "id": "43329709-5134-4ce5-88e7-edd2176bf84d", - "metadata": {}, - "source": [ - "## raw files\n", - "- cla.txt, des.txt and hie.txt\n", - "\n", - "For detailed description of raw files and their structures, please refer the official website [here](https://scop.berkeley.edu/help/ver=2.08#parseablefiles-2.08).\n" - ] - }, - { - "cell_type": "markdown", - "id": "558295e5a7ded456", - "metadata": {}, - "source": [ - "## data.pkl File\n", - "\n", - "**Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the ids, sids which are used to label corresponding sequence, protein-chain sequence, and columns for each label with boolean values." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "fd490270-59b8-4c1c-8b09-204defddf592", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:09:01.622317Z", - "start_time": "2024-10-05T21:09:01.606698Z" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:11:51.296162Z", - "start_time": "2024-10-05T21:11:44.559304Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Size of the data (rows x columns): (60424, 1035)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idsidssequenceclass_46456class_48724class_51349class_53931class_56572class_56835class_56992...species_187294species_56257species_186882species_56690species_161316species_57962species_58067species_267696species_311502species_311501
01[d4oq9a_, d4oq9b_, d4oq9c_, d4oq9d_, d4niaa_, ...AAAAAAAAAAFalseTrueFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
12[d7dxhc_]AAAAAAAAAAAAAAAAAAAAAAAFalseFalseFalseFalseFalseTrueFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
23[d1gkub1, d1gkub2, d1gkub3, d1gkub4]AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASLCLFPEDFLLKEF...FalseFalseTrueFalseTrueFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
34[d3c9wa2, d3c9wb2, d3c9wa3, d3c9wb3]AAAAAAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNLNKV...FalseFalseFalseTrueFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
45[d1xwaa1, d1xwab_, d1xwac_, d1xwad_, d1xwaa2]AAAAAMVYQVKDKADLDGQLTKASGKLVVLDFFATWCGPCKMISPK...FalseFalseTrueFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
\n", - "

5 rows × 1035 columns

\n", - "
" - ], - "text/plain": [ - " id sids \\\n", - "0 1 [d4oq9a_, d4oq9b_, d4oq9c_, d4oq9d_, d4niaa_, ... \n", - "1 2 [d7dxhc_] \n", - "2 3 [d1gkub1, d1gkub2, d1gkub3, d1gkub4] \n", - "3 4 [d3c9wa2, d3c9wb2, d3c9wa3, d3c9wb3] \n", - "4 5 [d1xwaa1, d1xwab_, d1xwac_, d1xwad_, d1xwaa2] \n", - "\n", - " sequence class_46456 \\\n", - "0 AAAAAAAAAA False \n", - "1 AAAAAAAAAAAAAAAAAAAAAAA False \n", - "2 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASLCLFPEDFLLKEF... False \n", - "3 AAAAAAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNLNKV... False \n", - "4 AAAAAMVYQVKDKADLDGQLTKASGKLVVLDFFATWCGPCKMISPK... False \n", - "\n", - " class_48724 class_51349 class_53931 class_56572 class_56835 \\\n", - "0 True False False False False \n", - "1 False False False False True \n", - "2 False True False True False \n", - "3 False False True False False \n", - "4 False True False False False \n", - "\n", - " class_56992 ... species_187294 species_56257 species_186882 \\\n", - "0 False ... False False False \n", - "1 False ... False False False \n", - "2 False ... False False False \n", - "3 False ... False False False \n", - "4 False ... False False False \n", - "\n", - " species_56690 species_161316 species_57962 species_58067 \\\n", - "0 False False False False \n", - "1 False False False False \n", - "2 False False False False \n", - "3 False False False False \n", - "4 False False False False \n", - "\n", - " species_267696 species_311502 species_311501 \n", - "0 False False False \n", - "1 False False False \n", - "2 False False True \n", - "3 False False True \n", - "4 False False True \n", - "\n", - "[5 rows x 1035 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pkl_df = pd.DataFrame(\n", - " pd.read_pickle(\n", - " os.path.join(\n", - " scope_class.processed_dir_main,\n", - " scope_class.processed_main_file_names_dict[\"data\"],\n", - " )\n", - " )\n", - ")\n", - "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", - "pkl_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", - "metadata": {}, - "source": [ - "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/data.pkl`\n", - "\n", - "\n", - "### Structure of `data.pkl`\n", - "`data.pkl` as following structure: \n", - "- **Column 0**: Contains the ID of eachdata instance.\n", - "- **Column 1**: Contains the `sids` which are associated with corresponding protein-chain sequence.\n", - "- **Column 2**: Contains the protein-chain sequence.\n", - "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", - "\n", - "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" - ] - }, - { - "cell_type": "markdown", - "id": "ba019d2d4324bd0b", - "metadata": {}, - "source": [ - "## data.pt File\n", - "\n", - "\n", - "**Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library, specifically as a list of dictionaries. Each dictionary in this list includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:12:49.338943Z", - "start_time": "2024-10-05T21:12:49.323319Z" - } - }, - "outputs": [], - "source": [ - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:14:12.892845Z", - "start_time": "2024-10-05T21:13:59.859953Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type of loaded data: \n" - ] - } - ], - "source": [ - "data_pt = torch.load(\n", - " os.path.join(\n", - " scope_class.processed_dir, scope_class.processed_file_names_dict[\"data\"]\n", - " ),\n", - " weights_only=False,\n", - ")\n", - "print(\"Type of loaded data:\", type(data_pt))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:14:21.185027Z", - "start_time": "2024-10-05T21:14:21.169358Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'features': [14, 14, 14, 14, 20, 15, 15, 28, 15, 18, 25, 17, 18, 11, 25, 21, 27, 19, 14, 27, 19, 13, 14, 17, 16, 21, 25, 22, 27, 28, 12, 10, 20, 19, 13, 13, 14, 28, 17, 20, 20, 12, 19, 11, 17, 15, 27, 28, 15, 12, 17, 14, 23, 11, 19, 27, 14, 26, 19, 11, 11, 19, 12, 19, 19, 28, 17, 16, 20, 16, 19, 21, 10, 16, 18, 12, 17, 19, 10, 29, 12, 12, 21, 20, 16, 17, 19, 28, 20, 21, 12, 16, 18, 21, 19, 14, 19, 17, 12, 14, 18, 28, 23, 15, 28, 19, 19, 19, 15, 25, 17, 22, 25, 19, 28, 16, 13, 27, 13, 11, 20, 15, 28, 12, 15, 28, 27, 13, 13, 13, 28, 19, 14, 15, 28, 12, 18, 14, 20, 28, 14, 18, 15, 19, 13, 22, 28, 29, 12, 12, 20, 29, 28, 17, 13, 28, 23, 22, 15, 15, 28, 17, 13, 21, 17, 27, 11, 20, 23, 10, 10, 11, 20, 15, 22, 21, 10, 13, 21, 25, 11, 29, 25, 19, 20, 18, 17, 19, 19, 15, 18, 16, 16, 25, 15, 22, 25, 28, 23, 16, 20, 21, 13, 26, 18, 21, 15, 27, 17, 20, 22, 23, 11, 14, 29, 21, 21, 17, 25, 10, 14, 20, 25, 11, 22, 29, 11, 21, 11, 12, 17, 27, 16, 29, 17, 14, 12, 11, 20, 21, 27, 22, 15, 10, 21, 20, 17, 28, 21, 25, 11, 18, 27, 11, 13, 11, 28, 12, 17, 23, 15, 25, 16, 20, 11, 17, 11, 12, 16, 28, 27, 27, 27, 14, 13, 16, 22, 28, 12, 12, 26, 19, 22, 21, 21, 12, 19, 28, 22, 16, 23, 20, 28, 27, 24, 15, 19, 13, 12, 12, 29, 28, 12, 20, 22, 23, 17, 17, 27, 27, 21, 20, 28, 28, 28, 14, 13, 13, 11, 14, 14, 14, 14, 14], 'labels': array([False, True, False, ..., False, False, False]), 'ident': 6, 'group': None}\n" - ] - } - ], - "source": [ - "for i in range(5, 6):\n", - " print(data_pt[i])" - ] - }, - { - "cell_type": "markdown", - "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", - "metadata": {}, - "source": [ - "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", - "\n", - "\n", - "### Structure of `data.pt`\n", - "\n", - "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", - "\n", - "- **`features`**: \n", - " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", - "\n", - "- **`labels`**: \n", - " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", - "\n", - "- **`ident`**: \n", - " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" - ] - }, - { - "cell_type": "markdown", - "id": "186ec6f0eed6ecf7", - "metadata": {}, - "source": [ - "## classes.txt File\n", - "\n", - "**Description**: A file containing the list of selected SCOPe **labels** based on the specified threshold. This file is crucial for ensuring that only relevant **labels** are included in the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:15:19.146285Z", - "start_time": "2024-10-05T21:15:18.503284Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "class_48724\n", - "class_53931\n", - "class_310555\n", - "fold_48725\n", - "fold_56111\n", - "fold_56234\n", - "fold_310573\n", - "superfamily_48726\n", - "superfamily_56112\n", - "superfamily_56235\n", - "superfamily_310607\n", - "family_48942\n", - "family_56251\n", - "family_191359\n", - "family_191470\n" - ] - } - ], - "source": [ - "with open(os.path.join(scope_class.processed_dir_main, \"classes.txt\"), \"r\") as file:\n", - " for i in range(15):\n", - " line = file.readline()\n", - " print(line.strip())" - ] - }, - { - "cell_type": "markdown", - "id": "861da1c3-0401-49f0-a22f-109814ed95d5", - "metadata": {}, - "source": [ - "\n", - "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/classes.txt`\n", - "\n", - "The `classes.txt` file lists selected SCOPe classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique SCOPe class ID, identifying specific class withing SCOPe ontology along with the hierarchy level.\n", - "\n", - "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" - ] - }, - { - "cell_type": "markdown", - "id": "fb72be449e52b63f", - "metadata": {}, - "source": [ - "## splits.csv File\n", - "\n", - "**Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:15:54.575116Z", - "start_time": "2024-10-05T21:15:53.945139Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idsplit
01train
13train
24train
36train
49train
\n", - "
" - ], - "text/plain": [ - " id split\n", - "0 1 train\n", - "1 3 train\n", - "2 4 train\n", - "3 6 train\n", - "4 9 train" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv_df = pd.read_csv(os.path.join(scope_class.processed_dir_main, \"splits.csv\"))\n", - "csv_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "b058714f-e434-4367-89b9-74c129ac727f", - "metadata": {}, - "source": [ - "\n", - "\n", - "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/splits.csv`\n", - "\n", - "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "6dc3fd6c-7cf6-47ef-812f-54319a0cdeb9", - "metadata": {}, - "outputs": [], - "source": [ - "# You can specify a literal path for the `splits_file_path`, or if another `scope_class` instance is already defined,\n", - "# you can use its existing `splits_file_path` attribute for consistency.\n", - "scope_class_with_splits = SCOPeOver2000(\n", - " scope_version=\"2.08\",\n", - " # splits_file_path=\"data/chebi_v231/ChEBI50/processed/splits.csv\", # Literal path option\n", - " splits_file_path=scope_class.splits_file_path, # Use path from an existing `chebi_class` instance\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", - "metadata": {}, - "source": [ - "## 5.1 Protein Representation Using Amino Acid Sequence Notation\n", - "\n", - "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", - "\n", - "### Example Protein Sequence\n", - "\n", - "Protein-Chain: PDB ID:**1cph** Chain ID:**B** mol:protein length:30 INSULIN (PH 10)\n", - "
Refer - [1cph_B](https://www.rcsb.org/sequence/1CPH)\n", - "\n", - "- **Sequence**: `FVNQHLCGSHLVEALYLVCGERGFFYTPKA`\n", - "- **Sequence Length**: 30\n", - "\n", - "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", - "\n", - "### Tokenization and Encoding\n", - "\n", - "To tokenize and numerically encode this protein sequence, the `ProteinDataReader` class is used. This class allows for n-gram tokenization, where the `n_gram` parameter defines the size of the tokenized units. If `n_gram` is not provided (default is `None`), each amino acid letter is treated as a single token.\n", - "\n", - "For more details, you can explore the implementation of the `ProteinDataReader` class in the source code [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/reader.py)." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "da47d47e-4560-46af-b246-235596f27d82", - "metadata": {}, - "outputs": [], - "source": [ - "from chebai.preprocessing.reader import ProteinDataReader" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "8bdbf309-29ec-4aab-a6dc-9e09bc6961a2", - "metadata": {}, - "outputs": [], - "source": [ - "protein_dr_3gram = ProteinDataReader(n_gram=3)\n", - "protein_dr = ProteinDataReader()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "68e5c87c-79c3-4d5f-91e6-635399a84d3d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[25, 28, 19, 18, 29, 17, 24, 13, 11, 29, 17, 28, 27, 14, 17, 22, 17, 28, 24, 13, 27, 16, 13, 25, 25, 22, 15, 23, 21, 14]\n", - "[5023, 2218, 3799, 2290, 6139, 2208, 6917, 4674, 484, 439, 2737, 851, 365, 2624, 3240, 4655, 1904, 3737, 1453, 2659, 5160, 3027, 2355, 7163, 4328, 3115, 6207, 1234]\n" - ] - } - ], - "source": [ - "protein = \"FVNQHLCGSHLVEALYLVCGERGFFYTPKA\"\n", - "print(protein_dr._read_data(protein))\n", - "print(protein_dr_3gram._read_data(protein))" - ] - }, - { - "cell_type": "markdown", - "id": "5b7211ee-2ccc-46d3-8e8f-790f344726ba", - "metadata": {}, - "source": [ - "The numbers mentioned above refer to the index of each individual token from the [`tokens.txt`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/bin/protein_token/tokens.txt) file, which is used by the `ProteinDataReader` class. \n", - "\n", - "Each token in the `tokens.txt` file corresponds to a specific amino-acid letter, and these tokens are referenced by their index. Additionally, the index values are offset by the `EMBEDDING_OFFSET`, ensuring that the token embeddings are adjusted appropriately during processing." - ] - }, - { - "cell_type": "markdown", - "id": "93e328cf-09f9-4694-b175-28320590937d", - "metadata": {}, - "source": [ - "---" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}