drafting method for any positions specific embedder to accept aligned sequences

EvanKomp · EvanKomp · commit 0894ccf4d897 · 2025-11-24T13:47:31.000-07:00
diff --git a/aide_predict/bespoke_models/__init__.py b/aide_predict/bespoke_models/__init__.py
@@ -20,6 +20,7 @@
 from .embedders.saprot import SaProtEmbedding
 from .embedders.kmer import KmerEmbedding
 from .embedders.ssemb import SSEmbEmbedding
+from .embedders.aa_properties import AAPropertiesEmbedding
 
 
 TOOLS = [
@@ -40,4 +41,5 @@
     SaProtEmbedding,
     KmerEmbedding,
     SSEmbEmbedding,
+    AAPropertiesEmbedding,
 ]
diff --git a/aide_predict/bespoke_models/base.py b/aide_predict/bespoke_models/base.py
@@ -898,7 +898,8 @@ class PositionSpecificMixin:
     
     This mixin adds functionality for handling position-specific outputs from protein models.
     It allows selecting specific positions to analyze, pooling across positions, and 
-    flattening multi-dimensional outputs.
+    flattening multi-dimensional outputs. It can also automatically handle aligned sequences
+    with gaps by stripping gaps before processing and remapping embeddings back to aligned positions.
     
     Attributes:
         positions (Optional[List[int]]): The positions to output scores for. If None, all positions are used.
@@ -908,14 +909,21 @@ class PositionSpecificMixin:
             - If callable: Uses the provided function for pooling
             - If False: No pooling is performed
         flatten (bool): Whether to flatten dimensions beyond the second dimension.
+        handle_aligned (bool): If True, automatically strip gaps before processing and remap to aligned positions.
+        gap_fill_value (float): Value to use for gap positions in aligned sequences (default 0.0).
     """
     _per_position_capable: bool = True
 
-    def _init_handler(self, positions=None, pool=True, flatten=True, **kwargs):
+    def _init_handler(self, positions=None, pool=True, flatten=True, 
+                     handle_aligned=True, gap_fill_value=0.0, **kwargs):
         """Initialize position-specific attributes from kwargs."""
         self.positions = positions
         self.pool = pool
         self.flatten = flatten
+        self.handle_aligned = handle_aligned
+        self.gap_fill_value = gap_fill_value
+        # Temporary storage for alignment mapping during transform
+        self._alignment_mapping = None
         
     def _is_ragged_array(self, arr):
         """Check if the input is a ragged array (list of arrays with different shapes)."""
@@ -934,9 +942,71 @@ def _pre_transform_hook(self, X):
         if self.positions is not None:
             if not (X.aligned or len(X) == 1):
                 raise ValueError("Input sequences must be same length / aligned for position-specific output.")
+        
+        # Handle aligned sequences if enabled
+        if self.handle_aligned and X.has_gaps:
+            # Store mapping in instance for post-transform hook
+            self._alignment_mapping = X.get_alignment_mapping()
+            # Convert mapping to have integer keys ascending from 0
+            self._alignment_mapping = {i: m for i, m in enumerate(self._alignment_mapping.values())}
+            X = X.with_no_gaps()
+            
+            # Validate behavior is well-defined
+            if self.positions is None and not self.pool:
+                raise ValueError(
+                    "Cannot return position-specific embeddings for sequences with gaps "
+                    "unless positions are specified or pooling is enabled."
+                )
+        else:
+            self._alignment_mapping = None
             
         return X
 
+    def _remap_to_aligned_positions(self, result, mapping, positions, fill_value):
+        """
+        Remap embeddings from ungapped sequences back to aligned positions.
+        
+        Args:
+            result: List of embeddings for ungapped sequences
+            mapping: Dict mapping sequence index to list of aligned positions
+            positions: List of aligned positions to extract
+            fill_value: Value to use for gap positions
+            
+        Returns:
+            List of remapped embeddings with gaps represented by fill_value
+        """
+        aligned_embeddings = []
+        for i, emb in enumerate(result):
+            seq_mapping = mapping[i]
+            # emb shape: (1, seq_len, embedding_dim), (seq_len, embedding_dim), or (seq_len,) for 1D
+            # Remove batch dimension if present
+            if emb.ndim == 3 and emb.shape[0] == 1:
+                emb = emb[0]  # Now (seq_len, embedding_dim)
+            
+            if emb.ndim == 1:
+                # Handle 1D case (e.g., single position or pooled)
+                emb = np.expand_dims(emb, 0)
+                squeeze_after = True
+            else:
+                squeeze_after = False
+            
+            # Now emb is (seq_len, embedding_dim)
+            seq_len = emb.shape[0]
+            embedding_dim = emb.shape[-1] if emb.ndim > 1 else 1
+            aligned_emb = np.full((len(positions), embedding_dim), fill_value, dtype=emb.dtype)
+            
+            for j, pos in enumerate(positions):
+                if pos in seq_mapping:
+                    aligned_pos = seq_mapping.index(pos)
+                    if aligned_pos < seq_len:
+                        aligned_emb[j] = emb[aligned_pos]
+            
+            if squeeze_after:
+                aligned_emb = np.squeeze(aligned_emb, axis=-1)
+            
+            aligned_embeddings.append(aligned_emb)
+        return aligned_embeddings
+    
     def _post_transform_hook(self, result, X):
         """
         Process the model output to handle position selection, pooling, and flattening.
@@ -950,6 +1020,15 @@ def _post_transform_hook(self, result, X):
         """
         if result is None or len(result) == 0:
             return result
+        
+        # Remap to aligned positions if we have a mapping and positions were specified
+        if self._alignment_mapping is not None and self.positions is not None:
+            result = self._remap_to_aligned_positions(
+                result, self._alignment_mapping, self.positions, self.gap_fill_value
+            )
+            # Clean up temporary storage
+            self._alignment_mapping = None
+        
         if self.pool:
             # get the pool function
             if isinstance(self.pool, str):
diff --git a/aide_predict/bespoke_models/embedders/aa_properties.py b/aide_predict/bespoke_models/embedders/aa_properties.py
@@ -0,0 +1,184 @@
+# aide_predict/bespoke_models/embedders/aa_properties.py
+'''
+* Author: Evan Komp
+* Created: 11/24/2024
+* Company: National Renewable Energy Lab, Bioeneergy Science and Technology
+* License: MIT
+
+Simple amino acid property embedder for testing position-specific functionality.
+'''
+import numpy as np
+from typing import List, Union, Optional
+
+from aide_predict.bespoke_models.base import (
+    ProteinModelWrapper, 
+    PositionSpecificMixin,
+    CanHandleAlignedSequencesMixin,
+    ExpectsNoFitMixin
+)
+from aide_predict.utils.data_structures import ProteinSequences, ProteinSequence
+from aide_predict.utils.common import MessageBool
+
+import logging
+logger = logging.getLogger(__name__)
+
+AVAILABLE = MessageBool(True, "AAPropertiesEmbedding is always available")
+
+
+# Simple physicochemical properties for the 20 standard amino acids
+AA_PROPERTIES = {
+    'A': [1.8, 0.0, 0.0],   # Alanine: hydrophobicity, charge, size
+    'C': [2.5, 0.0, 0.0],   # Cysteine
+    'D': [-3.5, -1.0, 0.0], # Aspartic acid
+    'E': [-3.5, -1.0, 0.5], # Glutamic acid
+    'F': [2.8, 0.0, 1.0],   # Phenylalanine
+    'G': [-0.4, 0.0, -1.0], # Glycine
+    'H': [-3.2, 0.5, 0.5],  # Histidine
+    'I': [4.5, 0.0, 0.5],   # Isoleucine
+    'K': [-3.9, 1.0, 0.5],  # Lysine
+    'L': [3.8, 0.0, 0.5],   # Leucine
+    'M': [1.9, 0.0, 0.5],   # Methionine
+    'N': [-3.5, 0.0, 0.0],  # Asparagine
+    'P': [-1.6, 0.0, 0.0],  # Proline
+    'Q': [-3.5, 0.0, 0.5],  # Glutamine
+    'R': [-4.5, 1.0, 1.0],  # Arginine
+    'S': [-0.8, 0.0, -0.5], # Serine
+    'T': [-0.7, 0.0, 0.0],  # Threonine
+    'V': [4.2, 0.0, 0.0],   # Valine
+    'W': [-0.9, 0.0, 1.5],  # Tryptophan
+    'Y': [-1.3, 0.0, 1.0],  # Tyrosine
+}
+
+
+class AAPropertiesEmbedding(
+    ExpectsNoFitMixin,
+    PositionSpecificMixin, 
+    CanHandleAlignedSequencesMixin,
+    ProteinModelWrapper
+):
+    """
+    A simple amino acid property embedder for testing position-specific functionality.
+    
+    This embedder converts each amino acid to a 3-dimensional vector based on:
+    - Hydrophobicity (Kyte-Doolittle scale approximation)
+    - Charge (at physiological pH)
+    - Size (relative volume)
+    
+    This is a simple, fast embedder that can handle aligned sequences with gaps
+    and is useful for testing the PositionSpecificMixin functionality.
+    
+    Attributes:
+        positions (Optional[List[int]]): Specific positions to encode. If None, all positions are encoded.
+        pool (bool): Whether to pool the encoded vectors across positions.
+        flatten (bool): Whether to flatten the output array.
+        handle_aligned (bool): Whether to handle aligned sequences with gaps.
+        gap_fill_value (float): Value to use for gap positions.
+    """
+    
+    _available = AVAILABLE
+
+    def __init__(
+        self,
+        metadata_folder: str = None,
+        positions: Optional[List[int]] = None,
+        flatten: bool = False,
+        pool: bool = False,
+        handle_aligned: bool = True,
+        gap_fill_value: float = 0.0,
+        wt: Optional[Union[str, ProteinSequence]] = None,
+        **kwargs
+    ):
+        """
+        Initialize the AAPropertiesEmbedding.
+
+        Args:
+            metadata_folder (str): The folder where metadata is stored.
+            positions (Optional[List[int]]): Specific positions to encode. If None, all positions are encoded.
+            flatten (bool): Whether to flatten the output array.
+            pool (bool): Whether to pool the encoded vectors across positions.
+            handle_aligned (bool): Whether to handle aligned sequences with gaps.
+            gap_fill_value (float): Value to use for gap positions.
+            wt (Optional[Union[str, ProteinSequence]]): The wild type sequence, if any.
+        """
+        super().__init__(
+            metadata_folder=metadata_folder,
+            wt=wt,
+            positions=positions,
+            pool=pool,
+            flatten=flatten,
+            handle_aligned=handle_aligned,
+            gap_fill_value=gap_fill_value,
+            **kwargs
+        )
+        self.embedding_dim_ = 3  # 3 properties per amino acid
+
+    def _fit(self, X: ProteinSequences, y: Optional[np.ndarray] = None) -> 'AAPropertiesEmbedding':
+        """
+        Fit the embedder (no actual fitting needed as properties are predefined).
+
+        Args:
+            X (ProteinSequences): The input protein sequences.
+            y (Optional[np.ndarray]): Ignored. Present for API consistency.
+
+        Returns:
+            AAPropertiesEmbedding: The fitted embedder.
+        """
+        self.fitted_ = True
+        return self
+
+    def _transform(self, X: ProteinSequences) -> List[np.ndarray]:
+        """
+        Transform the protein sequences into amino acid property embeddings.
+
+        Args:
+            X (ProteinSequences): The input protein sequences.
+
+        Returns:
+            List[np.ndarray]: The amino acid property embeddings for the sequences.
+        """
+        all_embeddings = []
+        
+        for seq in X:
+            seq_str = str(seq).upper()
+            seq_len = len(seq_str)
+            
+            # Create embedding matrix: (seq_len, 3)
+            embedding = np.zeros((1, seq_len, 3), dtype=np.float32)
+            
+            for i, aa in enumerate(seq_str):
+                if aa in AA_PROPERTIES:
+                    embedding[0, i, :] = AA_PROPERTIES[aa]
+                else:
+                    # Unknown amino acid - use zeros
+                    logger.warning(f"Unknown amino acid '{aa}' in sequence {seq.id}, using zeros")
+                    embedding[0, i, :] = [0.0, 0.0, 0.0]
+            
+            all_embeddings.append(embedding)
+        
+        # Return as list - PositionSpecificMixin will handle position selection, pooling, and alignment remapping
+        return all_embeddings
+
+    def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> List[str]:
+        """
+        Get output feature names for transformation.
+
+        Args:
+            input_features (Optional[List[str]]): Ignored. Present for API consistency.
+
+        Returns:
+            List[str]: Output feature names.
+        """
+        if not hasattr(self, 'fitted_'):
+            raise ValueError("Model has not been fitted yet. Call fit() before using this method.")
+        
+        positions = self.positions
+        property_names = ['hydrophobicity', 'charge', 'size']
+        
+        if self.pool:
+            return [f"AAProps_{prop}" for prop in property_names]
+        elif self.flatten:
+            if positions is None:
+                raise ValueError("Cannot return feature names for flattened embeddings without specifying positions")
+            return [f"pos{p}_{prop}" for p in positions for prop in property_names]
+        else:
+            raise ValueError("Cannot return feature names for non-flattened non-pooled embeddings.")
diff --git a/aide_predict/bespoke_models/embedders/esm2.py b/aide_predict/bespoke_models/embedders/esm2.py
@@ -149,20 +149,9 @@ def _transform(self, X: ProteinSequences) -> np.ndarray:
                     raise ValueError("Cannot flatten variable length sequences without positions or pooling.")
                 warnings.warn("Variable length sequences are being processed without positions or pooling, raw shapes will be output.")
             
-            mapping = None
-            if X.has_gaps:
-                # here we need to store a mapping such that if positions were specified we can map back to
-                # the aligned positions
-                mapping = X.get_alignment_mapping()
-                # convert mapping to have integer keys ascending from 0
-                mapping = {i: m for i, m in enumerate(mapping.values())}
-
-                X = X.with_no_gaps()
-                # raise if positions were not passed - here behavior is uncertain
-                if self.positions is None and not self.pool:
-                    raise ValueError("Cannot return position-specific embeddings for sequences with gaps unless positions are specified or pooling is on.")
-
-            base_index = 0
+            # Note: gap handling is now managed by PositionSpecificMixin hooks
+            # X will arrive here without gaps if handle_aligned=True
+            
             bar = tqdm.tqdm(total=len(X), desc="Computing ESM2 embeddings")
             for batch in X.iter_batches(self.batch_size):
                 batch_sequences = self._prepare_sequences(batch)
@@ -181,49 +170,13 @@ def _transform(self, X: ProteinSequences) -> np.ndarray:
                 # Remove special tokens (assuming first and last tokens are special)
                 embeddings = [emb[1:-1] for emb in embeddings]
                 
-                if self.positions is not None and mapping is None:
-                    # here we have fixed length so we can just use positions
-                    embeddings = [emb[self.positions] for emb in embeddings]
-                elif self.positions is not None and mapping is not None:
-                    # here we have variable length sequences that were input as an aligned set,
-                    # the user asked for positions in the alignment
-                    aligned_embeddings = []
-                    for i, emb in enumerate(embeddings):
-                        seq_mapping = mapping[base_index + i]
-                        aligned_emb = np.zeros((len(self.positions), emb.shape[1]))
-                        for j, pos in enumerate(self.positions):
-                            if pos in seq_mapping:
-                                aligned_pos = seq_mapping.index(pos)
-                                aligned_emb[j] = emb[aligned_pos]
-                            # If pos is not in seq_mapping, it remains a zero vector
-                        aligned_embeddings.append(aligned_emb)
-                    embeddings = aligned_embeddings
-                else:
-                    # Here positions were not specified and either have fixed length or pooling
-                    # is on
-                    pass
-                
-                if self.pool:
-                    if self.pool == 'mean' or self.pool is True:
-                        embeddings = [emb.mean(axis=0) for emb in embeddings]
-                    elif self.pool == 'max':
-                        embeddings = [emb.max(axis=0) for emb in embeddings]
-                    elif hasattr(np, self.pool):
-                        # check if the pool is a numpy function
-                        pool_func = getattr(np, self.pool)
-                        embeddings = [pool_func(emb, axis=0) for emb in embeddings]
-                    else:
-                        raise ValueError(f"Invalid pooling method: {self.pool}")
-                
-                # add 0th dimension
+                # Add 0th dimension for stacking
                 embeddings = [np.expand_dims(emb, 0) for emb in embeddings]
                 all_embeddings.extend(embeddings)
 
-                base_index += len(batch)
-
                 bar.update(len(batch))
             
-        # stack along 0 dimension
+        # Return as list - PositionSpecificMixin will handle position selection, pooling, and alignment remapping
         return all_embeddings
 
     def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> List[str]: