55* Company: National Renewable Energy Lab, Bioeneergy Science and Technology
66* License: MIT
77
8- Simple amino acid property embedder for testing position-specific functionality .
8+ Amino acid property embedder using AAindex database for physicochemical properties .
99'''
1010import numpy as np
11- from typing import List , Union , Optional
11+ from typing import List , Union , Optional , Dict , Tuple
12+ from aaindex import aaindex1
1213
1314from aide_predict .bespoke_models .base import (
1415 ProteinModelWrapper ,
2223import logging
2324logger = logging .getLogger (__name__ )
2425
25- AVAILABLE = MessageBool (True , "AAPropertiesEmbedding is always available" )
26-
27-
28- # Simple physicochemical properties for the 20 standard amino acids
29- AA_PROPERTIES = {
30- 'A' : [1.8 , 0.0 , 0.0 ], # Alanine: hydrophobicity, charge, size
31- 'C' : [2.5 , 0.0 , 0.0 ], # Cysteine
32- 'D' : [- 3.5 , - 1.0 , 0.0 ], # Aspartic acid
33- 'E' : [- 3.5 , - 1.0 , 0.5 ], # Glutamic acid
34- 'F' : [2.8 , 0.0 , 1.0 ], # Phenylalanine
35- 'G' : [- 0.4 , 0.0 , - 1.0 ], # Glycine
36- 'H' : [- 3.2 , 0.5 , 0.5 ], # Histidine
37- 'I' : [4.5 , 0.0 , 0.5 ], # Isoleucine
38- 'K' : [- 3.9 , 1.0 , 0.5 ], # Lysine
39- 'L' : [3.8 , 0.0 , 0.5 ], # Leucine
40- 'M' : [1.9 , 0.0 , 0.5 ], # Methionine
41- 'N' : [- 3.5 , 0.0 , 0.0 ], # Asparagine
42- 'P' : [- 1.6 , 0.0 , 0.0 ], # Proline
43- 'Q' : [- 3.5 , 0.0 , 0.5 ], # Glutamine
44- 'R' : [- 4.5 , 1.0 , 1.0 ], # Arginine
45- 'S' : [- 0.8 , 0.0 , - 0.5 ], # Serine
46- 'T' : [- 0.7 , 0.0 , 0.0 ], # Threonine
47- 'V' : [4.2 , 0.0 , 0.0 ], # Valine
48- 'W' : [- 0.9 , 0.0 , 1.5 ], # Tryptophan
49- 'Y' : [- 1.3 , 0.0 , 1.0 ], # Tyrosine
50- }
26+ # Default AAindex indices to use for each property
27+ DEFAULT_AAINDEX_PROPERTIES = [
28+ ('KYTJ820101' , 'hydrophobicity' ), # Kyte-Doolittle hydropathy index
29+ ('KLEP840101' , 'charge' ), # Net charge
30+ ('FASG760104' , 'pKa_amino_terminus' ), # pKa N-terminus
31+ ('FASG760105' , 'pKa_carboxyl_terminus' ), # pKa C-terminus
32+ ('GRAR740103' , 'volume' ), # Volume
33+ ('GRAR740102' , 'polarity' ), # Polarity
34+ ('FASG760101' , 'molecular_weight' ), # Molecular weight
35+ ('ZIMJ680104' , 'isoelectric_point' ), # Isoelectric point
36+ ('ZIMJ680102' , 'bulkiness' ), # Bulkiness (related to aromaticity/structure)
37+ ]
38+
39+
40+ def _check_aaindex_available () -> MessageBool :
41+ """Check if AAindex is available and can be accessed."""
42+ try :
43+ from aaindex import aaindex1
44+ # Try to access a known index to verify it works
45+ _ = aaindex1 ['KYTJ820101' ]
46+ return MessageBool (True , "AAindex is available" )
47+ except Exception as e :
48+ return MessageBool (False , f"AAindex initialization failed: { str (e )} " )
49+
50+
51+ AVAILABLE = _check_aaindex_available ()
52+
53+
54+ def _build_aa_property_lookup (
55+ aaindex_ids : List [Tuple [str , str ]],
56+ include_aromatic : bool = True
57+ ) -> Tuple [Dict [str , np .ndarray ], List [str ]]:
58+ """
59+ Build a lookup table for amino acid properties from AAindex.
60+
61+ Args:
62+ aaindex_ids: List of tuples (aaindex_id, property_name)
63+ include_aromatic: Whether to include a boolean aromatic property
64+
65+ Returns:
66+ Tuple of (lookup_dict, property_names) where:
67+ - lookup_dict maps amino acid letter to property vector
68+ - property_names is the list of property names in order
69+ """
70+ # Standard amino acids
71+ amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
72+ aromatic_aas = {'F' , 'W' , 'Y' } # Phenylalanine, Tryptophan, Tyrosine
73+
74+ property_names = [name for _ , name in aaindex_ids ]
75+ if include_aromatic :
76+ property_names .append ('aromatic' )
77+
78+ # Build lookup table
79+ lookup = {}
80+ for aa in amino_acids :
81+ properties = []
82+ for aaindex_id , _ in aaindex_ids :
83+ try :
84+ record = aaindex1 [aaindex_id ]
85+ value = record .values .get (aa )
86+ if value is None :
87+ logger .warning (f"No value for { aa } in { aaindex_id } , using 0.0" )
88+ value = 0.0
89+ properties .append (float (value ))
90+ except Exception as e :
91+ logger .warning (f"Error getting { aaindex_id } for { aa } : { e } , using 0.0" )
92+ properties .append (0.0 )
93+
94+ # Add aromatic boolean as 1.0 or 0.0
95+ if include_aromatic :
96+ properties .append (1.0 if aa in aromatic_aas else 0.0 )
97+
98+ lookup [aa ] = np .array (properties , dtype = np .float32 )
99+
100+ return lookup , property_names
51101
52102
53103class AAPropertiesEmbedding (
@@ -57,22 +107,33 @@ class AAPropertiesEmbedding(
57107 ProteinModelWrapper
58108):
59109 """
60- A simple amino acid property embedder for testing position-specific functionality .
110+ An amino acid property embedder using AAindex database .
61111
62- This embedder converts each amino acid to a 3-dimensional vector based on:
63- - Hydrophobicity (Kyte-Doolittle scale approximation)
64- - Charge (at physiological pH)
65- - Size (relative volume)
112+ This embedder converts each amino acid to a vector of physicochemical properties
113+ from the AAindex database. By default, it uses:
114+ - Hydrophobicity (Kyte-Doolittle scale)
115+ - Net charge
116+ - pKa of amino terminus
117+ - pKa of carboxyl terminus
118+ - Volume
119+ - Polarity
120+ - Molecular weight
121+ - Isoelectric point
122+ - Bulkiness
123+ - Aromatic (boolean: 1.0 for F/W/Y, 0.0 otherwise)
66124
67- This is a simple, fast embedder that can handle aligned sequences with gaps
68- and is useful for testing the PositionSpecificMixin functionality.
125+ Custom properties can be specified by providing a list of (aaindex_id, property_name) tuples.
69126
70127 Attributes:
71128 positions (Optional[List[int]]): Specific positions to encode. If None, all positions are encoded.
72129 pool (bool): Whether to pool the encoded vectors across positions.
73130 flatten (bool): Whether to flatten the output array.
74131 handle_aligned (bool): Whether to handle aligned sequences with gaps.
75132 gap_fill_value (float): Value to use for gap positions.
133+ aaindex_properties (List[Tuple[str, str]]): List of (aaindex_id, property_name) tuples.
134+ include_aromatic (bool): Whether to include aromatic boolean property.
135+ aa_property_lookup_ (Dict[str, np.ndarray]): Lookup table for amino acid properties.
136+ property_names_ (List[str]): Names of the properties in order.
76137 """
77138
78139 _available = AVAILABLE
@@ -86,6 +147,8 @@ def __init__(
86147 handle_aligned : bool = True ,
87148 gap_fill_value : float = 0.0 ,
88149 wt : Optional [Union [str , ProteinSequence ]] = None ,
150+ aaindex_properties : Optional [List [Tuple [str , str ]]] = None ,
151+ include_aromatic : bool = True ,
89152 ** kwargs
90153 ):
91154 """
@@ -99,7 +162,15 @@ def __init__(
99162 handle_aligned (bool): Whether to handle aligned sequences with gaps.
100163 gap_fill_value (float): Value to use for gap positions.
101164 wt (Optional[Union[str, ProteinSequence]]): The wild type sequence, if any.
165+ aaindex_properties (Optional[List[Tuple[str, str]]]): List of (aaindex_id, property_name) tuples.
166+ If None, uses DEFAULT_AAINDEX_PROPERTIES.
167+ include_aromatic (bool): Whether to include a boolean aromatic property (F, W, Y = 1.0, others = 0.0).
102168 """
169+ if aaindex_properties is None :
170+ aaindex_properties = DEFAULT_AAINDEX_PROPERTIES
171+ self .aaindex_properties = aaindex_properties
172+ self .include_aromatic = include_aromatic
173+
103174 super ().__init__ (
104175 metadata_folder = metadata_folder ,
105176 wt = wt ,
@@ -110,11 +181,10 @@ def __init__(
110181 gap_fill_value = gap_fill_value ,
111182 ** kwargs
112183 )
113- self .embedding_dim_ = 3 # 3 properties per amino acid
114184
115185 def _fit (self , X : ProteinSequences , y : Optional [np .ndarray ] = None ) -> 'AAPropertiesEmbedding' :
116186 """
117- Fit the embedder (no actual fitting needed as properties are predefined) .
187+ Fit the embedder by building the amino acid property lookup table .
118188
119189 Args:
120190 X (ProteinSequences): The input protein sequences.
@@ -123,6 +193,13 @@ def _fit(self, X: ProteinSequences, y: Optional[np.ndarray] = None) -> 'AAProper
123193 Returns:
124194 AAPropertiesEmbedding: The fitted embedder.
125195 """
196+ # Build lookup table from AAindex
197+ self .aa_property_lookup_ , self .property_names_ = _build_aa_property_lookup (
198+ self .aaindex_properties ,
199+ include_aromatic = self .include_aromatic
200+ )
201+ self .embedding_dim_ = len (self .property_names_ )
202+
126203 self .fitted_ = True
127204 return self
128205
@@ -142,16 +219,19 @@ def _transform(self, X: ProteinSequences) -> List[np.ndarray]:
142219 seq_str = str (seq ).upper ()
143220 seq_len = len (seq_str )
144221
145- # Create embedding matrix: (seq_len, 3 )
146- embedding = np .zeros ((1 , seq_len , 3 ), dtype = np .float32 )
222+ # Create embedding matrix: (1, seq_len, n_properties )
223+ embedding = np .zeros ((1 , seq_len , self . embedding_dim_ ), dtype = np .float32 )
147224
148225 for i , aa in enumerate (seq_str ):
149- if aa in AA_PROPERTIES :
150- embedding [0 , i , :] = AA_PROPERTIES [aa ]
226+ if aa in self . aa_property_lookup_ :
227+ embedding [0 , i , :] = self . aa_property_lookup_ [aa ]
151228 else :
152- # Unknown amino acid - use zeros
153- logger .warning (f"Unknown amino acid '{ aa } ' in sequence { seq .id } , using zeros" )
154- embedding [0 , i , :] = [0.0 , 0.0 , 0.0 ]
229+ # Unknown amino acid (including gaps) - use zeros or gap_fill_value
230+ if aa == '-' :
231+ embedding [0 , i , :] = self .gap_fill_value
232+ else :
233+ logger .warning (f"Unknown amino acid '{ aa } ' in sequence { seq .id } , using zeros" )
234+ embedding [0 , i , :] = 0.0
155235
156236 all_embeddings .append (embedding )
157237
@@ -168,17 +248,16 @@ def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> L
168248 Returns:
169249 List[str]: Output feature names.
170250 """
171- if not hasattr (self , 'fitted_' ):
251+ if not hasattr (self , 'fitted_' ) or not self . fitted_ :
172252 raise ValueError ("Model has not been fitted yet. Call fit() before using this method." )
173253
174254 positions = self .positions
175- property_names = ['hydrophobicity' , 'charge' , 'size' ]
176255
177256 if self .pool :
178- return [f"AAProps_{ prop } " for prop in property_names ]
257+ return [f"AAProps_{ prop } " for prop in self . property_names_ ]
179258 elif self .flatten :
180259 if positions is None :
181260 raise ValueError ("Cannot return feature names for flattened embeddings without specifying positions" )
182- return [f"pos{ p } _{ prop } " for p in positions for prop in property_names ]
261+ return [f"pos{ p } _{ prop } " for p in positions for prop in self . property_names_ ]
183262 else :
184263 raise ValueError ("Cannot return feature names for non-flattened non-pooled embeddings." )
0 commit comments