Skip to content

Commit 9bbbd64

Browse files
committed
aaproperties now uses aaindex to get properties
1 parent 33b1b71 commit 9bbbd64

2 files changed

Lines changed: 128 additions & 48 deletions

File tree

aide_predict/bespoke_models/embedders/aa_properties.py

Lines changed: 127 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
* Company: National Renewable Energy Lab, Bioeneergy Science and Technology
66
* License: MIT
77
8-
Simple amino acid property embedder for testing position-specific functionality.
8+
Amino acid property embedder using AAindex database for physicochemical properties.
99
'''
1010
import numpy as np
11-
from typing import List, Union, Optional
11+
from typing import List, Union, Optional, Dict, Tuple
12+
from aaindex import aaindex1
1213

1314
from aide_predict.bespoke_models.base import (
1415
ProteinModelWrapper,
@@ -22,32 +23,81 @@
2223
import logging
2324
logger = logging.getLogger(__name__)
2425

25-
AVAILABLE = MessageBool(True, "AAPropertiesEmbedding is always available")
26-
27-
28-
# Simple physicochemical properties for the 20 standard amino acids
29-
AA_PROPERTIES = {
30-
'A': [1.8, 0.0, 0.0], # Alanine: hydrophobicity, charge, size
31-
'C': [2.5, 0.0, 0.0], # Cysteine
32-
'D': [-3.5, -1.0, 0.0], # Aspartic acid
33-
'E': [-3.5, -1.0, 0.5], # Glutamic acid
34-
'F': [2.8, 0.0, 1.0], # Phenylalanine
35-
'G': [-0.4, 0.0, -1.0], # Glycine
36-
'H': [-3.2, 0.5, 0.5], # Histidine
37-
'I': [4.5, 0.0, 0.5], # Isoleucine
38-
'K': [-3.9, 1.0, 0.5], # Lysine
39-
'L': [3.8, 0.0, 0.5], # Leucine
40-
'M': [1.9, 0.0, 0.5], # Methionine
41-
'N': [-3.5, 0.0, 0.0], # Asparagine
42-
'P': [-1.6, 0.0, 0.0], # Proline
43-
'Q': [-3.5, 0.0, 0.5], # Glutamine
44-
'R': [-4.5, 1.0, 1.0], # Arginine
45-
'S': [-0.8, 0.0, -0.5], # Serine
46-
'T': [-0.7, 0.0, 0.0], # Threonine
47-
'V': [4.2, 0.0, 0.0], # Valine
48-
'W': [-0.9, 0.0, 1.5], # Tryptophan
49-
'Y': [-1.3, 0.0, 1.0], # Tyrosine
50-
}
26+
# Default AAindex indices to use for each property
27+
DEFAULT_AAINDEX_PROPERTIES = [
28+
('KYTJ820101', 'hydrophobicity'), # Kyte-Doolittle hydropathy index
29+
('KLEP840101', 'charge'), # Net charge
30+
('FASG760104', 'pKa_amino_terminus'), # pKa N-terminus
31+
('FASG760105', 'pKa_carboxyl_terminus'), # pKa C-terminus
32+
('GRAR740103', 'volume'), # Volume
33+
('GRAR740102', 'polarity'), # Polarity
34+
('FASG760101', 'molecular_weight'), # Molecular weight
35+
('ZIMJ680104', 'isoelectric_point'), # Isoelectric point
36+
('ZIMJ680102', 'bulkiness'), # Bulkiness (related to aromaticity/structure)
37+
]
38+
39+
40+
def _check_aaindex_available() -> MessageBool:
41+
"""Check if AAindex is available and can be accessed."""
42+
try:
43+
from aaindex import aaindex1
44+
# Try to access a known index to verify it works
45+
_ = aaindex1['KYTJ820101']
46+
return MessageBool(True, "AAindex is available")
47+
except Exception as e:
48+
return MessageBool(False, f"AAindex initialization failed: {str(e)}")
49+
50+
51+
AVAILABLE = _check_aaindex_available()
52+
53+
54+
def _build_aa_property_lookup(
55+
aaindex_ids: List[Tuple[str, str]],
56+
include_aromatic: bool = True
57+
) -> Tuple[Dict[str, np.ndarray], List[str]]:
58+
"""
59+
Build a lookup table for amino acid properties from AAindex.
60+
61+
Args:
62+
aaindex_ids: List of tuples (aaindex_id, property_name)
63+
include_aromatic: Whether to include a boolean aromatic property
64+
65+
Returns:
66+
Tuple of (lookup_dict, property_names) where:
67+
- lookup_dict maps amino acid letter to property vector
68+
- property_names is the list of property names in order
69+
"""
70+
# Standard amino acids
71+
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
72+
aromatic_aas = {'F', 'W', 'Y'} # Phenylalanine, Tryptophan, Tyrosine
73+
74+
property_names = [name for _, name in aaindex_ids]
75+
if include_aromatic:
76+
property_names.append('aromatic')
77+
78+
# Build lookup table
79+
lookup = {}
80+
for aa in amino_acids:
81+
properties = []
82+
for aaindex_id, _ in aaindex_ids:
83+
try:
84+
record = aaindex1[aaindex_id]
85+
value = record.values.get(aa)
86+
if value is None:
87+
logger.warning(f"No value for {aa} in {aaindex_id}, using 0.0")
88+
value = 0.0
89+
properties.append(float(value))
90+
except Exception as e:
91+
logger.warning(f"Error getting {aaindex_id} for {aa}: {e}, using 0.0")
92+
properties.append(0.0)
93+
94+
# Add aromatic boolean as 1.0 or 0.0
95+
if include_aromatic:
96+
properties.append(1.0 if aa in aromatic_aas else 0.0)
97+
98+
lookup[aa] = np.array(properties, dtype=np.float32)
99+
100+
return lookup, property_names
51101

52102

53103
class AAPropertiesEmbedding(
@@ -57,22 +107,33 @@ class AAPropertiesEmbedding(
57107
ProteinModelWrapper
58108
):
59109
"""
60-
A simple amino acid property embedder for testing position-specific functionality.
110+
An amino acid property embedder using AAindex database.
61111
62-
This embedder converts each amino acid to a 3-dimensional vector based on:
63-
- Hydrophobicity (Kyte-Doolittle scale approximation)
64-
- Charge (at physiological pH)
65-
- Size (relative volume)
112+
This embedder converts each amino acid to a vector of physicochemical properties
113+
from the AAindex database. By default, it uses:
114+
- Hydrophobicity (Kyte-Doolittle scale)
115+
- Net charge
116+
- pKa of amino terminus
117+
- pKa of carboxyl terminus
118+
- Volume
119+
- Polarity
120+
- Molecular weight
121+
- Isoelectric point
122+
- Bulkiness
123+
- Aromatic (boolean: 1.0 for F/W/Y, 0.0 otherwise)
66124
67-
This is a simple, fast embedder that can handle aligned sequences with gaps
68-
and is useful for testing the PositionSpecificMixin functionality.
125+
Custom properties can be specified by providing a list of (aaindex_id, property_name) tuples.
69126
70127
Attributes:
71128
positions (Optional[List[int]]): Specific positions to encode. If None, all positions are encoded.
72129
pool (bool): Whether to pool the encoded vectors across positions.
73130
flatten (bool): Whether to flatten the output array.
74131
handle_aligned (bool): Whether to handle aligned sequences with gaps.
75132
gap_fill_value (float): Value to use for gap positions.
133+
aaindex_properties (List[Tuple[str, str]]): List of (aaindex_id, property_name) tuples.
134+
include_aromatic (bool): Whether to include aromatic boolean property.
135+
aa_property_lookup_ (Dict[str, np.ndarray]): Lookup table for amino acid properties.
136+
property_names_ (List[str]): Names of the properties in order.
76137
"""
77138

78139
_available = AVAILABLE
@@ -86,6 +147,8 @@ def __init__(
86147
handle_aligned: bool = True,
87148
gap_fill_value: float = 0.0,
88149
wt: Optional[Union[str, ProteinSequence]] = None,
150+
aaindex_properties: Optional[List[Tuple[str, str]]] = None,
151+
include_aromatic: bool = True,
89152
**kwargs
90153
):
91154
"""
@@ -99,7 +162,15 @@ def __init__(
99162
handle_aligned (bool): Whether to handle aligned sequences with gaps.
100163
gap_fill_value (float): Value to use for gap positions.
101164
wt (Optional[Union[str, ProteinSequence]]): The wild type sequence, if any.
165+
aaindex_properties (Optional[List[Tuple[str, str]]]): List of (aaindex_id, property_name) tuples.
166+
If None, uses DEFAULT_AAINDEX_PROPERTIES.
167+
include_aromatic (bool): Whether to include a boolean aromatic property (F, W, Y = 1.0, others = 0.0).
102168
"""
169+
if aaindex_properties is None:
170+
aaindex_properties = DEFAULT_AAINDEX_PROPERTIES
171+
self.aaindex_properties = aaindex_properties
172+
self.include_aromatic = include_aromatic
173+
103174
super().__init__(
104175
metadata_folder=metadata_folder,
105176
wt=wt,
@@ -110,11 +181,10 @@ def __init__(
110181
gap_fill_value=gap_fill_value,
111182
**kwargs
112183
)
113-
self.embedding_dim_ = 3 # 3 properties per amino acid
114184

115185
def _fit(self, X: ProteinSequences, y: Optional[np.ndarray] = None) -> 'AAPropertiesEmbedding':
116186
"""
117-
Fit the embedder (no actual fitting needed as properties are predefined).
187+
Fit the embedder by building the amino acid property lookup table.
118188
119189
Args:
120190
X (ProteinSequences): The input protein sequences.
@@ -123,6 +193,13 @@ def _fit(self, X: ProteinSequences, y: Optional[np.ndarray] = None) -> 'AAProper
123193
Returns:
124194
AAPropertiesEmbedding: The fitted embedder.
125195
"""
196+
# Build lookup table from AAindex
197+
self.aa_property_lookup_, self.property_names_ = _build_aa_property_lookup(
198+
self.aaindex_properties,
199+
include_aromatic=self.include_aromatic
200+
)
201+
self.embedding_dim_ = len(self.property_names_)
202+
126203
self.fitted_ = True
127204
return self
128205

@@ -142,16 +219,19 @@ def _transform(self, X: ProteinSequences) -> List[np.ndarray]:
142219
seq_str = str(seq).upper()
143220
seq_len = len(seq_str)
144221

145-
# Create embedding matrix: (seq_len, 3)
146-
embedding = np.zeros((1, seq_len, 3), dtype=np.float32)
222+
# Create embedding matrix: (1, seq_len, n_properties)
223+
embedding = np.zeros((1, seq_len, self.embedding_dim_), dtype=np.float32)
147224

148225
for i, aa in enumerate(seq_str):
149-
if aa in AA_PROPERTIES:
150-
embedding[0, i, :] = AA_PROPERTIES[aa]
226+
if aa in self.aa_property_lookup_:
227+
embedding[0, i, :] = self.aa_property_lookup_[aa]
151228
else:
152-
# Unknown amino acid - use zeros
153-
logger.warning(f"Unknown amino acid '{aa}' in sequence {seq.id}, using zeros")
154-
embedding[0, i, :] = [0.0, 0.0, 0.0]
229+
# Unknown amino acid (including gaps) - use zeros or gap_fill_value
230+
if aa == '-':
231+
embedding[0, i, :] = self.gap_fill_value
232+
else:
233+
logger.warning(f"Unknown amino acid '{aa}' in sequence {seq.id}, using zeros")
234+
embedding[0, i, :] = 0.0
155235

156236
all_embeddings.append(embedding)
157237

@@ -168,17 +248,16 @@ def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> L
168248
Returns:
169249
List[str]: Output feature names.
170250
"""
171-
if not hasattr(self, 'fitted_'):
251+
if not hasattr(self, 'fitted_') or not self.fitted_:
172252
raise ValueError("Model has not been fitted yet. Call fit() before using this method.")
173253

174254
positions = self.positions
175-
property_names = ['hydrophobicity', 'charge', 'size']
176255

177256
if self.pool:
178-
return [f"AAProps_{prop}" for prop in property_names]
257+
return [f"AAProps_{prop}" for prop in self.property_names_]
179258
elif self.flatten:
180259
if positions is None:
181260
raise ValueError("Cannot return feature names for flattened embeddings without specifying positions")
182-
return [f"pos{p}_{prop}" for p in positions for prop in property_names]
261+
return [f"pos{p}_{prop}" for p in positions for prop in self.property_names_]
183262
else:
184263
raise ValueError("Cannot return feature names for non-flattened non-pooled embeddings.")

environment.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,6 @@ dependencies:
1919
- h5py
2020
- tqdm
2121
- biopython==1.84
22+
- aaindex
2223

2324

0 commit comments

Comments
 (0)