diff --git a/docs/index.md b/docs/index.md index d81ea6c..aff7673 100644 --- a/docs/index.md +++ b/docs/index.md @@ -21,7 +21,7 @@ - :material-ruler: **Model Estimators** - Keras 3 estimators with scikit-learn API (MLP, BottleneckEncoder, LSTM, and more) + custom losses. + Keras 3 estimators with scikit-learn API (MLP, BottleneckEncoder, LSTM, Transformer, and more) + custom losses. [User Guide](user-guide/model-estimators.md) · [Losses](api-reference/losses.md) · [API](api-reference/model_estimators.md) diff --git a/docs/user-guide/model-estimators.md b/docs/user-guide/model-estimators.md index 7fb8e31..039c26a 100644 --- a/docs/user-guide/model-estimators.md +++ b/docs/user-guide/model-estimators.md @@ -202,6 +202,50 @@ lstm.fit(X_lagged, y, epochs=50, batch_size=32) predictions = lstm.predict(X_lagged) ``` +### TransformerRegressor + +`centimators.model_estimators.TransformerRegressor` applies transformer encoder blocks over lagged sequence inputs. It supports three attention modes — temporal (standard self-attention over timesteps), feature (iTransformer-style attention over features), and cross (dual-axis temporal + feature attention) — and two pooling strategies for collapsing the sequence dimension before the final MLP head. + +```python +from centimators.model_estimators import TransformerRegressor +from centimators.feature_transformers import LagTransformer + +# Create lagged features +lag_transformer = LagTransformer(windows=[1, 2, 3, 4, 5]) +X_lagged = lag_transformer.fit_transform(X, ticker_series=tickers) + +# Create Transformer model +transformer = TransformerRegressor( + lag_windows=[1, 2, 3, 4, 5], # Must match lag transformer + n_features_per_timestep=2, # e.g., price and volume + d_model=32, # Embedding dimension + num_heads=4, # Attention heads + ff_dim=128, # Feed-forward inner dimension + num_blocks=2, # Stacked encoder blocks + attention_type="temporal", # "temporal", "feature", or "cross" + pooling_type="attention", # "attention" (learned) or "average" + use_pre_norm=True, # Pre-LayerNorm (more stable training) + mlp_units=(64,), # MLP head after pooling + dropout_rate=0.1, + learning_rate=0.001, +) + +# Fit and predict +transformer.fit(X_lagged, y, epochs=50, batch_size=32) +predictions = transformer.predict(X_lagged) +``` + +#### Key parameters + +- `d_model` (int): Dimension of the internal embedding space (default: 32) +- `num_heads` (int): Number of attention heads (default: 4) +- `ff_dim` (int): Hidden dimension of the feed-forward network in each encoder block (default: 128) +- `num_blocks` (int): Number of stacked encoder blocks (default: 1) +- `attention_type` (str): `"temporal"` for standard self-attention over timesteps, `"feature"` for iTransformer-style attention over features, or `"cross"` for dual-axis (temporal + feature) attention +- `pooling_type` (str): `"attention"` for learned weighted pooling or `"average"` for global average pooling +- `use_pre_norm` (bool): Apply LayerNorm before attention/FFN rather than after (default: True) +- `mlp_units` (tuple[int, ...]): Hidden layer sizes for the prediction head after pooling (default: (64,)) + ## Loss Functions diff --git a/src/centimators/__init__.py b/src/centimators/__init__.py index f6fff57..57e236d 100644 --- a/src/centimators/__init__.py +++ b/src/centimators/__init__.py @@ -27,6 +27,7 @@ "MLPRegressor", "BottleneckEncoder", "LSTMRegressor", + "TransformerRegressor", "NeuralDecisionForestRegressor", "DSPyMator", "KerasCortex", @@ -52,6 +53,7 @@ "MLPRegressor": "centimators.model_estimators.keras_estimators.dense", "BottleneckEncoder": "centimators.model_estimators.keras_estimators.autoencoder", "LSTMRegressor": "centimators.model_estimators.keras_estimators.sequence", + "TransformerRegressor": "centimators.model_estimators.keras_estimators.transformer", "NeuralDecisionForestRegressor": "centimators.model_estimators.keras_estimators.tree", # DSPy estimator "DSPyMator": "centimators.model_estimators.dspymator", diff --git a/src/centimators/model_estimators/__init__.py b/src/centimators/model_estimators/__init__.py index 6b584a5..da9d670 100644 --- a/src/centimators/model_estimators/__init__.py +++ b/src/centimators/model_estimators/__init__.py @@ -14,6 +14,7 @@ "MLPRegressor", "BottleneckEncoder", "LSTMRegressor", + "TransformerRegressor", "NeuralDecisionForestRegressor", "TemperatureAnnealing", # DSPy estimator @@ -29,6 +30,7 @@ "MLPRegressor": "centimators.model_estimators.keras_estimators.dense", "BottleneckEncoder": "centimators.model_estimators.keras_estimators.autoencoder", "LSTMRegressor": "centimators.model_estimators.keras_estimators.sequence", + "TransformerRegressor": "centimators.model_estimators.keras_estimators.transformer", "NeuralDecisionForestRegressor": "centimators.model_estimators.keras_estimators.tree", "TemperatureAnnealing": "centimators.model_estimators.keras_estimators.tree", # DSPy estimator diff --git a/src/centimators/model_estimators/keras_estimators/__init__.py b/src/centimators/model_estimators/keras_estimators/__init__.py index 6e111d5..d75197c 100644 --- a/src/centimators/model_estimators/keras_estimators/__init__.py +++ b/src/centimators/model_estimators/keras_estimators/__init__.py @@ -12,6 +12,12 @@ from .dense import MLPRegressor from .autoencoder import BottleneckEncoder from .sequence import SequenceEstimator, LSTMRegressor +from .transformer import ( + AttentionPooling, + CrossAttention, + PositionEmbedding, + TransformerRegressor, +) from .tree import NeuralDecisionForestRegressor, TemperatureAnnealing __all__ = [ @@ -20,6 +26,10 @@ "BottleneckEncoder", "SequenceEstimator", "LSTMRegressor", + "TransformerRegressor", + "PositionEmbedding", + "CrossAttention", + "AttentionPooling", "NeuralDecisionForestRegressor", "TemperatureAnnealing", ] diff --git a/src/centimators/model_estimators/keras_estimators/transformer.py b/src/centimators/model_estimators/keras_estimators/transformer.py new file mode 100644 index 0000000..b591a01 --- /dev/null +++ b/src/centimators/model_estimators/keras_estimators/transformer.py @@ -0,0 +1,245 @@ +"""Transformer-based sequence estimator.""" + +from dataclasses import dataclass, field +from typing import Any + +from keras import initializers, layers, models, ops +from keras.saving import register_keras_serializable +from sklearn.base import RegressorMixin +from sklearn.preprocessing import StandardScaler + +from .sequence import SequenceEstimator + + +@register_keras_serializable(package="centimators") +class PositionEmbedding(layers.Layer): + """Learned positional embedding with fixed sequence length.""" + + def __init__( + self, sequence_length: int, initializer: str = "glorot_uniform", **kwargs + ): + super().__init__(**kwargs) + self.sequence_length = int(sequence_length) + self.initializer = initializers.get(initializer) + + def build(self, input_shape): + d_model = int(input_shape[-1]) + self.position_embedding = self.add_weight( + name="position_embedding", + shape=(self.sequence_length, d_model), + initializer=self.initializer, + trainable=True, + ) + super().build(input_shape) + + def call(self, inputs): + # (seq_len, d_model) -> (1, seq_len, d_model) for broadcasting over batch + return ops.expand_dims(self.position_embedding, axis=0) + + def get_config(self): + config = super().get_config() + config.update( + { + "sequence_length": self.sequence_length, + "initializer": initializers.serialize(self.initializer), + } + ) + return config + + +@register_keras_serializable(package="centimators") +class CrossAttention(layers.Layer): + """Dual-axis attention: temporal attention + feature attention.""" + + def __init__( + self, key_dim: int = 32, num_heads: int = 4, dropout: float = 0.1, **kwargs + ): + super().__init__(**kwargs) + self.key_dim = int(key_dim) + self.num_heads = int(num_heads) + self.dropout = float(dropout) + + self.temporal_attention = layers.MultiHeadAttention( + key_dim=self.key_dim, + num_heads=self.num_heads, + dropout=self.dropout, + attention_axes=(1,), + ) + self.feature_attention = layers.MultiHeadAttention( + key_dim=self.key_dim, + num_heads=self.num_heads, + dropout=self.dropout, + attention_axes=(2,), + ) + + def call(self, inputs): + temporal_out = self.temporal_attention(inputs, inputs) + feature_out = self.feature_attention(inputs, inputs) + return temporal_out + feature_out + + def get_config(self): + config = super().get_config() + config.update( + { + "key_dim": self.key_dim, + "num_heads": self.num_heads, + "dropout": self.dropout, + } + ) + return config + + +@register_keras_serializable(package="centimators") +class AttentionPooling(layers.Layer): + """Learned weighted pooling over the sequence dimension.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.score = layers.Dense(1) + + def call(self, inputs): + # inputs: (batch, seq_len, d_model) + logits = self.score(inputs) # (batch, seq_len, 1) + weights = ops.softmax(logits, axis=1) + weighted = inputs * weights + return ops.sum(weighted, axis=1) # (batch, d_model) + + +@dataclass(kw_only=True) +class TransformerRegressor(RegressorMixin, SequenceEstimator): + """Transformer encoder regressor for lagged sequence inputs. + + Stacks one or more encoder blocks (multi-head attention + feed-forward) + over the 3-D tensor produced by :class:`SequenceEstimator`, then collapses + the sequence dimension via pooling before a final MLP prediction head. + + Three attention modes are available: + + - ``"temporal"`` -- standard self-attention over timesteps (default). + - ``"feature"`` -- iTransformer-style attention over the feature axis. + - ``"cross"`` -- dual-axis attention (temporal + feature combined). + + Two pooling strategies collapse the sequence before the MLP head: + + - ``"attention"`` -- learned weighted pooling (:class:`AttentionPooling`). + - ``"average"`` -- global average pooling. + + Parameters + ---------- + d_model : int + Internal embedding dimension (default: 32). + num_heads : int + Number of attention heads (default: 4). + ff_dim : int + Feed-forward hidden dimension per encoder block (default: 128). + num_blocks : int + Number of stacked encoder blocks (default: 1). + dropout_rate : float + Dropout applied in attention and feed-forward layers (default: 0.1). + attention_type : str + One of ``"temporal"``, ``"feature"``, or ``"cross"`` (default: ``"temporal"``). + pooling_type : str + One of ``"attention"`` or ``"average"`` (default: ``"attention"``). + use_pre_norm : bool + Apply LayerNorm before (True) or after (False) attention/FFN (default: True). + mlp_units : tuple[int, ...] + Hidden layer sizes for the prediction head (default: ``(64,)``). + """ + + d_model: int = 32 + num_heads: int = 4 + ff_dim: int = 128 + num_blocks: int = 1 + dropout_rate: float = 0.1 + attention_type: str = "temporal" + pooling_type: str = "attention" + use_pre_norm: bool = True + mlp_units: tuple[int, ...] = (64,) + metrics: list[str] | None = field(default_factory=lambda: ["mse"]) + target_scaler: Any = field(default_factory=StandardScaler) + + def _encoder_block(self, inputs): + x = ( + layers.LayerNormalization(epsilon=1e-6)(inputs) + if self.use_pre_norm + else inputs + ) + + if self.attention_type == "cross": + key_dim = max(1, self.d_model // self.num_heads) + x = CrossAttention( + key_dim=key_dim, num_heads=self.num_heads, dropout=self.dropout_rate + )(x) + elif self.attention_type == "temporal": + x = layers.MultiHeadAttention( + key_dim=max(1, self.d_model // self.num_heads), + num_heads=self.num_heads, + dropout=self.dropout_rate, + )(x, x) + elif self.attention_type == "feature": + # iTransformer-style feature attention. + feature_tokens = layers.Permute((2, 1))(x) + feature_tokens = layers.MultiHeadAttention( + key_dim=max(1, self.seq_length // self.num_heads), + num_heads=self.num_heads, + dropout=self.dropout_rate, + )(feature_tokens, feature_tokens) + x = layers.Permute((2, 1))(feature_tokens) + else: + raise ValueError( + f"Unknown attention_type={self.attention_type!r}. " + "Use one of {'cross', 'temporal', 'feature'}." + ) + + x = inputs + x + ffn_input = ( + layers.LayerNormalization(epsilon=1e-6)(x) if self.use_pre_norm else x + ) + + ffn = layers.Dense(self.ff_dim, activation="relu")(ffn_input) + ffn = layers.Dropout(self.dropout_rate)(ffn) + ffn = layers.Dense(self.d_model)(ffn) + ffn = layers.Dropout(self.dropout_rate)(ffn) + return x + ffn + + def build_model(self): + if self._n_features_in_ is None: + raise ValueError("Must call fit() before building the model") + + inputs = layers.Input( + shape=(self.seq_length, self.n_features_per_timestep), + name="sequence_input", + ) + + x = layers.Dense(self.d_model)(inputs) + x = x + PositionEmbedding(sequence_length=self.seq_length)(x) + + for _ in range(self.num_blocks): + x = self._encoder_block(x) + + if self.use_pre_norm: + x = layers.LayerNormalization(epsilon=1e-6)(x) + + if self.pooling_type == "attention": + x = AttentionPooling()(x) + elif self.pooling_type == "average": + x = layers.GlobalAveragePooling1D()(x) + else: + raise ValueError( + f"Unknown pooling_type={self.pooling_type!r}. Use one of {'attention', 'average'}." + ) + + for units in self.mlp_units: + x = layers.Dense(units, activation="relu")(x) + x = layers.Dropout(self.dropout_rate)(x) + + outputs = layers.Dense(self.output_units, activation="linear", name="output")(x) + self.model = models.Model( + inputs=inputs, outputs=outputs, name="transformer_regressor" + ) + self.model.compile( + optimizer=self.optimizer(learning_rate=self.learning_rate), + loss=self.loss_function, + metrics=self.metrics, + ) + return self diff --git a/tests/test_model_estimators.py b/tests/test_model_estimators.py index d1a8a3b..85c76da 100644 --- a/tests/test_model_estimators.py +++ b/tests/test_model_estimators.py @@ -11,6 +11,7 @@ from centimators.model_estimators import ( MLPRegressor, LSTMRegressor, + TransformerRegressor, BottleneckEncoder, NeuralDecisionForestRegressor, ) @@ -573,3 +574,51 @@ def test_sklearn_metadata_routing(): preds = pipeline.predict(X, verbose=0) assert preds.shape == (n_samples, 1) + + +def test_transformer_regressor_fit_predict(): + """TransformerRegressor should fit and produce expected output shape.""" + rng = np.random.default_rng(7) + seq_length = 4 + n_features_per_step = 5 + n_samples = 18 + X = rng.standard_normal((n_samples, seq_length * n_features_per_step)).astype( + "float32" + ) + y = rng.standard_normal((n_samples, 1)).astype("float32") + + est = TransformerRegressor( + lag_windows=list(range(seq_length)), + n_features_per_timestep=n_features_per_step, + output_units=1, + d_model=16, + ff_dim=64, + num_blocks=1, + attention_type="temporal", + ) + est.fit(X, y, epochs=1, batch_size=6, verbose=0) + preds = est.predict(X, batch_size=6, verbose=0) + assert preds.shape == (n_samples, 1) + + +@pytest.mark.parametrize("attention_type", ["temporal", "feature", "cross"]) +def test_transformer_attention_types(attention_type): + """TransformerRegressor supports all attention backends.""" + rng = np.random.default_rng(9) + seq_length = 3 + n_features_per_step = 4 + X = rng.standard_normal((12, seq_length * n_features_per_step)).astype("float32") + y = rng.standard_normal((12, 1)).astype("float32") + + est = TransformerRegressor( + lag_windows=list(range(seq_length)), + n_features_per_timestep=n_features_per_step, + output_units=1, + d_model=16, + ff_dim=32, + num_blocks=1, + attention_type=attention_type, + ) + est.fit(X, y, epochs=1, batch_size=4, verbose=0) + preds = est.predict(X, batch_size=4, verbose=0) + assert preds.shape == (12, 1) diff --git a/uv.lock b/uv.lock index f03cb08..51405cc 100644 --- a/uv.lock +++ b/uv.lock @@ -371,7 +371,7 @@ wheels = [ [[package]] name = "centimators" -version = "0.3.1" +version = "0.3.2" source = { editable = "." } dependencies = [ { name = "narwhals" },