diff --git a/README.md b/README.md index ac141d1..63680b4 100644 --- a/README.md +++ b/README.md @@ -90,12 +90,11 @@ Total time: 0.0247s | Category | Description | Examples | |----------|-------------|----------| | **col_** | Column operations | `col_rename`, `col_drop`, `col_cast`, `col_add`, `col_select` | -| **math_** | Arithmetic operations | `math_add`, `math_multiply`, `math_clamp`, `math_round`, `math_abs` | +| **math_** | Arithmetic & scaling | `math_add`, `math_multiply`, `math_standardize`, `math_minmax`, `math_clamp` | | **rows_** | Row filtering & reshaping | `rows_filter`, `rows_drop_nulls`, `rows_sort`, `rows_unique`, `rows_pivot` | | **str_** | String operations | `str_lower`, `str_upper`, `str_strip`, `str_replace`, `str_split` | | **dt_** | Datetime operations | `dt_year`, `dt_month`, `dt_parse`, `dt_age_years`, `dt_diff_days` | -| **map_** | Value mapping | `map_values`, `map_discretize`, `map_case`, `map_from_column` | -| **enc_** | Categorical encoding | `enc_onehot`, `enc_ordinal`, `enc_label` | +| **map_** | Value mapping & encoding | `map_values`, `map_discretize`, `map_onehot`, `map_ordinal` | ## Installation diff --git a/docs/api/index.md b/docs/api/index.md index c577fc3..5f83ed3 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -82,6 +82,13 @@ All TransformPlan operations at a glance. Click method names for detailed docume | [`math_percent_of`](ops/math.md) | Calculate percentage of one column relative to another | | [`math_cumsum`](ops/math.md) | Calculate cumulative sum (optionally grouped) | | [`math_rank`](ops/math.md) | Calculate rank of values | +| [`math_standardize`](ops/math.md) | Z-score standardization (mean=0, std=1) | +| [`math_minmax`](ops/math.md) | Min-max normalization to a range | +| [`math_robust_scale`](ops/math.md) | Robust scaling using median and IQR | +| [`math_log`](ops/math.md) | Logarithmic transform | +| [`math_sqrt`](ops/math.md) | Square root transform | +| [`math_power`](ops/math.md) | Power transform | +| [`math_winsorize`](ops/math.md) | Clip values to percentiles or bounds | ### Row Operations diff --git a/docs/api/ops/encoding.md b/docs/api/ops/encoding.md deleted file mode 100644 index 74554aa..0000000 --- a/docs/api/ops/encoding.md +++ /dev/null @@ -1,187 +0,0 @@ -# Encoding Operations - -Categorical encoding operations for machine learning preparation. - -## Overview - -Encoding operations transform categorical columns into numeric representations suitable for machine learning models. They support one-hot encoding, ordinal encoding, and label encoding. - -```python -from transformplan import TransformPlan - -plan = ( - TransformPlan() - .enc_onehot("color", categories=["red", "green", "blue"], drop="first") - .enc_ordinal("size", categories=["small", "medium", "large"]) -) -``` - -## Class Reference - -::: transformplan.ops.encoding.EncodingOps - options: - show_root_heading: true - members: - - enc_onehot - - enc_ordinal - - enc_label - -## Examples - -### One-Hot Encoding - -Creates binary indicator columns (0/1) for each category. - -```python -# Basic one-hot encoding -plan = TransformPlan().enc_onehot( - column="color", - categories=["red", "green", "blue"] -) -# Creates columns: color_red, color_green, color_blue - -# Drop first category to avoid multicollinearity (for regression models) -plan = TransformPlan().enc_onehot( - column="color", - categories=["red", "green", "blue"], - drop="first" -) -# Creates columns: color_green, color_blue (drops color_red) - -# Drop last category -plan = TransformPlan().enc_onehot( - column="color", - categories=["red", "green", "blue"], - drop="last" -) -# Creates columns: color_red, color_green (drops color_blue) - -# Drop specific category -plan = TransformPlan().enc_onehot( - column="color", - categories=["red", "green", "blue"], - drop="green" -) -# Creates columns: color_red, color_blue (drops color_green) - -# Custom prefix for new columns -plan = TransformPlan().enc_onehot( - column="color", - categories=["red", "green", "blue"], - prefix="c" -) -# Creates columns: c_red, c_green, c_blue - -# Keep original column -plan = TransformPlan().enc_onehot( - column="color", - categories=["red", "green", "blue"], - drop_original=False -) -# Keeps color column alongside color_red, color_green, color_blue -``` - -### Ordinal Encoding - -Maps categories to integers based on explicit ordering (first=0, second=1, etc.). - -```python -# Ordinal encoding with meaningful order -plan = TransformPlan().enc_ordinal( - column="size", - categories=["small", "medium", "large"] -) -# Maps: small -> 0, medium -> 1, large -> 2 - -# Output to new column -plan = TransformPlan().enc_ordinal( - column="size", - categories=["small", "medium", "large"], - new_column="size_encoded" -) - -# Custom unknown value -plan = TransformPlan().enc_ordinal( - column="size", - categories=["small", "medium", "large"], - unknown_value=-1 # Default -) -# Values not in categories get -1 -``` - -### Label Encoding - -Simple integer encoding, alphabetically sorted by default. Similar to ordinal encoding but without semantic ordering. - -```python -# Label encoding (alphabetically sorted) -plan = TransformPlan().enc_label(column="department") -# Maps alphabetically: Engineering -> 0, HR -> 1, Sales -> 2 - -# With explicit categories -plan = TransformPlan().enc_label( - column="department", - categories=["HR", "Engineering", "Sales"] -) -# Maps: HR -> 0, Engineering -> 1, Sales -> 2 -``` - -## Use Cases - -### Preparing Data for Machine Learning - -```python -# One-hot encode categorical features, dropping first to avoid multicollinearity -plan = ( - TransformPlan() - .enc_onehot("color", categories=["red", "green", "blue"], drop="first") - .enc_onehot("size", categories=["S", "M", "L", "XL"], drop="first") - .enc_ordinal("quality", categories=["low", "medium", "high"]) -) -``` - -### Handling Unknown Categories - -```python -# Unknown values get all zeros (one-hot) -plan = TransformPlan().enc_onehot( - column="color", - categories=["red", "green", "blue"], - unknown_value="all_zero" # Default -) - -# Unknown values get -1 (ordinal/label) -plan = TransformPlan().enc_ordinal( - column="size", - categories=["small", "medium", "large"], - unknown_value=-1 -) -``` - -### Deriving Categories from Data - -When categories are not specified, they are derived from the data (sorted alphabetically): - -```python -# Categories derived from data -plan = TransformPlan().enc_onehot("color") -# Uses sorted unique values from the column - -# Note: For reproducibility, explicitly specify categories -plan = TransformPlan().enc_onehot( - column="color", - categories=["blue", "green", "red"] # Explicit is better -) -``` - -## Multicollinearity Note - -When using one-hot encoding for linear models (regression, logistic regression), you should drop one category to avoid the [dummy variable trap](https://en.wikipedia.org/wiki/Dummy_variable_(statistics)). Use the `drop` parameter: - -```python -# For regression models, drop one category -plan = TransformPlan().enc_onehot("color", drop="first") - -# Tree-based models (random forest, XGBoost) don't require this -plan = TransformPlan().enc_onehot("color") # Keep all -``` diff --git a/docs/api/ops/map.md b/docs/api/ops/map.md index e49babd..c927bfb 100644 --- a/docs/api/ops/map.md +++ b/docs/api/ops/map.md @@ -1,10 +1,10 @@ # Map Operations -Value mapping, discretization, and transformation operations. +Value mapping, discretization, encoding, and transformation operations. ## Overview -Map operations transform column values using dictionaries, bins, or other columns. They're useful for categorization, value replacement, and data normalization. +Map operations transform column values using dictionaries, bins, or encoding schemes. They're useful for categorization, value replacement, data normalization, and ML feature preparation. ```python from transformplan import TransformPlan @@ -13,6 +13,7 @@ plan = ( TransformPlan() .map_values("status", {"A": "Active", "I": "Inactive"}) .map_discretize("age", bins=[18, 35, 55], labels=["Young", "Adult", "Senior"]) + .map_onehot("color", categories=["red", "green", "blue"], drop="first") ) ``` @@ -29,6 +30,9 @@ plan = ( - map_null_to_value - map_value_to_null - map_from_column + - map_onehot + - map_ordinal + - map_label ## Examples @@ -155,3 +159,52 @@ plan = TransformPlan().map_value_to_null("score", -999) # Replace null with default plan = TransformPlan().map_null_to_value("category", "Uncategorized") ``` + +### One-Hot Encoding + +```python +# Basic one-hot encoding +plan = TransformPlan().map_onehot( + column="color", + categories=["red", "green", "blue"] +) +# Creates columns: color_red, color_green, color_blue + +# Drop first category to avoid multicollinearity (for regression models) +plan = TransformPlan().map_onehot( + column="color", + categories=["red", "green", "blue"], + drop="first" +) +# Creates columns: color_green, color_blue (drops color_red) +``` + +### Ordinal Encoding + +```python +# Ordinal encoding with meaningful order +plan = TransformPlan().map_ordinal( + column="size", + categories=["small", "medium", "large"] +) +# Maps: small -> 0, medium -> 1, large -> 2 +``` + +### Label Encoding + +```python +# Label encoding (alphabetically sorted by default) +plan = TransformPlan().map_label(column="department") +# Maps alphabetically: Engineering -> 0, HR -> 1, Sales -> 2 +``` + +### ML Feature Preparation + +```python +# One-hot encode categorical features, dropping first to avoid multicollinearity +plan = ( + TransformPlan() + .map_onehot("color", categories=["red", "green", "blue"], drop="first") + .map_ordinal("quality", categories=["low", "medium", "high"]) +) +``` diff --git a/docs/api/ops/math.md b/docs/api/ops/math.md index 070c423..6017c17 100644 --- a/docs/api/ops/math.md +++ b/docs/api/ops/math.md @@ -39,6 +39,13 @@ plan = ( - math_percent_of - math_cumsum - math_rank + - math_standardize + - math_minmax + - math_robust_scale + - math_log + - math_sqrt + - math_power + - math_winsorize ## Examples @@ -128,3 +135,52 @@ plan = TransformPlan().math_rank( group_by="category" ) ``` + +### Scaling Operations + +```python +# Z-score standardization (explicit params for reproducibility) +plan = TransformPlan().math_standardize("income", mean=50000, std=25000) + +# Derive from data +plan = TransformPlan().math_standardize("income") + +# Min-max normalization to [0, 1] +plan = TransformPlan().math_minmax("age", min_val=0, max_val=100) + +# Custom range +plan = TransformPlan().math_minmax("score", min_val=0, max_val=100, feature_range=(0, 10)) + +# Robust scaling (resistant to outliers) +plan = TransformPlan().math_robust_scale("salary", median=60000, iqr=30000) +``` + +### Transform Operations + +```python +# Natural log +plan = TransformPlan().math_log("price") + +# Log base 10 +plan = TransformPlan().math_log("price", base=10) + +# Log with offset for zeros +plan = TransformPlan().math_log("count", offset=1) # log(x + 1) + +# Square root +plan = TransformPlan().math_sqrt("variance") + +# Power transform +plan = TransformPlan().math_power("value", exponent=2) # square +plan = TransformPlan().math_power("value", exponent=0.5) # sqrt +``` + +### Outlier Handling + +```python +# Winsorize by percentiles +plan = TransformPlan().math_winsorize("salary", lower=0.05, upper=0.95) + +# Winsorize by explicit values +plan = TransformPlan().math_winsorize("salary", lower_value=20000, upper_value=200000) +``` diff --git a/docs/index.md b/docs/index.md index ae4265a..e80c103 100644 --- a/docs/index.md +++ b/docs/index.md @@ -93,8 +93,7 @@ Total time: 0.0247s | **rows_** | Row filtering & reshaping | `rows_filter`, `rows_drop_nulls`, `rows_sort`, `rows_unique`, `rows_pivot` | | **str_** | String operations | `str_lower`, `str_upper`, `str_strip`, `str_replace`, `str_split` | | **dt_** | Datetime operations | `dt_year`, `dt_month`, `dt_parse`, `dt_age_years`, `dt_diff_days` | -| **map_** | Value mapping | `map_values`, `map_discretize`, `map_case`, `map_from_column` | -| **enc_** | Categorical encoding | `enc_onehot`, `enc_ordinal`, `enc_label` | +| **map_** | Value mapping & encoding | `map_values`, `map_discretize`, `map_onehot`, `map_ordinal` | ## Getting Started diff --git a/mkdocs.yml b/mkdocs.yml index 5fc254a..4f0128f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -91,4 +91,3 @@ nav: - String Operations: api/ops/string.md - Datetime Operations: api/ops/datetime.md - Map Operations: api/ops/map.md - - Encoding Operations: api/ops/encoding.md diff --git a/tests/test_encoding.py b/tests/test_map_encoding.py similarity index 76% rename from tests/test_encoding.py rename to tests/test_map_encoding.py index d70dc6c..cb233c8 100644 --- a/tests/test_encoding.py +++ b/tests/test_map_encoding.py @@ -1,4 +1,4 @@ -"""Tests for encoding operations (ops/encoding.py).""" +"""Tests for encoding operations (map_onehot, map_ordinal, map_label).""" import polars as pl import pytest @@ -18,12 +18,12 @@ def encoding_df() -> pl.DataFrame: ) -class TestEncOnehot: - """Tests for enc_onehot operation.""" +class TestMapOnehot: + """Tests for map_onehot operation.""" - def test_enc_onehot_with_categories(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_with_categories(self, encoding_df: pl.DataFrame) -> None: """Test one-hot encoding with explicit categories.""" - plan = TransformPlan().enc_onehot("color", categories=["red", "green", "blue"]) + plan = TransformPlan().map_onehot("color", categories=["red", "green", "blue"]) result, _ = plan.process(encoding_df) # Check columns created @@ -44,9 +44,9 @@ def test_enc_onehot_with_categories(self, encoding_df: pl.DataFrame) -> None: assert result["color_green"][1] == 1 assert result["color_blue"][1] == 0 - def test_enc_onehot_derive_categories(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_derive_categories(self, encoding_df: pl.DataFrame) -> None: """Test one-hot encoding deriving categories from data.""" - plan = TransformPlan().enc_onehot("color") + plan = TransformPlan().map_onehot("color") result, _ = plan.process(encoding_df) # Should derive categories alphabetically: blue, green, red @@ -54,9 +54,9 @@ def test_enc_onehot_derive_categories(self, encoding_df: pl.DataFrame) -> None: assert "color_green" in result.columns assert "color_red" in result.columns - def test_enc_onehot_custom_prefix(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_custom_prefix(self, encoding_df: pl.DataFrame) -> None: """Test one-hot encoding with custom prefix.""" - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "color", categories=["red", "green"], prefix="c" ) result, _ = plan.process(encoding_df) @@ -64,9 +64,9 @@ def test_enc_onehot_custom_prefix(self, encoding_df: pl.DataFrame) -> None: assert "c_red" in result.columns assert "c_green" in result.columns - def test_enc_onehot_keep_original(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_keep_original(self, encoding_df: pl.DataFrame) -> None: """Test one-hot encoding keeping original column.""" - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "color", categories=["red", "green"], drop_original=False ) result, _ = plan.process(encoding_df) @@ -75,10 +75,10 @@ def test_enc_onehot_keep_original(self, encoding_df: pl.DataFrame) -> None: assert "color_red" in result.columns assert "color_green" in result.columns - def test_enc_onehot_unknown_all_zero(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_unknown_all_zero(self, encoding_df: pl.DataFrame) -> None: """Test one-hot encoding with unknown values set to all zeros.""" # Only include some categories - "blue" will be unknown - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "color", categories=["red", "green"], unknown_value="all_zero" ) result, _ = plan.process(encoding_df) @@ -87,9 +87,9 @@ def test_enc_onehot_unknown_all_zero(self, encoding_df: pl.DataFrame) -> None: assert result["color_red"][2] == 0 assert result["color_green"][2] == 0 - def test_enc_onehot_unknown_ignore(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_unknown_ignore(self, encoding_df: pl.DataFrame) -> None: """Test one-hot encoding with unknown values returning null.""" - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "color", categories=["red", "green"], unknown_value="ignore" ) result, _ = plan.process(encoding_df) @@ -98,9 +98,9 @@ def test_enc_onehot_unknown_ignore(self, encoding_df: pl.DataFrame) -> None: assert result["color_red"][2] is None assert result["color_green"][2] is None - def test_enc_onehot_drop_first(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_drop_first(self, encoding_df: pl.DataFrame) -> None: """Test one-hot encoding with drop='first' to avoid multicollinearity.""" - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "color", categories=["red", "green", "blue"], drop="first" ) result, _ = plan.process(encoding_df) @@ -118,9 +118,9 @@ def test_enc_onehot_drop_first(self, encoding_df: pl.DataFrame) -> None: assert result["color_green"][1] == 1 assert result["color_blue"][1] == 0 - def test_enc_onehot_drop_last(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_drop_last(self, encoding_df: pl.DataFrame) -> None: """Test one-hot encoding with drop='last'.""" - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "color", categories=["red", "green", "blue"], drop="last" ) result, _ = plan.process(encoding_df) @@ -134,9 +134,9 @@ def test_enc_onehot_drop_last(self, encoding_df: pl.DataFrame) -> None: assert result["color_red"][2] == 0 assert result["color_green"][2] == 0 - def test_enc_onehot_drop_specific_value(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_drop_specific_value(self, encoding_df: pl.DataFrame) -> None: """Test one-hot encoding dropping a specific category value.""" - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "color", categories=["red", "green", "blue"], drop="green" ) result, _ = plan.process(encoding_df) @@ -150,10 +150,10 @@ def test_enc_onehot_drop_specific_value(self, encoding_df: pl.DataFrame) -> None assert result["color_red"][1] == 0 assert result["color_blue"][1] == 0 - def test_enc_onehot_drop_with_derived_categories(self) -> None: + def test_map_onehot_drop_with_derived_categories(self) -> None: """Test one-hot encoding with drop and categories derived from data.""" df = pl.DataFrame({"color": ["red", "green", "blue"]}) - plan = TransformPlan().enc_onehot("color", drop="first") + plan = TransformPlan().map_onehot("color", drop="first") result, _ = plan.process(df) # Categories are derived alphabetically: blue, green, red @@ -162,11 +162,11 @@ def test_enc_onehot_drop_with_derived_categories(self) -> None: assert "color_green" in result.columns assert "color_red" in result.columns - def test_enc_onehot_drop_literal_takes_precedence(self) -> None: + def test_map_onehot_drop_literal_takes_precedence(self) -> None: """Test that literal values take precedence over 'first'/'last' keywords.""" # Category list where "first" is NOT the first element df = pl.DataFrame({"pos": ["last", "middle", "first"]}) - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "pos", categories=["last", "middle", "first"], drop="first" ) result, _ = plan.process(df) @@ -177,10 +177,10 @@ def test_enc_onehot_drop_literal_takes_precedence(self) -> None: assert "pos_middle" in result.columns assert "pos_first" not in result.columns - def test_enc_onehot_drop_keyword_when_not_in_categories(self) -> None: + def test_map_onehot_drop_keyword_when_not_in_categories(self) -> None: """Test that 'first'/'last' work as keywords when not in categories.""" df = pl.DataFrame({"color": ["red", "green", "blue"]}) - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "color", categories=["red", "green", "blue"], drop="first" ) result, _ = plan.process(df) @@ -192,12 +192,12 @@ def test_enc_onehot_drop_keyword_when_not_in_categories(self) -> None: assert "color_blue" in result.columns -class TestEncOrdinal: - """Tests for enc_ordinal operation.""" +class TestMapOrdinal: + """Tests for map_ordinal operation.""" - def test_enc_ordinal_with_categories(self, encoding_df: pl.DataFrame) -> None: + def test_map_ordinal_with_categories(self, encoding_df: pl.DataFrame) -> None: """Test ordinal encoding with explicit ordering.""" - plan = TransformPlan().enc_ordinal( + plan = TransformPlan().map_ordinal( "size", categories=["small", "medium", "large"] ) result, _ = plan.process(encoding_df) @@ -206,9 +206,9 @@ def test_enc_ordinal_with_categories(self, encoding_df: pl.DataFrame) -> None: expected = [0, 1, 2, 1, 0] assert result["size"].to_list() == expected - def test_enc_ordinal_derive_categories(self, encoding_df: pl.DataFrame) -> None: + def test_map_ordinal_derive_categories(self, encoding_df: pl.DataFrame) -> None: """Test ordinal encoding deriving categories alphabetically.""" - plan = TransformPlan().enc_ordinal("size") + plan = TransformPlan().map_ordinal("size") result, _ = plan.process(encoding_df) # Alphabetically: large=0, medium=1, small=2 @@ -216,9 +216,9 @@ def test_enc_ordinal_derive_categories(self, encoding_df: pl.DataFrame) -> None: expected = [2, 1, 0, 1, 2] assert result["size"].to_list() == expected - def test_enc_ordinal_new_column(self, encoding_df: pl.DataFrame) -> None: + def test_map_ordinal_new_column(self, encoding_df: pl.DataFrame) -> None: """Test ordinal encoding to new column.""" - plan = TransformPlan().enc_ordinal( + plan = TransformPlan().map_ordinal( "size", categories=["small", "medium", "large"], new_column="size_encoded", @@ -232,9 +232,9 @@ def test_enc_ordinal_new_column(self, encoding_df: pl.DataFrame) -> None: expected = [0, 1, 2, 1, 0] assert result["size_encoded"].to_list() == expected - def test_enc_ordinal_keep_original(self, encoding_df: pl.DataFrame) -> None: + def test_map_ordinal_keep_original(self, encoding_df: pl.DataFrame) -> None: """Test ordinal encoding keeping original column.""" - plan = TransformPlan().enc_ordinal( + plan = TransformPlan().map_ordinal( "size", categories=["small", "medium", "large"], new_column="size_encoded", @@ -245,10 +245,10 @@ def test_enc_ordinal_keep_original(self, encoding_df: pl.DataFrame) -> None: assert "size" in result.columns assert "size_encoded" in result.columns - def test_enc_ordinal_unknown_value(self) -> None: + def test_map_ordinal_unknown_value(self) -> None: """Test ordinal encoding with unknown values.""" df = pl.DataFrame({"size": ["small", "medium", "xl"]}) - plan = TransformPlan().enc_ordinal( + plan = TransformPlan().map_ordinal( "size", categories=["small", "medium", "large"], unknown_value=-1 ) result, _ = plan.process(df) @@ -256,10 +256,10 @@ def test_enc_ordinal_unknown_value(self) -> None: # "xl" is unknown assert result["size"].to_list() == [0, 1, -1] - def test_enc_ordinal_custom_unknown_value(self) -> None: + def test_map_ordinal_custom_unknown_value(self) -> None: """Test ordinal encoding with custom unknown value.""" df = pl.DataFrame({"size": ["small", "unknown"]}) - plan = TransformPlan().enc_ordinal( + plan = TransformPlan().map_ordinal( "size", categories=["small", "medium"], unknown_value=99 ) result, _ = plan.process(df) @@ -267,12 +267,12 @@ def test_enc_ordinal_custom_unknown_value(self) -> None: assert result["size"].to_list() == [0, 99] -class TestEncLabel: - """Tests for enc_label operation.""" +class TestMapLabel: + """Tests for map_label operation.""" - def test_enc_label_with_categories(self, encoding_df: pl.DataFrame) -> None: + def test_map_label_with_categories(self, encoding_df: pl.DataFrame) -> None: """Test label encoding with explicit categories.""" - plan = TransformPlan().enc_label( + plan = TransformPlan().map_label( "department", categories=["HR", "Engineering", "Sales"] ) result, _ = plan.process(encoding_df) @@ -281,9 +281,9 @@ def test_enc_label_with_categories(self, encoding_df: pl.DataFrame) -> None: expected = [0, 1, 2, 0, 1] assert result["department"].to_list() == expected - def test_enc_label_derive_categories(self, encoding_df: pl.DataFrame) -> None: + def test_map_label_derive_categories(self, encoding_df: pl.DataFrame) -> None: """Test label encoding deriving categories alphabetically.""" - plan = TransformPlan().enc_label("department") + plan = TransformPlan().map_label("department") result, _ = plan.process(encoding_df) # Alphabetically: Engineering=0, HR=1, Sales=2 @@ -291,9 +291,9 @@ def test_enc_label_derive_categories(self, encoding_df: pl.DataFrame) -> None: expected = [1, 0, 2, 1, 0] assert result["department"].to_list() == expected - def test_enc_label_new_column(self, encoding_df: pl.DataFrame) -> None: + def test_map_label_new_column(self, encoding_df: pl.DataFrame) -> None: """Test label encoding to new column.""" - plan = TransformPlan().enc_label( + plan = TransformPlan().map_label( "department", categories=["HR", "Engineering", "Sales"], new_column="dept_id", @@ -303,10 +303,10 @@ def test_enc_label_new_column(self, encoding_df: pl.DataFrame) -> None: assert "dept_id" in result.columns assert "department" not in result.columns - def test_enc_label_unknown_value(self) -> None: + def test_map_label_unknown_value(self) -> None: """Test label encoding with unknown values.""" df = pl.DataFrame({"dept": ["HR", "Marketing"]}) - plan = TransformPlan().enc_label( + plan = TransformPlan().map_label( "dept", categories=["HR", "Engineering"], unknown_value=-1 ) result, _ = plan.process(df) @@ -317,34 +317,34 @@ def test_enc_label_unknown_value(self) -> None: class TestEncodingValidation: """Tests for encoding validation errors.""" - def test_enc_onehot_missing_column(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_missing_column(self, encoding_df: pl.DataFrame) -> None: """Test validation error for missing column.""" - plan = TransformPlan().enc_onehot("nonexistent") + plan = TransformPlan().map_onehot("nonexistent") result = plan.validate(encoding_df) assert not result.is_valid assert any("nonexistent" in str(e) for e in result.errors) - def test_enc_onehot_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: """Test validation error for duplicate categories.""" - plan = TransformPlan().enc_onehot("color", categories=["red", "red", "blue"]) + plan = TransformPlan().map_onehot("color", categories=["red", "red", "blue"]) result = plan.validate(encoding_df) assert not result.is_valid assert any("Duplicate" in str(e) for e in result.errors) - def test_enc_onehot_column_collision(self) -> None: + def test_map_onehot_column_collision(self) -> None: """Test validation error for column name collision.""" df = pl.DataFrame({"color": ["red"], "color_red": [1]}) - plan = TransformPlan().enc_onehot("color", categories=["red"]) + plan = TransformPlan().map_onehot("color", categories=["red"]) result = plan.validate(df) assert not result.is_valid assert any("already exists" in str(e) for e in result.errors) - def test_enc_onehot_drop_invalid_value(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_drop_invalid_value(self, encoding_df: pl.DataFrame) -> None: """Test validation error for drop value not in categories.""" - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "color", categories=["red", "green", "blue"], drop="purple" ) result = plan.validate(encoding_df) @@ -352,11 +352,11 @@ def test_enc_onehot_drop_invalid_value(self, encoding_df: pl.DataFrame) -> None: assert not result.is_valid assert any("not in categories" in str(e) for e in result.errors) - def test_enc_onehot_drop_avoids_collision(self) -> None: + def test_map_onehot_drop_avoids_collision(self) -> None: """Test that drop='first' avoids column collision when first column exists.""" # color_blue already exists, but we're dropping blue (first alphabetically) df = pl.DataFrame({"color": ["red", "green", "blue"], "color_blue": [1, 2, 3]}) - plan = TransformPlan().enc_onehot( + plan = TransformPlan().map_onehot( "color", categories=["blue", "green", "red"], drop="first" ) result = plan.validate(df) @@ -364,17 +364,17 @@ def test_enc_onehot_drop_avoids_collision(self) -> None: # Should be valid because we're dropping color_blue assert result.is_valid - def test_enc_ordinal_missing_column(self, encoding_df: pl.DataFrame) -> None: + def test_map_ordinal_missing_column(self, encoding_df: pl.DataFrame) -> None: """Test validation error for missing column.""" - plan = TransformPlan().enc_ordinal("nonexistent") + plan = TransformPlan().map_ordinal("nonexistent") result = plan.validate(encoding_df) assert not result.is_valid assert any("nonexistent" in str(e) for e in result.errors) - def test_enc_ordinal_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: + def test_map_ordinal_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: """Test validation error for duplicate categories.""" - plan = TransformPlan().enc_ordinal( + plan = TransformPlan().map_ordinal( "size", categories=["small", "small", "large"] ) result = plan.validate(encoding_df) @@ -382,16 +382,16 @@ def test_enc_ordinal_duplicate_categories(self, encoding_df: pl.DataFrame) -> No assert not result.is_valid assert any("Duplicate" in str(e) for e in result.errors) - def test_enc_label_missing_column(self, encoding_df: pl.DataFrame) -> None: + def test_map_label_missing_column(self, encoding_df: pl.DataFrame) -> None: """Test validation error for missing column.""" - plan = TransformPlan().enc_label("nonexistent") + plan = TransformPlan().map_label("nonexistent") result = plan.validate(encoding_df) assert not result.is_valid - def test_enc_label_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: + def test_map_label_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: """Test validation error for duplicate categories.""" - plan = TransformPlan().enc_label("department", categories=["HR", "HR"]) + plan = TransformPlan().map_label("department", categories=["HR", "HR"]) result = plan.validate(encoding_df) assert not result.is_valid @@ -401,39 +401,39 @@ def test_enc_label_duplicate_categories(self, encoding_df: pl.DataFrame) -> None class TestEncodingEdgeCases: """Tests for edge cases in encoding operations.""" - def test_enc_onehot_with_nulls(self) -> None: + def test_map_onehot_with_nulls(self) -> None: """Test one-hot encoding with null values.""" df = pl.DataFrame({"color": ["red", None, "blue"]}) - plan = TransformPlan().enc_onehot("color", categories=["red", "blue"]) + plan = TransformPlan().map_onehot("color", categories=["red", "blue"]) result, _ = plan.process(df) # Null should be treated as unknown (all zeros with default setting) assert result["color_red"][1] == 0 assert result["color_blue"][1] == 0 - def test_enc_onehot_empty_dataframe(self, empty_df: pl.DataFrame) -> None: + def test_map_onehot_empty_dataframe(self, empty_df: pl.DataFrame) -> None: """Test one-hot encoding with empty DataFrame.""" df = pl.DataFrame({"color": pl.Series([], dtype=pl.Utf8)}) - plan = TransformPlan().enc_onehot("color", categories=["red", "blue"]) + plan = TransformPlan().map_onehot("color", categories=["red", "blue"]) result, _ = plan.process(df) assert "color_red" in result.columns assert "color_blue" in result.columns assert len(result) == 0 - def test_enc_onehot_single_category(self) -> None: + def test_map_onehot_single_category(self) -> None: """Test one-hot encoding with single category.""" df = pl.DataFrame({"status": ["active", "active", "active"]}) - plan = TransformPlan().enc_onehot("status", categories=["active"]) + plan = TransformPlan().map_onehot("status", categories=["active"]) result, _ = plan.process(df) assert "status_active" in result.columns assert result["status_active"].to_list() == [1, 1, 1] - def test_enc_ordinal_with_nulls(self) -> None: + def test_map_ordinal_with_nulls(self) -> None: """Test ordinal encoding with null values.""" df = pl.DataFrame({"size": ["small", None, "large"]}) - plan = TransformPlan().enc_ordinal( + plan = TransformPlan().map_ordinal( "size", categories=["small", "medium", "large"], unknown_value=-1 ) result, _ = plan.process(df) @@ -441,10 +441,10 @@ def test_enc_ordinal_with_nulls(self) -> None: # Null is treated as unknown assert result["size"].to_list() == [0, -1, 2] - def test_enc_ordinal_empty_categories(self) -> None: + def test_map_ordinal_empty_categories(self) -> None: """Test ordinal encoding with empty categories list.""" df = pl.DataFrame({"size": ["small", "medium"]}) - plan = TransformPlan().enc_ordinal("size", categories=[], unknown_value=-1) + plan = TransformPlan().map_ordinal("size", categories=[], unknown_value=-1) result, _ = plan.process(df) # All values should be unknown @@ -458,8 +458,8 @@ def test_multiple_encodings(self, encoding_df: pl.DataFrame) -> None: """Test chaining multiple encoding operations.""" plan = ( TransformPlan() - .enc_onehot("color", categories=["red", "green", "blue"]) - .enc_ordinal("size", categories=["small", "medium", "large"]) + .map_onehot("color", categories=["red", "green", "blue"]) + .map_ordinal("size", categories=["small", "medium", "large"]) ) result, _ = plan.process(encoding_df) @@ -472,7 +472,7 @@ def test_encoding_with_other_ops(self, encoding_df: pl.DataFrame) -> None: """Test encoding combined with other operations.""" plan = ( TransformPlan() - .enc_ordinal( + .map_ordinal( "size", categories=["small", "medium", "large"], new_column="size_encoded", @@ -490,26 +490,26 @@ def test_encoding_with_other_ops(self, encoding_df: pl.DataFrame) -> None: class TestEncodingProtocol: """Tests for encoding operations in the protocol/audit trail.""" - def test_enc_onehot_in_protocol(self, encoding_df: pl.DataFrame) -> None: + def test_map_onehot_in_protocol(self, encoding_df: pl.DataFrame) -> None: """Test that one-hot encoding is recorded in protocol.""" - plan = TransformPlan().enc_onehot("color", categories=["red", "green", "blue"]) + plan = TransformPlan().map_onehot("color", categories=["red", "green", "blue"]) _, protocol = plan.process(encoding_df) protocol_dict = protocol.to_dict() assert len(protocol_dict["steps"]) == 1 step = protocol_dict["steps"][0] - assert step["operation"] == "enc_onehot" + assert step["operation"] == "map_onehot" assert step["params"]["column"] == "color" assert step["params"]["categories"] == ["red", "green", "blue"] - def test_enc_ordinal_in_protocol(self, encoding_df: pl.DataFrame) -> None: + def test_map_ordinal_in_protocol(self, encoding_df: pl.DataFrame) -> None: """Test that ordinal encoding is recorded in protocol.""" - plan = TransformPlan().enc_ordinal( + plan = TransformPlan().map_ordinal( "size", categories=["small", "medium", "large"] ) _, protocol = plan.process(encoding_df) protocol_dict = protocol.to_dict() step = protocol_dict["steps"][0] - assert step["operation"] == "enc_ordinal" + assert step["operation"] == "map_ordinal" assert step["params"]["categories"] == ["small", "medium", "large"] diff --git a/tests/test_math_scaling.py b/tests/test_math_scaling.py new file mode 100644 index 0000000..ffcaad1 --- /dev/null +++ b/tests/test_math_scaling.py @@ -0,0 +1,389 @@ +"""Tests for ML preprocessing operations (scaling, transforms, outlier handling).""" + +import polars as pl + +from transformplan import TransformPlan + + +class TestMathStandardize: + """Tests for math_standardize operation (z-score).""" + + def test_standardize_with_explicit_params(self, numeric_df: pl.DataFrame) -> None: + """Test standardization with explicit mean and std.""" + plan = TransformPlan().math_standardize("a", mean=3, std=1) + result, _ = plan.process(numeric_df) + # (value - mean) / std = (1-3)/1=-2, (2-3)/1=-1, etc. + expected = [-2.0, -1.0, 0.0, 1.0, 2.0] + assert result["a"].to_list() == expected + + def test_standardize_derive_from_data(self, numeric_df: pl.DataFrame) -> None: + """Test standardization deriving params from data.""" + plan = TransformPlan().math_standardize("a") + result, _ = plan.process(numeric_df) + # Mean of [1,2,3,4,5] = 3, std ≈ 1.58 + # Result should have mean ≈ 0 and std ≈ 1 + values = result["a"].to_list() + mean_val = sum(v for v in values if v is not None) / len(values) + assert abs(mean_val) < 0.001 + # Check that the values approximate a standardized distribution + # Original [1,2,3,4,5] -> z-scores should be approximately [-1.26, -0.63, 0, 0.63, 1.26] + assert values[2] is not None + assert abs(values[2]) < 0.001 # Middle value should be ~0 + + def test_standardize_to_new_column(self, numeric_df: pl.DataFrame) -> None: + """Test standardization to a new column.""" + plan = TransformPlan().math_standardize("a", mean=3, std=1, new_column="a_z") + result, _ = plan.process(numeric_df) + assert "a" in result.columns + assert "a_z" in result.columns + assert result["a"].to_list() == [1, 2, 3, 4, 5] + assert result["a_z"].to_list() == [-2.0, -1.0, 0.0, 1.0, 2.0] + + def test_standardize_zero_std(self) -> None: + """Test standardization when std is zero (constant values).""" + df = pl.DataFrame({"a": [5.0, 5.0, 5.0, 5.0, 5.0]}) + plan = TransformPlan().math_standardize("a") + result, _ = plan.process(df) + # Should return zeros to avoid division by zero + assert result["a"].to_list() == [0.0, 0.0, 0.0, 0.0, 0.0] + + def test_standardize_nonexistent_column(self, numeric_df: pl.DataFrame) -> None: + """Test validation fails for nonexistent column.""" + plan = TransformPlan().math_standardize("nonexistent") + result = plan.validate(numeric_df) + assert not result.is_valid + + def test_standardize_non_numeric_column(self, basic_df: pl.DataFrame) -> None: + """Test validation fails for non-numeric column.""" + plan = TransformPlan().math_standardize("name") + result = plan.validate(basic_df) + assert not result.is_valid + assert "expected numeric" in str(result.errors[0]) + + +class TestMathMinmax: + """Tests for math_minmax operation (min-max normalization).""" + + def test_minmax_default_range(self, numeric_df: pl.DataFrame) -> None: + """Test min-max scaling to default [0, 1] range.""" + plan = TransformPlan().math_minmax("a", min_val=1, max_val=5) + result, _ = plan.process(numeric_df) + # (value - min) / (max - min) = (1-1)/4=0, (2-1)/4=0.25, etc. + expected = [0.0, 0.25, 0.5, 0.75, 1.0] + assert result["a"].to_list() == expected + + def test_minmax_custom_range(self, numeric_df: pl.DataFrame) -> None: + """Test min-max scaling to custom range.""" + plan = TransformPlan().math_minmax( + "a", min_val=1, max_val=5, feature_range=(0, 10) + ) + result, _ = plan.process(numeric_df) + # 0 + (value - 1) * 10 / 4 = 0, 2.5, 5, 7.5, 10 + expected = [0.0, 2.5, 5.0, 7.5, 10.0] + assert result["a"].to_list() == expected + + def test_minmax_derive_from_data(self, numeric_df: pl.DataFrame) -> None: + """Test min-max scaling deriving params from data.""" + plan = TransformPlan().math_minmax("a") + result, _ = plan.process(numeric_df) + # Min=1, Max=5, so result should be [0, 0.25, 0.5, 0.75, 1.0] + assert result["a"].min() == 0.0 + assert result["a"].max() == 1.0 + + def test_minmax_to_new_column(self, numeric_df: pl.DataFrame) -> None: + """Test min-max to a new column.""" + plan = TransformPlan().math_minmax( + "a", min_val=1, max_val=5, new_column="a_norm" + ) + result, _ = plan.process(numeric_df) + assert "a" in result.columns + assert "a_norm" in result.columns + assert result["a"].to_list() == [1, 2, 3, 4, 5] + + def test_minmax_constant_values(self) -> None: + """Test min-max when all values are the same.""" + df = pl.DataFrame({"a": [5.0, 5.0, 5.0, 5.0, 5.0]}) + plan = TransformPlan().math_minmax("a") + result, _ = plan.process(df) + # Should return midpoint of range when min == max + assert all(x == 0.5 for x in result["a"].to_list()) + + +class TestMathRobustScale: + """Tests for math_robust_scale operation (median/IQR scaling).""" + + def test_robust_scale_with_explicit_params(self) -> None: + """Test robust scaling with explicit median and IQR.""" + df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0, 5.0]}) + plan = TransformPlan().math_robust_scale("a", median=3.0, iqr=2.0) + result, _ = plan.process(df) + # (value - median) / iqr = (1-3)/2=-1, (2-3)/2=-0.5, etc. + expected = [-1.0, -0.5, 0.0, 0.5, 1.0] + assert result["a"].to_list() == expected + + def test_robust_scale_derive_from_data(self) -> None: + """Test robust scaling deriving params from data.""" + df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0, 5.0]}) + plan = TransformPlan().math_robust_scale("a") + result, _ = plan.process(df) + # Median of [1,2,3,4,5] = 3 + # Q1 = 2, Q3 = 4, IQR = 2 + # Result: (value - 3) / 2 + expected = [-1.0, -0.5, 0.0, 0.5, 1.0] + assert result["a"].to_list() == expected + + def test_robust_scale_to_new_column(self) -> None: + """Test robust scaling to a new column.""" + df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0, 5.0]}) + plan = TransformPlan().math_robust_scale( + "a", median=3.0, iqr=2.0, new_column="a_robust" + ) + result, _ = plan.process(df) + assert "a" in result.columns + assert "a_robust" in result.columns + assert result["a"].to_list() == [1.0, 2.0, 3.0, 4.0, 5.0] + + def test_robust_scale_zero_iqr(self) -> None: + """Test robust scale when IQR is zero.""" + df = pl.DataFrame({"a": [5.0, 5.0, 5.0, 5.0, 5.0]}) + plan = TransformPlan().math_robust_scale("a") + result, _ = plan.process(df) + # Should return zeros + assert result["a"].to_list() == [0.0, 0.0, 0.0, 0.0, 0.0] + + +class TestMathLog: + """Tests for math_log operation.""" + + def test_log_natural(self) -> None: + """Test natural log.""" + import math as pymath + + df = pl.DataFrame({"a": [1.0, pymath.e, pymath.e**2]}) + plan = TransformPlan().math_log("a") + result, _ = plan.process(df) + assert abs(result["a"][0] - 0.0) < 0.001 + assert abs(result["a"][1] - 1.0) < 0.001 + assert abs(result["a"][2] - 2.0) < 0.001 + + def test_log_base_10(self) -> None: + """Test log base 10.""" + df = pl.DataFrame({"a": [1.0, 10.0, 100.0, 1000.0]}) + plan = TransformPlan().math_log("a", base=10) + result, _ = plan.process(df) + expected = [0.0, 1.0, 2.0, 3.0] + for r, e in zip(result["a"], expected): + assert abs(r - e) < 0.001 + + def test_log_with_offset(self) -> None: + """Test log with offset for handling zeros.""" + df = pl.DataFrame({"a": [0.0, 1.0, 2.0]}) + plan = TransformPlan().math_log("a", offset=1) + result, _ = plan.process(df) + # log(0+1) = 0, log(1+1) = log(2), log(2+1) = log(3) + import math as pymath + + assert abs(result["a"][0] - 0.0) < 0.001 + assert abs(result["a"][1] - pymath.log(2)) < 0.001 + assert abs(result["a"][2] - pymath.log(3)) < 0.001 + + def test_log_to_new_column(self) -> None: + """Test log transform to new column.""" + df = pl.DataFrame({"a": [1.0, 10.0, 100.0]}) + plan = TransformPlan().math_log("a", base=10, new_column="a_log") + result, _ = plan.process(df) + assert "a" in result.columns + assert "a_log" in result.columns + assert result["a"].to_list() == [1.0, 10.0, 100.0] + + def test_log_custom_base(self) -> None: + """Test log with custom base.""" + df = pl.DataFrame({"a": [1.0, 2.0, 4.0, 8.0]}) + plan = TransformPlan().math_log("a", base=2) + result, _ = plan.process(df) + expected = [0.0, 1.0, 2.0, 3.0] + for r, e in zip(result["a"], expected): + assert abs(r - e) < 0.001 + + +class TestMathSqrt: + """Tests for math_sqrt operation.""" + + def test_sqrt_basic(self) -> None: + """Test basic square root.""" + df = pl.DataFrame({"a": [0.0, 1.0, 4.0, 9.0, 16.0]}) + plan = TransformPlan().math_sqrt("a") + result, _ = plan.process(df) + expected = [0.0, 1.0, 2.0, 3.0, 4.0] + for r, e in zip(result["a"], expected): + assert abs(r - e) < 0.001 + + def test_sqrt_to_new_column(self) -> None: + """Test sqrt to new column.""" + df = pl.DataFrame({"a": [1.0, 4.0, 9.0]}) + plan = TransformPlan().math_sqrt("a", new_column="a_sqrt") + result, _ = plan.process(df) + assert "a" in result.columns + assert "a_sqrt" in result.columns + assert result["a"].to_list() == [1.0, 4.0, 9.0] + + +class TestMathPower: + """Tests for math_power operation.""" + + def test_power_square(self) -> None: + """Test squaring values.""" + df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0, 5.0]}) + plan = TransformPlan().math_power("a", 2) + result, _ = plan.process(df) + expected = [1.0, 4.0, 9.0, 16.0, 25.0] + assert result["a"].to_list() == expected + + def test_power_cube(self) -> None: + """Test cubing values.""" + df = pl.DataFrame({"a": [1.0, 2.0, 3.0]}) + plan = TransformPlan().math_power("a", 3) + result, _ = plan.process(df) + expected = [1.0, 8.0, 27.0] + assert result["a"].to_list() == expected + + def test_power_sqrt_via_half(self) -> None: + """Test square root via power 0.5.""" + df = pl.DataFrame({"a": [1.0, 4.0, 9.0, 16.0]}) + plan = TransformPlan().math_power("a", 0.5) + result, _ = plan.process(df) + expected = [1.0, 2.0, 3.0, 4.0] + for r, e in zip(result["a"], expected): + assert abs(r - e) < 0.001 + + def test_power_to_new_column(self) -> None: + """Test power to new column.""" + df = pl.DataFrame({"a": [1.0, 2.0, 3.0]}) + plan = TransformPlan().math_power("a", 2, new_column="a_squared") + result, _ = plan.process(df) + assert "a" in result.columns + assert "a_squared" in result.columns + assert result["a"].to_list() == [1.0, 2.0, 3.0] + + +class TestMathWinsorize: + """Tests for math_winsorize operation (outlier handling).""" + + def test_winsorize_percentile_based(self) -> None: + """Test winsorization with percentile bounds.""" + df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]}) + plan = TransformPlan().math_winsorize("a", lower=0.1, upper=0.9) + result, _ = plan.process(df) + # Values below 10th percentile clipped up, above 90th clipped down + assert min(result["a"]) >= df["a"].quantile(0.1) + assert max(result["a"]) <= df["a"].quantile(0.9) + + def test_winsorize_value_based(self) -> None: + """Test winsorization with explicit bounds.""" + df = pl.DataFrame({"a": [1.0, 5.0, 10.0, 50.0, 100.0]}) + plan = TransformPlan().math_winsorize("a", lower_value=5.0, upper_value=50.0) + result, _ = plan.process(df) + assert result["a"].to_list() == [5.0, 5.0, 10.0, 50.0, 50.0] + + def test_winsorize_lower_only(self) -> None: + """Test winsorization with only lower bound.""" + df = pl.DataFrame({"a": [1.0, 5.0, 10.0, 50.0, 100.0]}) + plan = TransformPlan().math_winsorize("a", lower_value=5.0) + result, _ = plan.process(df) + assert result["a"].to_list() == [5.0, 5.0, 10.0, 50.0, 100.0] + + def test_winsorize_upper_only(self) -> None: + """Test winsorization with only upper bound.""" + df = pl.DataFrame({"a": [1.0, 5.0, 10.0, 50.0, 100.0]}) + plan = TransformPlan().math_winsorize("a", upper_value=50.0) + result, _ = plan.process(df) + assert result["a"].to_list() == [1.0, 5.0, 10.0, 50.0, 50.0] + + def test_winsorize_to_new_column(self) -> None: + """Test winsorization to new column.""" + df = pl.DataFrame({"a": [1.0, 5.0, 10.0, 50.0, 100.0]}) + plan = TransformPlan().math_winsorize( + "a", lower_value=5.0, upper_value=50.0, new_column="a_win" + ) + result, _ = plan.process(df) + assert "a" in result.columns + assert "a_win" in result.columns + assert result["a"].to_list() == [1.0, 5.0, 10.0, 50.0, 100.0] + + def test_winsorize_mixed_bounds(self) -> None: + """Test winsorization with value for lower and percentile for upper.""" + df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 100.0]}) + plan = TransformPlan().math_winsorize("a", lower_value=2.0, upper=0.9) + result, _ = plan.process(df) + assert min(result["a"]) == 2.0 + # 90th percentile should clip the outlier + assert max(result["a"]) < 100.0 + + +class TestScalingEdgeCases: + """Tests for edge cases across scaling operations.""" + + def test_standardize_with_nulls(self) -> None: + """Test standardization preserves nulls.""" + df = pl.DataFrame({"a": [1.0, None, 3.0, None, 5.0]}) + plan = TransformPlan().math_standardize("a", mean=3.0, std=2.0) + result, _ = plan.process(df) + assert result["a"][1] is None + assert result["a"][3] is None + assert result["a"][0] == -1.0 # (1-3)/2 + assert result["a"][2] == 0.0 # (3-3)/2 + assert result["a"][4] == 1.0 # (5-3)/2 + + def test_empty_dataframe(self, empty_df: pl.DataFrame) -> None: + """Test operations on empty dataframe.""" + df = pl.DataFrame({"a": pl.Series([], dtype=pl.Float64)}) + plan = TransformPlan().math_standardize("a", mean=0, std=1) + result, _ = plan.process(df) + assert len(result) == 0 + + def test_single_row(self, single_row_df: pl.DataFrame) -> None: + """Test operations on single row.""" + df = pl.DataFrame({"a": [5.0]}) + plan = TransformPlan().math_standardize("a") + result, _ = plan.process(df) + # Single value has std=0, should return 0 + assert result["a"][0] == 0.0 + + +class TestScalingChaining: + """Tests for chaining scaling operations.""" + + def test_chain_multiple_transforms(self) -> None: + """Test chaining multiple transform operations.""" + df = pl.DataFrame({"a": [1.0, 4.0, 9.0, 16.0, 25.0]}) + plan = ( + TransformPlan() + .math_sqrt("a", new_column="a_sqrt") + .math_standardize("a_sqrt", mean=3.0, std=1.0, new_column="a_z") + ) + result, _ = plan.process(df) + # sqrt gives [1, 2, 3, 4, 5], standardize gives [-2, -1, 0, 1, 2] + assert "a" in result.columns + assert "a_sqrt" in result.columns + assert "a_z" in result.columns + expected_z = [-2.0, -1.0, 0.0, 1.0, 2.0] + for r, e in zip(result["a_z"], expected_z): + assert abs(r - e) < 0.001 + + def test_chain_with_winsorize(self) -> None: + """Test chaining winsorize with standardize.""" + df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0, 100.0]}) # 100 is outlier + plan = ( + TransformPlan() + .math_winsorize("a", upper_value=5.0, new_column="a_win") + .math_standardize("a_win") + ) + result, _ = plan.process(df) + # After winsorize: [1, 2, 3, 4, 5] + # After standardize: mean≈0, std≈1 + values = result["a_win"].to_list() + mean_val = sum(v for v in values if v is not None) / len(values) + assert abs(mean_val) < 0.001 + # Middle value should be ~0 + assert values[2] is not None + assert abs(values[2]) < 0.001 diff --git a/transformplan/ops/__init__.py b/transformplan/ops/__init__.py index 35b81e7..2575702 100644 --- a/transformplan/ops/__init__.py +++ b/transformplan/ops/__init__.py @@ -18,7 +18,6 @@ from .column import ColumnOps from .datetime import DatetimeOps -from .encoding import EncodingOps from .map import MapOps from .math import MathOps from .rows import RowOps @@ -27,7 +26,6 @@ __all__ = [ "ColumnOps", "DatetimeOps", - "EncodingOps", "MapOps", "MathOps", "RowOps", diff --git a/transformplan/ops/encoding.py b/transformplan/ops/encoding.py deleted file mode 100644 index 603ed42..0000000 --- a/transformplan/ops/encoding.py +++ /dev/null @@ -1,276 +0,0 @@ -"""Encoding operations mixin. - -This module provides the EncodingOps mixin class with categorical encoding -operations for machine learning preparation workflows. - -Classes: - EncodingOps: Mixin providing encoding operations. - -Encoding Operations: - enc_onehot: One-hot encoding (binary indicator columns). - enc_ordinal: Ordinal encoding (ordered integers). - enc_label: Label encoding (alphabetically sorted integers). - -Example: - >>> plan = TransformPlan().enc_onehot("color", categories=["red", "green", "blue"]) -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -import polars as pl - -if TYPE_CHECKING: - from typing import Callable, Literal - - from typing_extensions import Self - - -class EncodingOps: - """Mixin providing categorical encoding operations.""" - - if TYPE_CHECKING: - - def _register( - self, - method: Callable[..., pl.DataFrame], - params: dict[str, Any], - ) -> Self: ... - - def enc_onehot( - self, - column: str, - categories: list[Any] | None = None, - prefix: str | None = None, - *, - drop: Literal["first", "last"] | Any | None = None, # noqa: ANN401 - drop_original: bool = True, - unknown_value: Literal["all_zero", "ignore"] = "all_zero", - ) -> Self: - """One-hot encode a categorical column. - - Creates binary indicator columns (0/1) for each category. - - Args: - column: Source column to encode. - categories: List of category values. If None, derived from data. - prefix: Prefix for new columns (default: column name). - drop: Drop one category column to avoid multicollinearity: - - None: Keep all columns (default). - - "first": Drop the first category. - - "last": Drop the last category. - - Any value: Drop that specific category. - drop_original: Drop source column after encoding (default: True). - unknown_value: How to handle unknown values: - - "all_zero": Set all indicator columns to 0. - - "ignore": Keep original value behavior. - - Returns: - Self for method chaining. - - Example: - >>> plan.enc_onehot("color", categories=["red", "green", "blue"]) - # Creates: color_red, color_green, color_blue - - >>> plan.enc_onehot("color", categories=["red", "green"], drop="first") - # Creates: color_green (drops color_red) - """ - return self._register( - self._enc_onehot, - { - "column": column, - "categories": categories, - "prefix": prefix or column, - "drop": drop, - "drop_original": drop_original, - "unknown_value": unknown_value, - }, - ) - - def _enc_onehot( - self, - data: pl.DataFrame, - column: str, - categories: list[Any] | None, - prefix: str, - drop: Any | None, # noqa: ANN401 - drop_original: bool, # noqa: FBT001 - unknown_value: str, - ) -> pl.DataFrame: - # Derive categories from data if not provided - if categories is None: - categories = data[column].drop_nulls().unique().sort().to_list() - - # Determine which category to drop (if any) - # Literal values take precedence over keywords "first"/"last" - drop_category: Any | None = None - if drop is not None and categories: - if drop in categories: - # Literal value - drop this specific category - drop_category = drop - elif drop == "first": - drop_category = categories[0] - elif drop == "last": - drop_category = categories[-1] - else: - # Value not in categories - will result in no column being dropped - drop_category = drop - - # Build one-hot columns - new_columns = [] - for cat in categories: - # Skip the dropped category - if drop_category is not None and cat == drop_category: - continue - - col_name = f"{prefix}_{cat}" - if unknown_value == "all_zero": - # Unknown values get 0 for all categories - expr = ( - pl.when(pl.col(column) == cat) - .then(pl.lit(1)) - .otherwise(pl.lit(0)) - .alias(col_name) - ) - else: - # "ignore" - unknown values get null - expr = ( - pl.when(pl.col(column) == cat) - .then(pl.lit(1)) - .when(pl.col(column).is_in(categories)) - .then(pl.lit(0)) - .otherwise(pl.lit(None)) - .alias(col_name) - ) - new_columns.append(expr) - - result = data.with_columns(new_columns) - - if drop_original: - result = result.drop(column) - - return result - - def enc_ordinal( - self, - column: str, - categories: list[Any] | None = None, - new_column: str | None = None, - *, - drop_original: bool = True, - unknown_value: int = -1, - ) -> Self: - """Ordinal encode a categorical column. - - Maps categories to integers based on explicit ordering. - - Args: - column: Source column to encode. - categories: List of categories in desired order (first=0, second=1, etc.). - If None, uses sorted unique values from data. - new_column: Output column name. If None, replaces original. - drop_original: Drop source column if new_column differs (default: True). - unknown_value: Integer for unknown values (default: -1). - - Returns: - Self for method chaining. - - Example: - >>> plan.enc_ordinal("size", categories=["small", "medium", "large"]) - # Maps: small→0, medium→1, large→2 - """ - return self._register( - self._enc_ordinal, - { - "column": column, - "categories": categories, - "new_column": new_column or column, - "drop_original": drop_original, - "unknown_value": unknown_value, - }, - ) - - def _enc_ordinal( - self, - data: pl.DataFrame, - column: str, - categories: list[Any] | None, - new_column: str, - drop_original: bool, # noqa: FBT001 - unknown_value: int, - ) -> pl.DataFrame: - # Derive categories from data if not provided - if categories is None: - categories = data[column].drop_nulls().unique().sort().to_list() - - # Build when/then chain - if not categories: - return data.with_columns(pl.lit(unknown_value).alias(new_column)) - - first_cat = categories[0] - chain = pl.when(pl.col(column) == first_cat).then(pl.lit(0)) - - for idx, cat in enumerate(categories[1:], start=1): - chain = chain.when(pl.col(column) == cat).then(pl.lit(idx)) - - chain = chain.otherwise(pl.lit(unknown_value)) - result = data.with_columns(chain.alias(new_column)) - - if drop_original and new_column != column: - result = result.drop(column) - - return result - - def enc_label( - self, - column: str, - categories: list[Any] | None = None, - new_column: str | None = None, - *, - drop_original: bool = True, - unknown_value: int = -1, - ) -> Self: - """Label encode a categorical column. - - Simple integer encoding (alphabetically sorted by default). - - Args: - column: Source column to encode. - categories: List of categories. If None, uses sorted unique values. - new_column: Output column name. If None, replaces original. - drop_original: Drop source column if new_column differs (default: True). - unknown_value: Integer for unknown values (default: -1). - - Returns: - Self for method chaining. - - Example: - >>> plan.enc_label("department") - # Maps alphabetically: Engineering→0, HR→1, Sales→2 - """ - return self._register( - self._enc_label, - { - "column": column, - "categories": categories, - "new_column": new_column or column, - "drop_original": drop_original, - "unknown_value": unknown_value, - }, - ) - - def _enc_label( - self, - data: pl.DataFrame, - column: str, - categories: list[Any] | None, - new_column: str, - drop_original: bool, # noqa: FBT001 - unknown_value: int, - ) -> pl.DataFrame: - # Label encoding is the same as ordinal encoding - # The semantic difference is that ordinal implies meaningful order - return self._enc_ordinal( - data, column, categories, new_column, drop_original, unknown_value - ) diff --git a/transformplan/ops/map.py b/transformplan/ops/map.py index c01eb1a..e5ee85f 100644 --- a/transformplan/ops/map.py +++ b/transformplan/ops/map.py @@ -1,7 +1,7 @@ """Mapping and transformation operations mixin. This module provides the MapOps mixin class with value mapping, discretization, -and transformation operations. +encoding, and transformation operations. Classes: MapOps: Mixin providing value mapping operations. @@ -14,6 +14,11 @@ Discretization: map_discretize: Bin numeric values into categories. +Encoding (categorical to numeric): + map_onehot: One-hot encoding (binary indicator columns). + map_ordinal: Ordinal encoding (ordered integers). + map_label: Label encoding (alphabetically sorted integers). + Type Conversion: map_bool_to_int: Convert boolean to integer. @@ -27,12 +32,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any, Literal, Sequence import polars as pl if TYPE_CHECKING: - from typing import Any, Callable + from typing import Callable from typing_extensions import Self @@ -340,3 +345,240 @@ def _map_from_column( return data.with_columns( pl.col(column).replace(lookup, default=default).alias(new_column) ) + + def map_onehot( + self, + column: str, + categories: list[Any] | None = None, + prefix: str | None = None, + *, + drop: Literal["first", "last"] | Any | None = None, # noqa: ANN401 + drop_original: bool = True, + unknown_value: Literal["all_zero", "ignore"] = "all_zero", + ) -> Self: + """One-hot encode a categorical column. + + Creates binary indicator columns (0/1) for each category. + + Args: + column: Source column to encode. + categories: List of category values. If None, derived from data. + prefix: Prefix for new columns (default: column name). + drop: Drop one category column to avoid multicollinearity: + - None: Keep all columns (default). + - "first": Drop the first category. + - "last": Drop the last category. + - Any value: Drop that specific category. + drop_original: Drop source column after encoding (default: True). + unknown_value: How to handle unknown values: + - "all_zero": Set all indicator columns to 0. + - "ignore": Keep original value behavior. + + Returns: + Self for method chaining. + + Example: + >>> plan.map_onehot("color", categories=["red", "green", "blue"]) + # Creates: color_red, color_green, color_blue + + >>> plan.map_onehot("color", categories=["red", "green"], drop="first") + # Creates: color_green (drops color_red) + """ + return self._register( + self._map_onehot, + { + "column": column, + "categories": categories, + "prefix": prefix or column, + "drop": drop, + "drop_original": drop_original, + "unknown_value": unknown_value, + }, + ) + + def _map_onehot( + self, + data: pl.DataFrame, + column: str, + categories: list[Any] | None, + prefix: str, + drop: Any | None, # noqa: ANN401 + drop_original: bool, # noqa: FBT001 + unknown_value: str, + ) -> pl.DataFrame: + # Derive categories from data if not provided + if categories is None: + categories = data[column].drop_nulls().unique().sort().to_list() + + # Determine which category to drop (if any) + # Literal values take precedence over keywords "first"/"last" + drop_category: Any | None = None + if drop is not None and categories: + if drop in categories: + # Literal value - drop this specific category + drop_category = drop + elif drop == "first": + drop_category = categories[0] + elif drop == "last": + drop_category = categories[-1] + else: + # Value not in categories - will result in no column being dropped + drop_category = drop + + # Build one-hot columns + new_columns = [] + for cat in categories: + # Skip the dropped category + if drop_category is not None and cat == drop_category: + continue + + col_name = f"{prefix}_{cat}" + if unknown_value == "all_zero": + # Unknown values get 0 for all categories + expr = ( + pl.when(pl.col(column) == cat) + .then(pl.lit(1)) + .otherwise(pl.lit(0)) + .alias(col_name) + ) + else: + # "ignore" - unknown values get null + expr = ( + pl.when(pl.col(column) == cat) + .then(pl.lit(1)) + .when(pl.col(column).is_in(categories)) + .then(pl.lit(0)) + .otherwise(pl.lit(None)) + .alias(col_name) + ) + new_columns.append(expr) + + result = data.with_columns(new_columns) + + if drop_original: + result = result.drop(column) + + return result + + def map_ordinal( + self, + column: str, + categories: list[Any] | None = None, + new_column: str | None = None, + *, + drop_original: bool = True, + unknown_value: int = -1, + ) -> Self: + """Ordinal encode a categorical column. + + Maps categories to integers based on explicit ordering. + + Args: + column: Source column to encode. + categories: List of categories in desired order (first=0, second=1, etc.). + If None, uses sorted unique values from data. + new_column: Output column name. If None, replaces original. + drop_original: Drop source column if new_column differs (default: True). + unknown_value: Integer for unknown values (default: -1). + + Returns: + Self for method chaining. + + Example: + >>> plan.map_ordinal("size", categories=["small", "medium", "large"]) + # Maps: small->0, medium->1, large->2 + """ + return self._register( + self._map_ordinal, + { + "column": column, + "categories": categories, + "new_column": new_column or column, + "drop_original": drop_original, + "unknown_value": unknown_value, + }, + ) + + def _map_ordinal( + self, + data: pl.DataFrame, + column: str, + categories: list[Any] | None, + new_column: str, + drop_original: bool, # noqa: FBT001 + unknown_value: int, + ) -> pl.DataFrame: + # Derive categories from data if not provided + if categories is None: + categories = data[column].drop_nulls().unique().sort().to_list() + + # Build when/then chain + if not categories: + return data.with_columns(pl.lit(unknown_value).alias(new_column)) + + first_cat = categories[0] + chain = pl.when(pl.col(column) == first_cat).then(pl.lit(0)) + + for idx, cat in enumerate(categories[1:], start=1): + chain = chain.when(pl.col(column) == cat).then(pl.lit(idx)) + + chain = chain.otherwise(pl.lit(unknown_value)) + result = data.with_columns(chain.alias(new_column)) + + if drop_original and new_column != column: + result = result.drop(column) + + return result + + def map_label( + self, + column: str, + categories: list[Any] | None = None, + new_column: str | None = None, + *, + drop_original: bool = True, + unknown_value: int = -1, + ) -> Self: + """Label encode a categorical column. + + Simple integer encoding (alphabetically sorted by default). + + Args: + column: Source column to encode. + categories: List of categories. If None, uses sorted unique values. + new_column: Output column name. If None, replaces original. + drop_original: Drop source column if new_column differs (default: True). + unknown_value: Integer for unknown values (default: -1). + + Returns: + Self for method chaining. + + Example: + >>> plan.map_label("department") + # Maps alphabetically: Engineering->0, HR->1, Sales->2 + """ + return self._register( + self._map_label, + { + "column": column, + "categories": categories, + "new_column": new_column or column, + "drop_original": drop_original, + "unknown_value": unknown_value, + }, + ) + + def _map_label( + self, + data: pl.DataFrame, + column: str, + categories: list[Any] | None, + new_column: str, + drop_original: bool, # noqa: FBT001 + unknown_value: int, + ) -> pl.DataFrame: + # Label encoding is the same as ordinal encoding + # The semantic difference is that ordinal implies meaningful order + return self._map_ordinal( + data, column, categories, new_column, drop_original, unknown_value + ) diff --git a/transformplan/ops/math.py b/transformplan/ops/math.py index d183c15..74c8c3e 100644 --- a/transformplan/ops/math.py +++ b/transformplan/ops/math.py @@ -28,13 +28,27 @@ math_cumsum: Cumulative sum. math_rank: Rank values. +Scaling Operations: + math_standardize: Z-score standardization (mean=0, std=1). + math_minmax: Min-max normalization to a range. + math_robust_scale: Robust scaling using median and IQR. + +Transform Operations: + math_log: Logarithmic transform. + math_sqrt: Square root transform. + math_power: Power transform. + +Outlier Handling: + math_winsorize: Clip values to percentiles or explicit bounds. + Example: >>> plan = TransformPlan().math_multiply("price", 1.1).math_round("price", 2) """ from __future__ import annotations -from typing import TYPE_CHECKING, Literal, Union +import math +from typing import TYPE_CHECKING, Literal, Union, cast import polars as pl @@ -45,6 +59,7 @@ Numeric = Union[int, float] RankMethod = Literal["average", "min", "max", "dense", "ordinal", "random"] +FeatureRange = tuple[Numeric, Numeric] class MathOps: @@ -405,3 +420,367 @@ def _math_rank( if group_by: expr = expr.over(group_by) return data.with_columns(expr.alias(new_column)) + + # ========================================================================= + # Scaling Operations + # ========================================================================= + + def math_standardize( + self, + column: str, + *, + mean: Numeric | None = None, + std: Numeric | None = None, + new_column: str | None = None, + ) -> Self: + """Standardize a column to have mean=0 and std=1 (z-score). + + Args: + column: Column to transform. + mean: Mean value. If None, derived from data. + std: Standard deviation. If None, derived from data. + new_column: Output column name (default: replace original). + + Returns: + Self for method chaining. + """ + return self._register( + self._math_standardize, + { + "column": column, + "mean": mean, + "std": std, + "new_column": new_column or column, + }, + ) + + def _math_standardize( + self, + data: pl.DataFrame, + column: str, + mean: Numeric | None, + std: Numeric | None, + new_column: str, + ) -> pl.DataFrame: + computed_mean = ( + float(mean) + if mean is not None + else cast("float", data[column].mean()) or 0.0 + ) + computed_std = ( + float(std) if std is not None else cast("float", data[column].std()) or 0.0 + ) + if computed_std == 0: + # Avoid division by zero - return zeros + return data.with_columns(pl.lit(0.0).alias(new_column)) + return data.with_columns( + ((pl.col(column) - computed_mean) / computed_std).alias(new_column) + ) + + def math_minmax( + self, + column: str, + *, + min_val: Numeric | None = None, + max_val: Numeric | None = None, + feature_range: FeatureRange = (0, 1), + new_column: str | None = None, + ) -> Self: + """Scale a column to a range using min-max normalization. + + Args: + column: Column to transform. + min_val: Minimum value. If None, derived from data. + max_val: Maximum value. If None, derived from data. + feature_range: Output range tuple (default: (0, 1)). + new_column: Output column name (default: replace original). + + Returns: + Self for method chaining. + """ + return self._register( + self._math_minmax, + { + "column": column, + "min_val": min_val, + "max_val": max_val, + "feature_range": feature_range, + "new_column": new_column or column, + }, + ) + + def _math_minmax( + self, + data: pl.DataFrame, + column: str, + min_val: Numeric | None, + max_val: Numeric | None, + feature_range: FeatureRange, + new_column: str, + ) -> pl.DataFrame: + computed_min = ( + float(min_val) + if min_val is not None + else cast("float", data[column].min()) or 0.0 + ) + computed_max = ( + float(max_val) + if max_val is not None + else cast("float", data[column].max()) or 0.0 + ) + a, b = feature_range + if computed_max == computed_min: + # All values are the same - return midpoint of range + return data.with_columns(pl.lit((a + b) / 2).alias(new_column)) + return data.with_columns( + ( + a + + (pl.col(column) - computed_min) + * (b - a) + / (computed_max - computed_min) + ).alias(new_column) + ) + + def math_robust_scale( + self, + column: str, + *, + median: Numeric | None = None, + iqr: Numeric | None = None, + new_column: str | None = None, + ) -> Self: + """Scale a column using median and interquartile range. + + Robust to outliers compared to standardization. + + Args: + column: Column to transform. + median: Median value. If None, derived from data. + iqr: Interquartile range (Q3 - Q1). If None, derived from data. + new_column: Output column name (default: replace original). + + Returns: + Self for method chaining. + """ + return self._register( + self._math_robust_scale, + { + "column": column, + "median": median, + "iqr": iqr, + "new_column": new_column or column, + }, + ) + + def _math_robust_scale( + self, + data: pl.DataFrame, + column: str, + median: Numeric | None, + iqr: Numeric | None, + new_column: str, + ) -> pl.DataFrame: + computed_median = ( + float(median) + if median is not None + else cast("float", data[column].median()) or 0.0 + ) + if iqr is None: + q1 = cast("float", data[column].quantile(0.25)) or 0.0 + q3 = cast("float", data[column].quantile(0.75)) or 0.0 + computed_iqr = q3 - q1 + else: + computed_iqr = float(iqr) + if computed_iqr == 0: + # Avoid division by zero - return zeros + return data.with_columns(pl.lit(0.0).alias(new_column)) + return data.with_columns( + ((pl.col(column) - computed_median) / computed_iqr).alias(new_column) + ) + + # ========================================================================= + # Transform Operations + # ========================================================================= + + def math_log( + self, + column: str, + *, + base: Numeric | None = None, + offset: Numeric = 0, + new_column: str | None = None, + ) -> Self: + """Apply logarithmic transform to a column. + + Args: + column: Column to transform. + base: Log base (default: natural log e). + offset: Value added before log to handle zeros (default: 0). + new_column: Output column name (default: replace original). + + Returns: + Self for method chaining. + """ + return self._register( + self._math_log, + { + "column": column, + "base": base, + "offset": offset, + "new_column": new_column or column, + }, + ) + + def _math_log( + self, + data: pl.DataFrame, + column: str, + base: Numeric | None, + offset: Numeric, + new_column: str, + ) -> pl.DataFrame: + expr = pl.col(column) + offset + if base is None: + expr = expr.log() + elif base == 10: + expr = expr.log10() + else: + # log_b(x) = ln(x) / ln(b) + expr = expr.log() / math.log(base) + return data.with_columns(expr.alias(new_column)) + + def math_sqrt( + self, + column: str, + *, + new_column: str | None = None, + ) -> Self: + """Apply square root transform to a column. + + Args: + column: Column to transform. + new_column: Output column name (default: replace original). + + Returns: + Self for method chaining. + """ + return self._register( + self._math_sqrt, + { + "column": column, + "new_column": new_column or column, + }, + ) + + def _math_sqrt( + self, + data: pl.DataFrame, + column: str, + new_column: str, + ) -> pl.DataFrame: + return data.with_columns(pl.col(column).sqrt().alias(new_column)) + + def math_power( + self, + column: str, + exponent: Numeric, + *, + new_column: str | None = None, + ) -> Self: + """Apply power transform to a column. + + Args: + column: Column to transform. + exponent: Power to raise values to. + new_column: Output column name (default: replace original). + + Returns: + Self for method chaining. + """ + return self._register( + self._math_power, + { + "column": column, + "exponent": exponent, + "new_column": new_column or column, + }, + ) + + def _math_power( + self, + data: pl.DataFrame, + column: str, + exponent: Numeric, + new_column: str, + ) -> pl.DataFrame: + return data.with_columns(pl.col(column).pow(exponent).alias(new_column)) + + # ========================================================================= + # Outlier Handling + # ========================================================================= + + def math_winsorize( + self, + column: str, + *, + lower: Numeric | None = None, + upper: Numeric | None = None, + lower_value: Numeric | None = None, + upper_value: Numeric | None = None, + new_column: str | None = None, + ) -> Self: + """Clip values to percentiles or explicit bounds. + + Use either percentile-based (lower/upper as 0-1 fractions) or + value-based (lower_value/upper_value as explicit bounds) clipping. + + Args: + column: Column to transform. + lower: Lower percentile (0-1). E.g., 0.05 for 5th percentile. + upper: Upper percentile (0-1). E.g., 0.95 for 95th percentile. + lower_value: Explicit lower bound (overrides lower percentile). + upper_value: Explicit upper bound (overrides upper percentile). + new_column: Output column name (default: replace original). + + Returns: + Self for method chaining. + """ + return self._register( + self._math_winsorize, + { + "column": column, + "lower": lower, + "upper": upper, + "lower_value": lower_value, + "upper_value": upper_value, + "new_column": new_column or column, + }, + ) + + def _math_winsorize( + self, + data: pl.DataFrame, + column: str, + lower: Numeric | None, + upper: Numeric | None, + lower_value: Numeric | None, + upper_value: Numeric | None, + new_column: str, + ) -> pl.DataFrame: + # Determine lower bound + lower_bound: float | None = ( + float(lower_value) if lower_value is not None else None + ) + if lower_bound is None and lower is not None: + lower_bound = cast("float", data[column].quantile(lower)) + + # Determine upper bound + upper_bound: float | None = ( + float(upper_value) if upper_value is not None else None + ) + if upper_bound is None and upper is not None: + upper_bound = cast("float", data[column].quantile(upper)) + + return data.with_columns( + pl.col(column).clip(lower_bound, upper_bound).alias(new_column) + ) diff --git a/transformplan/plan.py b/transformplan/plan.py index ad68004..a75b99a 100644 --- a/transformplan/plan.py +++ b/transformplan/plan.py @@ -20,14 +20,13 @@ """ from .core import TransformPlanBase -from .ops import ColumnOps, DatetimeOps, EncodingOps, MapOps, MathOps, RowOps, StrOps +from .ops import ColumnOps, DatetimeOps, MapOps, MathOps, RowOps, StrOps class TransformPlan( TransformPlanBase, ColumnOps, DatetimeOps, - EncodingOps, MapOps, MathOps, RowOps, diff --git a/transformplan/validation.py b/transformplan/validation.py index f27ff26..6e06470 100644 --- a/transformplan/validation.py +++ b/transformplan/validation.py @@ -813,6 +813,28 @@ def _validate_math_percent_of( tracker.add_column(new_column, pl.Float64()) +def _validate_math_scaling( + tracker: SchemaTracker, + params: dict[str, Any], + result: ValidationResult, + step: int, + op_name: str, +) -> None: + """Validate scaling/transform operations: column must exist and be numeric.""" + column = params["column"] + new_column = params.get("new_column", column) + + if _check_column_exists(tracker, column, result, step, op_name): + _check_column_numeric(tracker, column, result, step, op_name) + + # If outputting to a new column, add it to the schema + if new_column != column: + tracker.add_column(new_column, pl.Float64()) + else: + # Type may change to float + tracker.set_dtype(column, pl.Float64()) + + # ============================================================================= # String operation validators # ============================================================================= @@ -1228,7 +1250,7 @@ def _validate_map_from_column( # ============================================================================= -# Encoding operation validators +# Encoding validators (map_onehot, map_ordinal, map_label) # ============================================================================= @@ -1262,7 +1284,7 @@ def _resolve_drop_category( return None, False -def _validate_enc_onehot( +def _validate_map_onehot( tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int ) -> None: column = params["column"] @@ -1271,18 +1293,18 @@ def _validate_enc_onehot( drop = params.get("drop") drop_original = params["drop_original"] - if not _check_column_exists(tracker, column, result, step, "enc_onehot"): + if not _check_column_exists(tracker, column, result, step, "map_onehot"): return if categories is not None: # Check for duplicate categories if len(categories) != len(set(categories)): - result.add_error(step, "enc_onehot", "Duplicate values in categories list") + result.add_error(step, "map_onehot", "Duplicate values in categories list") return # Determine which category to drop (if any) drop_category, is_valid = _resolve_drop_category( - drop, categories, result, step, "enc_onehot" + drop, categories, result, step, "map_onehot" ) if not is_valid: return @@ -1294,7 +1316,7 @@ def _validate_enc_onehot( new_col = f"{prefix}_{cat}" if tracker.has_column(new_col): result.add_error( - step, "enc_onehot", f"Column '{new_col}' already exists" + step, "map_onehot", f"Column '{new_col}' already exists" ) return tracker.add_column(new_col, pl.Int64()) @@ -1306,7 +1328,7 @@ def _validate_enc_onehot( tracker.drop_column(column) -def _validate_enc_ordinal( +def _validate_map_ordinal( tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int ) -> None: column = params["column"] @@ -1314,12 +1336,12 @@ def _validate_enc_ordinal( new_column = params["new_column"] drop_original = params["drop_original"] - if not _check_column_exists(tracker, column, result, step, "enc_ordinal"): + if not _check_column_exists(tracker, column, result, step, "map_ordinal"): return # Check for duplicate categories if categories is not None and len(categories) != len(set(categories)): - result.add_error(step, "enc_ordinal", "Duplicate values in categories list") + result.add_error(step, "map_ordinal", "Duplicate values in categories list") return # Update schema @@ -1331,7 +1353,7 @@ def _validate_enc_ordinal( tracker.set_dtype(column, pl.Int64()) -def _validate_enc_label( +def _validate_map_label( tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int ) -> None: column = params["column"] @@ -1339,12 +1361,12 @@ def _validate_enc_label( new_column = params["new_column"] drop_original = params["drop_original"] - if not _check_column_exists(tracker, column, result, step, "enc_label"): + if not _check_column_exists(tracker, column, result, step, "map_label"): return # Check for duplicate categories if categories is not None and len(categories) != len(set(categories)): - result.add_error(step, "enc_label", "Duplicate values in categories list") + result.add_error(step, "map_label", "Duplicate values in categories list") return # Update schema @@ -1398,6 +1420,16 @@ def _validate_enc_label( "math_cumsum": _validate_math_cumsum, "math_rank": _validate_math_rank, "math_percent_of": _validate_math_percent_of, + # Scaling ops + "math_standardize": partial(_validate_math_scaling, op_name="math_standardize"), + "math_minmax": partial(_validate_math_scaling, op_name="math_minmax"), + "math_robust_scale": partial(_validate_math_scaling, op_name="math_robust_scale"), + # Transform ops + "math_log": partial(_validate_math_scaling, op_name="math_log"), + "math_sqrt": partial(_validate_math_scaling, op_name="math_sqrt"), + "math_power": partial(_validate_math_scaling, op_name="math_power"), + # Outlier handling + "math_winsorize": partial(_validate_math_scaling, op_name="math_winsorize"), # String ops "str_replace": partial(_validate_str_op, op_name="str_replace"), "str_slice": partial(_validate_str_op, op_name="str_slice"), @@ -1447,10 +1479,9 @@ def _validate_enc_label( "map_values": _validate_map_values, "map_discretize": _validate_map_discretize, "map_from_column": _validate_map_from_column, - # Encoding ops - "enc_onehot": _validate_enc_onehot, - "enc_ordinal": _validate_enc_ordinal, - "enc_label": _validate_enc_label, + "map_onehot": _validate_map_onehot, + "map_ordinal": _validate_map_ordinal, + "map_label": _validate_map_label, }