From 27d4ab504ef7f9298c1fd6f92289c5e202026a0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gro=C3=9Fer?= Date: Fri, 30 Jan 2026 11:55:16 +0100 Subject: [PATCH 1/6] encodings --- tests/test_encoding.py | 408 ++++++++++++++++++++++++++++++++++ transformplan/ops/__init__.py | 2 + transformplan/ops/encoding.py | 246 ++++++++++++++++++++ transformplan/plan.py | 11 +- transformplan/validation.py | 97 ++++++++ 5 files changed, 762 insertions(+), 2 deletions(-) create mode 100644 tests/test_encoding.py create mode 100644 transformplan/ops/encoding.py diff --git a/tests/test_encoding.py b/tests/test_encoding.py new file mode 100644 index 0000000..27bd34a --- /dev/null +++ b/tests/test_encoding.py @@ -0,0 +1,408 @@ +"""Tests for encoding operations (ops/encoding.py).""" + +import polars as pl +import pytest + +from transformplan import TransformPlan + + +@pytest.fixture +def encoding_df() -> pl.DataFrame: + """DataFrame for encoding operations.""" + return pl.DataFrame( + { + "color": ["red", "green", "blue", "red", "green"], + "size": ["small", "medium", "large", "medium", "small"], + "department": ["HR", "Engineering", "Sales", "HR", "Engineering"], + } + ) + + +class TestEncOnehot: + """Tests for enc_onehot operation.""" + + def test_enc_onehot_with_categories(self, encoding_df: pl.DataFrame) -> None: + """Test one-hot encoding with explicit categories.""" + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green", "blue"] + ) + result, _ = plan.process(encoding_df) + + # Check columns created + assert "color_red" in result.columns + assert "color_green" in result.columns + assert "color_blue" in result.columns + + # Original column dropped by default + assert "color" not in result.columns + + # Check values (row 0 is "red") + assert result["color_red"][0] == 1 + assert result["color_green"][0] == 0 + assert result["color_blue"][0] == 0 + + # Row 1 is "green" + assert result["color_red"][1] == 0 + assert result["color_green"][1] == 1 + assert result["color_blue"][1] == 0 + + def test_enc_onehot_derive_categories(self, encoding_df: pl.DataFrame) -> None: + """Test one-hot encoding deriving categories from data.""" + plan = TransformPlan().enc_onehot("color") + result, _ = plan.process(encoding_df) + + # Should derive categories alphabetically: blue, green, red + assert "color_blue" in result.columns + assert "color_green" in result.columns + assert "color_red" in result.columns + + def test_enc_onehot_custom_prefix(self, encoding_df: pl.DataFrame) -> None: + """Test one-hot encoding with custom prefix.""" + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green"], prefix="c" + ) + result, _ = plan.process(encoding_df) + + assert "c_red" in result.columns + assert "c_green" in result.columns + + def test_enc_onehot_keep_original(self, encoding_df: pl.DataFrame) -> None: + """Test one-hot encoding keeping original column.""" + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green"], drop_original=False + ) + result, _ = plan.process(encoding_df) + + assert "color" in result.columns + assert "color_red" in result.columns + assert "color_green" in result.columns + + def test_enc_onehot_unknown_all_zero(self, encoding_df: pl.DataFrame) -> None: + """Test one-hot encoding with unknown values set to all zeros.""" + # Only include some categories - "blue" will be unknown + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green"], unknown_value="all_zero" + ) + result, _ = plan.process(encoding_df) + + # Row 2 is "blue" which is unknown + assert result["color_red"][2] == 0 + assert result["color_green"][2] == 0 + + def test_enc_onehot_unknown_ignore(self, encoding_df: pl.DataFrame) -> None: + """Test one-hot encoding with unknown values returning null.""" + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green"], unknown_value="ignore" + ) + result, _ = plan.process(encoding_df) + + # Row 2 is "blue" which is unknown - should get null + assert result["color_red"][2] is None + assert result["color_green"][2] is None + + +class TestEncOrdinal: + """Tests for enc_ordinal operation.""" + + def test_enc_ordinal_with_categories(self, encoding_df: pl.DataFrame) -> None: + """Test ordinal encoding with explicit ordering.""" + plan = TransformPlan().enc_ordinal( + "size", categories=["small", "medium", "large"] + ) + result, _ = plan.process(encoding_df) + + # small=0, medium=1, large=2 + expected = [0, 1, 2, 1, 0] + assert result["size"].to_list() == expected + + def test_enc_ordinal_derive_categories(self, encoding_df: pl.DataFrame) -> None: + """Test ordinal encoding deriving categories alphabetically.""" + plan = TransformPlan().enc_ordinal("size") + result, _ = plan.process(encoding_df) + + # Alphabetically: large=0, medium=1, small=2 + # Original: ["small", "medium", "large", "medium", "small"] + expected = [2, 1, 0, 1, 2] + assert result["size"].to_list() == expected + + def test_enc_ordinal_new_column(self, encoding_df: pl.DataFrame) -> None: + """Test ordinal encoding to new column.""" + plan = TransformPlan().enc_ordinal( + "size", + categories=["small", "medium", "large"], + new_column="size_encoded", + ) + result, _ = plan.process(encoding_df) + + assert "size_encoded" in result.columns + # Original dropped by default when new_column differs + assert "size" not in result.columns + + expected = [0, 1, 2, 1, 0] + assert result["size_encoded"].to_list() == expected + + def test_enc_ordinal_keep_original(self, encoding_df: pl.DataFrame) -> None: + """Test ordinal encoding keeping original column.""" + plan = TransformPlan().enc_ordinal( + "size", + categories=["small", "medium", "large"], + new_column="size_encoded", + drop_original=False, + ) + result, _ = plan.process(encoding_df) + + assert "size" in result.columns + assert "size_encoded" in result.columns + + def test_enc_ordinal_unknown_value(self) -> None: + """Test ordinal encoding with unknown values.""" + df = pl.DataFrame({"size": ["small", "medium", "xl"]}) + plan = TransformPlan().enc_ordinal( + "size", categories=["small", "medium", "large"], unknown_value=-1 + ) + result, _ = plan.process(df) + + # "xl" is unknown + assert result["size"].to_list() == [0, 1, -1] + + def test_enc_ordinal_custom_unknown_value(self) -> None: + """Test ordinal encoding with custom unknown value.""" + df = pl.DataFrame({"size": ["small", "unknown"]}) + plan = TransformPlan().enc_ordinal( + "size", categories=["small", "medium"], unknown_value=99 + ) + result, _ = plan.process(df) + + assert result["size"].to_list() == [0, 99] + + +class TestEncLabel: + """Tests for enc_label operation.""" + + def test_enc_label_with_categories(self, encoding_df: pl.DataFrame) -> None: + """Test label encoding with explicit categories.""" + plan = TransformPlan().enc_label( + "department", categories=["HR", "Engineering", "Sales"] + ) + result, _ = plan.process(encoding_df) + + # HR=0, Engineering=1, Sales=2 + expected = [0, 1, 2, 0, 1] + assert result["department"].to_list() == expected + + def test_enc_label_derive_categories(self, encoding_df: pl.DataFrame) -> None: + """Test label encoding deriving categories alphabetically.""" + plan = TransformPlan().enc_label("department") + result, _ = plan.process(encoding_df) + + # Alphabetically: Engineering=0, HR=1, Sales=2 + # Original: ["HR", "Engineering", "Sales", "HR", "Engineering"] + expected = [1, 0, 2, 1, 0] + assert result["department"].to_list() == expected + + def test_enc_label_new_column(self, encoding_df: pl.DataFrame) -> None: + """Test label encoding to new column.""" + plan = TransformPlan().enc_label( + "department", + categories=["HR", "Engineering", "Sales"], + new_column="dept_id", + ) + result, _ = plan.process(encoding_df) + + assert "dept_id" in result.columns + assert "department" not in result.columns + + def test_enc_label_unknown_value(self) -> None: + """Test label encoding with unknown values.""" + df = pl.DataFrame({"dept": ["HR", "Marketing"]}) + plan = TransformPlan().enc_label( + "dept", categories=["HR", "Engineering"], unknown_value=-1 + ) + result, _ = plan.process(df) + + assert result["dept"].to_list() == [0, -1] + + +class TestEncodingValidation: + """Tests for encoding validation errors.""" + + def test_enc_onehot_missing_column(self, encoding_df: pl.DataFrame) -> None: + """Test validation error for missing column.""" + plan = TransformPlan().enc_onehot("nonexistent") + result = plan.validate(encoding_df) + + assert not result.is_valid + assert any("nonexistent" in str(e) for e in result.errors) + + def test_enc_onehot_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: + """Test validation error for duplicate categories.""" + plan = TransformPlan().enc_onehot( + "color", categories=["red", "red", "blue"] + ) + result = plan.validate(encoding_df) + + assert not result.is_valid + assert any("Duplicate" in str(e) for e in result.errors) + + def test_enc_onehot_column_collision(self) -> None: + """Test validation error for column name collision.""" + df = pl.DataFrame({"color": ["red"], "color_red": [1]}) + plan = TransformPlan().enc_onehot("color", categories=["red"]) + result = plan.validate(df) + + assert not result.is_valid + assert any("already exists" in str(e) for e in result.errors) + + def test_enc_ordinal_missing_column(self, encoding_df: pl.DataFrame) -> None: + """Test validation error for missing column.""" + plan = TransformPlan().enc_ordinal("nonexistent") + result = plan.validate(encoding_df) + + assert not result.is_valid + assert any("nonexistent" in str(e) for e in result.errors) + + def test_enc_ordinal_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: + """Test validation error for duplicate categories.""" + plan = TransformPlan().enc_ordinal( + "size", categories=["small", "small", "large"] + ) + result = plan.validate(encoding_df) + + assert not result.is_valid + assert any("Duplicate" in str(e) for e in result.errors) + + def test_enc_label_missing_column(self, encoding_df: pl.DataFrame) -> None: + """Test validation error for missing column.""" + plan = TransformPlan().enc_label("nonexistent") + result = plan.validate(encoding_df) + + assert not result.is_valid + + def test_enc_label_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: + """Test validation error for duplicate categories.""" + plan = TransformPlan().enc_label( + "department", categories=["HR", "HR"] + ) + result = plan.validate(encoding_df) + + assert not result.is_valid + assert any("Duplicate" in str(e) for e in result.errors) + + +class TestEncodingEdgeCases: + """Tests for edge cases in encoding operations.""" + + def test_enc_onehot_with_nulls(self) -> None: + """Test one-hot encoding with null values.""" + df = pl.DataFrame({"color": ["red", None, "blue"]}) + plan = TransformPlan().enc_onehot("color", categories=["red", "blue"]) + result, _ = plan.process(df) + + # Null should be treated as unknown (all zeros with default setting) + assert result["color_red"][1] == 0 + assert result["color_blue"][1] == 0 + + def test_enc_onehot_empty_dataframe(self, empty_df: pl.DataFrame) -> None: + """Test one-hot encoding with empty DataFrame.""" + df = pl.DataFrame({"color": pl.Series([], dtype=pl.Utf8)}) + plan = TransformPlan().enc_onehot("color", categories=["red", "blue"]) + result, _ = plan.process(df) + + assert "color_red" in result.columns + assert "color_blue" in result.columns + assert len(result) == 0 + + def test_enc_onehot_single_category(self) -> None: + """Test one-hot encoding with single category.""" + df = pl.DataFrame({"status": ["active", "active", "active"]}) + plan = TransformPlan().enc_onehot("status", categories=["active"]) + result, _ = plan.process(df) + + assert "status_active" in result.columns + assert result["status_active"].to_list() == [1, 1, 1] + + def test_enc_ordinal_with_nulls(self) -> None: + """Test ordinal encoding with null values.""" + df = pl.DataFrame({"size": ["small", None, "large"]}) + plan = TransformPlan().enc_ordinal( + "size", categories=["small", "medium", "large"], unknown_value=-1 + ) + result, _ = plan.process(df) + + # Null is treated as unknown + assert result["size"].to_list() == [0, -1, 2] + + def test_enc_ordinal_empty_categories(self) -> None: + """Test ordinal encoding with empty categories list.""" + df = pl.DataFrame({"size": ["small", "medium"]}) + plan = TransformPlan().enc_ordinal("size", categories=[], unknown_value=-1) + result, _ = plan.process(df) + + # All values should be unknown + assert result["size"].to_list() == [-1, -1] + + +class TestEncodingChaining: + """Tests for chaining encoding operations.""" + + def test_multiple_encodings(self, encoding_df: pl.DataFrame) -> None: + """Test chaining multiple encoding operations.""" + plan = ( + TransformPlan() + .enc_onehot("color", categories=["red", "green", "blue"]) + .enc_ordinal("size", categories=["small", "medium", "large"]) + ) + result, _ = plan.process(encoding_df) + + # Check both encodings applied + assert "color_red" in result.columns + # Polars uses Int32 for small integer literals + assert result["size"].dtype in (pl.Int32, pl.Int64) + + def test_encoding_with_other_ops(self, encoding_df: pl.DataFrame) -> None: + """Test encoding combined with other operations.""" + plan = ( + TransformPlan() + .enc_ordinal( + "size", + categories=["small", "medium", "large"], + new_column="size_encoded", + drop_original=False, + ) + .col_drop("department") + ) + result, _ = plan.process(encoding_df) + + assert "size_encoded" in result.columns + assert "size" in result.columns + assert "department" not in result.columns + + +class TestEncodingProtocol: + """Tests for encoding operations in the protocol/audit trail.""" + + def test_enc_onehot_in_protocol(self, encoding_df: pl.DataFrame) -> None: + """Test that one-hot encoding is recorded in protocol.""" + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green", "blue"] + ) + _, protocol = plan.process(encoding_df) + + protocol_dict = protocol.to_dict() + assert len(protocol_dict["steps"]) == 1 + step = protocol_dict["steps"][0] + assert step["operation"] == "enc_onehot" + assert step["params"]["column"] == "color" + assert step["params"]["categories"] == ["red", "green", "blue"] + + def test_enc_ordinal_in_protocol(self, encoding_df: pl.DataFrame) -> None: + """Test that ordinal encoding is recorded in protocol.""" + plan = TransformPlan().enc_ordinal( + "size", categories=["small", "medium", "large"] + ) + _, protocol = plan.process(encoding_df) + + protocol_dict = protocol.to_dict() + step = protocol_dict["steps"][0] + assert step["operation"] == "enc_ordinal" + assert step["params"]["categories"] == ["small", "medium", "large"] diff --git a/transformplan/ops/__init__.py b/transformplan/ops/__init__.py index 2575702..35b81e7 100644 --- a/transformplan/ops/__init__.py +++ b/transformplan/ops/__init__.py @@ -18,6 +18,7 @@ from .column import ColumnOps from .datetime import DatetimeOps +from .encoding import EncodingOps from .map import MapOps from .math import MathOps from .rows import RowOps @@ -26,6 +27,7 @@ __all__ = [ "ColumnOps", "DatetimeOps", + "EncodingOps", "MapOps", "MathOps", "RowOps", diff --git a/transformplan/ops/encoding.py b/transformplan/ops/encoding.py new file mode 100644 index 0000000..001df9d --- /dev/null +++ b/transformplan/ops/encoding.py @@ -0,0 +1,246 @@ +"""Encoding operations mixin. + +This module provides the EncodingOps mixin class with categorical encoding +operations for machine learning preparation workflows. + +Classes: + EncodingOps: Mixin providing encoding operations. + +Encoding Operations: + enc_onehot: One-hot encoding (binary indicator columns). + enc_ordinal: Ordinal encoding (ordered integers). + enc_label: Label encoding (alphabetically sorted integers). + +Example: + >>> plan = TransformPlan().enc_onehot("color", categories=["red", "green", "blue"]) +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import polars as pl + +if TYPE_CHECKING: + from typing import Callable, Literal + + from typing_extensions import Self + + +class EncodingOps: + """Mixin providing categorical encoding operations.""" + + if TYPE_CHECKING: + + def _register( + self, + method: Callable[..., pl.DataFrame], + params: dict[str, Any], + ) -> Self: ... + + def enc_onehot( + self, + column: str, + categories: list[Any] | None = None, + prefix: str | None = None, + *, + drop_original: bool = True, + unknown_value: Literal["all_zero", "ignore"] = "all_zero", + ) -> Self: + """One-hot encode a categorical column. + + Creates binary indicator columns (0/1) for each category. + + Args: + column: Source column to encode. + categories: List of category values. If None, derived from data. + prefix: Prefix for new columns (default: column name). + drop_original: Drop source column after encoding (default: True). + unknown_value: How to handle unknown values: + - "all_zero": Set all indicator columns to 0. + - "ignore": Keep original value behavior. + + Returns: + Self for method chaining. + + Example: + >>> plan.enc_onehot("color", categories=["red", "green", "blue"]) + # Creates columns: color_red, color_green, color_blue + """ + return self._register( + self._enc_onehot, + { + "column": column, + "categories": categories, + "prefix": prefix or column, + "drop_original": drop_original, + "unknown_value": unknown_value, + }, + ) + + def _enc_onehot( + self, + data: pl.DataFrame, + column: str, + categories: list[Any] | None, + prefix: str, + drop_original: bool, # noqa: FBT001 + unknown_value: str, + ) -> pl.DataFrame: + # Derive categories from data if not provided + if categories is None: + categories = data[column].drop_nulls().unique().sort().to_list() + + # Build one-hot columns + new_columns = [] + for cat in categories: + col_name = f"{prefix}_{cat}" + if unknown_value == "all_zero": + # Unknown values get 0 for all categories + expr = ( + pl.when(pl.col(column) == cat) + .then(pl.lit(1)) + .otherwise(pl.lit(0)) + .alias(col_name) + ) + else: + # "ignore" - unknown values get null + expr = ( + pl.when(pl.col(column) == cat) + .then(pl.lit(1)) + .when(pl.col(column).is_in(categories)) + .then(pl.lit(0)) + .otherwise(pl.lit(None)) + .alias(col_name) + ) + new_columns.append(expr) + + result = data.with_columns(new_columns) + + if drop_original: + result = result.drop(column) + + return result + + def enc_ordinal( + self, + column: str, + categories: list[Any] | None = None, + new_column: str | None = None, + *, + drop_original: bool = True, + unknown_value: int = -1, + ) -> Self: + """Ordinal encode a categorical column. + + Maps categories to integers based on explicit ordering. + + Args: + column: Source column to encode. + categories: List of categories in desired order (first=0, second=1, etc.). + If None, uses sorted unique values from data. + new_column: Output column name. If None, replaces original. + drop_original: Drop source column if new_column differs (default: True). + unknown_value: Integer for unknown values (default: -1). + + Returns: + Self for method chaining. + + Example: + >>> plan.enc_ordinal("size", categories=["small", "medium", "large"]) + # Maps: small→0, medium→1, large→2 + """ + return self._register( + self._enc_ordinal, + { + "column": column, + "categories": categories, + "new_column": new_column or column, + "drop_original": drop_original, + "unknown_value": unknown_value, + }, + ) + + def _enc_ordinal( + self, + data: pl.DataFrame, + column: str, + categories: list[Any] | None, + new_column: str, + drop_original: bool, # noqa: FBT001 + unknown_value: int, + ) -> pl.DataFrame: + # Derive categories from data if not provided + if categories is None: + categories = data[column].drop_nulls().unique().sort().to_list() + + # Build when/then chain + if not categories: + return data.with_columns(pl.lit(unknown_value).alias(new_column)) + + first_cat = categories[0] + chain = pl.when(pl.col(column) == first_cat).then(pl.lit(0)) + + for idx, cat in enumerate(categories[1:], start=1): + chain = chain.when(pl.col(column) == cat).then(pl.lit(idx)) + + chain = chain.otherwise(pl.lit(unknown_value)) + result = data.with_columns(chain.alias(new_column)) + + if drop_original and new_column != column: + result = result.drop(column) + + return result + + def enc_label( + self, + column: str, + categories: list[Any] | None = None, + new_column: str | None = None, + *, + drop_original: bool = True, + unknown_value: int = -1, + ) -> Self: + """Label encode a categorical column. + + Simple integer encoding (alphabetically sorted by default). + + Args: + column: Source column to encode. + categories: List of categories. If None, uses sorted unique values. + new_column: Output column name. If None, replaces original. + drop_original: Drop source column if new_column differs (default: True). + unknown_value: Integer for unknown values (default: -1). + + Returns: + Self for method chaining. + + Example: + >>> plan.enc_label("department") + # Maps alphabetically: Engineering→0, HR→1, Sales→2 + """ + return self._register( + self._enc_label, + { + "column": column, + "categories": categories, + "new_column": new_column or column, + "drop_original": drop_original, + "unknown_value": unknown_value, + }, + ) + + def _enc_label( + self, + data: pl.DataFrame, + column: str, + categories: list[Any] | None, + new_column: str, + drop_original: bool, # noqa: FBT001 + unknown_value: int, + ) -> pl.DataFrame: + # Label encoding is the same as ordinal encoding + # The semantic difference is that ordinal implies meaningful order + return self._enc_ordinal( + data, column, categories, new_column, drop_original, unknown_value + ) diff --git a/transformplan/plan.py b/transformplan/plan.py index ea36cc9..ad68004 100644 --- a/transformplan/plan.py +++ b/transformplan/plan.py @@ -20,11 +20,18 @@ """ from .core import TransformPlanBase -from .ops import ColumnOps, DatetimeOps, MapOps, MathOps, RowOps, StrOps +from .ops import ColumnOps, DatetimeOps, EncodingOps, MapOps, MathOps, RowOps, StrOps class TransformPlan( - TransformPlanBase, ColumnOps, DatetimeOps, MapOps, MathOps, RowOps, StrOps + TransformPlanBase, + ColumnOps, + DatetimeOps, + EncodingOps, + MapOps, + MathOps, + RowOps, + StrOps, ): """Data processor with tracked transformations. diff --git a/transformplan/validation.py b/transformplan/validation.py index f4e3070..15e48fe 100644 --- a/transformplan/validation.py +++ b/transformplan/validation.py @@ -1227,6 +1227,99 @@ def _validate_map_from_column( tracker.add_column(new_column, tracker.get_dtype(value_column)) +# ============================================================================= +# Encoding operation validators +# ============================================================================= + + +def _validate_enc_onehot( + tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int +) -> None: + column = params["column"] + categories = params.get("categories") + prefix = params["prefix"] + drop_original = params["drop_original"] + + if not _check_column_exists(tracker, column, result, step, "enc_onehot"): + return + + if categories is not None: + # Check for duplicate categories + if len(categories) != len(set(categories)): + result.add_error(step, "enc_onehot", "Duplicate values in categories list") + return + + # Check for column name collisions + for cat in categories: + new_col = f"{prefix}_{cat}" + if tracker.has_column(new_col): + result.add_error( + step, "enc_onehot", f"Column '{new_col}' already exists" + ) + return + + # Update schema + for cat in categories: + new_col = f"{prefix}_{cat}" + tracker.add_column(new_col, pl.Int64()) + + # If categories is None, we can't fully validate the output schema + # The validator will only check that the source column exists + + if drop_original: + tracker.drop_column(column) + + +def _validate_enc_ordinal( + tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int +) -> None: + column = params["column"] + categories = params.get("categories") + new_column = params["new_column"] + drop_original = params["drop_original"] + + if not _check_column_exists(tracker, column, result, step, "enc_ordinal"): + return + + # Check for duplicate categories + if categories is not None and len(categories) != len(set(categories)): + result.add_error(step, "enc_ordinal", "Duplicate values in categories list") + return + + # Update schema + if new_column != column: + tracker.add_column(new_column, pl.Int64()) + if drop_original: + tracker.drop_column(column) + else: + tracker.set_dtype(column, pl.Int64()) + + +def _validate_enc_label( + tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int +) -> None: + column = params["column"] + categories = params.get("categories") + new_column = params["new_column"] + drop_original = params["drop_original"] + + if not _check_column_exists(tracker, column, result, step, "enc_label"): + return + + # Check for duplicate categories + if categories is not None and len(categories) != len(set(categories)): + result.add_error(step, "enc_label", "Duplicate values in categories list") + return + + # Update schema + if new_column != column: + tracker.add_column(new_column, pl.Int64()) + if drop_original: + tracker.drop_column(column) + else: + tracker.set_dtype(column, pl.Int64()) + + # ============================================================================= # Validator registry # ============================================================================= @@ -1318,6 +1411,10 @@ def _validate_map_from_column( "map_values": _validate_map_values, "map_discretize": _validate_map_discretize, "map_from_column": _validate_map_from_column, + # Encoding ops + "enc_onehot": _validate_enc_onehot, + "enc_ordinal": _validate_enc_ordinal, + "enc_label": _validate_enc_label, } From a1b204abfb6a76db287bf76fa8231e3e38913005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gro=C3=9Fer?= Date: Fri, 30 Jan 2026 12:10:04 +0100 Subject: [PATCH 2/6] encoding multicollinearity --- tests/test_encoding.py | 115 ++++++++++++++++++++++++++++++++++ transformplan/ops/encoding.py | 31 +++++++++ transformplan/validation.py | 48 ++++++++++++-- 3 files changed, 189 insertions(+), 5 deletions(-) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 27bd34a..00a93bb 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -100,6 +100,99 @@ def test_enc_onehot_unknown_ignore(self, encoding_df: pl.DataFrame) -> None: assert result["color_red"][2] is None assert result["color_green"][2] is None + def test_enc_onehot_drop_first(self, encoding_df: pl.DataFrame) -> None: + """Test one-hot encoding with drop='first' to avoid multicollinearity.""" + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green", "blue"], drop="first" + ) + result, _ = plan.process(encoding_df) + + # "red" (first category) should be dropped + assert "color_red" not in result.columns + assert "color_green" in result.columns + assert "color_blue" in result.columns + + # Row 0 is "red" - should have 0 for green and blue + assert result["color_green"][0] == 0 + assert result["color_blue"][0] == 0 + + # Row 1 is "green" + assert result["color_green"][1] == 1 + assert result["color_blue"][1] == 0 + + def test_enc_onehot_drop_last(self, encoding_df: pl.DataFrame) -> None: + """Test one-hot encoding with drop='last'.""" + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green", "blue"], drop="last" + ) + result, _ = plan.process(encoding_df) + + # "blue" (last category) should be dropped + assert "color_red" in result.columns + assert "color_green" in result.columns + assert "color_blue" not in result.columns + + # Row 2 is "blue" - should have 0 for red and green + assert result["color_red"][2] == 0 + assert result["color_green"][2] == 0 + + def test_enc_onehot_drop_specific_value(self, encoding_df: pl.DataFrame) -> None: + """Test one-hot encoding dropping a specific category value.""" + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green", "blue"], drop="green" + ) + result, _ = plan.process(encoding_df) + + # "green" should be dropped + assert "color_red" in result.columns + assert "color_green" not in result.columns + assert "color_blue" in result.columns + + # Row 1 is "green" - should have 0 for red and blue + assert result["color_red"][1] == 0 + assert result["color_blue"][1] == 0 + + def test_enc_onehot_drop_with_derived_categories(self) -> None: + """Test one-hot encoding with drop and categories derived from data.""" + df = pl.DataFrame({"color": ["red", "green", "blue"]}) + plan = TransformPlan().enc_onehot("color", drop="first") + result, _ = plan.process(df) + + # Categories are derived alphabetically: blue, green, red + # "blue" (first alphabetically) should be dropped + assert "color_blue" not in result.columns + assert "color_green" in result.columns + assert "color_red" in result.columns + + def test_enc_onehot_drop_literal_takes_precedence(self) -> None: + """Test that literal values take precedence over 'first'/'last' keywords.""" + # Category list where "first" is NOT the first element + df = pl.DataFrame({"pos": ["last", "middle", "first"]}) + plan = TransformPlan().enc_onehot( + "pos", categories=["last", "middle", "first"], drop="first" + ) + result, _ = plan.process(df) + + # "first" should be interpreted as the literal value, not positional + # So "first" (the value) should be dropped, NOT "last" (the first position) + assert "pos_last" in result.columns + assert "pos_middle" in result.columns + assert "pos_first" not in result.columns + + def test_enc_onehot_drop_keyword_when_not_in_categories(self) -> None: + """Test that 'first'/'last' work as keywords when not in categories.""" + df = pl.DataFrame({"color": ["red", "green", "blue"]}) + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green", "blue"], drop="first" + ) + result, _ = plan.process(df) + + # "first" is not in categories, so it's interpreted as keyword + # Should drop "red" (first in list) + assert "color_red" not in result.columns + assert "color_green" in result.columns + assert "color_blue" in result.columns + class TestEncOrdinal: """Tests for enc_ordinal operation.""" @@ -253,6 +346,28 @@ def test_enc_onehot_column_collision(self) -> None: assert not result.is_valid assert any("already exists" in str(e) for e in result.errors) + def test_enc_onehot_drop_invalid_value(self, encoding_df: pl.DataFrame) -> None: + """Test validation error for drop value not in categories.""" + plan = TransformPlan().enc_onehot( + "color", categories=["red", "green", "blue"], drop="purple" + ) + result = plan.validate(encoding_df) + + assert not result.is_valid + assert any("not in categories" in str(e) for e in result.errors) + + def test_enc_onehot_drop_avoids_collision(self) -> None: + """Test that drop='first' avoids column collision when first column exists.""" + # color_blue already exists, but we're dropping blue (first alphabetically) + df = pl.DataFrame({"color": ["red", "green", "blue"], "color_blue": [1, 2, 3]}) + plan = TransformPlan().enc_onehot( + "color", categories=["blue", "green", "red"], drop="first" + ) + result = plan.validate(df) + + # Should be valid because we're dropping color_blue + assert result.is_valid + def test_enc_ordinal_missing_column(self, encoding_df: pl.DataFrame) -> None: """Test validation error for missing column.""" plan = TransformPlan().enc_ordinal("nonexistent") diff --git a/transformplan/ops/encoding.py b/transformplan/ops/encoding.py index 001df9d..dc1090f 100644 --- a/transformplan/ops/encoding.py +++ b/transformplan/ops/encoding.py @@ -44,6 +44,7 @@ def enc_onehot( categories: list[Any] | None = None, prefix: str | None = None, *, + drop: Literal["first", "last"] | Any | None = None, # noqa: ANN401 drop_original: bool = True, unknown_value: Literal["all_zero", "ignore"] = "all_zero", ) -> Self: @@ -55,6 +56,11 @@ def enc_onehot( column: Source column to encode. categories: List of category values. If None, derived from data. prefix: Prefix for new columns (default: column name). + drop: Drop one category column to avoid multicollinearity: + - None: Keep all columns (default). + - "first": Drop the first category. + - "last": Drop the last category. + - Any value: Drop that specific category. drop_original: Drop source column after encoding (default: True). unknown_value: How to handle unknown values: - "all_zero": Set all indicator columns to 0. @@ -66,6 +72,10 @@ def enc_onehot( Example: >>> plan.enc_onehot("color", categories=["red", "green", "blue"]) # Creates columns: color_red, color_green, color_blue + + >>> plan.enc_onehot("color", drop="first", + ... categories=["red", "green", "blue"]) + # Creates columns: color_green, color_blue (drops color_red) """ return self._register( self._enc_onehot, @@ -73,6 +83,7 @@ def enc_onehot( "column": column, "categories": categories, "prefix": prefix or column, + "drop": drop, "drop_original": drop_original, "unknown_value": unknown_value, }, @@ -84,6 +95,7 @@ def _enc_onehot( column: str, categories: list[Any] | None, prefix: str, + drop: Any | None, # noqa: ANN401 drop_original: bool, # noqa: FBT001 unknown_value: str, ) -> pl.DataFrame: @@ -91,9 +103,28 @@ def _enc_onehot( if categories is None: categories = data[column].drop_nulls().unique().sort().to_list() + # Determine which category to drop (if any) + # Literal values take precedence over keywords "first"/"last" + drop_category: Any | None = None + if drop is not None and categories: + if drop in categories: + # Literal value - drop this specific category + drop_category = drop + elif drop == "first": + drop_category = categories[0] + elif drop == "last": + drop_category = categories[-1] + else: + # Value not in categories - will result in no column being dropped + drop_category = drop + # Build one-hot columns new_columns = [] for cat in categories: + # Skip the dropped category + if drop_category is not None and cat == drop_category: + continue + col_name = f"{prefix}_{cat}" if unknown_value == "all_zero": # Unknown values get 0 for all categories diff --git a/transformplan/validation.py b/transformplan/validation.py index 15e48fe..db56003 100644 --- a/transformplan/validation.py +++ b/transformplan/validation.py @@ -1232,12 +1232,45 @@ def _validate_map_from_column( # ============================================================================= +def _resolve_drop_category( + drop: Any | None, # noqa: ANN401 + categories: list[Any], + result: ValidationResult, + step: int, + op_name: str, +) -> tuple[Any | None, bool]: + """Resolve the category to drop for one-hot encoding. + + Literal values take precedence over keywords "first"/"last". + + Returns: + Tuple of (drop_category, is_valid). If is_valid is False, an error was added. + """ + if drop is None: + return None, True + if not categories: + return None, True + # Literal values take precedence over keywords + if drop in categories: + return drop, True + if drop == "first": + return categories[0], True + if drop == "last": + return categories[-1], True + # Value not in categories and not a keyword + result.add_error( + step, op_name, f"Drop value '{drop}' not in categories list" + ) + return None, False + + def _validate_enc_onehot( tracker: SchemaTracker, params: dict[str, Any], result: ValidationResult, step: int ) -> None: column = params["column"] categories = params.get("categories") prefix = params["prefix"] + drop = params.get("drop") drop_original = params["drop_original"] if not _check_column_exists(tracker, column, result, step, "enc_onehot"): @@ -1249,18 +1282,23 @@ def _validate_enc_onehot( result.add_error(step, "enc_onehot", "Duplicate values in categories list") return - # Check for column name collisions + # Determine which category to drop (if any) + drop_category, is_valid = _resolve_drop_category( + drop, categories, result, step, "enc_onehot" + ) + if not is_valid: + return + + # Check for column name collisions and update schema for cat in categories: + if cat == drop_category: + continue new_col = f"{prefix}_{cat}" if tracker.has_column(new_col): result.add_error( step, "enc_onehot", f"Column '{new_col}' already exists" ) return - - # Update schema - for cat in categories: - new_col = f"{prefix}_{cat}" tracker.add_column(new_col, pl.Int64()) # If categories is None, we can't fully validate the output schema From 65f6ffca05ded8c223edeefb017e3ae22e1580f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gro=C3=9Fer?= Date: Fri, 30 Jan 2026 12:14:42 +0100 Subject: [PATCH 3/6] docs update --- README.md | 1 + docs/api/ops/encoding.md | 187 +++++++++++++++++++++++++++++++++++++++ docs/index.md | 1 + mkdocs.yml | 1 + 4 files changed, 190 insertions(+) create mode 100644 docs/api/ops/encoding.md diff --git a/README.md b/README.md index fe542bf..ac141d1 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ Total time: 0.0247s | **str_** | String operations | `str_lower`, `str_upper`, `str_strip`, `str_replace`, `str_split` | | **dt_** | Datetime operations | `dt_year`, `dt_month`, `dt_parse`, `dt_age_years`, `dt_diff_days` | | **map_** | Value mapping | `map_values`, `map_discretize`, `map_case`, `map_from_column` | +| **enc_** | Categorical encoding | `enc_onehot`, `enc_ordinal`, `enc_label` | ## Installation diff --git a/docs/api/ops/encoding.md b/docs/api/ops/encoding.md new file mode 100644 index 0000000..74554aa --- /dev/null +++ b/docs/api/ops/encoding.md @@ -0,0 +1,187 @@ +# Encoding Operations + +Categorical encoding operations for machine learning preparation. + +## Overview + +Encoding operations transform categorical columns into numeric representations suitable for machine learning models. They support one-hot encoding, ordinal encoding, and label encoding. + +```python +from transformplan import TransformPlan + +plan = ( + TransformPlan() + .enc_onehot("color", categories=["red", "green", "blue"], drop="first") + .enc_ordinal("size", categories=["small", "medium", "large"]) +) +``` + +## Class Reference + +::: transformplan.ops.encoding.EncodingOps + options: + show_root_heading: true + members: + - enc_onehot + - enc_ordinal + - enc_label + +## Examples + +### One-Hot Encoding + +Creates binary indicator columns (0/1) for each category. + +```python +# Basic one-hot encoding +plan = TransformPlan().enc_onehot( + column="color", + categories=["red", "green", "blue"] +) +# Creates columns: color_red, color_green, color_blue + +# Drop first category to avoid multicollinearity (for regression models) +plan = TransformPlan().enc_onehot( + column="color", + categories=["red", "green", "blue"], + drop="first" +) +# Creates columns: color_green, color_blue (drops color_red) + +# Drop last category +plan = TransformPlan().enc_onehot( + column="color", + categories=["red", "green", "blue"], + drop="last" +) +# Creates columns: color_red, color_green (drops color_blue) + +# Drop specific category +plan = TransformPlan().enc_onehot( + column="color", + categories=["red", "green", "blue"], + drop="green" +) +# Creates columns: color_red, color_blue (drops color_green) + +# Custom prefix for new columns +plan = TransformPlan().enc_onehot( + column="color", + categories=["red", "green", "blue"], + prefix="c" +) +# Creates columns: c_red, c_green, c_blue + +# Keep original column +plan = TransformPlan().enc_onehot( + column="color", + categories=["red", "green", "blue"], + drop_original=False +) +# Keeps color column alongside color_red, color_green, color_blue +``` + +### Ordinal Encoding + +Maps categories to integers based on explicit ordering (first=0, second=1, etc.). + +```python +# Ordinal encoding with meaningful order +plan = TransformPlan().enc_ordinal( + column="size", + categories=["small", "medium", "large"] +) +# Maps: small -> 0, medium -> 1, large -> 2 + +# Output to new column +plan = TransformPlan().enc_ordinal( + column="size", + categories=["small", "medium", "large"], + new_column="size_encoded" +) + +# Custom unknown value +plan = TransformPlan().enc_ordinal( + column="size", + categories=["small", "medium", "large"], + unknown_value=-1 # Default +) +# Values not in categories get -1 +``` + +### Label Encoding + +Simple integer encoding, alphabetically sorted by default. Similar to ordinal encoding but without semantic ordering. + +```python +# Label encoding (alphabetically sorted) +plan = TransformPlan().enc_label(column="department") +# Maps alphabetically: Engineering -> 0, HR -> 1, Sales -> 2 + +# With explicit categories +plan = TransformPlan().enc_label( + column="department", + categories=["HR", "Engineering", "Sales"] +) +# Maps: HR -> 0, Engineering -> 1, Sales -> 2 +``` + +## Use Cases + +### Preparing Data for Machine Learning + +```python +# One-hot encode categorical features, dropping first to avoid multicollinearity +plan = ( + TransformPlan() + .enc_onehot("color", categories=["red", "green", "blue"], drop="first") + .enc_onehot("size", categories=["S", "M", "L", "XL"], drop="first") + .enc_ordinal("quality", categories=["low", "medium", "high"]) +) +``` + +### Handling Unknown Categories + +```python +# Unknown values get all zeros (one-hot) +plan = TransformPlan().enc_onehot( + column="color", + categories=["red", "green", "blue"], + unknown_value="all_zero" # Default +) + +# Unknown values get -1 (ordinal/label) +plan = TransformPlan().enc_ordinal( + column="size", + categories=["small", "medium", "large"], + unknown_value=-1 +) +``` + +### Deriving Categories from Data + +When categories are not specified, they are derived from the data (sorted alphabetically): + +```python +# Categories derived from data +plan = TransformPlan().enc_onehot("color") +# Uses sorted unique values from the column + +# Note: For reproducibility, explicitly specify categories +plan = TransformPlan().enc_onehot( + column="color", + categories=["blue", "green", "red"] # Explicit is better +) +``` + +## Multicollinearity Note + +When using one-hot encoding for linear models (regression, logistic regression), you should drop one category to avoid the [dummy variable trap](https://en.wikipedia.org/wiki/Dummy_variable_(statistics)). Use the `drop` parameter: + +```python +# For regression models, drop one category +plan = TransformPlan().enc_onehot("color", drop="first") + +# Tree-based models (random forest, XGBoost) don't require this +plan = TransformPlan().enc_onehot("color") # Keep all +``` diff --git a/docs/index.md b/docs/index.md index b36a863..ae4265a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -94,6 +94,7 @@ Total time: 0.0247s | **str_** | String operations | `str_lower`, `str_upper`, `str_strip`, `str_replace`, `str_split` | | **dt_** | Datetime operations | `dt_year`, `dt_month`, `dt_parse`, `dt_age_years`, `dt_diff_days` | | **map_** | Value mapping | `map_values`, `map_discretize`, `map_case`, `map_from_column` | +| **enc_** | Categorical encoding | `enc_onehot`, `enc_ordinal`, `enc_label` | ## Getting Started diff --git a/mkdocs.yml b/mkdocs.yml index 4f0128f..5fc254a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -91,3 +91,4 @@ nav: - String Operations: api/ops/string.md - Datetime Operations: api/ops/datetime.md - Map Operations: api/ops/map.md + - Encoding Operations: api/ops/encoding.md From e9db47f91570bc6261984fee438d0aedea37db8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gro=C3=9Fer?= Date: Fri, 30 Jan 2026 12:15:35 +0100 Subject: [PATCH 4/6] format --- tests/test_encoding.py | 16 ++++------------ transformplan/ops/encoding.py | 3 +-- transformplan/validation.py | 4 +--- 3 files changed, 6 insertions(+), 17 deletions(-) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 00a93bb..d70dc6c 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -23,9 +23,7 @@ class TestEncOnehot: def test_enc_onehot_with_categories(self, encoding_df: pl.DataFrame) -> None: """Test one-hot encoding with explicit categories.""" - plan = TransformPlan().enc_onehot( - "color", categories=["red", "green", "blue"] - ) + plan = TransformPlan().enc_onehot("color", categories=["red", "green", "blue"]) result, _ = plan.process(encoding_df) # Check columns created @@ -329,9 +327,7 @@ def test_enc_onehot_missing_column(self, encoding_df: pl.DataFrame) -> None: def test_enc_onehot_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: """Test validation error for duplicate categories.""" - plan = TransformPlan().enc_onehot( - "color", categories=["red", "red", "blue"] - ) + plan = TransformPlan().enc_onehot("color", categories=["red", "red", "blue"]) result = plan.validate(encoding_df) assert not result.is_valid @@ -395,9 +391,7 @@ def test_enc_label_missing_column(self, encoding_df: pl.DataFrame) -> None: def test_enc_label_duplicate_categories(self, encoding_df: pl.DataFrame) -> None: """Test validation error for duplicate categories.""" - plan = TransformPlan().enc_label( - "department", categories=["HR", "HR"] - ) + plan = TransformPlan().enc_label("department", categories=["HR", "HR"]) result = plan.validate(encoding_df) assert not result.is_valid @@ -498,9 +492,7 @@ class TestEncodingProtocol: def test_enc_onehot_in_protocol(self, encoding_df: pl.DataFrame) -> None: """Test that one-hot encoding is recorded in protocol.""" - plan = TransformPlan().enc_onehot( - "color", categories=["red", "green", "blue"] - ) + plan = TransformPlan().enc_onehot("color", categories=["red", "green", "blue"]) _, protocol = plan.process(encoding_df) protocol_dict = protocol.to_dict() diff --git a/transformplan/ops/encoding.py b/transformplan/ops/encoding.py index dc1090f..2899561 100644 --- a/transformplan/ops/encoding.py +++ b/transformplan/ops/encoding.py @@ -73,8 +73,7 @@ def enc_onehot( >>> plan.enc_onehot("color", categories=["red", "green", "blue"]) # Creates columns: color_red, color_green, color_blue - >>> plan.enc_onehot("color", drop="first", - ... categories=["red", "green", "blue"]) + >>> plan.enc_onehot("color", drop="first", categories=["red", "green", "blue"]) # Creates columns: color_green, color_blue (drops color_red) """ return self._register( diff --git a/transformplan/validation.py b/transformplan/validation.py index db56003..f27ff26 100644 --- a/transformplan/validation.py +++ b/transformplan/validation.py @@ -1258,9 +1258,7 @@ def _resolve_drop_category( if drop == "last": return categories[-1], True # Value not in categories and not a keyword - result.add_error( - step, op_name, f"Drop value '{drop}' not in categories list" - ) + result.add_error(step, op_name, f"Drop value '{drop}' not in categories list") return None, False From ab999e4ea1a4e23f44d1847d8dd5d5c3b1987b25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gro=C3=9Fer?= Date: Fri, 30 Jan 2026 12:17:02 +0100 Subject: [PATCH 5/6] linting --- transformplan/ops/encoding.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/transformplan/ops/encoding.py b/transformplan/ops/encoding.py index 2899561..5b33e56 100644 --- a/transformplan/ops/encoding.py +++ b/transformplan/ops/encoding.py @@ -73,8 +73,9 @@ def enc_onehot( >>> plan.enc_onehot("color", categories=["red", "green", "blue"]) # Creates columns: color_red, color_green, color_blue - >>> plan.enc_onehot("color", drop="first", categories=["red", "green", "blue"]) - # Creates columns: color_green, color_blue (drops color_red) + >>> plan.enc_onehot("color", drop="first", + ... categories=["red", "green", "blue"]) + # Creates: color_green, color_blue (drops color_red) """ return self._register( self._enc_onehot, From aaeaaa8024e688cb54774dff00ff6ec998f36f47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gro=C3=9Fer?= Date: Fri, 30 Jan 2026 12:19:31 +0100 Subject: [PATCH 6/6] format and lint --- transformplan/ops/encoding.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/transformplan/ops/encoding.py b/transformplan/ops/encoding.py index 5b33e56..603ed42 100644 --- a/transformplan/ops/encoding.py +++ b/transformplan/ops/encoding.py @@ -71,11 +71,10 @@ def enc_onehot( Example: >>> plan.enc_onehot("color", categories=["red", "green", "blue"]) - # Creates columns: color_red, color_green, color_blue + # Creates: color_red, color_green, color_blue - >>> plan.enc_onehot("color", drop="first", - ... categories=["red", "green", "blue"]) - # Creates: color_green, color_blue (drops color_red) + >>> plan.enc_onehot("color", categories=["red", "green"], drop="first") + # Creates: color_green (drops color_red) """ return self._register( self._enc_onehot,