Convert example functions to bind() and add varargs support to worker matching

rustyconover · claude · rustyconover · commit b87f40ed18df · 2026-01-06T20:59:43.000-05:00
- Convert AddNumericColumnsFunction to use bind() instead of __init__ - Convert RepeatInputsFunction, SumAllColumnsFunction, and SumAllColumnsFunctionWithLogging to use bind() - Fix worker _match_function to support varargs parameters (unlimited positional args) - Add comprehensive tests for SumColumnsFunction (7 test cases) - Remove unused imports from examples 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/tests/scalar/test_client.py b/tests/scalar/test_client.py
@@ -235,6 +235,145 @@ def test_add_columns_accepts_mixed_int_types(self, example_worker: str) -> None:
         assert outputs[0].schema.field("result").type == pa.int64()
 
 
+class TestSumColumns:
+    """Tests for SumColumnsFunction via Client."""
+
+    def test_sum_two_columns(self, example_worker: str) -> None:
+        """Sum of two columns."""
+        schema = pa.schema([("a", pa.int64()), ("b", pa.int64())])
+        batch = pa.RecordBatch.from_pydict(
+            {"a": [1, 2, 3], "b": [10, 20, 30]}, schema=schema
+        )
+
+        with Client(example_worker) as client:
+            outputs = list(
+                client.scalar_function(
+                    function_name="sum_columns",
+                    input=iter([batch]),
+                    arguments=Arguments(positional=(pa.scalar("a"), pa.scalar("b"))),
+                )
+            )
+
+        assert len(outputs) == 1
+        assert outputs[0].to_pydict() == {"result": [11, 22, 33]}
+
+    def test_sum_three_columns(self, example_worker: str) -> None:
+        """Sum of three columns using varargs."""
+        schema = pa.schema([("a", pa.int64()), ("b", pa.int64()), ("c", pa.int64())])
+        batch = pa.RecordBatch.from_pydict(
+            {"a": [1, 2], "b": [10, 20], "c": [100, 200]}, schema=schema
+        )
+
+        with Client(example_worker) as client:
+            outputs = list(
+                client.scalar_function(
+                    function_name="sum_columns",
+                    input=iter([batch]),
+                    arguments=Arguments(
+                        positional=(pa.scalar("a"), pa.scalar("b"), pa.scalar("c"))
+                    ),
+                )
+            )
+
+        assert len(outputs) == 1
+        assert outputs[0].to_pydict() == {"result": [111, 222]}
+
+    def test_sum_with_type_promotion(self, example_worker: str) -> None:
+        """Different int types promote correctly."""
+        schema = pa.schema([("a", pa.int32()), ("b", pa.int64())])
+        batch = pa.RecordBatch.from_pydict({"a": [1, 2], "b": [10, 20]}, schema=schema)
+
+        with Client(example_worker) as client:
+            outputs = list(
+                client.scalar_function(
+                    function_name="sum_columns",
+                    input=iter([batch]),
+                    arguments=Arguments(positional=(pa.scalar("a"), pa.scalar("b"))),
+                )
+            )
+
+        assert len(outputs) == 1
+        assert outputs[0].to_pydict() == {"result": [11, 22]}
+        # Output should be int64 (promoted from int32)
+        assert outputs[0].schema.field("result").type == pa.int64()
+
+    def test_sum_rejects_string_column(self, example_worker: str) -> None:
+        """Type bound rejects non-numeric columns."""
+        schema = pa.schema([("a", pa.int64()), ("b", pa.string())])  # type: ignore[arg-type]
+        batch = pa.RecordBatch.from_pydict(
+            {"a": [1, 2], "b": ["x", "y"]}, schema=schema
+        )
+
+        with (
+            Client(example_worker) as client,
+            pytest.raises(Exception, match="does not match any of"),
+        ):
+            list(
+                client.scalar_function(
+                    function_name="sum_columns",
+                    input=iter([batch]),
+                    arguments=Arguments(positional=(pa.scalar("a"), pa.scalar("b"))),
+                )
+            )
+
+    def test_sum_multiple_batches(self, example_worker: str) -> None:
+        """Multiple input batches processed correctly."""
+        schema = pa.schema([("a", pa.int64()), ("b", pa.int64())])
+        batch1 = pa.RecordBatch.from_pydict({"a": [1, 2], "b": [10, 20]}, schema=schema)
+        batch2 = pa.RecordBatch.from_pydict({"a": [3, 4], "b": [30, 40]}, schema=schema)
+
+        with Client(example_worker) as client:
+            outputs = list(
+                client.scalar_function(
+                    function_name="sum_columns",
+                    input=iter([batch1, batch2]),
+                    arguments=Arguments(positional=(pa.scalar("a"), pa.scalar("b"))),
+                )
+            )
+
+        assert_total_rows(outputs, 4)
+        all_values: list[int] = []
+        for batch in outputs:
+            all_values.extend(cast(list[int], batch.column("result").to_pylist()))
+        assert sorted(all_values) == [11, 22, 33, 44]
+
+    def test_sum_empty_batch(self, example_worker: str) -> None:
+        """Empty batch returns empty output."""
+        schema = pa.schema([("a", pa.int64()), ("b", pa.int64())])
+        empty_batch = pa.RecordBatch.from_pydict({"a": [], "b": []}, schema=schema)
+
+        with Client(example_worker) as client:
+            outputs = list(
+                client.scalar_function(
+                    function_name="sum_columns",
+                    input=iter([empty_batch]),
+                    arguments=Arguments(positional=(pa.scalar("a"), pa.scalar("b"))),
+                )
+            )
+
+        assert len(outputs) == 1
+        assert outputs[0].num_rows == 0
+
+    def test_sum_float_columns(self, example_worker: str) -> None:
+        """Sum of float columns."""
+        schema = pa.schema([("a", pa.float64()), ("b", pa.float64())])
+        batch = pa.RecordBatch.from_pydict(
+            {"a": [1.5, 2.5], "b": [0.5, 0.5]}, schema=schema
+        )
+
+        with Client(example_worker) as client:
+            outputs = list(
+                client.scalar_function(
+                    function_name="sum_columns",
+                    input=iter([batch]),
+                    arguments=Arguments(positional=(pa.scalar("a"), pa.scalar("b"))),
+                )
+            )
+
+        assert len(outputs) == 1
+        assert outputs[0].to_pydict() == {"result": [2.0, 3.0]}
+
+
 class TestScalarFunctionParallel:
     """Tests for scalar functions with parallel processing."""
 
diff --git a/vgi/examples/scalar.py b/vgi/examples/scalar.py
@@ -16,9 +16,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import structlog
 
-import vgi.invocation
 from vgi.arguments import AnyArrow, Arg
 from vgi.exceptions import SchemaValidationError
 from vgi.metadata import FunctionExample
@@ -151,18 +149,12 @@ class Meta:
     col1 = Arg[AnyArrow](0, doc="First column name", type_bound=_is_addable_type)
     col2 = Arg[AnyArrow](1, doc="Second column name", type_bound=_is_addable_type)
 
-    def __init__(
-        self,
-        invocation: vgi.invocation.Invocation,
-        logger: structlog.stdlib.BoundLogger,
-    ):
-        """Initialize and compute output type based on input column types."""
-        super().__init__(invocation, logger)
-        assert invocation.input_schema is not None  # Required for scalar functions
-
-        # Type validation is automatic via type_bound - we just compute output type
-        field1 = invocation.input_schema.field(self.col1.value)
-        field2 = invocation.input_schema.field(self.col2.value)
+    _output_type: pa.DataType
+
+    def bind(self) -> None:
+        """Compute output type from input column types."""
+        field1 = self.input_schema.field(self.col1.value)
+        field2 = self.input_schema.field(self.col2.value)
 
         # Compute the output type by promoting to the wider of the two types,
         # then promoting again to reduce overflow risk.
diff --git a/vgi/examples/table_in_out.py b/vgi/examples/table_in_out.py
@@ -27,10 +27,8 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import structlog
 
 from vgi.arguments import Arg, TableInput
-from vgi.invocation import Invocation
 from vgi.ipc_utils import RecordBatchState
 from vgi.log import Level, Message
 from vgi.metadata import FunctionExample
@@ -239,13 +237,8 @@ class Meta:
     repeat_count = Arg[int](0, doc="Number of times to repeat each input batch")
     data: TableInput = Arg[TableInput](1, doc="Input table to repeat")  # type: ignore[assignment]
 
-    def __init__(
-        self, invocation: Invocation, logger: structlog.stdlib.BoundLogger
-    ) -> None:
-        """Initialize and validate repeat count argument."""
-        super().__init__(invocation=invocation, logger=logger)
-
-        # Access to trigger validation early
+    def bind(self) -> None:
+        """Validate repeat count argument."""
         if self.repeat_count < 1:
             raise ValueError("Repeat count must be at least 1")
 
@@ -363,11 +356,8 @@ def cardinality(self) -> TableCardinality | None:
         """Return cardinality estimate of exactly 1 row."""
         return TableCardinality(estimate=1, max=1)
 
-    def __init__(
-        self, invocation: Invocation, logger: structlog.stdlib.BoundLogger
-    ) -> None:
+    def bind(self) -> None:
         """Initialize the sum accumulator."""
-        super().__init__(invocation=invocation, logger=logger)
         self.sums: dict[str, pa.Scalar[Any]] = {}
 
     @property
@@ -671,11 +661,8 @@ class Meta:
 
     data: TableInput = Arg[TableInput](0, doc="Input table with numeric columns")  # type: ignore[assignment]
 
-    def __init__(
-        self, invocation: Invocation, logger: structlog.stdlib.BoundLogger
-    ) -> None:
+    def bind(self) -> None:
         """Initialize with empty sums dict."""
-        super().__init__(invocation=invocation, logger=logger)
         self.sums: dict[str, pa.Scalar[Any]] = {}
 
     @property
diff --git a/vgi/worker.py b/vgi/worker.py
@@ -263,11 +263,18 @@ def _match_function(
 
             # Check positional arguments
             required_positional = [p for p in positional_params if p.required]
-            max_positional = len(positional_params)
+            has_varargs = any(p.is_varargs for p in positional_params)
             min_positional = len(required_positional)
 
-            if not (min_positional <= num_positional <= max_positional):
-                continue  # Wrong number of positional arguments
+            if has_varargs:
+                # Varargs: allow any number >= min_positional
+                if num_positional < min_positional:
+                    continue  # Too few positional arguments
+            else:
+                # Fixed positional: must be within [min, max]
+                max_positional = len(positional_params)
+                if not (min_positional <= num_positional <= max_positional):
+                    continue  # Wrong number of positional arguments
 
             # Check named arguments
             valid_named_keys = {p.position for p in named_params}