Add MultiColumnGeneratorFunction example demonstrating varargs

rustyconover · claude · rustyconover · commit a76592df0fbf · 2026-01-15T11:01:40.000-05:00
New table function that takes column names as varargs and generates
a dynamic output schema based on the provided arguments. Each column
contains sequential integer values.

Example: multi_column_generator(3, 'x', 'y') produces:
  {"x": 0, "y": 0}, {"x": 1, "y": 1}, {"x": 2, "y": 2}

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/vgi/examples/table.py b/vgi/examples/table.py
@@ -6,10 +6,13 @@
 AVAILABLE FUNCTIONS
 -------------------
 SequenceFunction              - Generates a sequence of integers 0..n-1
-RangeFunction                 - Generates integers in a start..end range
 ConstantTableFunction         - Returns a constant single-row table
-RandomSampleFunction          - Generates random sample data (parallelizable)
 GeneratorExceptionFunction    - Demonstrates exception handling
+LoggingGeneratorFunction      - Demonstrates log message emission
+MultiColumnGeneratorFunction  - Demonstrates varargs with dynamic output schema
+PartitionedSequenceFunction   - Demonstrates multi-worker parallel execution
+ProjectedDataFunction         - Demonstrates projection pushdown
+SettingsAwareFunction         - Demonstrates settings-aware output schema
 """
 
 import struct
@@ -35,6 +38,7 @@
     "ConstantTableFunction",
     "GeneratorExceptionFunction",
     "LoggingGeneratorFunction",
+    "MultiColumnGeneratorFunction",
     "PartitionedSequenceFunction",
     "ProjectedDataFunction",
     "SettingsAwareFunction",
@@ -601,3 +605,95 @@ def process(self) -> OutputGenerator:
                 data["details"] = [f"row_{i}"]
 
             yield Output(pa.RecordBatch.from_pydict(data, schema=output_schema))
+
+
+class MultiColumnGeneratorFunction(TableFunctionGenerator):
+    """Generates a table with dynamic columns based on varargs.
+
+    USE CASE
+    --------
+    Demonstrates varargs where the output schema is determined by the
+    column names provided as arguments. Each column name becomes a column
+    in the output with sequential integer values.
+
+    This shows how varargs can be used to create flexible functions where
+    the output structure depends on the number and names of arguments.
+
+    SCHEMA
+    ------
+    Output schema is dynamic based on provided column names.
+    Example: multi_column_generator(3, 'a', 'b', 'c')
+    Output: {"a": int64, "b": int64, "c": int64}
+
+    PARALLELIZATION
+    ---------------
+    Single worker only (max_workers=1).
+
+    Example:
+    -------
+    SELECT * FROM multi_column_generator(5, 'x', 'y')
+    Returns: [{"x": 0, "y": 0}, {"x": 1, "y": 1}, ..., {"x": 4, "y": 4}]
+
+    SELECT * FROM multi_column_generator(3, 'id', 'value', 'score')
+    Returns: [{"id": 0, "value": 0, "score": 0}, ...]
+
+    """
+
+    class Meta:
+        """Metadata for MultiColumnGeneratorFunction."""
+
+        name = "multi_column_generator"
+        description = "Generates a table with columns specified via varargs"
+        categories = ["generator", "utility"]
+        max_workers = 1
+        examples = [
+            FunctionExample(
+                sql="SELECT * FROM multi_column_generator(5, 'x', 'y')",
+                description="Generate 5 rows with columns x and y",
+            ),
+            FunctionExample(
+                sql="SELECT * FROM multi_column_generator(10, 'a', 'b', 'c')",
+                description="Generate 10 rows with columns a, b, and c",
+            ),
+        ]
+
+    count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
+    columns: Annotated[
+        tuple[str, ...],
+        Arg(
+            1,
+            varargs=True,
+            arrow_type=pa.string(),
+            doc="Column names to generate (at least one required)",
+        ),
+    ]
+
+    BATCH_SIZE: int = 1000
+
+    @property
+    def output_schema(self) -> pa.Schema:
+        """Return output schema with one int64 column per vararg."""
+        return pa.schema([pa.field(name, pa.int64()) for name in self.columns])
+
+    @property
+    def cardinality(self) -> TableCardinality:
+        """Return exact cardinality since we know the count."""
+        return TableCardinality(estimate=self.count, max=self.count)
+
+    def process(self) -> OutputGenerator:
+        """Generate data for all specified columns."""
+        output_schema = self.output_schema
+        remaining = self.count
+        current_row = 0
+
+        while remaining > 0:
+            batch_size = min(remaining, self.BATCH_SIZE)
+
+            # Generate sequence values for each column
+            values = list(range(current_row, current_row + batch_size))
+            data = {name: values for name in self.columns}
+
+            yield Output(pa.RecordBatch.from_pydict(data, schema=output_schema))
+
+            current_row += batch_size
+            remaining -= batch_size
diff --git a/vgi/examples/worker.py b/vgi/examples/worker.py
@@ -23,6 +23,7 @@
     ConstantTableFunction,
     GeneratorExceptionFunction,
     LoggingGeneratorFunction,
+    MultiColumnGeneratorFunction,
     PartitionedSequenceFunction,
     ProjectedDataFunction,
     SequenceFunction,
@@ -67,6 +68,7 @@ class ExampleWorker(Worker):
         ConstantTableFunction,
         GeneratorExceptionFunction,
         LoggingGeneratorFunction,
+        MultiColumnGeneratorFunction,
         PartitionedSequenceFunction,
         ProjectedDataFunction,
         SettingsAwareFunction,