From 85f134482c9d0e709a1d0a02b43f213a6d5b4578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Sch=C3=A4fer?= Date: Fri, 20 Feb 2026 14:59:47 +0100 Subject: [PATCH] polars docs --- docs/examples/dataset.md | 32 ++++++++++++++----- .../foundry_dev_tools/resources/dataset.py | 11 +++++-- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/docs/examples/dataset.md b/docs/examples/dataset.md index fa042a9..9ba851b 100644 --- a/docs/examples/dataset.md +++ b/docs/examples/dataset.md @@ -249,9 +249,21 @@ rest_client.download_dataset_files(dataset_rid=rid, output_directory='/paht/to/o ``` ```` -### Polars DataFrame from Spark SQL dialect +### Polars -Queries the Foundry SQL server with Spark SQL dialect, load arrow stream using [polars](https://www.pola.rs/). +There are three ways to get Polars data from a Foundry dataset. Choose the one that fits your workload: + +| Method | Data path | Evaluation | Best for | +|---|---|---|---| +| `to_polars()` | FoundrySqlServer | Eager (full dataset) | Quick exploration of small-medium datasets | +| `query_foundry_sql(..., "polars")` | FoundrySqlServer | Eager (SQL-filtered) | Aggregations, joins, complex SQL queries | +| `to_lazy_polars()` | Direct parquet scan (S3) | Lazy | Filtering/selection on large datasets; portable code for Foundry transforms | + +`to_lazy_polars()` scans parquet files directly via the S3-compatible API using `polars.scan_parquet`. Combined with Polars' lazy evaluation, this enables predicate pushdown: filters applied to the LazyFrame are pushed down to the parquet reader, so only relevant data is read from storage. + +The lazy Polars API uses the same syntax as [Foundry lightweight transforms](https://www.palantir.com/docs/foundry/transforms-python/polars-lazy), so code written with `to_lazy_polars()` can be moved into a Foundry transform without rewriting. + +#### Eager via FoundrySqlServer ````{tab} v2 ```python @@ -260,7 +272,13 @@ import polars as pl ctx = FoundryContext() ds = ctx.get_dataset_by_path("/path/to/test_dataset") -df = ds.query_foundry_sql("SELECT *", return_type="polars") + +# Fetch the full dataset +df = ds.to_polars() +print(df) + +# Or use SQL to filter/aggregate server-side +df = ds.query_foundry_sql("SELECT * WHERE age > 25", return_type="polars") print(df) ``` ```` @@ -282,9 +300,7 @@ print(df) ``` ```` -### Polars LazyFrame with direct S3-compatible API access - -Access dataset files directly via the S3-compatible API as a Polars LazyFrame for efficient lazy evaluation. This method bypasses FoundrySqlServer and works with both regular and hive-partitioned parquet datasets. +#### Lazy via direct S3 parquet scan ````{tab} v2 ```python @@ -293,10 +309,10 @@ import polars as pl ctx = FoundryContext() ds = ctx.get_dataset_by_path("/path/to/test_dataset") -lazy_df = ds.to_lazy_polars() +lazy_df: pl.LazyFrame = ds.to_lazy_polars() # Perform lazy operations (not executed yet) -result = lazy_df.filter(pl.col("age") > 25).select(["name", "age"]) +result = lazy_df.filter(pl.col("age") > 25).select("name", "age") # Execute and collect results df = result.collect() diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py index 4aad2ee..6f58041 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py @@ -801,6 +801,9 @@ def to_pandas(self) -> pandas.core.frame.DataFrame: def to_polars(self) -> pl.DataFrame: """Get dataset as a :py:class:`polars.DataFrame`. + Fetches the full dataset via FoundrySqlServer. For lazy evaluation + with predicate pushdown on large datasets, see :py:meth:`to_lazy_polars`. + Via :py:meth:`foundry_dev_tools.resources.dataset.Dataset.query_foundry_sql` """ return self.query_foundry_sql("SELECT *", return_type="polars") @@ -822,8 +825,8 @@ def to_lazy_polars(self, transaction_rid: str | None = None) -> pl.LazyFrame: Example: >>> ds = ctx.get_dataset_by_path("/path/to/dataset") - >>> lf = ds.to_lazy_polars() - >>> result = lf.filter(pl.col("age") > 25).select(["name", "age"]) + >>> lazy_df = ds.to_lazy_polars() + >>> result = lazy_df.filter(pl.col("age") > 25).select("name", "age") >>> # Execute and collect results >>> df = result.collect() @@ -831,6 +834,10 @@ def to_lazy_polars(self, transaction_rid: str | None = None) -> pl.LazyFrame: This method uses the S3-compatible API to directly access dataset files. For hive-partitioned datasets, polars will automatically read the partition structure. + + See Also: + :py:meth:`to_polars`: Eager alternative via FoundrySqlServer. + :py:meth:`query_foundry_sql`: For SQL-based filtering and aggregations. """ from foundry_dev_tools._optional.polars import pl