From 64cf451b4729097e264213b16914364dc452a178 Mon Sep 17 00:00:00 2001 From: nicornk Date: Thu, 19 Feb 2026 14:19:23 +0100 Subject: [PATCH 1/9] feat: add to_lazy_polars to Dataset --- .../foundry_dev_tools/resources/dataset.py | 40 ++++++++ tests/integration/resources/test_dataset.py | 22 +++++ tests/integration/utils.py | 97 +++++++++++++++++++ 3 files changed, 159 insertions(+) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py index 8a73187..586dde3 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py @@ -14,6 +14,7 @@ from foundry_dev_tools.errors.dataset import ( BranchNotFoundError, DatasetHasNoOpenTransactionError, + DatasetHasNoTransactionsError, DatasetNotFoundError, TransactionTypeMismatchError, ) @@ -799,6 +800,45 @@ def to_polars(self) -> pl.DataFrame: """ return self.query_foundry_sql("SELECT *", return_type="polars") + def to_lazy_polars(self) -> pl.LazyFrame: + """Get dataset as a :py:class:`polars.LazyFrame`. + + Returns a lazy polars DataFrame that can be queried efficiently using + polars' lazy evaluation API. The data is accessed directly from S3 + without going through FoundrySqlServer. + + Example: + >>> ds = ctx.get_dataset_by_path("/path/to/dataset") + >>> lf = ds.to_lazy_polars() + >>> # Lazy operations - not executed yet + >>> result = lf.filter(pl.col("age") > 25).select(["name", "age"]) + >>> # Execute and collect results + >>> df = result.collect() + + Returns: + pl.LazyFrame: A lazy polars DataFrame + + Note: + This method uses the S3 API to directly access dataset files. + For hive-partitioned datasets, polars will automatically read + the partition structure. + """ + from foundry_dev_tools._optional.polars import pl + + last_transaction = self.get_last_transaction() + if last_transaction is None: + msg = f"Dataset has no transactions: {self.path=} {self.rid=}" + raise DatasetHasNoTransactionsError(msg) + + bucket_path = f"s3://{self.rid}.{last_transaction['rid']}/" + + storage_options = self._context.s3.get_polars_storage_options() + + return pl.scan_parquet( + bucket_path, + storage_options=storage_options, + ) + @contextmanager def transaction_context( self, diff --git a/tests/integration/resources/test_dataset.py b/tests/integration/resources/test_dataset.py index 6e20239..3bb7efb 100644 --- a/tests/integration/resources/test_dataset.py +++ b/tests/integration/resources/test_dataset.py @@ -130,3 +130,25 @@ def test_crud_dataset(spark_session, tmp_path): # noqa: PLR0915 # # check that deletion was successful with pytest.raises(DatasetNotFoundError): ds.sync() + + +def test_to_lazy_polars_parquet_dataset(): + ds = TEST_SINGLETON.iris_parquet + lazy_df = ds.to_lazy_polars() + + assert isinstance(lazy_df, pl.LazyFrame) + + df = lazy_df.collect() + assert df.shape == (150, 5) + assert df.columns == ["sepal_width", "sepal_length", "petal_width", "petal_length", "is_setosa"] + + +def test_to_lazy_polars_hive_partitioned(): + ds = TEST_SINGLETON.iris_hive_partitioned + lazy_df = ds.to_lazy_polars() + + assert isinstance(lazy_df, pl.LazyFrame) + + df = lazy_df.collect() + assert df.shape == (150, 5) + assert df.columns == ["sepal_width", "sepal_length", "petal_width", "petal_length", "is_setosa"] diff --git a/tests/integration/utils.py b/tests/integration/utils.py index 69414a8..e627d55 100644 --- a/tests/integration/utils.py +++ b/tests/integration/utils.py @@ -144,6 +144,79 @@ }, } +IRIS_SCHEMA_HIVE = { + "fieldSchemaList": [ + { + "type": "DOUBLE", + "name": "sepal_width", + "nullable": None, + "userDefinedTypeClass": None, + "customMetadata": {}, + "arraySubtype": None, + "precision": None, + "scale": None, + "mapKeyType": None, + "mapValueType": None, + "subSchemas": None, + }, + { + "type": "DOUBLE", + "name": "sepal_length", + "nullable": None, + "userDefinedTypeClass": None, + "customMetadata": {}, + "arraySubtype": None, + "precision": None, + "scale": None, + "mapKeyType": None, + "mapValueType": None, + "subSchemas": None, + }, + { + "type": "DOUBLE", + "name": "petal_width", + "nullable": None, + "userDefinedTypeClass": None, + "customMetadata": {}, + "arraySubtype": None, + "precision": None, + "scale": None, + "mapKeyType": None, + "mapValueType": None, + "subSchemas": None, + }, + { + "type": "DOUBLE", + "name": "petal_length", + "nullable": None, + "userDefinedTypeClass": None, + "customMetadata": {}, + "arraySubtype": None, + "precision": None, + "scale": None, + "mapKeyType": None, + "mapValueType": None, + "subSchemas": None, + }, + { + "type": "STRING", + "name": "is_setosa", + "nullable": None, + "userDefinedTypeClass": None, + "customMetadata": {}, + "arraySubtype": None, + "precision": None, + "scale": None, + "mapKeyType": None, + "mapValueType": None, + "subSchemas": None, + }, + ], + "primaryKey": None, + "dataFrameReaderClass": "com.palantir.foundry.spark.input.ParquetDataFrameReader", + "customMetadata": {"format": "parquet"}, +} + FOUNDRY_SCHEMA_COMPLEX_DATASET = { "fieldSchemaList": [ { @@ -515,6 +588,30 @@ def iris_no_schema(self) -> Dataset: ) return _iris_no_schema + @cached_property + def iris_hive_partitioned(self) -> Dataset: + _iris_hive_partitioned = self.ctx.get_dataset_by_path( + INTEGRATION_TEST_COMPASS_ROOT_PATH + "/iris_hive_partitioned", + create_if_not_exist=True, + ) + if _iris_hive_partitioned.__created__: + _ = _iris_hive_partitioned.upload_folder(TEST_FOLDER.joinpath("test_data", "iris", "iris_hive_partitioned")) + _iris_hive_partitioned.upload_schema( + _iris_hive_partitioned.get_last_transaction()["rid"], schema=IRIS_SCHEMA_HIVE + ) + return _iris_hive_partitioned + + @cached_property + def iris_parquet(self) -> Dataset: + _iris_parquet = self.ctx.get_dataset_by_path( + INTEGRATION_TEST_COMPASS_ROOT_PATH + "/iris_parquet", + create_if_not_exist=True, + ) + if _iris_parquet.__created__: + _ = _iris_parquet.upload_folder(TEST_FOLDER.joinpath("test_data", "iris", "iris_parquet")) + _iris_parquet.upload_schema(_iris_parquet.get_last_transaction()["rid"], schema=IRIS_SCHEMA_HIVE) + return _iris_parquet + @cached_property def empty_dataset(self) -> Dataset: return self.ctx.get_dataset_by_path( From 08aeb1ae2e4cc756c8a6a4fd464bcd9e5fc8ec81 Mon Sep 17 00:00:00 2001 From: nicornk Date: Thu, 19 Feb 2026 14:22:23 +0100 Subject: [PATCH 2/9] remove not required variables --- .../src/foundry_dev_tools/resources/dataset.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py index 586dde3..51b321f 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py @@ -810,7 +810,6 @@ def to_lazy_polars(self) -> pl.LazyFrame: Example: >>> ds = ctx.get_dataset_by_path("/path/to/dataset") >>> lf = ds.to_lazy_polars() - >>> # Lazy operations - not executed yet >>> result = lf.filter(pl.col("age") > 25).select(["name", "age"]) >>> # Execute and collect results >>> df = result.collect() @@ -830,13 +829,9 @@ def to_lazy_polars(self) -> pl.LazyFrame: msg = f"Dataset has no transactions: {self.path=} {self.rid=}" raise DatasetHasNoTransactionsError(msg) - bucket_path = f"s3://{self.rid}.{last_transaction['rid']}/" - - storage_options = self._context.s3.get_polars_storage_options() - return pl.scan_parquet( - bucket_path, - storage_options=storage_options, + f"s3://{self.rid}.{last_transaction['rid']}/", + storage_options=self._context.s3.get_polars_storage_options(), ) @contextmanager From 9294e1107d984e362bf7b9fb9861cdc5b7af3167 Mon Sep 17 00:00:00 2001 From: nicornk Date: Thu, 19 Feb 2026 15:48:13 +0100 Subject: [PATCH 3/9] add test data --- .../is_setosa=Iris-setosa/file.parquet | Bin 0 -> 2054 bytes .../is_setosa=Iris-versicolor/file.parquet | Bin 0 -> 2113 bytes .../is_setosa=Iris-virginica/file.parquet | Bin 0 -> 2095 bytes .../iris/iris_parquet/spark/iris.parquet | Bin 0 -> 3170 bytes 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_data/iris/iris_hive_partitioned/is_setosa=Iris-setosa/file.parquet create mode 100644 tests/test_data/iris/iris_hive_partitioned/is_setosa=Iris-versicolor/file.parquet create mode 100644 tests/test_data/iris/iris_hive_partitioned/is_setosa=Iris-virginica/file.parquet create mode 100644 tests/test_data/iris/iris_parquet/spark/iris.parquet diff --git a/tests/test_data/iris/iris_hive_partitioned/is_setosa=Iris-setosa/file.parquet b/tests/test_data/iris/iris_hive_partitioned/is_setosa=Iris-setosa/file.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7ffebed270b897b082f6679e92c07bd340a807ef GIT binary patch literal 2054 zcmb_eYe*bd6uvvNxX)xAb;g;xP(j^_Ml_lFNKtpMZ;5Spw=_Qb8g+LyWPRZJScH^~ zf)%>Af?%n_Wo@DLKsC zd+t4FzB%7H=MH0r)`K*lKaQi%69;G;Qsd}2eF4Z+@}eZ0*pNn<005bv<^C<40XcHe zOAu&nlR1)16XS`dg+PMIkzfKe#V1n9Oj1$;08|3S^K@3Khu|(1Z%jBncBNO|udER! z_57P>3ThX2ru*Ji$qj~pYNk&3!2i+jBQ3RL!O0s+Zz7w(nmgYaT}c1)myW(GSsL5y z&)(-7jqK#FQ}^yS+TE>bDaU*it>!X79LH*0_U>lqTLZ^zz3sG?u`xpmW|J@|i~o&? zM0t7-<%w}D6r$`9g@~fuCkH=8$bCCR*4FW!aZyO!k#)ZJ95!~l@CdU7| zyMcXo9S(G1*)|?X?H=M@ot5z^N8^PL&j*;JN-rP`;GC{%N7x$vZTN=5owqvo_Ox;F zo^V02GxzX;;pQpJCxTZP^{WXDv7z#gy%s2D3ofzG3?`;g?NN3^Syo zOMziCE`k#iO!&L<-$#o%MbRwd7m?P~5u_*YRAv$b?TSF&{Sl!5(IJcj;D{SPmhY zEil`R>;nVXUJGGc$PfEh9LI|eJXGO9dbkk#4~lEixmq$+pveNuY9V?tnkz9r5)l_d za}Ro`2yHdOvNnu^_sa&F8m#RtRfkXU1iOC=a(8D4WAs65<}|7XF!;nr(>|AqJf$JEylTpv=U?FTMAi z^L^j>-S79E-@U0BR^`xuJ|Uw=h)kMCQyDGOZwBcU2%DbMQ9duWWDYsfPi5V)dBFQp zaEu_(nwJ1Q=kv1bQIJXHW&#XnO6HteZHkUbHJMU$sq1O2c9X$Ir0qUH*3;H3$xc6~ z@-*dbs=t_W-L}_%UHbZP$I^SUJXzb#~Cu@fPEUO!E+sD5xq2t*NK*s74Tn(*uVQ=S4e0vkE zWO$}q&h(N-D*yjTiS#G#qHhXY0-}#-V&l*ip?^vWo*+f~KCc3?rZ^`di|0(FJdOjD zBn|+nlr$wcodnt?b%ssGWg5)gmcY;CdBf1GtWI00)+vPK;GU}=ys7w6J#qZ4zwcq%{iU%CY4gHOwy?FbCR^{5>*-vrRJV8^bxH8w1oGnf~@2e)=i=>n8YXPTqyN&GnI=dli|Av9UOI zcC@c|k3`O^n=rMrRGd)t;6P>$P_}xl6MBp5nU|%egwzagh z(-ffNHxR2{bJ4R(^pT_$GKeItY!hI3R?k)%KO*(fHa_szg9#{2cQ2nf{zNw+vt_zY`*m4>RA%w-~f?GP!dcOrv(K_tf384$_fDnxq zm|;fo&Q2UJhp;h{hx3bG#*1crE5SE$a~{rrHM|Zdmx`exG+1C^IYbXeV==}@7RH6p z+=e!7K`tvSY{WR&U)Wh&Q`O`s+3KpTer6wE8is=p2j8_4-(;_@%C^HI7b+ra7cNUI z&Wfkdf;_0`Vq6y{eq*JN7&dN4*m|@ nnkc@BZ>=q>YJqUuHlUN4TLgQXi>=h7Z$tD4+#~>I^ylF}nu8YU>C_b{H}v9?5c%)}Pjg04J5(c*q?cF<#{@?S8~`Bwv&_HE6W~J% zs6;|Mw+eJDr%UAQ8V)2SbCceLz1+fll7>~oWB?d7WEsug6jq(2PDFWBR6)j$sc1G0VuDwoZ%Vb=Mo# z?p9h3t@k=-iUU$kC!r{eFOQ}; zyIl-`G?o%Xi*+(-%nPX_LiUsU@l1u8a`(qGhFk49hYSC7ClAIgy2d_!_1vLt(lFYn2y@7mXX_*Hq~yrOd1T<)1xr!_D4{iZ&UXHFA* zKV+Y&TJE+y`gHKQtbOL&ao(RTWBq&5qZ;%4O(!PaDIBc#nWulT4g2=j`dzCtLyKp# zhCy#4YE!cUF5#s9li-zzNK599AJ@x1Z(kFomN53jv693pnf!otQ{Bud5{P&@qX0 zhjNGAd?XbT*FY&fD#Et`tOgEJ^oB4v-jr;QS7h{7V4W7Sua#3>9Ua}Y6womTg*%>T zer3mb4+cdXhG0-+ngF!28n#M1Dbj?}_|z{7IIo}~T#BO$fz~4l>+IM}=#BH=4O=I&_JtJ3LnZ;q-2a7nM}v( zUUD|K+S)9PPn1b4yQK&rRD%B~?`=HqF_Iyk2bL-ka-yRM;ogWc4A}2-5q||jEx|Yv zUwnd0+DKDM8vf!JN&LycJP=&QyNYpVL`4+{-x&8LL?2r~7a>Czz9YXJcJ%yR|GJhn|hTGg4n?sWW9+P_YwlBD0!oi{D%W86qQI!J952`y#!O z%1a64K_n2C0P(oTf}q$g z*2NX6MDf+?qCBmNKxqr2sK-D>TSU=T3hS<0rBtky+TzYl2!eGzXSFq%I9+d;BYxoDFQ^evrsDK3oulO;m)|oRq7%V zxVnm5+?)jhB4JVf2_ugO=V|J@3_dzRpEEMlL<9*YQgT?0n6DcuY_dT(S1dy>m*$nhr2sE z4~Yjaew}`E;)nF>jki@_m9GymJn3ZXfbhMRIW?D83`I6>k38_z`8@A7foL)VywOq9 z+xc(D{7A(^SxiC7r>ji6zTFZ!|D@p_vGwD^T|e==|9b3l-|R1xPm>~;b0h`G;sIhp zlKS0k*VUO739)N*kEWI*Ku3cx)C^C`|Hd;YL zTX|?BUCa#lZ^5?k()w~_ZUAo4!P$=--y#0X2i&SZVyD;iw+$FHgQ`It#1qQe}|WoTb4Fg za z?2_$$n2J8h&7JaT4=2VOBd&SfQ>@IY;m7XLbcenFj%(o}QF2h#zbbBBbC3&+n{pBt z6zuJ9d|)$tVrRy_8>}j+O>wqTv3HYl#`^fUHKwzHRVHC~E8lJH0MqkCPw@&6nf+_% z$-YZpKN43X<&K{%yyL#1^uj{Zx!O-^Cgn-i;zRWQk2~fJHc34LwNnoCRNit>Og`1t zz_${?wh*CJrmDv7zR;h-$1ef4B5h~d1rBX+>6Pv?zqVkQycbMeJtb&?CSFW2dY19mL`(IEj<9?tjPIh8NYZszToT^}cz?b}C-;&f*ZX#>2Zltle%p?h=DAGA@3WvMsGl+BPwrQLof~iBu_6*t5|2 zc_qCv!ORwgEB{`@gH+p(5?BFrD@Uv}WyWukGQkwdH|D1-F_~t~3 z>cZ4!w|~Z;i1siQmmHd(xs2Iz_)U>#lDOQf$5i59)4P($;WuaVGRo$i&ay3}9XkFU z_YRR3d@}N^_o~0FulAW``&zG%Rb0RE($QOQR0^-v_C`I>%J$xtWOfxN-|c6y&fjO% zPb+5&@Ra7*WL{Z;ulkqk6*mS=LCYTRT&j7x<&rA3FEhcNb7Qb(W6X)F36Fcnr7oq(-YGV~o5q6OkaUgSQ?Ja>>E(`D%5=RZQK!|VJ806?np91qoCbUgrk#tW zN{|SdV=hKc)?(yfw#~9m#aLsotP45(34oD&qbY#GMnOZ^xDQ}B;n+ltew}4u%gwum zh8k(&gE+N~S3~0*8veWhLeW|k5rCrrSk#t40Af_lA%3eK2pm;Ak5gOEB~)90?8d6c z?i9yfaCT%B5z0gw>Chd3i_kg-vfnDVvym%Sxrm1pS>>Zbq{xPvkI-=KQoD4WE(5U# zh`k4GT$!?rjJq#KCXGqYkx66E3P9Kp>3PsSK$r+F25TNJ!ehnf=pyS8``?Qq`mP&- zqD>;?O(5=MjRK8xa6Zoj8)TFS`{toglGKw6NfLf#X2WU+pY z*9L7uiejGJJok^mUr2?@iZ4~9MK6uejl{zmMCh+29wZ!jMLc||{lbt#zoV$FnYW(k z6={fCy+M_*Br-Ev3bCnXiTD4DA4Wl0@fBX$WW^FS2{1;uPAgB> Date: Thu, 19 Feb 2026 15:55:27 +0100 Subject: [PATCH 4/9] add unit test for exception --- .../foundry_dev_tools/resources/dataset.py | 4 ++-- tests/unit/resources/test_dataset.py | 24 +++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 tests/unit/resources/test_dataset.py diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py index 51b321f..5fdadae 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py @@ -826,8 +826,8 @@ def to_lazy_polars(self) -> pl.LazyFrame: last_transaction = self.get_last_transaction() if last_transaction is None: - msg = f"Dataset has no transactions: {self.path=} {self.rid=}" - raise DatasetHasNoTransactionsError(msg) + msg = f"Dataset has no transactions: {self.rid=}" + raise DatasetHasNoTransactionsError(info=msg) return pl.scan_parquet( f"s3://{self.rid}.{last_transaction['rid']}/", diff --git a/tests/unit/resources/test_dataset.py b/tests/unit/resources/test_dataset.py new file mode 100644 index 0000000..bfadcd4 --- /dev/null +++ b/tests/unit/resources/test_dataset.py @@ -0,0 +1,24 @@ +from unittest import mock + +import pytest + +from foundry_dev_tools.errors.dataset import DatasetHasNoTransactionsError +from foundry_dev_tools.resources.dataset import Dataset + + +def test_to_lazy_polars_no_transaction(): + with mock.patch.object(Dataset, "__created__", True): + ds = Dataset.__new__(Dataset) + ds.rid = "ri.foundry.main.dataset.test-dataset" + ds.path = "/test/dataset/path" + + # Mock get_last_transaction to return None + with mock.patch.object(ds, "get_last_transaction", return_value=None): + # Assert that the correct exception is raised with the expected message + with pytest.raises(DatasetHasNoTransactionsError) as exc_info: + ds.to_lazy_polars() + + # Verify the error message contains the expected information + error_message = str(exc_info.value) + assert "Dataset has no transactions" in error_message + assert ds.rid in error_message From f8c945a28d7aee579afaf02a44e699b4f86cffda Mon Sep 17 00:00:00 2001 From: nicornk Date: Thu, 19 Feb 2026 17:01:16 +0100 Subject: [PATCH 5/9] allow passing arbitrary transaction_rid --- .../foundry_dev_tools/resources/dataset.py | 15 +++++++------ tests/unit/resources/test_dataset.py | 21 ++++++++++++++++--- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py index 5fdadae..196e75a 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py @@ -800,7 +800,7 @@ def to_polars(self) -> pl.DataFrame: """ return self.query_foundry_sql("SELECT *", return_type="polars") - def to_lazy_polars(self) -> pl.LazyFrame: + def to_lazy_polars(self, transaction_rid: str | None = None) -> pl.LazyFrame: """Get dataset as a :py:class:`polars.LazyFrame`. Returns a lazy polars DataFrame that can be queried efficiently using @@ -824,14 +824,17 @@ def to_lazy_polars(self) -> pl.LazyFrame: """ from foundry_dev_tools._optional.polars import pl - last_transaction = self.get_last_transaction() - if last_transaction is None: - msg = f"Dataset has no transactions: {self.rid=}" - raise DatasetHasNoTransactionsError(info=msg) + if transaction_rid is None: + maybe_transaction = self.get_last_transaction() + if maybe_transaction is None: + msg = f"Dataset has no transactions: {self.rid=}" + raise DatasetHasNoTransactionsError(info=msg) + transaction_rid = maybe_transaction["rid"] return pl.scan_parquet( - f"s3://{self.rid}.{last_transaction['rid']}/", + f"s3://{self.rid}.{transaction_rid}/**/*.parquet", storage_options=self._context.s3.get_polars_storage_options(), + hive_partitioning=True, ) @contextmanager diff --git a/tests/unit/resources/test_dataset.py b/tests/unit/resources/test_dataset.py index bfadcd4..12236b6 100644 --- a/tests/unit/resources/test_dataset.py +++ b/tests/unit/resources/test_dataset.py @@ -12,13 +12,28 @@ def test_to_lazy_polars_no_transaction(): ds.rid = "ri.foundry.main.dataset.test-dataset" ds.path = "/test/dataset/path" - # Mock get_last_transaction to return None with mock.patch.object(ds, "get_last_transaction", return_value=None): - # Assert that the correct exception is raised with the expected message with pytest.raises(DatasetHasNoTransactionsError) as exc_info: ds.to_lazy_polars() - # Verify the error message contains the expected information error_message = str(exc_info.value) assert "Dataset has no transactions" in error_message assert ds.rid in error_message + + +def test_to_lazy_polars_transaction_rid_logic(): + with mock.patch.object(Dataset, "__created__", True): + ds = Dataset.__new__(Dataset) + ds.rid = "ri.foundry.main.dataset.abc123" + ds._context = mock.MagicMock() + ds._context.s3.get_polars_storage_options.return_value = {"aws_access_key_id": "test"} + + with mock.patch("foundry_dev_tools._optional.polars.pl.scan_parquet") as mock_scan: + mock_scan.return_value = mock.MagicMock() + ds.to_lazy_polars(transaction_rid="test") + + mock_scan.assert_called_once() + call_args = mock_scan.call_args + assert call_args[0][0] == f"s3://{ds.rid}.test/**/*.parquet" + assert call_args[1]["storage_options"] == ds._context.s3.get_polars_storage_options() + assert call_args[1]["hive_partitioning"] is True From e0a23b656ab8557b8fe5cdd101e2cb0d28cc92f3 Mon Sep 17 00:00:00 2001 From: nicornk Date: Thu, 19 Feb 2026 17:04:16 +0100 Subject: [PATCH 6/9] add doc entry --- docs/examples/dataset.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/examples/dataset.md b/docs/examples/dataset.md index f8687b8..fa042a9 100644 --- a/docs/examples/dataset.md +++ b/docs/examples/dataset.md @@ -282,6 +282,28 @@ print(df) ``` ```` +### Polars LazyFrame with direct S3-compatible API access + +Access dataset files directly via the S3-compatible API as a Polars LazyFrame for efficient lazy evaluation. This method bypasses FoundrySqlServer and works with both regular and hive-partitioned parquet datasets. + +````{tab} v2 +```python +from foundry_dev_tools import FoundryContext +import polars as pl + +ctx = FoundryContext() +ds = ctx.get_dataset_by_path("/path/to/test_dataset") +lazy_df = ds.to_lazy_polars() + +# Perform lazy operations (not executed yet) +result = lazy_df.filter(pl.col("age") > 25).select(["name", "age"]) + +# Execute and collect results +df = result.collect() +print(df) +``` +```` + ### DuckDB Table from Spark SQL dialect Queries the Foundry SQL server with Spark SQL dialect, load arrow stream using [duckdb](https://duckdb.org/). From e6d325d11748e133b01f80ba3ff0fa01e1b68d2b Mon Sep 17 00:00:00 2001 From: nicornk Date: Fri, 20 Feb 2026 08:18:04 +0100 Subject: [PATCH 7/9] fix docstring --- .../src/foundry_dev_tools/resources/dataset.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py index 196e75a..3d5f05e 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py @@ -804,8 +804,16 @@ def to_lazy_polars(self, transaction_rid: str | None = None) -> pl.LazyFrame: """Get dataset as a :py:class:`polars.LazyFrame`. Returns a lazy polars DataFrame that can be queried efficiently using - polars' lazy evaluation API. The data is accessed directly from S3 - without going through FoundrySqlServer. + polars' lazy evaluation API. The data is accessed directly via the + S3-compatible API without going through FoundrySqlServer. + + Args: + transaction_rid: The transaction RID to read from. If None, uses the + last committed transaction. Useful for reading specific historical + versions of the dataset. + + Returns: + pl.LazyFrame: A lazy polars DataFrame Example: >>> ds = ctx.get_dataset_by_path("/path/to/dataset") @@ -814,11 +822,8 @@ def to_lazy_polars(self, transaction_rid: str | None = None) -> pl.LazyFrame: >>> # Execute and collect results >>> df = result.collect() - Returns: - pl.LazyFrame: A lazy polars DataFrame - Note: - This method uses the S3 API to directly access dataset files. + This method uses the S3-compatible API to directly access dataset files. For hive-partitioned datasets, polars will automatically read the partition structure. """ From 0a930413ea92f27f7ce3ce471ee9c14f01b241cb Mon Sep 17 00:00:00 2001 From: nicornk Date: Fri, 20 Feb 2026 08:43:17 +0100 Subject: [PATCH 8/9] fix: get_last_transaction() now excludes open transactions in to_lazy_polars() --- .../src/foundry_dev_tools/resources/dataset.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py index 3d5f05e..4aad2ee 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py @@ -262,11 +262,16 @@ def get_transactions( ).json()["values"] ] - def get_last_transaction(self) -> api_types.Transaction | None: - """Returns the last transaction or None if there are no transactions.""" + def get_last_transaction(self, include_open_exclusive_transaction: bool = True) -> api_types.Transaction | None: + """Returns the last transaction or None if there are no transactions. + + Args: + include_open_exclusive_transaction: If True, includes open transactions + in the results. If False, only returns committed transactions. + """ v = self.get_transactions( page_size=1, - include_open_exclusive_transaction=True, + include_open_exclusive_transaction=include_open_exclusive_transaction, ) if v is not None and len(v) > 0: return v[0] @@ -830,9 +835,9 @@ def to_lazy_polars(self, transaction_rid: str | None = None) -> pl.LazyFrame: from foundry_dev_tools._optional.polars import pl if transaction_rid is None: - maybe_transaction = self.get_last_transaction() + maybe_transaction = self.get_last_transaction(include_open_exclusive_transaction=False) if maybe_transaction is None: - msg = f"Dataset has no transactions: {self.rid=}" + msg = f"Dataset has no committed transactions: {self.rid=}" raise DatasetHasNoTransactionsError(info=msg) transaction_rid = maybe_transaction["rid"] From c9cbd5ac4f69b263653c1b3b2ac922acd59563e5 Mon Sep 17 00:00:00 2001 From: nicornk Date: Fri, 20 Feb 2026 08:51:31 +0100 Subject: [PATCH 9/9] test with explicit transaction passing --- tests/integration/resources/test_dataset.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/integration/resources/test_dataset.py b/tests/integration/resources/test_dataset.py index 3bb7efb..00ee93d 100644 --- a/tests/integration/resources/test_dataset.py +++ b/tests/integration/resources/test_dataset.py @@ -143,6 +143,17 @@ def test_to_lazy_polars_parquet_dataset(): assert df.columns == ["sepal_width", "sepal_length", "petal_width", "petal_length", "is_setosa"] +def test_to_lazy_polars_parquet_dataset_explicit_transaction(): + ds = TEST_SINGLETON.iris_parquet + lazy_df = ds.to_lazy_polars(ds.get_last_transaction()["rid"]) + + assert isinstance(lazy_df, pl.LazyFrame) + + df = lazy_df.collect() + assert df.shape == (150, 5) + assert df.columns == ["sepal_width", "sepal_length", "petal_width", "petal_length", "is_setosa"] + + def test_to_lazy_polars_hive_partitioned(): ds = TEST_SINGLETON.iris_hive_partitioned lazy_df = ds.to_lazy_polars()