diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 427359befa73..e0fdf49d917c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,16 +16,15 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v6.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml -- repo: https://github.com/psf/black - rev: 22.3.0 - hooks: - - id: black -- repo: https://github.com/pycqa/flake8 - rev: 3.9.2 # version-scanner: ignore - hooks: - - id: flake8 +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.14.14 + hooks: + # Run the linter. + - id: ruff-check + args: [ --select, I, --fix, --target-version=py310, --line-length=88 ] diff --git a/packages/bigframes/bigframes/bigquery/_googlesql.py b/packages/bigframes/bigframes/bigquery/_googlesql.py new file mode 100644 index 000000000000..4f15d1f3c277 --- /dev/null +++ b/packages/bigframes/bigframes/bigquery/_googlesql.py @@ -0,0 +1,81 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for working with GoogleSqlScalarOps.""" + +from __future__ import annotations + +from typing import Any, Union + +import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.core.sentinels as sentinels +import bigframes.series as series +from bigframes.operations import googlesql + + +def apply_googlesql_scalar_op( + op: googlesql.GoogleSqlScalarOp, + *args: Any, +) -> Union[series.Series, bigframes.core.col.Expression]: + """Applies a GoogleSQL scalar operator to the given arguments. + + Handles a mix of Series, Expression, and literal inputs. + + Args: + op (googlesql.GoogleSqlScalarOp): + The operator to apply. + *args (Any): + The arguments to apply the operator to. + + Returns: + bigframes.pandas.Series | bigframes.core.col.Expression: + The result of the operation. If any of ``args`` is a Series, returns + a Series. Otherwise, returns an Expression. + """ + # Find the first Series to use for alignment + first_series = None + for arg in args: + if isinstance(arg, series.Series): + first_series = arg + break + + if first_series is not None: + processed_args = [] + block = first_series._block + for arg in args: + if isinstance(arg, bigframes.core.col.Expression): + block, col_id = block.project_expr(bigframes.core.col._as_bf_expr(arg)) + processed_args.append(series.Series(block.select_column(col_id))) + elif arg is sentinels.DEFAULT: + processed_args.append(bigframes.core.col.Expression(ex.OmittedArg())) + else: + processed_args.append(arg) + + # Apply the n-ary op. _apply_nary_op handles alignment of Series and literals. + result = first_series._apply_nary_op(op, processed_args, ignore_self=True) + result.name = None + return result + + # No Series, return an Expression + expr_args = [] + for arg in args: + if isinstance(arg, bigframes.core.col.Expression): + expr_args.append(bigframes.core.col._as_bf_expr(arg)) + elif arg is sentinels.DEFAULT: + expr_args.append(ex.OmittedArg()) + else: + expr_args.append(ex.const(arg)) + + return bigframes.core.col.Expression(ex.OpExpression(op, tuple(expr_args))) diff --git a/packages/bigframes/bigframes/bigquery/_operations/aead.py b/packages/bigframes/bigframes/bigquery/_operations/aead.py new file mode 100644 index 000000000000..1a7c02ec2e63 --- /dev/null +++ b/packages/bigframes/bigframes/bigquery/_operations/aead.py @@ -0,0 +1,88 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: scripts/data/sql-functions/aead.yaml +# by the script: scripts/generate_bigframes_bigquery.py + +from __future__ import annotations + +from typing import TypeVar, Union + +import bigframes.bigquery._googlesql +import bigframes.core.col +import bigframes.series as series +from bigframes import dtypes +from bigframes.operations import googlesql + +T = TypeVar("T", series.Series, bigframes.core.col.Expression) + +_DECRYPT_BYTES_OP = googlesql.GoogleSqlScalarOp( + "AEAD.DECRYPT_BYTES", + args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=lambda *args: dtypes.BYTES_DTYPE, +) +_DECRYPT_STRING_OP = googlesql.GoogleSqlScalarOp( + "AEAD.DECRYPT_STRING", + args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=lambda *args: dtypes.STRING_DTYPE, +) +_ENCRYPT_OP = googlesql.GoogleSqlScalarOp( + "AEAD.ENCRYPT", + args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=lambda *args: dtypes.BYTES_DTYPE, +) + + +def decrypt_bytes( + keyset: Union[T, bigframes.core.col.Expression, Union[bytes, dict]], + ciphertext: Union[T, bigframes.core.col.Expression, bytes], + additional_data: Union[T, bigframes.core.col.Expression, bytes], +) -> T: + """Uses the matching key from keyset to decrypt ciphertext and verifies the integrity of the data using additional_data. Returns an error if decryption or verification fails.""" + return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + _DECRYPT_BYTES_OP, + keyset, + ciphertext, + additional_data, + ) # type: ignore + + +def decrypt_string( + keyset: Union[T, bigframes.core.col.Expression, Union[bytes, dict]], + ciphertext: Union[T, bigframes.core.col.Expression, bytes], + additional_data: Union[T, bigframes.core.col.Expression, str], +) -> T: + """Like AEAD.DECRYPT_BYTES, but where additional_data is of type STRING.""" + return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + _DECRYPT_STRING_OP, + keyset, + ciphertext, + additional_data, + ) # type: ignore + + +def encrypt( + keyset: Union[T, bigframes.core.col.Expression, Union[bytes, dict]], + plaintext: Union[T, bigframes.core.col.Expression, Union[bytes, str]], + additional_data: Union[T, bigframes.core.col.Expression, Union[bytes, str]], +) -> T: + """Encrypts plaintext using the primary cryptographic key in keyset. The algorithm of the primary key must be AEAD_AES_GCM_256. Binds the ciphertext to the context defined by additional_data. Returns NULL if any input is NULL.""" + return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + _ENCRYPT_OP, + keyset, + plaintext, + additional_data, + ) # type: ignore diff --git a/packages/bigframes/bigframes/core/sentinels.py b/packages/bigframes/bigframes/core/sentinels.py new file mode 100644 index 000000000000..fc2bfac970e5 --- /dev/null +++ b/packages/bigframes/bigframes/core/sentinels.py @@ -0,0 +1,33 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sentinel values used throughout BigFrames.""" + +from __future__ import annotations + +from enum import Enum + + +class Default(Enum): + """Default values used throughout BigFrames. + + When a parameter is set to this, that parameter is explicitly omitted + from the SQL text. This allows for NULL (None in Python) to be explicitly + passed in to optional parameters. + """ + + token = 0 + + +DEFAULT = Default.token diff --git a/packages/bigframes/scripts/data/sql-functions/aead.yaml b/packages/bigframes/scripts/data/sql-functions/aead.yaml new file mode 100644 index 000000000000..6c289a96e886 --- /dev/null +++ b/packages/bigframes/scripts/data/sql-functions/aead.yaml @@ -0,0 +1,131 @@ +urn: extension:google:bq_scalar_functions +scalar_functions: + - name: "aead.decrypt_bytes" + description: "Uses the matching key from keyset to decrypt ciphertext and verifies the integrity of the data using additional_data. Returns an error if decryption or verification fails." + impls: + # Signature: aead.decrypt_bytes:vbin_vbin_vbin + - args: + - name: "keyset" + value: binary + optional: false + keyword_only: false + - name: "ciphertext" + value: binary + optional: false + keyword_only: false + - name: "additional_data" + value: binary + optional: false + keyword_only: false + return: binary + # Signature: aead.decrypt_bytes:struct_vbin_vbin + - args: + - name: "keyset" + value: struct + optional: false + keyword_only: false + - name: "ciphertext" + value: binary + optional: false + keyword_only: false + - name: "additional_data" + value: binary + optional: false + keyword_only: false + return: binary + - name: "aead.decrypt_string" + description: "Like AEAD.DECRYPT_BYTES, but where additional_data is of type STRING." + impls: + # Signature: aead.decrypt_string:vbin_vbin_str + - args: + - name: "keyset" + value: binary + optional: false + keyword_only: false + - name: "ciphertext" + value: binary + optional: false + keyword_only: false + - name: "additional_data" + value: string + optional: false + keyword_only: false + return: string + # Signature: aead.decrypt_string:struct_vbin_str + - args: + - name: "keyset" + value: struct + optional: false + keyword_only: false + - name: "ciphertext" + value: binary + optional: false + keyword_only: false + - name: "additional_data" + value: string + optional: false + keyword_only: false + return: string + - name: "aead.encrypt" + description: "Encrypts plaintext using the primary cryptographic key in keyset. The algorithm of the primary key must be AEAD_AES_GCM_256. Binds the ciphertext to the context defined by additional_data. Returns NULL if any input is NULL." + impls: + # Signature: aead.encrypt:vbin_str_str + - args: + - name: "keyset" + value: binary + optional: false + keyword_only: false + - name: "plaintext" + value: string + optional: false + keyword_only: false + - name: "additional_data" + value: string + optional: false + keyword_only: false + return: binary + # Signature: aead.encrypt:vbin_vbin_vbin + - args: + - name: "keyset" + value: binary + optional: false + keyword_only: false + - name: "plaintext" + value: binary + optional: false + keyword_only: false + - name: "additional_data" + value: binary + optional: false + keyword_only: false + return: binary + # Signature: aead.encrypt:struct_str_str + - args: + - name: "keyset" + value: struct + optional: false + keyword_only: false + - name: "plaintext" + value: string + optional: false + keyword_only: false + - name: "additional_data" + value: string + optional: false + keyword_only: false + return: binary + # Signature: aead.encrypt:struct_vbin_vbin + - args: + - name: "keyset" + value: struct + optional: false + keyword_only: false + - name: "plaintext" + value: binary + optional: false + keyword_only: false + - name: "additional_data" + value: binary + optional: false + keyword_only: false + return: binary diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py new file mode 100755 index 000000000000..ff5dcb1232cd --- /dev/null +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -0,0 +1,362 @@ +#!/usr/bin/env -S uv run --script +# +# /// script +# dependencies = [ +# "autoflake", +# "jinja2", +# "pyyaml", +# ] +# /// +# +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +import re +import subprocess + +import jinja2 +import yaml + +# Directory containing the YAML files +DATA_DIR = pathlib.Path("scripts/data/sql-functions") +# Directory where the generated Python files will be placed +OUTPUT_DIR = pathlib.Path("bigframes/bigquery/_operations") +# Directory where the generated test files will be placed +TEST_OUTPUT_DIR = pathlib.Path("tests/unit/bigquery/_operations") + +LICENSE_HEADER = """# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +TEMPLATE = """{{ license_header }} +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: {{ yaml_path }} +# by the script: {{ script_path }} + +from __future__ import annotations + +import datetime +from typing import Any, Optional, TypeVar, Union + +from bigframes import dtypes +import bigframes.bigquery._googlesql +import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.core.sentinels as sentinels +from bigframes.operations import googlesql +import bigframes.operations as ops +import bigframes.series as series + +T = TypeVar("T", series.Series, bigframes.core.col.Expression) + +{% for op in ops %} +{{ op.internal_name }} = googlesql.GoogleSqlScalarOp( + "{{ op.sql_name }}", + args=({{ op.arg_specs }}), + signature={{ op.signature }}, +) +{% endfor %} +{% for func in functions %} + + +def {{ func.name }}( +{% for arg in func.args %} + {{ arg.name }}: Union[T, bigframes.core.col.Expression, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, +{% endfor %} +) -> T: + \"\"\"{{ func.description }}\"\"\" + return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + {{ func.op_name }}, +{% for arg in func.args %} + {{ arg.name }}, +{% endfor %} + ) # type: ignore +{% endfor %} +""" + +TEST_TEMPLATE = """{{ license_header }} +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: {{ yaml_path }} +# by the script: {{ script_path }} + +from typing import cast + +import pytest + +import bigframes.pandas as bpd +import {{ import_path }} as {{ short_name }} + +pytest.importorskip("pytest_snapshot") + + +{% for func in functions %} +def test_{{ func.name }}(scalar_types_df: bpd.DataFrame, snapshot): + result = {{ short_name }}.{{ func.name }}( +{% for arg in func.test_args %} + cast(bpd.Series, scalar_types_df["{{ arg.col_name }}"]), +{% endfor %} + ).to_frame() + snapshot.assert_match(result.sql, "out.sql") + + +{% endfor %} +""" + +DTYPE_MAP = { + "binary": "dtypes.BYTES_DTYPE", + "string": "dtypes.STRING_DTYPE", + "int64": "dtypes.INT_DTYPE", + "float64": "dtypes.FLOAT_DTYPE", + "bool": "dtypes.BOOL_DTYPE", + "geography": "dtypes.GEO_DTYPE", + "json": "dtypes.JSON_DTYPE", + "date": "dtypes.DATE_DTYPE", + "time": "dtypes.TIME_DTYPE", + "datetime": "dtypes.DATETIME_DTYPE", + "timestamp": "dtypes.TIMESTAMP_DTYPE", +} + +PY_TYPE_MAP = { + "binary": "bytes", + "string": "str", + "int64": "int", + "float64": "float", + "bool": "bool", + "geography": "Any", + "json": "Any", + "date": "datetime.date", + "time": "datetime.time", + "datetime": "datetime.datetime", + "timestamp": "datetime.datetime", + "struct": "dict", +} + +YAML_TYPE_TO_COL = { + "binary": "bytes_col", + "string": "string_col", + "int64": "int64_col", + "float64": "float64_col", + "bool": "bool_col", + "geography": "geography_col", + "date": "date_col", + "time": "time_col", + "datetime": "datetime_col", + "timestamp": "timestamp_col", +} + + +def to_snake_case(name): + # Replace dots with underscores + name = name.replace(".", "_") + # Handle CamelCase to snake_case + name = re.sub(r"(? 1 + else types[0] + ) + default = "sentinels.DEFAULT" if arg_info["optional"] else "" + func_args.append( + { + "name": name, + "type_hint": type_hint, + "default": default, + } + ) + + # Clean up default values for mandatory args + # In Python, mandatory args come first. + for arg in func_args: + if not arg["default"]: + del arg["default"] + + # Test args + test_args = [] + for name in arg_order: + arg_info = args_by_name[name] + some_type = list(arg_info["types"])[0] + col_name = YAML_TYPE_TO_COL.get(some_type, "string_col") + test_args.append({"col_name": col_name}) + + functions_list.append( + { + "name": python_name, + "op_name": internal_op_name, + "description": func_data["description"], + "args": func_args, + "test_args": test_args, + } + ) + + # Render and write + output_file.parent.mkdir(parents=True, exist_ok=True) + content = template.render( + license_header=LICENSE_HEADER, + yaml_path=str(yaml_file), + script_path="scripts/generate_bigframes_bigquery.py", + ops=ops_list, + functions=functions_list, + ) + with open(output_file, "w") as f: + f.write(content) + + subprocess.run( + [ + "autoflake", + "--in-place", + "--remove-all-unused-imports", + str(output_file), + ], + check=True, + ) + print(f" Generated {output_file}") + + # Render and write test + import_path = "bigframes.bigquery._operations." + ".".join(module_path.parts) + test_output_file = TEST_OUTPUT_DIR.joinpath( + module_path.with_name(f"test_{module_path.name}") + ).with_suffix(".py") + + test_output_file.parent.mkdir(parents=True, exist_ok=True) + test_content = test_template.render( + license_header=LICENSE_HEADER, + yaml_path=str(yaml_file), + script_path="scripts/generate_bigframes_bigquery.py", + import_path=import_path, + short_name=module_path.name, + functions=functions_list, + ) + with open(test_output_file, "w") as f: + f.write(test_content) + + subprocess.run( + [ + "autoflake", + "--in-place", + "--remove-all-unused-imports", + str(test_output_file), + ], + check=True, + ) + print(f" Generated {test_output_file}") + + print(f" Updating snapshots for {test_output_file}...") + subprocess.run( + [ + "pytest", + str(test_output_file), + "--snapshot-update", + ], + check=False, + ) + + +if __name__ == "__main__": + main() diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock b/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock new file mode 100644 index 000000000000..3cba9097522d --- /dev/null +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock @@ -0,0 +1,99 @@ +version = 1 +revision = 3 +requires-python = ">=3.14" + +[manifest] +requirements = [ + { name = "autoflake" }, + { name = "jinja2" }, + { name = "pyyaml" }, +] + +[[package]] +name = "autoflake" +version = "2.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyflakes" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c3/0b/70c277eef225133763bf05c02c88df182e57d5c5c0730d3998958096a82e/autoflake-2.3.3.tar.gz", hash = "sha256:c24809541e23999f7a7b0d2faadf15deb0bc04cdde49728a2fd943a0c8055504", size = 16515, upload-time = "2026-02-20T05:01:43.448Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/21/26f1680ec3a598ea31768f9ebcd427e42986d077a005416094b580635532/autoflake-2.3.3-py3-none-any.whl", hash = "sha256:a51a3412aff16135ee5b3ec25922459fef10c1f23ce6d6c4977188df859e8b53", size = 17715, upload-time = "2026-02-20T05:01:42.137Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" }, + { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" }, + { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" }, + { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" }, + { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" }, + { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" }, + { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" }, + { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" }, + { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" }, + { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" }, + { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" }, + { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" }, + { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" }, + { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" }, + { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" }, + { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" }, + { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" }, + { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" }, + { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" }, + { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" }, + { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, +] + +[[package]] +name = "pyflakes" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/45/dc/fd034dc20b4b264b3d015808458391acbf9df40b1e54750ef175d39180b1/pyflakes-3.4.0.tar.gz", hash = "sha256:b24f96fafb7d2ab0ec5075b7350b3d2d2218eab42003821c06344973d3ea2f58", size = 64669, upload-time = "2025-06-20T18:45:27.834Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/2f/81d580a0fb83baeb066698975cb14a618bdbed7720678566f1b046a95fe8/pyflakes-3.4.0-py2.py3-none-any.whl", hash = "sha256:f742a7dbd0d9cb9ea41e9a24a918996e8170c799fa528688d40dd582c8265f4f", size = 63551, upload-time = "2025-06-20T18:45:26.937Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, +] diff --git a/packages/bigframes/specs/bigframes-bigquery-generator.md b/packages/bigframes/specs/bigframes-bigquery-generator.md new file mode 100644 index 000000000000..26d1d45c9f51 --- /dev/null +++ b/packages/bigframes/specs/bigframes-bigquery-generator.md @@ -0,0 +1,101 @@ +# Code generation for bigframes.bigquery + +This document describes code generation for the `bigframes.bigquery` modules. +For detailed specifications on input and output types, refer to +[Contributing to bigframes.bigquery](./bigframes-bigquery-contributing.md). + +## Overview + +The script at `packages/bigframes/scripts/generate_bigframes_bigquery.py` +generates python submodules for the `bigframes.bigquery` module. When run +without any arguments, it iterates through all yaml files at +`packages/bigframes/scripts/data/sql-functions/**/*.yaml` to generate the code. + +The script at `packages/bigframes/scripts/check_bigframes_bigquery.py` iterates +through all the same yaml files and checks that the functions have been included +in the `bigframes.bigquery` module, as the `__init__.py` file requires manual +updates. + +## Running the generator + +Since the dependencies for the script differ from that of bigframes +and its test suite, use the self-contained Python script technique described at +https://docs.astral.sh/uv/guides/scripts/ +to automatically manage dependencies using `uv`. Therefore, the header of the +script will look something like: + +```python +#!/usr/bin/env -S uv run --script +# +# /// script +# dependencies = [ +# "jinja2", +# "pyyaml", +# ] +# /// +# +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# ... +``` + +To run the script: + +```bash +cd packages/bigframes +uv run scripts/generate_bigframes_bigquery.py +``` + +To improve reproducibility, we also check in the uv lock file generated by +running `uv lock --script scripts/generate_bigframes_bigquery.py`. + +## Generated code organization + +The `generate_bigframes_bigquery.py` script generates submodules of +`bigframes.bigquery._operations`, with the full path reflecting the organization +of the YAML files. For example, a YAML file at +`packages/bigframes/scripts/data/sql-functions/aead.yaml` corresponds to a +generated Python module at `bigframes.bigquery._operations.aead`. Likewise, +`packages/bigframes/scripts/data/sql-functions/builtins/bit.yaml` corresponds +to the `bigframes.bigquery._operations.builtins.bit` submodule. + +## Generated module implementation + +Each generated module has all functions defined in the YAML file converted to +the equivalent Python definition, including keyword arguments and docstrings. + +### Code generation + +The code will be templated using the jinja2 template engine. This allows +proposed changes to the templated code to be reviewed more easily. + +### Handling optional arguments + +When the user calls a Python function without specifying the optional +argument, that argument is omitted from the SQL text. To allow for explicit +NULL values to be passed in (None in Python), the default value is specified +to be a default sentinel value enum `bigframes.core.sentinels.DEFAULT`. For +example: + +```python +import bigframes.core.sentinels + +def current_date( + time_zone_expression: str | bigframes.core.sentinels.Default = bigframes.core.sentinels.DEFAULT, +): + ... +``` + +### Input and output types + +Refer to the table in +[Contributing to bigframes.bigquery](./bigframes-bigquery-contributing.md). + +### Internal bigframes operator + +Scalar functions should generate an expression using the `GoogleSqlScalarOp`. +This keeps the implementation as scalar SQL functions consistent. + +Aggregate, analytic, and table-valued functions currently require custom ops. As +such, those functions are currently out of scope for this generator. diff --git a/packages/bigframes/tests/unit/bigquery/_operations/conftest.py b/packages/bigframes/tests/unit/bigquery/_operations/conftest.py new file mode 100644 index 000000000000..127902241acb --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/_operations/conftest.py @@ -0,0 +1,280 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +import typing + +import pandas as pd +import pyarrow as pa +import pytest +from google.cloud import bigquery + +import bigframes.core as core +import bigframes.pandas as bpd +import bigframes.testing.mocks as mocks +import bigframes.testing.utils +from bigframes import dtypes + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data" + + +def _create_compiler_session(table_name, table_schema): + """Helper function to create a compiler session.""" + from bigframes.testing import compiler_session + + anonymous_dataset = bigquery.DatasetReference.from_string( + "bigframes-dev.sqlglot_test" + ) + session = mocks.create_bigquery_session( + table_name=table_name, + table_schema=table_schema, + anonymous_dataset=anonymous_dataset, + ) + session._executor = compiler_session.SQLCompilerExecutor() + return session + + +@pytest.fixture(scope="session") +def compiler_session(scalar_types_table_schema): + """Compiler session for scalar types.""" + return _create_compiler_session("scalar_types", scalar_types_table_schema) + + +@pytest.fixture(scope="session") +def compiler_session_w_repeated_types(repeated_types_table_schema): + """Compiler session for repeated data types.""" + return _create_compiler_session("repeated_types", repeated_types_table_schema) + + +@pytest.fixture(scope="session") +def compiler_session_w_nested_structs_types(nested_structs_types_table_schema): + """Compiler session for nested STRUCT data types.""" + return _create_compiler_session( + "nested_structs_types", nested_structs_types_table_schema + ) + + +@pytest.fixture(scope="session") +def compiler_session_w_json_types(json_types_table_schema): + """Compiler session for JSON data types.""" + return _create_compiler_session("json_types", json_types_table_schema) + + +@pytest.fixture(scope="session") +def scalar_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("datetime_col", "DATETIME"), + bigquery.SchemaField("geography_col", "GEOGRAPHY"), + bigquery.SchemaField("int64_col", "INTEGER"), + bigquery.SchemaField("int64_too", "INTEGER"), + bigquery.SchemaField("numeric_col", "NUMERIC"), + bigquery.SchemaField("float64_col", "FLOAT"), + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("rowindex_2", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("string_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("timestamp_col", "TIMESTAMP"), + bigquery.SchemaField("duration_col", "INTEGER"), + ] + + +@pytest.fixture(scope="session") +def scalar_types_df(compiler_session) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.scalar_types", + enable_snapshot=False, + ) + bf_df = bf_df.set_index("rowindex", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def scalar_types_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + # TODO: add tests for empty dataframes + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + bigframes.testing.utils.convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + return df + + +@pytest.fixture(scope="module") +def scalar_types_array_value( + scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session +) -> core.ArrayValue: + managed_data_source = core.local_data.ManagedArrowTable.from_pandas( + scalar_types_pandas_df + ) + return core.ArrayValue.from_managed(managed_data_source, compiler_session) + + +@pytest.fixture(scope="session") +def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("id", "INTEGER"), + bigquery.SchemaField( + "people", + "RECORD", + fields=[ + bigquery.SchemaField("name", "STRING"), + bigquery.SchemaField("age", "INTEGER"), + bigquery.SchemaField( + "address", + "RECORD", + fields=[ + bigquery.SchemaField("city", "STRING"), + bigquery.SchemaField("country", "STRING"), + ], + ), + ], + ), + ] + + +@pytest.fixture(scope="session") +def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_nested_structs_types._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.nested_structs_types", + enable_snapshot=False, + ) + bf_df = bf_df.set_index("id", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def nested_structs_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing STRUCT types and using the `id` + column as the index.""" + + df = pd.read_json( + DATA_DIR / "nested_structs.jsonl", + lines=True, + ) + df = df.set_index("id") + + address_struct_schema = pa.struct( + [pa.field("city", pa.string()), pa.field("country", pa.string())] + ) + person_struct_schema = pa.struct( + [ + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + pa.field("address", address_struct_schema), + ] + ) + df["person"] = df["person"].astype(pd.ArrowDtype(person_struct_schema)) + return df + + +@pytest.fixture(scope="session") +def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("int_list_col", "INTEGER", "REPEATED"), + bigquery.SchemaField("bool_list_col", "BOOLEAN", "REPEATED"), + bigquery.SchemaField("float_list_col", "FLOAT", "REPEATED"), + bigquery.SchemaField("date_list_col", "DATE", "REPEATED"), + bigquery.SchemaField("date_time_list_col", "DATETIME", "REPEATED"), + bigquery.SchemaField("numeric_list_col", "NUMERIC", "REPEATED"), + bigquery.SchemaField("string_list_col", "STRING", "REPEATED"), + ] + + +@pytest.fixture(scope="session") +def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_repeated_types._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.repeated_types", + enable_snapshot=False, + ) + bf_df = bf_df.set_index("rowindex", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def repeated_types_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing LIST types and using the `rowindex` + column as the index.""" + + df = pd.read_json( + DATA_DIR / "repeated.jsonl", + lines=True, + ) + # TODO: add dtype conversion here if needed. + df = df.set_index("rowindex") + return df + + +@pytest.fixture(scope="session") +def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("json_col", "JSON"), + ] + + +@pytest.fixture(scope="session") +def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing JSON types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_json_types._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.json_types", + enable_snapshot=False, + ) + # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? + bf_df = bf_df.set_index("rowindex", drop=True) + return bf_df + + +@pytest.fixture(scope="session") +def json_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing JSON types and using the `rowindex` + column as the index.""" + json_data = [ + "null", + "true", + "100", + "0.98", + '"a string"', + "[]", + "[1, 2, 3]", + '[{"a": 1}, {"a": 2}, {"a": null}, {}]', + '"100"', + '{"date": "2024-07-16"}', + '{"int_value": 2, "null_filed": null}', + '{"list_data": [10, 20, 30]}', + ] + df = pd.DataFrame( + { + "rowindex": pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), + "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE), + }, + ) + # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? + df = df.set_index("rowindex", drop=True) + return df diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql new file mode 100644 index 000000000000..d74f1fa20eee --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql @@ -0,0 +1,4 @@ +SELECT + `rowindex`, + AEAD.DECRYPT_BYTES(`string_col`, `bytes_col`, `bytes_col`) AS `0` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql new file mode 100644 index 000000000000..1c2b75812b9e --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql @@ -0,0 +1,4 @@ +SELECT + `rowindex`, + AEAD.DECRYPT_STRING(`string_col`, `bytes_col`, `string_col`) AS `0` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql new file mode 100644 index 000000000000..e62f74d8fd9d --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql @@ -0,0 +1,4 @@ +SELECT + `rowindex`, + AEAD.ENCRYPT(`string_col`, `bytes_col`, `bytes_col`) AS `0` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py new file mode 100644 index 000000000000..95ab84c447d3 --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py @@ -0,0 +1,53 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: scripts/data/sql-functions/aead.yaml +# by the script: scripts/generate_bigframes_bigquery.py + +from typing import cast + +import bigframes.bigquery._operations.aead as aead +import bigframes.pandas as bpd +import pytest + +pytest.importorskip("pytest_snapshot") + + +def test_decrypt_bytes(scalar_types_df: bpd.DataFrame, snapshot): + result = aead.decrypt_bytes( + cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + ).to_frame() + snapshot.assert_match(result.sql, "out.sql") + + +def test_decrypt_string(scalar_types_df: bpd.DataFrame, snapshot): + result = aead.decrypt_string( + cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + cast(bpd.Series, scalar_types_df["string_col"]), + ).to_frame() + snapshot.assert_match(result.sql, "out.sql") + + +def test_encrypt(scalar_types_df: bpd.DataFrame, snapshot): + result = aead.encrypt( + cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + ).to_frame() + snapshot.assert_match(result.sql, "out.sql")