From 3ccb3ceb005eac30a4a1842f048ee9c604b67472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 18 May 2026 16:01:30 +0000 Subject: [PATCH 01/20] feat: add `bigframes.bigquery.aead.*` scalar functions --- .../scripts/data/sql-functions/aead.yaml | 216 ++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 packages/bigframes/scripts/data/sql-functions/aead.yaml diff --git a/packages/bigframes/scripts/data/sql-functions/aead.yaml b/packages/bigframes/scripts/data/sql-functions/aead.yaml new file mode 100644 index 000000000000..ba5dc1fb3af9 --- /dev/null +++ b/packages/bigframes/scripts/data/sql-functions/aead.yaml @@ -0,0 +1,216 @@ +urn: extension:google:bq_scalar_functions +scalar_functions: + - name: "aead.decrypt_bytes" + description: "GoogleSQL scalar function aead.decrypt_bytes" + impls: + # Signature: aead.decrypt_bytes:vbin_vbin_vbin + - args: + - name: "arg0" + value: binary + optional: false + keyword_only: false + - name: "arg1" + value: binary + optional: false + keyword_only: false + - name: "arg2" + value: binary + optional: false + keyword_only: false + return: binary + # Signature: aead.decrypt_bytes:struct_vbin_vbin + - args: + - name: "arg0" + value: struct + optional: false + keyword_only: false + - name: "arg1" + value: binary + optional: false + keyword_only: false + - name: "arg2" + value: binary + optional: false + keyword_only: false + return: binary + - name: "aead.decrypt_string" + description: "GoogleSQL scalar function aead.decrypt_string" + impls: + # Signature: aead.decrypt_string:vbin_vbin_str + - args: + - name: "arg0" + value: binary + optional: false + keyword_only: false + - name: "arg1" + value: binary + optional: false + keyword_only: false + - name: "arg2" + value: string + optional: false + keyword_only: false + return: string + # Signature: aead.decrypt_string:struct_vbin_str + - args: + - name: "arg0" + value: struct + optional: false + keyword_only: false + - name: "arg1" + value: binary + optional: false + keyword_only: false + - name: "arg2" + value: string + optional: false + keyword_only: false + return: string + - name: "aead.encrypt" + description: "GoogleSQL scalar function aead.encrypt" + impls: + # Signature: aead.encrypt:vbin_str_str + - args: + - name: "arg0" + value: binary + optional: false + keyword_only: false + - name: "arg1" + value: string + optional: false + keyword_only: false + - name: "arg2" + value: string + optional: false + keyword_only: false + return: binary + # Signature: aead.encrypt:vbin_vbin_vbin + - args: + - name: "arg0" + value: binary + optional: false + keyword_only: false + - name: "arg1" + value: binary + optional: false + keyword_only: false + - name: "arg2" + value: binary + optional: false + keyword_only: false + return: binary + # Signature: aead.encrypt:struct_str_str + - args: + - name: "arg0" + value: struct + optional: false + keyword_only: false + - name: "arg1" + value: string + optional: false + keyword_only: false + - name: "arg2" + value: string + optional: false + keyword_only: false + return: binary + # Signature: aead.encrypt:struct_vbin_vbin + - args: + - name: "arg0" + value: struct + optional: false + keyword_only: false + - name: "arg1" + value: binary + optional: false + keyword_only: false + - name: "arg2" + value: binary + optional: false + keyword_only: false + return: binary + - name: "aead.envelope_decrypt_bytes" + description: "GoogleSQL scalar function aead.envelope_decrypt_bytes" + impls: + # Signature: aead.envelope_decrypt_bytes:str_vbin_vbin_vbin + - args: + - name: "arg0" + value: string + optional: false + keyword_only: false + - name: "arg1" + value: binary + optional: false + keyword_only: false + - name: "arg2" + value: binary + optional: false + keyword_only: false + - name: "arg3" + value: binary + optional: false + keyword_only: false + return: binary + - name: "aead.envelope_decrypt_string" + description: "GoogleSQL scalar function aead.envelope_decrypt_string" + impls: + # Signature: aead.envelope_decrypt_string:str_vbin_vbin_str + - args: + - name: "arg0" + value: string + optional: false + keyword_only: false + - name: "arg1" + value: binary + optional: false + keyword_only: false + - name: "arg2" + value: binary + optional: false + keyword_only: false + - name: "arg3" + value: string + optional: false + keyword_only: false + return: string + - name: "aead.envelope_encrypt" + description: "GoogleSQL scalar function aead.envelope_encrypt" + impls: + # Signature: aead.envelope_encrypt:str_vbin_str_str + - args: + - name: "arg0" + value: string + optional: false + keyword_only: false + - name: "arg1" + value: binary + optional: false + keyword_only: false + - name: "arg2" + value: string + optional: false + keyword_only: false + - name: "arg3" + value: string + optional: false + keyword_only: false + return: binary + # Signature: aead.envelope_encrypt:str_vbin_vbin_vbin + - args: + - name: "arg0" + value: string + optional: false + keyword_only: false + - name: "arg1" + value: binary + optional: false + keyword_only: false + - name: "arg2" + value: binary + optional: false + keyword_only: false + - name: "arg3" + value: binary + optional: false + keyword_only: false + return: binary From 4c94ec3e1d81dee1f7cb45ac3c2ccdf7f900e590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 18 May 2026 16:13:34 +0000 Subject: [PATCH 02/20] update function definitions to match https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/aead_encryption_functions --- .../scripts/data/sql-functions/aead.yaml | 139 ++++-------------- 1 file changed, 27 insertions(+), 112 deletions(-) diff --git a/packages/bigframes/scripts/data/sql-functions/aead.yaml b/packages/bigframes/scripts/data/sql-functions/aead.yaml index ba5dc1fb3af9..6c289a96e886 100644 --- a/packages/bigframes/scripts/data/sql-functions/aead.yaml +++ b/packages/bigframes/scripts/data/sql-functions/aead.yaml @@ -1,215 +1,130 @@ urn: extension:google:bq_scalar_functions scalar_functions: - name: "aead.decrypt_bytes" - description: "GoogleSQL scalar function aead.decrypt_bytes" + description: "Uses the matching key from keyset to decrypt ciphertext and verifies the integrity of the data using additional_data. Returns an error if decryption or verification fails." impls: # Signature: aead.decrypt_bytes:vbin_vbin_vbin - args: - - name: "arg0" + - name: "keyset" value: binary optional: false keyword_only: false - - name: "arg1" + - name: "ciphertext" value: binary optional: false keyword_only: false - - name: "arg2" + - name: "additional_data" value: binary optional: false keyword_only: false return: binary # Signature: aead.decrypt_bytes:struct_vbin_vbin - args: - - name: "arg0" + - name: "keyset" value: struct optional: false keyword_only: false - - name: "arg1" + - name: "ciphertext" value: binary optional: false keyword_only: false - - name: "arg2" + - name: "additional_data" value: binary optional: false keyword_only: false return: binary - name: "aead.decrypt_string" - description: "GoogleSQL scalar function aead.decrypt_string" + description: "Like AEAD.DECRYPT_BYTES, but where additional_data is of type STRING." impls: # Signature: aead.decrypt_string:vbin_vbin_str - args: - - name: "arg0" + - name: "keyset" value: binary optional: false keyword_only: false - - name: "arg1" + - name: "ciphertext" value: binary optional: false keyword_only: false - - name: "arg2" + - name: "additional_data" value: string optional: false keyword_only: false return: string # Signature: aead.decrypt_string:struct_vbin_str - args: - - name: "arg0" + - name: "keyset" value: struct optional: false keyword_only: false - - name: "arg1" + - name: "ciphertext" value: binary optional: false keyword_only: false - - name: "arg2" + - name: "additional_data" value: string optional: false keyword_only: false return: string - name: "aead.encrypt" - description: "GoogleSQL scalar function aead.encrypt" + description: "Encrypts plaintext using the primary cryptographic key in keyset. The algorithm of the primary key must be AEAD_AES_GCM_256. Binds the ciphertext to the context defined by additional_data. Returns NULL if any input is NULL." impls: # Signature: aead.encrypt:vbin_str_str - args: - - name: "arg0" + - name: "keyset" value: binary optional: false keyword_only: false - - name: "arg1" + - name: "plaintext" value: string optional: false keyword_only: false - - name: "arg2" + - name: "additional_data" value: string optional: false keyword_only: false return: binary # Signature: aead.encrypt:vbin_vbin_vbin - args: - - name: "arg0" + - name: "keyset" value: binary optional: false keyword_only: false - - name: "arg1" + - name: "plaintext" value: binary optional: false keyword_only: false - - name: "arg2" + - name: "additional_data" value: binary optional: false keyword_only: false return: binary # Signature: aead.encrypt:struct_str_str - args: - - name: "arg0" + - name: "keyset" value: struct optional: false keyword_only: false - - name: "arg1" + - name: "plaintext" value: string optional: false keyword_only: false - - name: "arg2" + - name: "additional_data" value: string optional: false keyword_only: false return: binary # Signature: aead.encrypt:struct_vbin_vbin - args: - - name: "arg0" + - name: "keyset" value: struct optional: false keyword_only: false - - name: "arg1" + - name: "plaintext" value: binary optional: false keyword_only: false - - name: "arg2" - value: binary - optional: false - keyword_only: false - return: binary - - name: "aead.envelope_decrypt_bytes" - description: "GoogleSQL scalar function aead.envelope_decrypt_bytes" - impls: - # Signature: aead.envelope_decrypt_bytes:str_vbin_vbin_vbin - - args: - - name: "arg0" - value: string - optional: false - keyword_only: false - - name: "arg1" - value: binary - optional: false - keyword_only: false - - name: "arg2" - value: binary - optional: false - keyword_only: false - - name: "arg3" - value: binary - optional: false - keyword_only: false - return: binary - - name: "aead.envelope_decrypt_string" - description: "GoogleSQL scalar function aead.envelope_decrypt_string" - impls: - # Signature: aead.envelope_decrypt_string:str_vbin_vbin_str - - args: - - name: "arg0" - value: string - optional: false - keyword_only: false - - name: "arg1" - value: binary - optional: false - keyword_only: false - - name: "arg2" - value: binary - optional: false - keyword_only: false - - name: "arg3" - value: string - optional: false - keyword_only: false - return: string - - name: "aead.envelope_encrypt" - description: "GoogleSQL scalar function aead.envelope_encrypt" - impls: - # Signature: aead.envelope_encrypt:str_vbin_str_str - - args: - - name: "arg0" - value: string - optional: false - keyword_only: false - - name: "arg1" - value: binary - optional: false - keyword_only: false - - name: "arg2" - value: string - optional: false - keyword_only: false - - name: "arg3" - value: string - optional: false - keyword_only: false - return: binary - # Signature: aead.envelope_encrypt:str_vbin_vbin_vbin - - args: - - name: "arg0" - value: string - optional: false - keyword_only: false - - name: "arg1" - value: binary - optional: false - keyword_only: false - - name: "arg2" - value: binary - optional: false - keyword_only: false - - name: "arg3" + - name: "additional_data" value: binary optional: false keyword_only: false From c6b94fa2aaedae84d82d7f433acedf4e65b9d9e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 18 May 2026 19:15:38 +0000 Subject: [PATCH 03/20] create a spec for code generation --- .pre-commit-config.yaml | 6 +- .../bigframes/bigframes/core/sentinels.py | 33 ++++++++++ .../specs/bigframes-bigquery-generator.md | 62 +++++++++++++++++++ 3 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 packages/bigframes/bigframes/core/sentinels.py create mode 100644 packages/bigframes/specs/bigframes-bigquery-generator.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 427359befa73..15b074bf647e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,16 +16,16 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v6.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 23.7.0 hooks: - id: black - repo: https://github.com/pycqa/flake8 - rev: 3.9.2 # version-scanner: ignore + rev: 6.1.0 # version-scanner: ignore hooks: - id: flake8 diff --git a/packages/bigframes/bigframes/core/sentinels.py b/packages/bigframes/bigframes/core/sentinels.py new file mode 100644 index 000000000000..fc2bfac970e5 --- /dev/null +++ b/packages/bigframes/bigframes/core/sentinels.py @@ -0,0 +1,33 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sentinel values used throughout BigFrames.""" + +from __future__ import annotations + +from enum import Enum + + +class Default(Enum): + """Default values used throughout BigFrames. + + When a parameter is set to this, that parameter is explicitly omitted + from the SQL text. This allows for NULL (None in Python) to be explicitly + passed in to optional parameters. + """ + + token = 0 + + +DEFAULT = Default.token diff --git a/packages/bigframes/specs/bigframes-bigquery-generator.md b/packages/bigframes/specs/bigframes-bigquery-generator.md new file mode 100644 index 000000000000..75ad578f844c --- /dev/null +++ b/packages/bigframes/specs/bigframes-bigquery-generator.md @@ -0,0 +1,62 @@ +# Code generation for bigframes.bigquery + +This document describes code generation for the `bigframes.bigquery` modules. +For detailed specifications on input and output types, refer to +[Contributing to bigframes.bigquery](./bigframes-bigquery-contributing.md). + +## Overview + +The script at `packages/bigframes/scripts/generate_bigframes_bigquery.py` +generates python submodules for the `bigframes.bigquery` module. When run +without any arguments, it iterates through all yaml files at +`packages/bigframes/scripts/data/sql-functions/**/*.yaml` to generate the code. + +The script at `packages/bigframes/scripts/check_bigframes_bigquery.py` iterates +through all the same yaml files and checks that the functions have been included +in the `bigframes.bigquery` module, as the `__init__.py` file requires manual +updates. + +## Generated code organization + +The `generate_bigframes_bigquery.py` script generates submodules of +`bigframes.bigquery._operations`, with the full path reflecting the organization +of the YAML files. For example, a YAML file at +`packages/bigframes/scripts/data/sql-functions/aead.yaml` corresponds to a +generated Python module at `bigframes.bigquery._operations.aead`. Likewise, +`packages/bigframes/scripts/data/sql-functions/builtins/bit.yaml` corresponds +to the `bigframes.bigquery._operations.builtins.bit` submodule. + +## Generated module implementation + +Each generated module has all functions defined in the YAML file converted to +the equivalent Python definition, including keyword arguments and docstrings. + +### Handling optional arguments + +When the user calls a Python function without specifying the optional +argument, that argument is omitted from the SQL text. To allow for explicit +NULL values to be passed in (None in Python), the default value is specified +to be a default sentinel value enum `bigframes.core.sentinels.DEFAULT`. For +example: + +```python +import bigframes.core.sentinels + +def current_date( + time_zone_expression: str | bigframes.core.sentinels.Default = bigframes.core.sentinels.DEFAULT, +): + ... +``` + +### Input and output types + +Refer to the table in +[Contributing to bigframes.bigquery](./bigframes-bigquery-contributing.md). + +### Internal bigframes operator + +Scalar functions should generate an expression using the `GoogleSqlScalarOp`. +This keeps the implementation as scalar SQL functions consistent. + +Aggregate, analytic, and table-valued functions currently require custom ops. As +such, those functions are currently out of scope for this generator. From 7cb7e76dd4d81bb89d2db68e1438da59f7a55a06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 18 May 2026 19:32:01 +0000 Subject: [PATCH 04/20] use uv for script reproducibility --- .../specs/bigframes-bigquery-generator.md | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/packages/bigframes/specs/bigframes-bigquery-generator.md b/packages/bigframes/specs/bigframes-bigquery-generator.md index 75ad578f844c..26d1d45c9f51 100644 --- a/packages/bigframes/specs/bigframes-bigquery-generator.md +++ b/packages/bigframes/specs/bigframes-bigquery-generator.md @@ -16,6 +16,40 @@ through all the same yaml files and checks that the functions have been included in the `bigframes.bigquery` module, as the `__init__.py` file requires manual updates. +## Running the generator + +Since the dependencies for the script differ from that of bigframes +and its test suite, use the self-contained Python script technique described at +https://docs.astral.sh/uv/guides/scripts/ +to automatically manage dependencies using `uv`. Therefore, the header of the +script will look something like: + +```python +#!/usr/bin/env -S uv run --script +# +# /// script +# dependencies = [ +# "jinja2", +# "pyyaml", +# ] +# /// +# +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# ... +``` + +To run the script: + +```bash +cd packages/bigframes +uv run scripts/generate_bigframes_bigquery.py +``` + +To improve reproducibility, we also check in the uv lock file generated by +running `uv lock --script scripts/generate_bigframes_bigquery.py`. + ## Generated code organization The `generate_bigframes_bigquery.py` script generates submodules of @@ -31,6 +65,11 @@ to the `bigframes.bigquery._operations.builtins.bit` submodule. Each generated module has all functions defined in the YAML file converted to the equivalent Python definition, including keyword arguments and docstrings. +### Code generation + +The code will be templated using the jinja2 template engine. This allows +proposed changes to the templated code to be reviewed more easily. + ### Handling optional arguments When the user calls a Python function without specifying the optional From d2f1e985b5802fec5e57468dea023e34df127302 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 18 May 2026 20:00:28 +0000 Subject: [PATCH 05/20] first pass at code generation --- .../bigframes/bigquery/_operations/aead.py | 138 ++++++++ .../scripts/generate_bigframes_bigquery.py | 301 ++++++++++++++++++ .../generate_bigframes_bigquery.py.lock | 77 +++++ 3 files changed, 516 insertions(+) create mode 100644 packages/bigframes/bigframes/bigquery/_operations/aead.py create mode 100755 packages/bigframes/scripts/generate_bigframes_bigquery.py create mode 100644 packages/bigframes/scripts/generate_bigframes_bigquery.py.lock diff --git a/packages/bigframes/bigframes/bigquery/_operations/aead.py b/packages/bigframes/bigframes/bigquery/_operations/aead.py new file mode 100644 index 000000000000..1bde8d748638 --- /dev/null +++ b/packages/bigframes/bigframes/bigquery/_operations/aead.py @@ -0,0 +1,138 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: scripts/data/sql-functions/aead.yaml +# by the script: scripts/generate_bigframes_bigquery.py + +from __future__ import annotations + +import datetime +from typing import Any, Optional, TypeVar, Union + +import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.core.sentinels as sentinels +import bigframes.operations as ops +import bigframes.series as series +from bigframes import dtypes +from bigframes.operations import googlesql + +T = TypeVar("T", series.Series, bigframes.core.col.Expression) + +_DECRYPT_BYTES_OP = googlesql.GoogleSqlScalarOp( + "AEAD.DECRYPT_BYTES", + args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=lambda *args: dtypes.BYTES_DTYPE, +) +_DECRYPT_STRING_OP = googlesql.GoogleSqlScalarOp( + "AEAD.DECRYPT_STRING", + args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=lambda *args: dtypes.STRING_DTYPE, +) +_ENCRYPT_OP = googlesql.GoogleSqlScalarOp( + "AEAD.ENCRYPT", + args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=lambda *args: dtypes.BYTES_DTYPE, +) + + +def _apply_googlesql_op( + op: googlesql.GoogleSqlScalarOp, + *args: Any, +) -> Union[series.Series, bigframes.core.col.Expression]: + """Applies a GoogleSQL scalar operator to the given arguments. + + Handles a mix of Series, Expression, and literal inputs. + """ + # Find the first Series to use for alignment + first_series = None + for arg in args: + if isinstance(arg, series.Series): + first_series = arg + break + + if first_series is not None: + processed_args = [] + block = first_series._block + for arg in args: + if isinstance(arg, bigframes.core.col.Expression): + # Project expression onto the block + block, col_id = block.project_expr(arg._expr) + processed_args.append(series.Series(block.select_column(col_id))) + elif arg is sentinels.DEFAULT: + # OmittedArg is handled by GoogleSqlScalarOp in compiler + processed_args.append(bigframes.core.col.Expression(ex.OmittedArg())) + else: + processed_args.append(arg) + + # Apply the n-ary op. _apply_nary_op handles alignment of Series and literals. + result = first_series._apply_nary_op(op, processed_args, ignore_self=True) + result.name = None + return result + + # No Series, return an Expression + expr_args = [] + for arg in args: + if isinstance(arg, bigframes.core.col.Expression): + expr_args.append(arg._expr) + elif arg is sentinels.DEFAULT: + expr_args.append(ex.OmittedArg()) + else: + expr_args.append(ex.const(arg)) + + return bigframes.core.col.Expression(ex.OpExpression(op, tuple(expr_args))) + + +def decrypt_bytes( + keyset: Union[T, Union[bytes, dict]], + ciphertext: Union[T, bytes], + additional_data: Union[T, bytes], +) -> T: + """Uses the matching key from keyset to decrypt ciphertext and verifies the integrity of the data using additional_data. Returns an error if decryption or verification fails.""" + return _apply_googlesql_op( + _DECRYPT_BYTES_OP, + keyset, + ciphertext, + additional_data, + ) # type: ignore + + +def decrypt_string( + keyset: Union[T, Union[bytes, dict]], + ciphertext: Union[T, bytes], + additional_data: Union[T, str], +) -> T: + """Like AEAD.DECRYPT_BYTES, but where additional_data is of type STRING.""" + return _apply_googlesql_op( + _DECRYPT_STRING_OP, + keyset, + ciphertext, + additional_data, + ) # type: ignore + + +def encrypt( + keyset: Union[T, Union[bytes, dict]], + plaintext: Union[T, Union[bytes, str]], + additional_data: Union[T, Union[bytes, str]], +) -> T: + """Encrypts plaintext using the primary cryptographic key in keyset. The algorithm of the primary key must be AEAD_AES_GCM_256. Binds the ciphertext to the context defined by additional_data. Returns NULL if any input is NULL.""" + return _apply_googlesql_op( + _ENCRYPT_OP, + keyset, + plaintext, + additional_data, + ) # type: ignore diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py new file mode 100755 index 000000000000..b15aace5f17c --- /dev/null +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -0,0 +1,301 @@ +#!/usr/bin/env -S uv run --script +# +# /// script +# dependencies = [ +# "jinja2", +# "pyyaml", +# ] +# /// +# +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +import re +import yaml +import jinja2 + +# Directory containing the YAML files +DATA_DIR = pathlib.Path("scripts/data/sql-functions") +# Directory where the generated Python files will be placed +OUTPUT_DIR = pathlib.Path("bigframes/bigquery/_operations") + +LICENSE_HEADER = """# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +TEMPLATE = """{{ license_header }} +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: {{ yaml_path }} +# by the script: {{ script_path }} + +from __future__ import annotations + +import datetime +from typing import Any, Optional, TypeVar, Union + +import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.core.sentinels as sentinels +import bigframes.operations as ops +import bigframes.series as series +from bigframes import dtypes +from bigframes.operations import googlesql + +T = TypeVar("T", series.Series, bigframes.core.col.Expression) + +{% for op in ops %} +{{ op.internal_name }} = googlesql.GoogleSqlScalarOp( + "{{ op.sql_name }}", + args=({{ op.arg_specs }}), + signature={{ op.signature }}, +) +{% endfor %} + +def _apply_googlesql_op( + op: googlesql.GoogleSqlScalarOp, + *args: Any, +) -> Union[series.Series, bigframes.core.col.Expression]: + \"\"\"Applies a GoogleSQL scalar operator to the given arguments. + + Handles a mix of Series, Expression, and literal inputs. + \"\"\" + # Find the first Series to use for alignment + first_series = None + for arg in args: + if isinstance(arg, series.Series): + first_series = arg + break + + if first_series is not None: + processed_args = [] + block = first_series._block + for arg in args: + if isinstance(arg, bigframes.core.col.Expression): + # Project expression onto the block + block, col_id = block.project_expr(arg._expr) + processed_args.append(series.Series(block.select_column(col_id))) + elif arg is sentinels.DEFAULT: + # OmittedArg is handled by GoogleSqlScalarOp in compiler + processed_args.append(bigframes.core.col.Expression(ex.OmittedArg())) + else: + processed_args.append(arg) + + # Apply the n-ary op. _apply_nary_op handles alignment of Series and literals. + result = first_series._apply_nary_op(op, processed_args, ignore_self=True) + result.name = None + return result + + # No Series, return an Expression + expr_args = [] + for arg in args: + if isinstance(arg, bigframes.core.col.Expression): + expr_args.append(arg._expr) + elif arg is sentinels.DEFAULT: + expr_args.append(ex.OmittedArg()) + else: + expr_args.append(ex.const(arg)) + + return bigframes.core.col.Expression(ex.OpExpression(op, tuple(expr_args))) + +{% for func in functions %} +def {{ func.name }}( +{% for arg in func.args %} + {{ arg.name }}: Union[T, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, +{% endfor %} +) -> T: + \"\"\"{{ func.description }}\"\"\" + return _apply_googlesql_op( + {{ func.op_name }}, +{% for arg in func.args %} + {{ arg.name }}, +{% endfor %} + ) # type: ignore + +{% endfor %} +""" + +DTYPE_MAP = { + "binary": "dtypes.BYTES_DTYPE", + "string": "dtypes.STRING_DTYPE", + "int64": "dtypes.INT_DTYPE", + "float64": "dtypes.FLOAT_DTYPE", + "bool": "dtypes.BOOL_DTYPE", + "geography": "dtypes.GEO_DTYPE", + "json": "dtypes.JSON_DTYPE", + "date": "dtypes.DATE_DTYPE", + "time": "dtypes.TIME_DTYPE", + "datetime": "dtypes.DATETIME_DTYPE", + "timestamp": "dtypes.TIMESTAMP_DTYPE", +} + +PY_TYPE_MAP = { + "binary": "bytes", + "string": "str", + "int64": "int", + "float64": "float", + "bool": "bool", + "geography": "Any", + "json": "Any", + "date": "datetime.date", + "time": "datetime.time", + "datetime": "datetime.datetime", + "timestamp": "datetime.datetime", + "struct": "dict", +} + + +def to_snake_case(name): + # Replace dots with underscores + name = name.replace(".", "_") + # Handle CamelCase to snake_case + name = re.sub(r"(? 1 + else types[0] + ) + default = "sentinels.DEFAULT" if arg_info["optional"] else "" + func_args.append( + { + "name": name, + "type_hint": type_hint, + "default": default, + } + ) + + # Clean up default values for mandatory args + # In Python, mandatory args come first. + for arg in func_args: + if not arg["default"]: + del arg["default"] + + functions_list.append( + { + "name": python_name, + "op_name": internal_op_name, + "description": func_data["description"], + "args": func_args, + } + ) + + # Render and write + output_file.parent.mkdir(parents=True, exist_ok=True) + content = template.render( + license_header=LICENSE_HEADER, + yaml_path=str(yaml_file), + script_path="scripts/generate_bigframes_bigquery.py", + ops=ops_list, + functions=functions_list, + ) + with open(output_file, "w") as f: + f.write(content) + print(f" Generated {output_file}") + + +if __name__ == "__main__": + main() diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock b/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock new file mode 100644 index 000000000000..0d28e42101bd --- /dev/null +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock @@ -0,0 +1,77 @@ +version = 1 +revision = 3 +requires-python = ">=3.14" + +[manifest] +requirements = [ + { name = "jinja2" }, + { name = "pyyaml" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" }, + { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" }, + { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" }, + { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" }, + { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" }, + { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" }, + { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" }, + { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" }, + { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" }, + { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" }, + { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" }, + { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" }, + { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" }, + { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" }, + { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" }, + { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" }, + { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" }, + { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" }, + { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" }, + { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" }, + { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, +] From 3a06e11d4ae9b8449def1f6173fe0a38f55a8156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 15:20:09 +0000 Subject: [PATCH 06/20] manual edits to generator --- .pre-commit-config.yaml | 1 + .../bigframes/bigquery/_googlesql.py | 81 ++++++++++++++++++ .../bigframes/bigquery/_operations/aead.py | 82 ++++--------------- packages/bigframes/bigframes/dtypes.py | 5 +- .../scripts/generate_bigframes_bigquery.py | 54 ++---------- 5 files changed, 106 insertions(+), 117 deletions(-) create mode 100644 packages/bigframes/bigframes/bigquery/_googlesql.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 15b074bf647e..101c6bdc13d1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,3 +29,4 @@ repos: rev: 6.1.0 # version-scanner: ignore hooks: - id: flake8 + args: [--config, packages/google-cloud-alloydb/.flake8] diff --git a/packages/bigframes/bigframes/bigquery/_googlesql.py b/packages/bigframes/bigframes/bigquery/_googlesql.py new file mode 100644 index 000000000000..ebf0bb81dc7e --- /dev/null +++ b/packages/bigframes/bigframes/bigquery/_googlesql.py @@ -0,0 +1,81 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for working with GoogleSqlScalarOps.""" + +from __future__ import annotations + +from typing import Any, Union + +import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.core.sentinels as sentinels +from bigframes.operations import googlesql +import bigframes.series as series + + +def apply_googlesql_scalar_op( + op: googlesql.GoogleSqlScalarOp, + *args: Any, +) -> Union[series.Series, bigframes.core.col.Expression]: + """Applies a GoogleSQL scalar operator to the given arguments. + + Handles a mix of Series, Expression, and literal inputs. + + Args: + op (googlesql.GoogleSqlScalarOp): + The operator to apply. + *args (Any): + The arguments to apply the operator to. + + Returns: + bigframes.pandas.Series | bigframes.core.col.Expression: + The result of the operation. If any of ``args`` is a Series, returns + a Series. Otherwise, returns an Expression. + """ + # Find the first Series to use for alignment + first_series = None + for arg in args: + if isinstance(arg, series.Series): + first_series = arg + break + + if first_series is not None: + processed_args = [] + block = first_series._block + for arg in args: + if isinstance(arg, bigframes.core.col.Expression): + block, col_id = block.project_expr(bigframes.core.col._as_bf_expr(arg)) + processed_args.append(series.Series(block.select_column(col_id))) + elif arg is sentinels.DEFAULT: + processed_args.append(bigframes.core.col.Expression(ex.OmittedArg())) + else: + processed_args.append(arg) + + # Apply the n-ary op. _apply_nary_op handles alignment of Series and literals. + result = first_series._apply_nary_op(op, processed_args, ignore_self=True) + result.name = None + return result + + # No Series, return an Expression + expr_args = [] + for arg in args: + if isinstance(arg, bigframes.core.col.Expression): + expr_args.append(bigframes.core.col._as_bf_expr(arg)) + elif arg is sentinels.DEFAULT: + expr_args.append(ex.OmittedArg()) + else: + expr_args.append(ex.const(arg)) + + return bigframes.core.col.Expression(ex.OpExpression(op, tuple(expr_args))) diff --git a/packages/bigframes/bigframes/bigquery/_operations/aead.py b/packages/bigframes/bigframes/bigquery/_operations/aead.py index 1bde8d748638..cbc995b1d5d7 100644 --- a/packages/bigframes/bigframes/bigquery/_operations/aead.py +++ b/packages/bigframes/bigframes/bigquery/_operations/aead.py @@ -19,16 +19,13 @@ from __future__ import annotations -import datetime -from typing import Any, Optional, TypeVar, Union +from typing import TypeVar, Union -import bigframes.core.col -import bigframes.core.expression as ex -import bigframes.core.sentinels as sentinels -import bigframes.operations as ops -import bigframes.series as series from bigframes import dtypes +import bigframes.bigquery._googlesql +import bigframes.core.col from bigframes.operations import googlesql +import bigframes.series as series T = TypeVar("T", series.Series, bigframes.core.col.Expression) @@ -49,60 +46,13 @@ ) -def _apply_googlesql_op( - op: googlesql.GoogleSqlScalarOp, - *args: Any, -) -> Union[series.Series, bigframes.core.col.Expression]: - """Applies a GoogleSQL scalar operator to the given arguments. - - Handles a mix of Series, Expression, and literal inputs. - """ - # Find the first Series to use for alignment - first_series = None - for arg in args: - if isinstance(arg, series.Series): - first_series = arg - break - - if first_series is not None: - processed_args = [] - block = first_series._block - for arg in args: - if isinstance(arg, bigframes.core.col.Expression): - # Project expression onto the block - block, col_id = block.project_expr(arg._expr) - processed_args.append(series.Series(block.select_column(col_id))) - elif arg is sentinels.DEFAULT: - # OmittedArg is handled by GoogleSqlScalarOp in compiler - processed_args.append(bigframes.core.col.Expression(ex.OmittedArg())) - else: - processed_args.append(arg) - - # Apply the n-ary op. _apply_nary_op handles alignment of Series and literals. - result = first_series._apply_nary_op(op, processed_args, ignore_self=True) - result.name = None - return result - - # No Series, return an Expression - expr_args = [] - for arg in args: - if isinstance(arg, bigframes.core.col.Expression): - expr_args.append(arg._expr) - elif arg is sentinels.DEFAULT: - expr_args.append(ex.OmittedArg()) - else: - expr_args.append(ex.const(arg)) - - return bigframes.core.col.Expression(ex.OpExpression(op, tuple(expr_args))) - - def decrypt_bytes( - keyset: Union[T, Union[bytes, dict]], - ciphertext: Union[T, bytes], - additional_data: Union[T, bytes], + keyset: Union[T, bigframes.core.col.Expression, Union[bytes, dict]], + ciphertext: Union[T, bigframes.core.col.Expression, bytes], + additional_data: Union[T, bigframes.core.col.Expression, bytes], ) -> T: """Uses the matching key from keyset to decrypt ciphertext and verifies the integrity of the data using additional_data. Returns an error if decryption or verification fails.""" - return _apply_googlesql_op( + return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( _DECRYPT_BYTES_OP, keyset, ciphertext, @@ -111,12 +61,12 @@ def decrypt_bytes( def decrypt_string( - keyset: Union[T, Union[bytes, dict]], - ciphertext: Union[T, bytes], - additional_data: Union[T, str], + keyset: Union[T, bigframes.core.col.Expression, Union[bytes, dict]], + ciphertext: Union[T, bigframes.core.col.Expression, bytes], + additional_data: Union[T, bigframes.core.col.Expression, str], ) -> T: """Like AEAD.DECRYPT_BYTES, but where additional_data is of type STRING.""" - return _apply_googlesql_op( + return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( _DECRYPT_STRING_OP, keyset, ciphertext, @@ -125,12 +75,12 @@ def decrypt_string( def encrypt( - keyset: Union[T, Union[bytes, dict]], - plaintext: Union[T, Union[bytes, str]], - additional_data: Union[T, Union[bytes, str]], + keyset: Union[T, bigframes.core.col.Expression, Union[bytes, dict]], + plaintext: Union[T, bigframes.core.col.Expression, Union[bytes, str]], + additional_data: Union[T, bigframes.core.col.Expression, Union[bytes, str]], ) -> T: """Encrypts plaintext using the primary cryptographic key in keyset. The algorithm of the primary key must be AEAD_AES_GCM_256. Binds the ciphertext to the context defined by additional_data. Returns NULL if any input is NULL.""" - return _apply_googlesql_op( + return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( _ENCRYPT_OP, keyset, plaintext, diff --git a/packages/bigframes/bigframes/dtypes.py b/packages/bigframes/bigframes/dtypes.py index e7539c59c7d7..95689b91dbd2 100644 --- a/packages/bigframes/bigframes/dtypes.py +++ b/packages/bigframes/bigframes/dtypes.py @@ -14,13 +14,13 @@ """Mappings for Pandas dtypes supported by BigQuery DataFrames package""" +from dataclasses import dataclass import datetime import decimal import textwrap import typing -import warnings -from dataclasses import dataclass from typing import Any, Dict, List, Literal, Sequence, Union +import warnings import bigframes_vendored.constants as constants import db_dtypes # type: ignore @@ -39,6 +39,7 @@ pd.Float64Dtype, pd.Int64Dtype, pd.StringDtype, + pd.StringDtype[Literal["pyarrow"]], pd.ArrowDtype, gpd.array.GeometryDtype, ] diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index b15aace5f17c..5479c040eddd 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -23,8 +23,9 @@ import pathlib import re -import yaml + import jinja2 +import yaml # Directory containing the YAML files DATA_DIR = pathlib.Path("scripts/data/sql-functions") @@ -64,6 +65,7 @@ import bigframes.series as series from bigframes import dtypes from bigframes.operations import googlesql +import bigframes.bigquery._googlesql T = TypeVar("T", series.Series, bigframes.core.col.Expression) @@ -75,60 +77,14 @@ ) {% endfor %} -def _apply_googlesql_op( - op: googlesql.GoogleSqlScalarOp, - *args: Any, -) -> Union[series.Series, bigframes.core.col.Expression]: - \"\"\"Applies a GoogleSQL scalar operator to the given arguments. - - Handles a mix of Series, Expression, and literal inputs. - \"\"\" - # Find the first Series to use for alignment - first_series = None - for arg in args: - if isinstance(arg, series.Series): - first_series = arg - break - - if first_series is not None: - processed_args = [] - block = first_series._block - for arg in args: - if isinstance(arg, bigframes.core.col.Expression): - # Project expression onto the block - block, col_id = block.project_expr(arg._expr) - processed_args.append(series.Series(block.select_column(col_id))) - elif arg is sentinels.DEFAULT: - # OmittedArg is handled by GoogleSqlScalarOp in compiler - processed_args.append(bigframes.core.col.Expression(ex.OmittedArg())) - else: - processed_args.append(arg) - - # Apply the n-ary op. _apply_nary_op handles alignment of Series and literals. - result = first_series._apply_nary_op(op, processed_args, ignore_self=True) - result.name = None - return result - - # No Series, return an Expression - expr_args = [] - for arg in args: - if isinstance(arg, bigframes.core.col.Expression): - expr_args.append(arg._expr) - elif arg is sentinels.DEFAULT: - expr_args.append(ex.OmittedArg()) - else: - expr_args.append(ex.const(arg)) - - return bigframes.core.col.Expression(ex.OpExpression(op, tuple(expr_args))) - {% for func in functions %} def {{ func.name }}( {% for arg in func.args %} - {{ arg.name }}: Union[T, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, + {{ arg.name }}: Union[T, bigframes.core.col.Expression, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, {% endfor %} ) -> T: \"\"\"{{ func.description }}\"\"\" - return _apply_googlesql_op( + return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( {{ func.op_name }}, {% for arg in func.args %} {{ arg.name }}, From 47c47af85bd1ac487d407f54a7b06c1276743864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 15:35:13 +0000 Subject: [PATCH 07/20] remove imports after generation --- .../scripts/generate_bigframes_bigquery.py | 22 ++++++++++++++----- .../generate_bigframes_bigquery.py.lock | 22 +++++++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index 5479c040eddd..7f0f54aec87d 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -2,6 +2,7 @@ # # /// script # dependencies = [ +# "autoflake", # "jinja2", # "pyyaml", # ] @@ -23,6 +24,7 @@ import pathlib import re +import subprocess import jinja2 import yaml @@ -58,14 +60,14 @@ import datetime from typing import Any, Optional, TypeVar, Union +from bigframes import dtypes +import bigframes.bigquery._googlesql import bigframes.core.col import bigframes.core.expression as ex import bigframes.core.sentinels as sentinels +from bigframes.operations import googlesql import bigframes.operations as ops import bigframes.series as series -from bigframes import dtypes -from bigframes.operations import googlesql -import bigframes.bigquery._googlesql T = TypeVar("T", series.Series, bigframes.core.col.Expression) @@ -76,8 +78,9 @@ signature={{ op.signature }}, ) {% endfor %} - {% for func in functions %} + + def {{ func.name }}( {% for arg in func.args %} {{ arg.name }}: Union[T, bigframes.core.col.Expression, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, @@ -90,7 +93,6 @@ def {{ func.name }}( {{ arg.name }}, {% endfor %} ) # type: ignore - {% endfor %} """ @@ -250,6 +252,16 @@ def main(): ) with open(output_file, "w") as f: f.write(content) + + subprocess.run( + [ + "autoflake", + "--in-place", + "--remove-all-unused-imports", + str(output_file), + ], + check=True, + ) print(f" Generated {output_file}") diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock b/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock index 0d28e42101bd..3cba9097522d 100644 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock @@ -4,10 +4,23 @@ requires-python = ">=3.14" [manifest] requirements = [ + { name = "autoflake" }, { name = "jinja2" }, { name = "pyyaml" }, ] +[[package]] +name = "autoflake" +version = "2.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyflakes" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c3/0b/70c277eef225133763bf05c02c88df182e57d5c5c0730d3998958096a82e/autoflake-2.3.3.tar.gz", hash = "sha256:c24809541e23999f7a7b0d2faadf15deb0bc04cdde49728a2fd943a0c8055504", size = 16515, upload-time = "2026-02-20T05:01:43.448Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/21/26f1680ec3a598ea31768f9ebcd427e42986d077a005416094b580635532/autoflake-2.3.3-py3-none-any.whl", hash = "sha256:a51a3412aff16135ee5b3ec25922459fef10c1f23ce6d6c4977188df859e8b53", size = 17715, upload-time = "2026-02-20T05:01:42.137Z" }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -50,6 +63,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, ] +[[package]] +name = "pyflakes" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/45/dc/fd034dc20b4b264b3d015808458391acbf9df40b1e54750ef175d39180b1/pyflakes-3.4.0.tar.gz", hash = "sha256:b24f96fafb7d2ab0ec5075b7350b3d2d2218eab42003821c06344973d3ea2f58", size = 64669, upload-time = "2025-06-20T18:45:27.834Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/2f/81d580a0fb83baeb066698975cb14a618bdbed7720678566f1b046a95fe8/pyflakes-3.4.0-py2.py3-none-any.whl", hash = "sha256:f742a7dbd0d9cb9ea41e9a24a918996e8170c799fa528688d40dd582c8265f4f", size = 63551, upload-time = "2025-06-20T18:45:26.937Z" }, +] + [[package]] name = "pyyaml" version = "6.0.3" From 639cfc7ca69291e54994646f9738e43d4f583b97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 15:52:10 +0000 Subject: [PATCH 08/20] generate tests --- packages/bigframes/bigframes/dtypes.py | 1 - .../scripts/generate_bigframes_bigquery.py | 93 ++++++ .../unit/bigquery/_operations/conftest.py | 280 ++++++++++++++++++ .../test_aead/test_decrypt_bytes/out.sql | 4 + .../test_aead/test_decrypt_string/out.sql | 4 + .../snapshots/test_aead/test_encrypt/out.sql | 4 + .../unit/bigquery/_operations/test_aead.py | 54 ++++ 7 files changed, 439 insertions(+), 1 deletion(-) create mode 100644 packages/bigframes/tests/unit/bigquery/_operations/conftest.py create mode 100644 packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql create mode 100644 packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql create mode 100644 packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql create mode 100644 packages/bigframes/tests/unit/bigquery/_operations/test_aead.py diff --git a/packages/bigframes/bigframes/dtypes.py b/packages/bigframes/bigframes/dtypes.py index 95689b91dbd2..e2bd9dd601f7 100644 --- a/packages/bigframes/bigframes/dtypes.py +++ b/packages/bigframes/bigframes/dtypes.py @@ -39,7 +39,6 @@ pd.Float64Dtype, pd.Int64Dtype, pd.StringDtype, - pd.StringDtype[Literal["pyarrow"]], pd.ArrowDtype, gpd.array.GeometryDtype, ] diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index 7f0f54aec87d..ff5dcb1232cd 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -33,6 +33,8 @@ DATA_DIR = pathlib.Path("scripts/data/sql-functions") # Directory where the generated Python files will be placed OUTPUT_DIR = pathlib.Path("bigframes/bigquery/_operations") +# Directory where the generated test files will be placed +TEST_OUTPUT_DIR = pathlib.Path("tests/unit/bigquery/_operations") LICENSE_HEADER = """# Copyright 2026 Google LLC # @@ -96,6 +98,35 @@ def {{ func.name }}( {% endfor %} """ +TEST_TEMPLATE = """{{ license_header }} +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: {{ yaml_path }} +# by the script: {{ script_path }} + +from typing import cast + +import pytest + +import bigframes.pandas as bpd +import {{ import_path }} as {{ short_name }} + +pytest.importorskip("pytest_snapshot") + + +{% for func in functions %} +def test_{{ func.name }}(scalar_types_df: bpd.DataFrame, snapshot): + result = {{ short_name }}.{{ func.name }}( +{% for arg in func.test_args %} + cast(bpd.Series, scalar_types_df["{{ arg.col_name }}"]), +{% endfor %} + ).to_frame() + snapshot.assert_match(result.sql, "out.sql") + + +{% endfor %} +""" + DTYPE_MAP = { "binary": "dtypes.BYTES_DTYPE", "string": "dtypes.STRING_DTYPE", @@ -125,6 +156,19 @@ def {{ func.name }}( "struct": "dict", } +YAML_TYPE_TO_COL = { + "binary": "bytes_col", + "string": "string_col", + "int64": "int64_col", + "float64": "float64_col", + "bool": "bool_col", + "geography": "geography_col", + "date": "date_col", + "time": "time_col", + "datetime": "datetime_col", + "timestamp": "timestamp_col", +} + def to_snake_case(name): # Replace dots with underscores @@ -139,6 +183,7 @@ def to_snake_case(name): def main(): env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True) template = env.from_string(TEMPLATE) + test_template = env.from_string(TEST_TEMPLATE) for yaml_file in DATA_DIR.glob("**/*.yaml"): print(f"Processing {yaml_file}...") @@ -232,12 +277,21 @@ def main(): if not arg["default"]: del arg["default"] + # Test args + test_args = [] + for name in arg_order: + arg_info = args_by_name[name] + some_type = list(arg_info["types"])[0] + col_name = YAML_TYPE_TO_COL.get(some_type, "string_col") + test_args.append({"col_name": col_name}) + functions_list.append( { "name": python_name, "op_name": internal_op_name, "description": func_data["description"], "args": func_args, + "test_args": test_args, } ) @@ -264,6 +318,45 @@ def main(): ) print(f" Generated {output_file}") + # Render and write test + import_path = "bigframes.bigquery._operations." + ".".join(module_path.parts) + test_output_file = TEST_OUTPUT_DIR.joinpath( + module_path.with_name(f"test_{module_path.name}") + ).with_suffix(".py") + + test_output_file.parent.mkdir(parents=True, exist_ok=True) + test_content = test_template.render( + license_header=LICENSE_HEADER, + yaml_path=str(yaml_file), + script_path="scripts/generate_bigframes_bigquery.py", + import_path=import_path, + short_name=module_path.name, + functions=functions_list, + ) + with open(test_output_file, "w") as f: + f.write(test_content) + + subprocess.run( + [ + "autoflake", + "--in-place", + "--remove-all-unused-imports", + str(test_output_file), + ], + check=True, + ) + print(f" Generated {test_output_file}") + + print(f" Updating snapshots for {test_output_file}...") + subprocess.run( + [ + "pytest", + str(test_output_file), + "--snapshot-update", + ], + check=False, + ) + if __name__ == "__main__": main() diff --git a/packages/bigframes/tests/unit/bigquery/_operations/conftest.py b/packages/bigframes/tests/unit/bigquery/_operations/conftest.py new file mode 100644 index 000000000000..127902241acb --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/_operations/conftest.py @@ -0,0 +1,280 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +import typing + +import pandas as pd +import pyarrow as pa +import pytest +from google.cloud import bigquery + +import bigframes.core as core +import bigframes.pandas as bpd +import bigframes.testing.mocks as mocks +import bigframes.testing.utils +from bigframes import dtypes + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data" + + +def _create_compiler_session(table_name, table_schema): + """Helper function to create a compiler session.""" + from bigframes.testing import compiler_session + + anonymous_dataset = bigquery.DatasetReference.from_string( + "bigframes-dev.sqlglot_test" + ) + session = mocks.create_bigquery_session( + table_name=table_name, + table_schema=table_schema, + anonymous_dataset=anonymous_dataset, + ) + session._executor = compiler_session.SQLCompilerExecutor() + return session + + +@pytest.fixture(scope="session") +def compiler_session(scalar_types_table_schema): + """Compiler session for scalar types.""" + return _create_compiler_session("scalar_types", scalar_types_table_schema) + + +@pytest.fixture(scope="session") +def compiler_session_w_repeated_types(repeated_types_table_schema): + """Compiler session for repeated data types.""" + return _create_compiler_session("repeated_types", repeated_types_table_schema) + + +@pytest.fixture(scope="session") +def compiler_session_w_nested_structs_types(nested_structs_types_table_schema): + """Compiler session for nested STRUCT data types.""" + return _create_compiler_session( + "nested_structs_types", nested_structs_types_table_schema + ) + + +@pytest.fixture(scope="session") +def compiler_session_w_json_types(json_types_table_schema): + """Compiler session for JSON data types.""" + return _create_compiler_session("json_types", json_types_table_schema) + + +@pytest.fixture(scope="session") +def scalar_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("datetime_col", "DATETIME"), + bigquery.SchemaField("geography_col", "GEOGRAPHY"), + bigquery.SchemaField("int64_col", "INTEGER"), + bigquery.SchemaField("int64_too", "INTEGER"), + bigquery.SchemaField("numeric_col", "NUMERIC"), + bigquery.SchemaField("float64_col", "FLOAT"), + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("rowindex_2", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("string_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("timestamp_col", "TIMESTAMP"), + bigquery.SchemaField("duration_col", "INTEGER"), + ] + + +@pytest.fixture(scope="session") +def scalar_types_df(compiler_session) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.scalar_types", + enable_snapshot=False, + ) + bf_df = bf_df.set_index("rowindex", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def scalar_types_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + # TODO: add tests for empty dataframes + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + bigframes.testing.utils.convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + return df + + +@pytest.fixture(scope="module") +def scalar_types_array_value( + scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session +) -> core.ArrayValue: + managed_data_source = core.local_data.ManagedArrowTable.from_pandas( + scalar_types_pandas_df + ) + return core.ArrayValue.from_managed(managed_data_source, compiler_session) + + +@pytest.fixture(scope="session") +def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("id", "INTEGER"), + bigquery.SchemaField( + "people", + "RECORD", + fields=[ + bigquery.SchemaField("name", "STRING"), + bigquery.SchemaField("age", "INTEGER"), + bigquery.SchemaField( + "address", + "RECORD", + fields=[ + bigquery.SchemaField("city", "STRING"), + bigquery.SchemaField("country", "STRING"), + ], + ), + ], + ), + ] + + +@pytest.fixture(scope="session") +def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_nested_structs_types._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.nested_structs_types", + enable_snapshot=False, + ) + bf_df = bf_df.set_index("id", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def nested_structs_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing STRUCT types and using the `id` + column as the index.""" + + df = pd.read_json( + DATA_DIR / "nested_structs.jsonl", + lines=True, + ) + df = df.set_index("id") + + address_struct_schema = pa.struct( + [pa.field("city", pa.string()), pa.field("country", pa.string())] + ) + person_struct_schema = pa.struct( + [ + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + pa.field("address", address_struct_schema), + ] + ) + df["person"] = df["person"].astype(pd.ArrowDtype(person_struct_schema)) + return df + + +@pytest.fixture(scope="session") +def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("int_list_col", "INTEGER", "REPEATED"), + bigquery.SchemaField("bool_list_col", "BOOLEAN", "REPEATED"), + bigquery.SchemaField("float_list_col", "FLOAT", "REPEATED"), + bigquery.SchemaField("date_list_col", "DATE", "REPEATED"), + bigquery.SchemaField("date_time_list_col", "DATETIME", "REPEATED"), + bigquery.SchemaField("numeric_list_col", "NUMERIC", "REPEATED"), + bigquery.SchemaField("string_list_col", "STRING", "REPEATED"), + ] + + +@pytest.fixture(scope="session") +def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_repeated_types._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.repeated_types", + enable_snapshot=False, + ) + bf_df = bf_df.set_index("rowindex", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def repeated_types_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing LIST types and using the `rowindex` + column as the index.""" + + df = pd.read_json( + DATA_DIR / "repeated.jsonl", + lines=True, + ) + # TODO: add dtype conversion here if needed. + df = df.set_index("rowindex") + return df + + +@pytest.fixture(scope="session") +def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("json_col", "JSON"), + ] + + +@pytest.fixture(scope="session") +def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing JSON types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_json_types._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.json_types", + enable_snapshot=False, + ) + # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? + bf_df = bf_df.set_index("rowindex", drop=True) + return bf_df + + +@pytest.fixture(scope="session") +def json_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing JSON types and using the `rowindex` + column as the index.""" + json_data = [ + "null", + "true", + "100", + "0.98", + '"a string"', + "[]", + "[1, 2, 3]", + '[{"a": 1}, {"a": 2}, {"a": null}, {}]', + '"100"', + '{"date": "2024-07-16"}', + '{"int_value": 2, "null_filed": null}', + '{"list_data": [10, 20, 30]}', + ] + df = pd.DataFrame( + { + "rowindex": pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), + "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE), + }, + ) + # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? + df = df.set_index("rowindex", drop=True) + return df diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql new file mode 100644 index 000000000000..d74f1fa20eee --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql @@ -0,0 +1,4 @@ +SELECT + `rowindex`, + AEAD.DECRYPT_BYTES(`string_col`, `bytes_col`, `bytes_col`) AS `0` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql new file mode 100644 index 000000000000..1c2b75812b9e --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql @@ -0,0 +1,4 @@ +SELECT + `rowindex`, + AEAD.DECRYPT_STRING(`string_col`, `bytes_col`, `string_col`) AS `0` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql new file mode 100644 index 000000000000..e62f74d8fd9d --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql @@ -0,0 +1,4 @@ +SELECT + `rowindex`, + AEAD.ENCRYPT(`string_col`, `bytes_col`, `bytes_col`) AS `0` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py new file mode 100644 index 000000000000..1b9168687b41 --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py @@ -0,0 +1,54 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: scripts/data/sql-functions/aead.yaml +# by the script: scripts/generate_bigframes_bigquery.py + +from typing import cast + +import pytest + +import bigframes.pandas as bpd +import bigframes.bigquery._operations.aead as aead + +pytest.importorskip("pytest_snapshot") + + +def test_decrypt_bytes(scalar_types_df: bpd.DataFrame, snapshot): + result = aead.decrypt_bytes( + cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + ).to_frame() + snapshot.assert_match(result.sql, "out.sql") + + +def test_decrypt_string(scalar_types_df: bpd.DataFrame, snapshot): + result = aead.decrypt_string( + cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + cast(bpd.Series, scalar_types_df["string_col"]), + ).to_frame() + snapshot.assert_match(result.sql, "out.sql") + + +def test_encrypt(scalar_types_df: bpd.DataFrame, snapshot): + result = aead.encrypt( + cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + ).to_frame() + snapshot.assert_match(result.sql, "out.sql") From e84cbdcdba4e3c8a7d3aa0fba35a112e82eb493e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 16:03:03 +0000 Subject: [PATCH 09/20] align pre-commit with migration to ruff --- .pre-commit-config.yaml | 16 +++++++--------- .../bigframes/bigframes/bigquery/_googlesql.py | 2 +- .../bigframes/bigquery/_operations/aead.py | 4 ++-- packages/bigframes/bigframes/dtypes.py | 4 ++-- .../tests/unit/bigquery/_operations/test_aead.py | 5 ++--- 5 files changed, 14 insertions(+), 17 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 101c6bdc13d1..e0fdf49d917c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,12 +21,10 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml -- repo: https://github.com/psf/black - rev: 23.7.0 - hooks: - - id: black -- repo: https://github.com/pycqa/flake8 - rev: 6.1.0 # version-scanner: ignore - hooks: - - id: flake8 - args: [--config, packages/google-cloud-alloydb/.flake8] +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.14.14 + hooks: + # Run the linter. + - id: ruff-check + args: [ --select, I, --fix, --target-version=py310, --line-length=88 ] diff --git a/packages/bigframes/bigframes/bigquery/_googlesql.py b/packages/bigframes/bigframes/bigquery/_googlesql.py index ebf0bb81dc7e..4f15d1f3c277 100644 --- a/packages/bigframes/bigframes/bigquery/_googlesql.py +++ b/packages/bigframes/bigframes/bigquery/_googlesql.py @@ -21,8 +21,8 @@ import bigframes.core.col import bigframes.core.expression as ex import bigframes.core.sentinels as sentinels -from bigframes.operations import googlesql import bigframes.series as series +from bigframes.operations import googlesql def apply_googlesql_scalar_op( diff --git a/packages/bigframes/bigframes/bigquery/_operations/aead.py b/packages/bigframes/bigframes/bigquery/_operations/aead.py index cbc995b1d5d7..1a7c02ec2e63 100644 --- a/packages/bigframes/bigframes/bigquery/_operations/aead.py +++ b/packages/bigframes/bigframes/bigquery/_operations/aead.py @@ -21,11 +21,11 @@ from typing import TypeVar, Union -from bigframes import dtypes import bigframes.bigquery._googlesql import bigframes.core.col -from bigframes.operations import googlesql import bigframes.series as series +from bigframes import dtypes +from bigframes.operations import googlesql T = TypeVar("T", series.Series, bigframes.core.col.Expression) diff --git a/packages/bigframes/bigframes/dtypes.py b/packages/bigframes/bigframes/dtypes.py index e2bd9dd601f7..e7539c59c7d7 100644 --- a/packages/bigframes/bigframes/dtypes.py +++ b/packages/bigframes/bigframes/dtypes.py @@ -14,13 +14,13 @@ """Mappings for Pandas dtypes supported by BigQuery DataFrames package""" -from dataclasses import dataclass import datetime import decimal import textwrap import typing -from typing import Any, Dict, List, Literal, Sequence, Union import warnings +from dataclasses import dataclass +from typing import Any, Dict, List, Literal, Sequence, Union import bigframes_vendored.constants as constants import db_dtypes # type: ignore diff --git a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py index 1b9168687b41..95ab84c447d3 100644 --- a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py +++ b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py @@ -19,10 +19,9 @@ from typing import cast -import pytest - -import bigframes.pandas as bpd import bigframes.bigquery._operations.aead as aead +import bigframes.pandas as bpd +import pytest pytest.importorskip("pytest_snapshot") From 8fa1159204860c22a75f14573c03ec24052a5ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 16:09:08 +0000 Subject: [PATCH 10/20] sort imports --- packages/bigframes/pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/bigframes/pyproject.toml b/packages/bigframes/pyproject.toml index fed528d4a7a1..fb658d69dafe 100644 --- a/packages/bigframes/pyproject.toml +++ b/packages/bigframes/pyproject.toml @@ -1,3 +1,6 @@ [build-system] requires = ["setuptools"] build-backend = "setuptools.build_meta" + +[tool.ruff.lint.isort] +known-first-party = ["bigframes", "bigframes_vendored"] From 8e4b7c551d9b2fe614de5e83eda6a06ffd6ab030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 16:10:08 +0000 Subject: [PATCH 11/20] sort imports --- .../bigframes/tests/unit/bigquery/_operations/test_aead.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py index 95ab84c447d3..d30474183ae2 100644 --- a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py +++ b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py @@ -19,9 +19,10 @@ from typing import cast +import pytest + import bigframes.bigquery._operations.aead as aead import bigframes.pandas as bpd -import pytest pytest.importorskip("pytest_snapshot") From 1332bc5eb7e2d9769c34387e5dc52d546f82f7bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 16:14:49 +0000 Subject: [PATCH 12/20] update gen to run ruff --- .../bigframes/bigquery/_operations/aead.py | 6 ++- .../scripts/generate_bigframes_bigquery.py | 22 +++++---- .../generate_bigframes_bigquery.py.lock | 49 ++++++++++--------- .../test_aead/test_decrypt_bytes/out.sql | 2 +- .../test_aead/test_decrypt_string/out.sql | 2 +- .../snapshots/test_aead/test_encrypt/out.sql | 2 +- .../unit/bigquery/_operations/test_aead.py | 6 +-- 7 files changed, 51 insertions(+), 38 deletions(-) diff --git a/packages/bigframes/bigframes/bigquery/_operations/aead.py b/packages/bigframes/bigframes/bigquery/_operations/aead.py index 1a7c02ec2e63..a7b3f9dfae54 100644 --- a/packages/bigframes/bigframes/bigquery/_operations/aead.py +++ b/packages/bigframes/bigframes/bigquery/_operations/aead.py @@ -19,10 +19,14 @@ from __future__ import annotations -from typing import TypeVar, Union +import datetime +from typing import Any, Optional, TypeVar, Union import bigframes.bigquery._googlesql import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.core.sentinels as sentinels +import bigframes.operations as ops import bigframes.series as series from bigframes import dtypes from bigframes.operations import googlesql diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index ff5dcb1232cd..02914b24dae7 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -2,9 +2,9 @@ # # /// script # dependencies = [ -# "autoflake", # "jinja2", # "pyyaml", +# "ruff==0.14.14", # ] # /// # @@ -36,6 +36,16 @@ # Directory where the generated test files will be placed TEST_OUTPUT_DIR = pathlib.Path("tests/unit/bigquery/_operations") +RUFF_ARGS = [ + "ruff", + "check", + "--select", + "I", + "--fix", + "--target-version=py310", + "--line-length=88", +] + LICENSE_HEADER = """# Copyright 2026 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -308,10 +318,7 @@ def main(): f.write(content) subprocess.run( - [ - "autoflake", - "--in-place", - "--remove-all-unused-imports", + RUFF_ARGS + [ str(output_file), ], check=True, @@ -337,10 +344,7 @@ def main(): f.write(test_content) subprocess.run( - [ - "autoflake", - "--in-place", - "--remove-all-unused-imports", + RUFF_ARGS + [ str(test_output_file), ], check=True, diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock b/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock index 3cba9097522d..0c89fde6d406 100644 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock @@ -4,21 +4,9 @@ requires-python = ">=3.14" [manifest] requirements = [ - { name = "autoflake" }, { name = "jinja2" }, { name = "pyyaml" }, -] - -[[package]] -name = "autoflake" -version = "2.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyflakes" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c3/0b/70c277eef225133763bf05c02c88df182e57d5c5c0730d3998958096a82e/autoflake-2.3.3.tar.gz", hash = "sha256:c24809541e23999f7a7b0d2faadf15deb0bc04cdde49728a2fd943a0c8055504", size = 16515, upload-time = "2026-02-20T05:01:43.448Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/da/21/26f1680ec3a598ea31768f9ebcd427e42986d077a005416094b580635532/autoflake-2.3.3-py3-none-any.whl", hash = "sha256:a51a3412aff16135ee5b3ec25922459fef10c1f23ce6d6c4977188df859e8b53", size = 17715, upload-time = "2026-02-20T05:01:42.137Z" }, + { name = "ruff", specifier = "==0.14.14" }, ] [[package]] @@ -63,15 +51,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, ] -[[package]] -name = "pyflakes" -version = "3.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/45/dc/fd034dc20b4b264b3d015808458391acbf9df40b1e54750ef175d39180b1/pyflakes-3.4.0.tar.gz", hash = "sha256:b24f96fafb7d2ab0ec5075b7350b3d2d2218eab42003821c06344973d3ea2f58", size = 64669, upload-time = "2025-06-20T18:45:27.834Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c2/2f/81d580a0fb83baeb066698975cb14a618bdbed7720678566f1b046a95fe8/pyflakes-3.4.0-py2.py3-none-any.whl", hash = "sha256:f742a7dbd0d9cb9ea41e9a24a918996e8170c799fa528688d40dd582c8265f4f", size = 63551, upload-time = "2025-06-20T18:45:26.937Z" }, -] - [[package]] name = "pyyaml" version = "6.0.3" @@ -97,3 +76,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, ] + +[[package]] +name = "ruff" +version = "0.14.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/06/f71e3a86b2df0dfa2d2f72195941cd09b44f87711cb7fa5193732cb9a5fc/ruff-0.14.14.tar.gz", hash = "sha256:2d0f819c9a90205f3a867dbbd0be083bee9912e170fd7d9704cc8ae45824896b", size = 4515732, upload-time = "2026-01-22T22:30:17.527Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/89/20a12e97bc6b9f9f68343952da08a8099c57237aef953a56b82711d55edd/ruff-0.14.14-py3-none-linux_armv6l.whl", hash = "sha256:7cfe36b56e8489dee8fbc777c61959f60ec0f1f11817e8f2415f429552846aed", size = 10467650, upload-time = "2026-01-22T22:30:08.578Z" }, + { url = "https://files.pythonhosted.org/packages/a3/b1/c5de3fd2d5a831fcae21beda5e3589c0ba67eec8202e992388e4b17a6040/ruff-0.14.14-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6006a0082336e7920b9573ef8a7f52eec837add1265cc74e04ea8a4368cd704c", size = 10883245, upload-time = "2026-01-22T22:30:04.155Z" }, + { url = "https://files.pythonhosted.org/packages/b8/7c/3c1db59a10e7490f8f6f8559d1db8636cbb13dccebf18686f4e3c9d7c772/ruff-0.14.14-py3-none-macosx_11_0_arm64.whl", hash = "sha256:026c1d25996818f0bf498636686199d9bd0d9d6341c9c2c3b62e2a0198b758de", size = 10231273, upload-time = "2026-01-22T22:30:34.642Z" }, + { url = "https://files.pythonhosted.org/packages/a1/6e/5e0e0d9674be0f8581d1f5e0f0a04761203affce3232c1a1189d0e3b4dad/ruff-0.14.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f666445819d31210b71e0a6d1c01e24447a20b85458eea25a25fe8142210ae0e", size = 10585753, upload-time = "2026-01-22T22:30:31.781Z" }, + { url = "https://files.pythonhosted.org/packages/23/09/754ab09f46ff1884d422dc26d59ba18b4e5d355be147721bb2518aa2a014/ruff-0.14.14-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c0f18b922c6d2ff9a5e6c3ee16259adc513ca775bcf82c67ebab7cbd9da5bc8", size = 10286052, upload-time = "2026-01-22T22:30:24.827Z" }, + { url = "https://files.pythonhosted.org/packages/c8/cc/e71f88dd2a12afb5f50733851729d6b571a7c3a35bfdb16c3035132675a0/ruff-0.14.14-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1629e67489c2dea43e8658c3dba659edbfd87361624b4040d1df04c9740ae906", size = 11043637, upload-time = "2026-01-22T22:30:13.239Z" }, + { url = "https://files.pythonhosted.org/packages/67/b2/397245026352494497dac935d7f00f1468c03a23a0c5db6ad8fc49ca3fb2/ruff-0.14.14-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:27493a2131ea0f899057d49d303e4292b2cae2bb57253c1ed1f256fbcd1da480", size = 12194761, upload-time = "2026-01-22T22:30:22.542Z" }, + { url = "https://files.pythonhosted.org/packages/5b/06/06ef271459f778323112c51b7587ce85230785cd64e91772034ddb88f200/ruff-0.14.14-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:01ff589aab3f5b539e35db38425da31a57521efd1e4ad1ae08fc34dbe30bd7df", size = 12005701, upload-time = "2026-01-22T22:30:20.499Z" }, + { url = "https://files.pythonhosted.org/packages/41/d6/99364514541cf811ccc5ac44362f88df66373e9fec1b9d1c4cc830593fe7/ruff-0.14.14-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1cc12d74eef0f29f51775f5b755913eb523546b88e2d733e1d701fe65144e89b", size = 11282455, upload-time = "2026-01-22T22:29:59.679Z" }, + { url = "https://files.pythonhosted.org/packages/ca/71/37daa46f89475f8582b7762ecd2722492df26421714a33e72ccc9a84d7a5/ruff-0.14.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb8481604b7a9e75eff53772496201690ce2687067e038b3cc31aaf16aa0b974", size = 11215882, upload-time = "2026-01-22T22:29:57.032Z" }, + { url = "https://files.pythonhosted.org/packages/2c/10/a31f86169ec91c0705e618443ee74ede0bdd94da0a57b28e72db68b2dbac/ruff-0.14.14-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:14649acb1cf7b5d2d283ebd2f58d56b75836ed8c6f329664fa91cdea19e76e66", size = 11180549, upload-time = "2026-01-22T22:30:27.175Z" }, + { url = "https://files.pythonhosted.org/packages/fd/1e/c723f20536b5163adf79bdd10c5f093414293cdf567eed9bdb7b83940f3f/ruff-0.14.14-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e8058d2145566510790eab4e2fad186002e288dec5e0d343a92fe7b0bc1b3e13", size = 10543416, upload-time = "2026-01-22T22:30:01.964Z" }, + { url = "https://files.pythonhosted.org/packages/3e/34/8a84cea7e42c2d94ba5bde1d7a4fae164d6318f13f933d92da6d7c2041ff/ruff-0.14.14-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e651e977a79e4c758eb807f0481d673a67ffe53cfa92209781dfa3a996cf8412", size = 10285491, upload-time = "2026-01-22T22:30:29.51Z" }, + { url = "https://files.pythonhosted.org/packages/55/ef/b7c5ea0be82518906c978e365e56a77f8de7678c8bb6651ccfbdc178c29f/ruff-0.14.14-py3-none-musllinux_1_2_i686.whl", hash = "sha256:cc8b22da8d9d6fdd844a68ae937e2a0adf9b16514e9a97cc60355e2d4b219fc3", size = 10733525, upload-time = "2026-01-22T22:30:06.499Z" }, + { url = "https://files.pythonhosted.org/packages/6a/5b/aaf1dfbcc53a2811f6cc0a1759de24e4b03e02ba8762daabd9b6bd8c59e3/ruff-0.14.14-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:16bc890fb4cc9781bb05beb5ab4cd51be9e7cb376bf1dd3580512b24eb3fda2b", size = 11315626, upload-time = "2026-01-22T22:30:36.848Z" }, + { url = "https://files.pythonhosted.org/packages/2c/aa/9f89c719c467dfaf8ad799b9bae0df494513fb21d31a6059cb5870e57e74/ruff-0.14.14-py3-none-win32.whl", hash = "sha256:b530c191970b143375b6a68e6f743800b2b786bbcf03a7965b06c4bf04568167", size = 10502442, upload-time = "2026-01-22T22:30:38.93Z" }, + { url = "https://files.pythonhosted.org/packages/87/44/90fa543014c45560cae1fffc63ea059fb3575ee6e1cb654562197e5d16fb/ruff-0.14.14-py3-none-win_amd64.whl", hash = "sha256:3dde1435e6b6fe5b66506c1dff67a421d0b7f6488d466f651c07f4cab3bf20fd", size = 11630486, upload-time = "2026-01-22T22:30:10.852Z" }, + { url = "https://files.pythonhosted.org/packages/9e/6a/40fee331a52339926a92e17ae748827270b288a35ef4a15c9c8f2ec54715/ruff-0.14.14-py3-none-win_arm64.whl", hash = "sha256:56e6981a98b13a32236a72a8da421d7839221fa308b223b9283312312e5ac76c", size = 10920448, upload-time = "2026-01-22T22:30:15.417Z" }, +] diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql index d74f1fa20eee..5b8b6416b36f 100644 --- a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql @@ -1,4 +1,4 @@ SELECT `rowindex`, - AEAD.DECRYPT_BYTES(`string_col`, `bytes_col`, `bytes_col`) AS `0` + AEAD.DECRYPT_BYTES(`bytes_col`, `bytes_col`, `bytes_col`) AS `0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql index 1c2b75812b9e..97b1ccff9c75 100644 --- a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql @@ -1,4 +1,4 @@ SELECT `rowindex`, - AEAD.DECRYPT_STRING(`string_col`, `bytes_col`, `string_col`) AS `0` + AEAD.DECRYPT_STRING(`bytes_col`, `bytes_col`, `string_col`) AS `0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql index e62f74d8fd9d..9ab9f8c0a7bb 100644 --- a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql @@ -1,4 +1,4 @@ SELECT `rowindex`, - AEAD.ENCRYPT(`string_col`, `bytes_col`, `bytes_col`) AS `0` + AEAD.ENCRYPT(`bytes_col`, `bytes_col`, `bytes_col`) AS `0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py index d30474183ae2..cafcd7d95278 100644 --- a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py +++ b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py @@ -29,7 +29,7 @@ def test_decrypt_bytes(scalar_types_df: bpd.DataFrame, snapshot): result = aead.decrypt_bytes( - cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), cast(bpd.Series, scalar_types_df["bytes_col"]), cast(bpd.Series, scalar_types_df["bytes_col"]), ).to_frame() @@ -38,7 +38,7 @@ def test_decrypt_bytes(scalar_types_df: bpd.DataFrame, snapshot): def test_decrypt_string(scalar_types_df: bpd.DataFrame, snapshot): result = aead.decrypt_string( - cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), cast(bpd.Series, scalar_types_df["bytes_col"]), cast(bpd.Series, scalar_types_df["string_col"]), ).to_frame() @@ -47,7 +47,7 @@ def test_decrypt_string(scalar_types_df: bpd.DataFrame, snapshot): def test_encrypt(scalar_types_df: bpd.DataFrame, snapshot): result = aead.encrypt( - cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), cast(bpd.Series, scalar_types_df["bytes_col"]), cast(bpd.Series, scalar_types_df["bytes_col"]), ).to_frame() From 4f3564426ea9462a3a145b8c8e6dc5dc09890776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 16:38:33 +0000 Subject: [PATCH 13/20] fix mypy --- packages/bigframes/bigframes/bigquery/_googlesql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/bigframes/bigframes/bigquery/_googlesql.py b/packages/bigframes/bigframes/bigquery/_googlesql.py index 4f15d1f3c277..5ac8ce826e9b 100644 --- a/packages/bigframes/bigframes/bigquery/_googlesql.py +++ b/packages/bigframes/bigframes/bigquery/_googlesql.py @@ -52,7 +52,7 @@ def apply_googlesql_scalar_op( break if first_series is not None: - processed_args = [] + processed_args: list[Union[bigframes.core.col.Expression, series.Series]] = [] block = first_series._block for arg in args: if isinstance(arg, bigframes.core.col.Expression): From c69a7f34681f0f62a9d51a519fd4a905f2921200 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 16:59:19 +0000 Subject: [PATCH 14/20] include namespace in bbq --- .../bigframes/bigframes/bigquery/__init__.py | 3 ++- packages/bigframes/bigframes/bigquery/aead.py | 25 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 packages/bigframes/bigframes/bigquery/aead.py diff --git a/packages/bigframes/bigframes/bigquery/__init__.py b/packages/bigframes/bigframes/bigquery/__init__.py index a31d7dd83f93..86a45546b748 100644 --- a/packages/bigframes/bigframes/bigquery/__init__.py +++ b/packages/bigframes/bigframes/bigquery/__init__.py @@ -47,7 +47,7 @@ import sys -from bigframes.bigquery import ai, ml, obj +from bigframes.bigquery import aead, ai, ml, obj from bigframes.bigquery._operations.approx_agg import approx_top_count from bigframes.bigquery._operations.array import ( array_agg, @@ -208,6 +208,7 @@ # io ops "load_data", # Modules / SQL namespaces + "aead", "ai", "ml", "obj", diff --git a/packages/bigframes/bigframes/bigquery/aead.py b/packages/bigframes/bigframes/bigquery/aead.py new file mode 100644 index 000000000000..f18e12bc5cf5 --- /dev/null +++ b/packages/bigframes/bigframes/bigquery/aead.py @@ -0,0 +1,25 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AEAD encryption functions""" + +from __future__ import annotations + +from bigframes.bigquery._operations.aead import decrypt_bytes, decrypt_string, encrypt + +__all__ = [ + "decrypt_bytes", + "decrypt_string", + "encrypt", +] From c555f4a5668da7420fb3cc83c19c442100cbed8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 17:09:31 +0000 Subject: [PATCH 15/20] attempt to fix snapshot --- .../bigframes/scripts/generate_bigframes_bigquery.py | 4 ++-- .../snapshots/test_aead/test_encrypt/out.sql | 2 +- .../tests/unit/bigquery/_operations/test_aead.py | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index 02914b24dae7..bd634dd0c815 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -108,7 +108,7 @@ def {{ func.name }}( {% endfor %} """ -TEST_TEMPLATE = """{{ license_header }} +TEST_TEMPLATE = r"""{{ license_header }} # # DO NOT MODIFY THIS FILE DIRECTLY. # This file was generated from: {{ yaml_path }} @@ -131,7 +131,7 @@ def test_{{ func.name }}(scalar_types_df: bpd.DataFrame, snapshot): cast(bpd.Series, scalar_types_df["{{ arg.col_name }}"]), {% endfor %} ).to_frame() - snapshot.assert_match(result.sql, "out.sql") + snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") {% endfor %} diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql index 9ab9f8c0a7bb..eba30737f631 100644 --- a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql @@ -1,4 +1,4 @@ SELECT `rowindex`, - AEAD.ENCRYPT(`bytes_col`, `bytes_col`, `bytes_col`) AS `0` + AEAD.ENCRYPT(`bytes_col`, `string_col`, `string_col`) AS `0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py index cafcd7d95278..62a3b0d95725 100644 --- a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py +++ b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py @@ -33,7 +33,7 @@ def test_decrypt_bytes(scalar_types_df: bpd.DataFrame, snapshot): cast(bpd.Series, scalar_types_df["bytes_col"]), cast(bpd.Series, scalar_types_df["bytes_col"]), ).to_frame() - snapshot.assert_match(result.sql, "out.sql") + snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") def test_decrypt_string(scalar_types_df: bpd.DataFrame, snapshot): @@ -42,13 +42,13 @@ def test_decrypt_string(scalar_types_df: bpd.DataFrame, snapshot): cast(bpd.Series, scalar_types_df["bytes_col"]), cast(bpd.Series, scalar_types_df["string_col"]), ).to_frame() - snapshot.assert_match(result.sql, "out.sql") + snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") def test_encrypt(scalar_types_df: bpd.DataFrame, snapshot): result = aead.encrypt( cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), + cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["string_col"]), ).to_frame() - snapshot.assert_match(result.sql, "out.sql") + snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") From d8d2cbfbe75b8535ef532d9b961679b4caf35dd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 18:07:19 +0000 Subject: [PATCH 16/20] remove bigframes vendored as first-party --- packages/bigframes/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/bigframes/pyproject.toml b/packages/bigframes/pyproject.toml index fb658d69dafe..e7d9c326a936 100644 --- a/packages/bigframes/pyproject.toml +++ b/packages/bigframes/pyproject.toml @@ -3,4 +3,4 @@ requires = ["setuptools"] build-backend = "setuptools.build_meta" [tool.ruff.lint.isort] -known-first-party = ["bigframes", "bigframes_vendored"] +known-first-party = ["bigframes"] From 5ea61c269b4dc50828f9c547c427cec9f07fc8d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 18:09:33 +0000 Subject: [PATCH 17/20] ruff --- .../scripts/generate_bigframes_bigquery.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index bd634dd0c815..39063c7bee1d 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -37,13 +37,13 @@ TEST_OUTPUT_DIR = pathlib.Path("tests/unit/bigquery/_operations") RUFF_ARGS = [ - "ruff", - "check", - "--select", - "I", - "--fix", - "--target-version=py310", - "--line-length=88", + "ruff", + "check", + "--select", + "I", + "--fix", + "--target-version=py310", + "--line-length=88", ] LICENSE_HEADER = """# Copyright 2026 Google LLC @@ -318,7 +318,8 @@ def main(): f.write(content) subprocess.run( - RUFF_ARGS + [ + RUFF_ARGS + + [ str(output_file), ], check=True, @@ -344,7 +345,8 @@ def main(): f.write(test_content) subprocess.run( - RUFF_ARGS + [ + RUFF_ARGS + + [ str(test_output_file), ], check=True, From a463ea6cbcfcea9353bb74ef1756441ab5b71802 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 21:52:44 +0000 Subject: [PATCH 18/20] create template files --- .../bigframes/bigquery/_googlesql.py | 4 +- .../bigframes/bigquery/_operations/aead.py | 21 +- .../bigframes/bigframes/core/sentinels.py | 14 +- .../scripts/generate_bigframes_bigquery.py | 109 +------ .../bigframes/scripts/templates/license.py.j2 | 13 + .../scripts/templates/operation.py.j2 | 46 +++ .../scripts/templates/test_operation.py.j2 | 28 ++ .../unit/bigquery/_operations/conftest.py | 280 ------------------ .../snapshots/test_aead/test_encrypt/out.sql | 2 +- .../unit/bigquery/_operations/test_aead.py | 5 +- packages/bigframes/tests/unit/conftest.py | 265 +++++++++++++++++ .../unit/core/compile/sqlglot/conftest.py | 280 ------------------ 12 files changed, 384 insertions(+), 683 deletions(-) create mode 100644 packages/bigframes/scripts/templates/license.py.j2 create mode 100644 packages/bigframes/scripts/templates/operation.py.j2 create mode 100644 packages/bigframes/scripts/templates/test_operation.py.j2 delete mode 100644 packages/bigframes/tests/unit/bigquery/_operations/conftest.py delete mode 100644 packages/bigframes/tests/unit/core/compile/sqlglot/conftest.py diff --git a/packages/bigframes/bigframes/bigquery/_googlesql.py b/packages/bigframes/bigframes/bigquery/_googlesql.py index 5ac8ce826e9b..a37c9790ff56 100644 --- a/packages/bigframes/bigframes/bigquery/_googlesql.py +++ b/packages/bigframes/bigframes/bigquery/_googlesql.py @@ -58,7 +58,7 @@ def apply_googlesql_scalar_op( if isinstance(arg, bigframes.core.col.Expression): block, col_id = block.project_expr(bigframes.core.col._as_bf_expr(arg)) processed_args.append(series.Series(block.select_column(col_id))) - elif arg is sentinels.DEFAULT: + elif arg is sentinels.Sentinel.ARGUMENT_DEFAULT: processed_args.append(bigframes.core.col.Expression(ex.OmittedArg())) else: processed_args.append(arg) @@ -73,7 +73,7 @@ def apply_googlesql_scalar_op( for arg in args: if isinstance(arg, bigframes.core.col.Expression): expr_args.append(bigframes.core.col._as_bf_expr(arg)) - elif arg is sentinels.DEFAULT: + elif arg is sentinels.Sentinel.ARGUMENT_DEFAULT: expr_args.append(ex.OmittedArg()) else: expr_args.append(ex.const(arg)) diff --git a/packages/bigframes/bigframes/bigquery/_operations/aead.py b/packages/bigframes/bigframes/bigquery/_operations/aead.py index a7b3f9dfae54..fb98bed59be9 100644 --- a/packages/bigframes/bigframes/bigquery/_operations/aead.py +++ b/packages/bigframes/bigframes/bigquery/_operations/aead.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # # DO NOT MODIFY THIS FILE DIRECTLY. # This file was generated from: scripts/data/sql-functions/aead.yaml @@ -20,7 +19,7 @@ from __future__ import annotations import datetime -from typing import Any, Optional, TypeVar, Union +from typing import Any, Literal, Optional, TypeVar, Union import bigframes.bigquery._googlesql import bigframes.core.col @@ -51,9 +50,9 @@ def decrypt_bytes( - keyset: Union[T, bigframes.core.col.Expression, Union[bytes, dict]], - ciphertext: Union[T, bigframes.core.col.Expression, bytes], - additional_data: Union[T, bigframes.core.col.Expression, bytes], + keyset: Union[T, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, dict]], + ciphertext: Union[T, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes]], + additional_data: Union[T, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes]], ) -> T: """Uses the matching key from keyset to decrypt ciphertext and verifies the integrity of the data using additional_data. Returns an error if decryption or verification fails.""" return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( @@ -65,9 +64,9 @@ def decrypt_bytes( def decrypt_string( - keyset: Union[T, bigframes.core.col.Expression, Union[bytes, dict]], - ciphertext: Union[T, bigframes.core.col.Expression, bytes], - additional_data: Union[T, bigframes.core.col.Expression, str], + keyset: Union[T, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, dict]], + ciphertext: Union[T, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes]], + additional_data: Union[T, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], str]], ) -> T: """Like AEAD.DECRYPT_BYTES, but where additional_data is of type STRING.""" return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( @@ -79,9 +78,9 @@ def decrypt_string( def encrypt( - keyset: Union[T, bigframes.core.col.Expression, Union[bytes, dict]], - plaintext: Union[T, bigframes.core.col.Expression, Union[bytes, str]], - additional_data: Union[T, bigframes.core.col.Expression, Union[bytes, str]], + keyset: Union[T, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, dict]], + plaintext: Union[T, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str]], + additional_data: Union[T, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str]], ) -> T: """Encrypts plaintext using the primary cryptographic key in keyset. The algorithm of the primary key must be AEAD_AES_GCM_256. Binds the ciphertext to the context defined by additional_data. Returns NULL if any input is NULL.""" return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( diff --git a/packages/bigframes/bigframes/core/sentinels.py b/packages/bigframes/bigframes/core/sentinels.py index fc2bfac970e5..ff9913f7c6fd 100644 --- a/packages/bigframes/bigframes/core/sentinels.py +++ b/packages/bigframes/bigframes/core/sentinels.py @@ -16,18 +16,16 @@ from __future__ import annotations -from enum import Enum +import enum -class Default(Enum): - """Default values used throughout BigFrames. +class Sentinel(enum.Enum): + """Default values used throughout BigFrames.""" + + """Default value for an optional argument. When a parameter is set to this, that parameter is explicitly omitted from the SQL text. This allows for NULL (None in Python) to be explicitly passed in to optional parameters. """ - - token = 0 - - -DEFAULT = Default.token + ARGUMENT_DEFAULT = enum.auto() diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index 39063c7bee1d..4baaef3ce6b0 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -35,6 +35,8 @@ OUTPUT_DIR = pathlib.Path("bigframes/bigquery/_operations") # Directory where the generated test files will be placed TEST_OUTPUT_DIR = pathlib.Path("tests/unit/bigquery/_operations") +# Directory containing the Jinja2 templates +TEMPLATE_DIR = pathlib.Path("scripts/templates") RUFF_ARGS = [ "ruff", @@ -46,97 +48,6 @@ "--line-length=88", ] -LICENSE_HEADER = """# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -TEMPLATE = """{{ license_header }} -# -# DO NOT MODIFY THIS FILE DIRECTLY. -# This file was generated from: {{ yaml_path }} -# by the script: {{ script_path }} - -from __future__ import annotations - -import datetime -from typing import Any, Optional, TypeVar, Union - -from bigframes import dtypes -import bigframes.bigquery._googlesql -import bigframes.core.col -import bigframes.core.expression as ex -import bigframes.core.sentinels as sentinels -from bigframes.operations import googlesql -import bigframes.operations as ops -import bigframes.series as series - -T = TypeVar("T", series.Series, bigframes.core.col.Expression) - -{% for op in ops %} -{{ op.internal_name }} = googlesql.GoogleSqlScalarOp( - "{{ op.sql_name }}", - args=({{ op.arg_specs }}), - signature={{ op.signature }}, -) -{% endfor %} -{% for func in functions %} - - -def {{ func.name }}( -{% for arg in func.args %} - {{ arg.name }}: Union[T, bigframes.core.col.Expression, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, -{% endfor %} -) -> T: - \"\"\"{{ func.description }}\"\"\" - return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( - {{ func.op_name }}, -{% for arg in func.args %} - {{ arg.name }}, -{% endfor %} - ) # type: ignore -{% endfor %} -""" - -TEST_TEMPLATE = r"""{{ license_header }} -# -# DO NOT MODIFY THIS FILE DIRECTLY. -# This file was generated from: {{ yaml_path }} -# by the script: {{ script_path }} - -from typing import cast - -import pytest - -import bigframes.pandas as bpd -import {{ import_path }} as {{ short_name }} - -pytest.importorskip("pytest_snapshot") - - -{% for func in functions %} -def test_{{ func.name }}(scalar_types_df: bpd.DataFrame, snapshot): - result = {{ short_name }}.{{ func.name }}( -{% for arg in func.test_args %} - cast(bpd.Series, scalar_types_df["{{ arg.col_name }}"]), -{% endfor %} - ).to_frame() - snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") - - -{% endfor %} -""" - DTYPE_MAP = { "binary": "dtypes.BYTES_DTYPE", "string": "dtypes.STRING_DTYPE", @@ -191,9 +102,13 @@ def to_snake_case(name): def main(): - env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True) - template = env.from_string(TEMPLATE) - test_template = env.from_string(TEST_TEMPLATE) + env = jinja2.Environment( + loader=jinja2.FileSystemLoader(TEMPLATE_DIR), + trim_blocks=True, + lstrip_blocks=True, + ) + template = env.get_template("operation.py.j2") + test_template = env.get_template("test_operation.py.j2") for yaml_file in DATA_DIR.glob("**/*.yaml"): print(f"Processing {yaml_file}...") @@ -266,13 +181,13 @@ def main(): func_args = [] for name in arg_order: arg_info = args_by_name[name] - types = [PY_TYPE_MAP.get(t, "Any") for t in arg_info["types"]] + types = [PY_TYPE_MAP.get(t, "Any") for t in arg_info["types"]] + ["Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]"] type_hint = ( "Union[" + ", ".join(sorted(set(types))) + "]" if len(types) > 1 else types[0] ) - default = "sentinels.DEFAULT" if arg_info["optional"] else "" + default = "sentinels.Sentinel.ARGUMENT_DEFAULT" if arg_info["optional"] else "" func_args.append( { "name": name, @@ -308,7 +223,6 @@ def main(): # Render and write output_file.parent.mkdir(parents=True, exist_ok=True) content = template.render( - license_header=LICENSE_HEADER, yaml_path=str(yaml_file), script_path="scripts/generate_bigframes_bigquery.py", ops=ops_list, @@ -334,7 +248,6 @@ def main(): test_output_file.parent.mkdir(parents=True, exist_ok=True) test_content = test_template.render( - license_header=LICENSE_HEADER, yaml_path=str(yaml_file), script_path="scripts/generate_bigframes_bigquery.py", import_path=import_path, diff --git a/packages/bigframes/scripts/templates/license.py.j2 b/packages/bigframes/scripts/templates/license.py.j2 new file mode 100644 index 000000000000..58d482ea3866 --- /dev/null +++ b/packages/bigframes/scripts/templates/license.py.j2 @@ -0,0 +1,13 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/packages/bigframes/scripts/templates/operation.py.j2 b/packages/bigframes/scripts/templates/operation.py.j2 new file mode 100644 index 000000000000..7e7c7f95b62f --- /dev/null +++ b/packages/bigframes/scripts/templates/operation.py.j2 @@ -0,0 +1,46 @@ +{% include 'license.py.j2' %} + +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: {{ yaml_path }} +# by the script: {{ script_path }} + +from __future__ import annotations + +import datetime +from typing import Any, Literal, Optional, TypeVar, Union + +from bigframes import dtypes +import bigframes.bigquery._googlesql +import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.core.sentinels as sentinels +from bigframes.operations import googlesql +import bigframes.operations as ops +import bigframes.series as series + +T = TypeVar("T", series.Series, bigframes.core.col.Expression) + +{% for op in ops %} +{{ op.internal_name }} = googlesql.GoogleSqlScalarOp( + "{{ op.sql_name }}", + args=({{ op.arg_specs }}), + signature={{ op.signature }}, +) +{% endfor %} +{% for func in functions %} + + +def {{ func.name }}( +{% for arg in func.args %} + {{ arg.name }}: Union[T, bigframes.core.col.Expression, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, +{% endfor %} +) -> T: + """{{ func.description }}""" + return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + {{ func.op_name }}, +{% for arg in func.args %} + {{ arg.name }}, +{% endfor %} + ) # type: ignore +{% endfor %} diff --git a/packages/bigframes/scripts/templates/test_operation.py.j2 b/packages/bigframes/scripts/templates/test_operation.py.j2 new file mode 100644 index 000000000000..aa87fcb94bee --- /dev/null +++ b/packages/bigframes/scripts/templates/test_operation.py.j2 @@ -0,0 +1,28 @@ +{% include 'license.py.j2' %} + +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: {{ yaml_path }} +# by the script: {{ script_path }} + +from typing import cast + +import pytest + +import bigframes.pandas as bpd +import {{ import_path }} as {{ short_name }} + +pytest.importorskip("pytest_snapshot") + + +{% for func in functions %} +def test_{{ func.name }}(scalar_types_df: bpd.DataFrame, snapshot): + result = {{ short_name }}.{{ func.name }}( +{% for arg in func.test_args %} + cast(bpd.Series, scalar_types_df["{{ arg.col_name }}"]), +{% endfor %} + ).to_frame() + snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") + + +{% endfor %} diff --git a/packages/bigframes/tests/unit/bigquery/_operations/conftest.py b/packages/bigframes/tests/unit/bigquery/_operations/conftest.py deleted file mode 100644 index 127902241acb..000000000000 --- a/packages/bigframes/tests/unit/bigquery/_operations/conftest.py +++ /dev/null @@ -1,280 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pathlib -import typing - -import pandas as pd -import pyarrow as pa -import pytest -from google.cloud import bigquery - -import bigframes.core as core -import bigframes.pandas as bpd -import bigframes.testing.mocks as mocks -import bigframes.testing.utils -from bigframes import dtypes - -CURRENT_DIR = pathlib.Path(__file__).parent -DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data" - - -def _create_compiler_session(table_name, table_schema): - """Helper function to create a compiler session.""" - from bigframes.testing import compiler_session - - anonymous_dataset = bigquery.DatasetReference.from_string( - "bigframes-dev.sqlglot_test" - ) - session = mocks.create_bigquery_session( - table_name=table_name, - table_schema=table_schema, - anonymous_dataset=anonymous_dataset, - ) - session._executor = compiler_session.SQLCompilerExecutor() - return session - - -@pytest.fixture(scope="session") -def compiler_session(scalar_types_table_schema): - """Compiler session for scalar types.""" - return _create_compiler_session("scalar_types", scalar_types_table_schema) - - -@pytest.fixture(scope="session") -def compiler_session_w_repeated_types(repeated_types_table_schema): - """Compiler session for repeated data types.""" - return _create_compiler_session("repeated_types", repeated_types_table_schema) - - -@pytest.fixture(scope="session") -def compiler_session_w_nested_structs_types(nested_structs_types_table_schema): - """Compiler session for nested STRUCT data types.""" - return _create_compiler_session( - "nested_structs_types", nested_structs_types_table_schema - ) - - -@pytest.fixture(scope="session") -def compiler_session_w_json_types(json_types_table_schema): - """Compiler session for JSON data types.""" - return _create_compiler_session("json_types", json_types_table_schema) - - -@pytest.fixture(scope="session") -def scalar_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: - return [ - bigquery.SchemaField("bool_col", "BOOLEAN"), - bigquery.SchemaField("bytes_col", "BYTES"), - bigquery.SchemaField("date_col", "DATE"), - bigquery.SchemaField("datetime_col", "DATETIME"), - bigquery.SchemaField("geography_col", "GEOGRAPHY"), - bigquery.SchemaField("int64_col", "INTEGER"), - bigquery.SchemaField("int64_too", "INTEGER"), - bigquery.SchemaField("numeric_col", "NUMERIC"), - bigquery.SchemaField("float64_col", "FLOAT"), - bigquery.SchemaField("rowindex", "INTEGER"), - bigquery.SchemaField("rowindex_2", "INTEGER", mode="REQUIRED"), - bigquery.SchemaField("string_col", "STRING"), - bigquery.SchemaField("time_col", "TIME"), - bigquery.SchemaField("timestamp_col", "TIMESTAMP"), - bigquery.SchemaField("duration_col", "INTEGER"), - ] - - -@pytest.fixture(scope="session") -def scalar_types_df(compiler_session) -> bpd.DataFrame: - """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` - column as the index.""" - bf_df = compiler_session._loader.read_gbq_table( - "bigframes-dev.sqlglot_test.scalar_types", - enable_snapshot=False, - ) - bf_df = bf_df.set_index("rowindex", drop=False) - return bf_df - - -@pytest.fixture(scope="session") -def scalar_types_pandas_df() -> pd.DataFrame: - """Returns a pandas DataFrame containing all scalar types and using the `rowindex` - column as the index.""" - # TODO: add tests for empty dataframes - df = pd.read_json( - DATA_DIR / "scalars.jsonl", - lines=True, - ) - bigframes.testing.utils.convert_pandas_dtypes(df, bytes_col=True) - - df = df.set_index("rowindex", drop=False) - return df - - -@pytest.fixture(scope="module") -def scalar_types_array_value( - scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session -) -> core.ArrayValue: - managed_data_source = core.local_data.ManagedArrowTable.from_pandas( - scalar_types_pandas_df - ) - return core.ArrayValue.from_managed(managed_data_source, compiler_session) - - -@pytest.fixture(scope="session") -def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: - return [ - bigquery.SchemaField("id", "INTEGER"), - bigquery.SchemaField( - "people", - "RECORD", - fields=[ - bigquery.SchemaField("name", "STRING"), - bigquery.SchemaField("age", "INTEGER"), - bigquery.SchemaField( - "address", - "RECORD", - fields=[ - bigquery.SchemaField("city", "STRING"), - bigquery.SchemaField("country", "STRING"), - ], - ), - ], - ), - ] - - -@pytest.fixture(scope="session") -def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame: - """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` - column as the index.""" - bf_df = compiler_session_w_nested_structs_types._loader.read_gbq_table( - "bigframes-dev.sqlglot_test.nested_structs_types", - enable_snapshot=False, - ) - bf_df = bf_df.set_index("id", drop=False) - return bf_df - - -@pytest.fixture(scope="session") -def nested_structs_pandas_df() -> pd.DataFrame: - """Returns a pandas DataFrame containing STRUCT types and using the `id` - column as the index.""" - - df = pd.read_json( - DATA_DIR / "nested_structs.jsonl", - lines=True, - ) - df = df.set_index("id") - - address_struct_schema = pa.struct( - [pa.field("city", pa.string()), pa.field("country", pa.string())] - ) - person_struct_schema = pa.struct( - [ - pa.field("name", pa.string()), - pa.field("age", pa.int64()), - pa.field("address", address_struct_schema), - ] - ) - df["person"] = df["person"].astype(pd.ArrowDtype(person_struct_schema)) - return df - - -@pytest.fixture(scope="session") -def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: - return [ - bigquery.SchemaField("rowindex", "INTEGER"), - bigquery.SchemaField("int_list_col", "INTEGER", "REPEATED"), - bigquery.SchemaField("bool_list_col", "BOOLEAN", "REPEATED"), - bigquery.SchemaField("float_list_col", "FLOAT", "REPEATED"), - bigquery.SchemaField("date_list_col", "DATE", "REPEATED"), - bigquery.SchemaField("date_time_list_col", "DATETIME", "REPEATED"), - bigquery.SchemaField("numeric_list_col", "NUMERIC", "REPEATED"), - bigquery.SchemaField("string_list_col", "STRING", "REPEATED"), - ] - - -@pytest.fixture(scope="session") -def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame: - """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` - column as the index.""" - bf_df = compiler_session_w_repeated_types._loader.read_gbq_table( - "bigframes-dev.sqlglot_test.repeated_types", - enable_snapshot=False, - ) - bf_df = bf_df.set_index("rowindex", drop=False) - return bf_df - - -@pytest.fixture(scope="session") -def repeated_types_pandas_df() -> pd.DataFrame: - """Returns a pandas DataFrame containing LIST types and using the `rowindex` - column as the index.""" - - df = pd.read_json( - DATA_DIR / "repeated.jsonl", - lines=True, - ) - # TODO: add dtype conversion here if needed. - df = df.set_index("rowindex") - return df - - -@pytest.fixture(scope="session") -def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: - return [ - bigquery.SchemaField("rowindex", "INTEGER"), - bigquery.SchemaField("json_col", "JSON"), - ] - - -@pytest.fixture(scope="session") -def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame: - """Returns a BigFrames DataFrame containing JSON types and using the `rowindex` - column as the index.""" - bf_df = compiler_session_w_json_types._loader.read_gbq_table( - "bigframes-dev.sqlglot_test.json_types", - enable_snapshot=False, - ) - # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? - bf_df = bf_df.set_index("rowindex", drop=True) - return bf_df - - -@pytest.fixture(scope="session") -def json_pandas_df() -> pd.DataFrame: - """Returns a pandas DataFrame containing JSON types and using the `rowindex` - column as the index.""" - json_data = [ - "null", - "true", - "100", - "0.98", - '"a string"', - "[]", - "[1, 2, 3]", - '[{"a": 1}, {"a": 2}, {"a": null}, {}]', - '"100"', - '{"date": "2024-07-16"}', - '{"int_value": 2, "null_filed": null}', - '{"list_data": [10, 20, 30]}', - ] - df = pd.DataFrame( - { - "rowindex": pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), - "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE), - }, - ) - # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? - df = df.set_index("rowindex", drop=True) - return df diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql index eba30737f631..9ab9f8c0a7bb 100644 --- a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql +++ b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql @@ -1,4 +1,4 @@ SELECT `rowindex`, - AEAD.ENCRYPT(`bytes_col`, `string_col`, `string_col`) AS `0` + AEAD.ENCRYPT(`bytes_col`, `bytes_col`, `bytes_col`) AS `0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py index 62a3b0d95725..f9f8fdd2dd88 100644 --- a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py +++ b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # # DO NOT MODIFY THIS FILE DIRECTLY. # This file was generated from: scripts/data/sql-functions/aead.yaml @@ -48,7 +47,7 @@ def test_decrypt_string(scalar_types_df: bpd.DataFrame, snapshot): def test_encrypt(scalar_types_df: bpd.DataFrame, snapshot): result = aead.encrypt( cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["string_col"]), - cast(bpd.Series, scalar_types_df["string_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), + cast(bpd.Series, scalar_types_df["bytes_col"]), ).to_frame() snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") diff --git a/packages/bigframes/tests/unit/conftest.py b/packages/bigframes/tests/unit/conftest.py index a9b26afeef29..3ab217cf09ba 100644 --- a/packages/bigframes/tests/unit/conftest.py +++ b/packages/bigframes/tests/unit/conftest.py @@ -12,7 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pathlib +import typing + +import pandas as pd +import pyarrow as pa import pytest +from google.cloud import bigquery + +import bigframes.core as core +import bigframes.pandas as bpd +import bigframes.testing.mocks as mocks +import bigframes.testing.utils +from bigframes import dtypes + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent / "data" @pytest.fixture(scope="session") @@ -22,3 +37,253 @@ def polars_session(): from bigframes.testing import polars_session return polars_session.TestSession() + + +def _create_compiler_session(table_name, table_schema): + """Helper function to create a compiler session.""" + from bigframes.testing import compiler_session + + anonymous_dataset = bigquery.DatasetReference.from_string( + "bigframes-dev.sqlglot_test" + ) + session = mocks.create_bigquery_session( + table_name=table_name, + table_schema=table_schema, + anonymous_dataset=anonymous_dataset, + ) + session._executor = compiler_session.SQLCompilerExecutor() + return session + + +@pytest.fixture(scope="session") +def compiler_session(scalar_types_table_schema): + """Compiler session for scalar types.""" + return _create_compiler_session("scalar_types", scalar_types_table_schema) + + +@pytest.fixture(scope="session") +def compiler_session_w_repeated_types(repeated_types_table_schema): + """Compiler session for repeated data types.""" + return _create_compiler_session("repeated_types", repeated_types_table_schema) + + +@pytest.fixture(scope="session") +def compiler_session_w_nested_structs_types(nested_structs_types_table_schema): + """Compiler session for nested STRUCT data types.""" + return _create_compiler_session( + "nested_structs_types", nested_structs_types_table_schema + ) + + +@pytest.fixture(scope="session") +def compiler_session_w_json_types(json_types_table_schema): + """Compiler session for JSON data types.""" + return _create_compiler_session("json_types", json_types_table_schema) + + +@pytest.fixture(scope="session") +def scalar_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("datetime_col", "DATETIME"), + bigquery.SchemaField("geography_col", "GEOGRAPHY"), + bigquery.SchemaField("int64_col", "INTEGER"), + bigquery.SchemaField("int64_too", "INTEGER"), + bigquery.SchemaField("numeric_col", "NUMERIC"), + bigquery.SchemaField("float64_col", "FLOAT"), + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("rowindex_2", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("string_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("timestamp_col", "TIMESTAMP"), + bigquery.SchemaField("duration_col", "INTEGER"), + ] + + +@pytest.fixture(scope="session") +def scalar_types_df(compiler_session) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.scalar_types", + enable_snapshot=False, + ) + bf_df = bf_df.set_index("rowindex", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def scalar_types_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + # TODO: add tests for empty dataframes + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + bigframes.testing.utils.convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + return df + + +@pytest.fixture(scope="module") +def scalar_types_array_value( + scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session +) -> core.ArrayValue: + managed_data_source = core.local_data.ManagedArrowTable.from_pandas( + scalar_types_pandas_df + ) + return core.ArrayValue.from_managed(managed_data_source, compiler_session) + + +@pytest.fixture(scope="session") +def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("id", "INTEGER"), + bigquery.SchemaField( + "people", + "RECORD", + fields=[ + bigquery.SchemaField("name", "STRING"), + bigquery.SchemaField("age", "INTEGER"), + bigquery.SchemaField( + "address", + "RECORD", + fields=[ + bigquery.SchemaField("city", "STRING"), + bigquery.SchemaField("country", "STRING"), + ], + ), + ], + ), + ] + + +@pytest.fixture(scope="session") +def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_nested_structs_types._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.nested_structs_types", + enable_snapshot=False, + ) + bf_df = bf_df.set_index("id", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def nested_structs_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing STRUCT types and using the `id` + column as the index.""" + + df = pd.read_json( + DATA_DIR / "nested_structs.jsonl", + lines=True, + ) + df = df.set_index("id") + + address_struct_schema = pa.struct( + [pa.field("city", pa.string()), pa.field("country", pa.string())] + ) + person_struct_schema = pa.struct( + [ + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + pa.field("address", address_struct_schema), + ] + ) + df["person"] = df["person"].astype(pd.ArrowDtype(person_struct_schema)) + return df + + +@pytest.fixture(scope="session") +def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("int_list_col", "INTEGER", "REPEATED"), + bigquery.SchemaField("bool_list_col", "BOOLEAN", "REPEATED"), + bigquery.SchemaField("float_list_col", "FLOAT", "REPEATED"), + bigquery.SchemaField("date_list_col", "DATE", "REPEATED"), + bigquery.SchemaField("date_time_list_col", "DATETIME", "REPEATED"), + bigquery.SchemaField("numeric_list_col", "NUMERIC", "REPEATED"), + bigquery.SchemaField("string_list_col", "STRING", "REPEATED"), + ] + + +@pytest.fixture(scope="session") +def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_repeated_types._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.repeated_types", + enable_snapshot=False, + ) + bf_df = bf_df.set_index("rowindex", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def repeated_types_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing LIST types and using the `rowindex` + column as the index.""" + + df = pd.read_json( + DATA_DIR / "repeated.jsonl", + lines=True, + ) + # TODO: add dtype conversion here if needed. + df = df.set_index("rowindex") + return df + + +@pytest.fixture(scope="session") +def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("json_col", "JSON"), + ] + + +@pytest.fixture(scope="session") +def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing JSON types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_json_types._loader.read_gbq_table( + "bigframes-dev.sqlglot_test.json_types", + enable_snapshot=False, + ) + # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? + bf_df = bf_df.set_index("rowindex", drop=True) + return bf_df + + +@pytest.fixture(scope="session") +def json_pandas_df() -> pd.DataFrame: + """Returns a pandas DataFrame containing JSON types and using the `rowindex` + column as the index.""" + json_data = [ + "null", + "true", + "100", + "0.98", + '"a string"', + "[]", + "[1, 2, 3]", + '[{"a": 1}, {"a": 2}, {"a": null}, {}]', + '"100"', + '{"date": "2024-07-16"}', + '{"int_value": 2, "null_filed": null}', + '{"list_data": [10, 20, 30]}', + ] + df = pd.DataFrame( + { + "rowindex": pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), + "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE), + }, + ) + # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? + df = df.set_index("rowindex", drop=True) + return df diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/conftest.py b/packages/bigframes/tests/unit/core/compile/sqlglot/conftest.py deleted file mode 100644 index fd914f589a50..000000000000 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/conftest.py +++ /dev/null @@ -1,280 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pathlib -import typing - -import pandas as pd -import pyarrow as pa -import pytest -from google.cloud import bigquery - -import bigframes.core as core -import bigframes.pandas as bpd -import bigframes.testing.mocks as mocks -import bigframes.testing.utils -from bigframes import dtypes - -CURRENT_DIR = pathlib.Path(__file__).parent -DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data" - - -def _create_compiler_session(table_name, table_schema): - """Helper function to create a compiler session.""" - from bigframes.testing import compiler_session - - anonymous_dataset = bigquery.DatasetReference.from_string( - "bigframes-dev.sqlglot_test" - ) - session = mocks.create_bigquery_session( - table_name=table_name, - table_schema=table_schema, - anonymous_dataset=anonymous_dataset, - ) - session._executor = compiler_session.SQLCompilerExecutor() - return session - - -@pytest.fixture(scope="session") -def compiler_session(scalar_types_table_schema): - """Compiler session for scalar types.""" - return _create_compiler_session("scalar_types", scalar_types_table_schema) - - -@pytest.fixture(scope="session") -def compiler_session_w_repeated_types(repeated_types_table_schema): - """Compiler session for repeated data types.""" - return _create_compiler_session("repeated_types", repeated_types_table_schema) - - -@pytest.fixture(scope="session") -def compiler_session_w_nested_structs_types(nested_structs_types_table_schema): - """Compiler session for nested STRUCT data types.""" - return _create_compiler_session( - "nested_structs_types", nested_structs_types_table_schema - ) - - -@pytest.fixture(scope="session") -def compiler_session_w_json_types(json_types_table_schema): - """Compiler session for JSON data types.""" - return _create_compiler_session("json_types", json_types_table_schema) - - -@pytest.fixture(scope="session") -def scalar_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: - return [ - bigquery.SchemaField("bool_col", "BOOLEAN"), - bigquery.SchemaField("bytes_col", "BYTES"), - bigquery.SchemaField("date_col", "DATE"), - bigquery.SchemaField("datetime_col", "DATETIME"), - bigquery.SchemaField("geography_col", "GEOGRAPHY"), - bigquery.SchemaField("int64_col", "INTEGER"), - bigquery.SchemaField("int64_too", "INTEGER"), - bigquery.SchemaField("numeric_col", "NUMERIC"), - bigquery.SchemaField("float64_col", "FLOAT"), - bigquery.SchemaField("rowindex", "INTEGER"), - bigquery.SchemaField("rowindex_2", "INTEGER", mode="REQUIRED"), - bigquery.SchemaField("string_col", "STRING"), - bigquery.SchemaField("time_col", "TIME"), - bigquery.SchemaField("timestamp_col", "TIMESTAMP"), - bigquery.SchemaField("duration_col", "INTEGER"), - ] - - -@pytest.fixture(scope="session") -def scalar_types_df(compiler_session) -> bpd.DataFrame: - """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` - column as the index.""" - bf_df = compiler_session._loader.read_gbq_table( - "bigframes-dev.sqlglot_test.scalar_types", - enable_snapshot=False, - ) - bf_df = bf_df.set_index("rowindex", drop=False) - return bf_df - - -@pytest.fixture(scope="session") -def scalar_types_pandas_df() -> pd.DataFrame: - """Returns a pandas DataFrame containing all scalar types and using the `rowindex` - column as the index.""" - # TODO: add tests for empty dataframes - df = pd.read_json( - DATA_DIR / "scalars.jsonl", - lines=True, - ) - bigframes.testing.utils.convert_pandas_dtypes(df, bytes_col=True) - - df = df.set_index("rowindex", drop=False) - return df - - -@pytest.fixture(scope="module") -def scalar_types_array_value( - scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session -) -> core.ArrayValue: - managed_data_source = core.local_data.ManagedArrowTable.from_pandas( - scalar_types_pandas_df - ) - return core.ArrayValue.from_managed(managed_data_source, compiler_session) - - -@pytest.fixture(scope="session") -def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: - return [ - bigquery.SchemaField("id", "INTEGER"), - bigquery.SchemaField( - "people", - "RECORD", - fields=[ - bigquery.SchemaField("name", "STRING"), - bigquery.SchemaField("age", "INTEGER"), - bigquery.SchemaField( - "address", - "RECORD", - fields=[ - bigquery.SchemaField("city", "STRING"), - bigquery.SchemaField("country", "STRING"), - ], - ), - ], - ), - ] - - -@pytest.fixture(scope="session") -def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame: - """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` - column as the index.""" - bf_df = compiler_session_w_nested_structs_types._loader.read_gbq_table( - "bigframes-dev.sqlglot_test.nested_structs_types", - enable_snapshot=False, - ) - bf_df = bf_df.set_index("id", drop=False) - return bf_df - - -@pytest.fixture(scope="session") -def nested_structs_pandas_df() -> pd.DataFrame: - """Returns a pandas DataFrame containing STRUCT types and using the `id` - column as the index.""" - - df = pd.read_json( - DATA_DIR / "nested_structs.jsonl", - lines=True, - ) - df = df.set_index("id") - - address_struct_schema = pa.struct( - [pa.field("city", pa.string()), pa.field("country", pa.string())] - ) - person_struct_schema = pa.struct( - [ - pa.field("name", pa.string()), - pa.field("age", pa.int64()), - pa.field("address", address_struct_schema), - ] - ) - df["person"] = df["person"].astype(pd.ArrowDtype(person_struct_schema)) - return df - - -@pytest.fixture(scope="session") -def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: - return [ - bigquery.SchemaField("rowindex", "INTEGER"), - bigquery.SchemaField("int_list_col", "INTEGER", "REPEATED"), - bigquery.SchemaField("bool_list_col", "BOOLEAN", "REPEATED"), - bigquery.SchemaField("float_list_col", "FLOAT", "REPEATED"), - bigquery.SchemaField("date_list_col", "DATE", "REPEATED"), - bigquery.SchemaField("date_time_list_col", "DATETIME", "REPEATED"), - bigquery.SchemaField("numeric_list_col", "NUMERIC", "REPEATED"), - bigquery.SchemaField("string_list_col", "STRING", "REPEATED"), - ] - - -@pytest.fixture(scope="session") -def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame: - """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` - column as the index.""" - bf_df = compiler_session_w_repeated_types._loader.read_gbq_table( - "bigframes-dev.sqlglot_test.repeated_types", - enable_snapshot=False, - ) - bf_df = bf_df.set_index("rowindex", drop=False) - return bf_df - - -@pytest.fixture(scope="session") -def repeated_types_pandas_df() -> pd.DataFrame: - """Returns a pandas DataFrame containing LIST types and using the `rowindex` - column as the index.""" - - df = pd.read_json( - DATA_DIR / "repeated.jsonl", - lines=True, - ) - # TODO: add dtype conversion here if needed. - df = df.set_index("rowindex") - return df - - -@pytest.fixture(scope="session") -def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: - return [ - bigquery.SchemaField("rowindex", "INTEGER"), - bigquery.SchemaField("json_col", "JSON"), - ] - - -@pytest.fixture(scope="session") -def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame: - """Returns a BigFrames DataFrame containing JSON types and using the `rowindex` - column as the index.""" - bf_df = compiler_session_w_json_types._loader.read_gbq_table( - "bigframes-dev.sqlglot_test.json_types", - enable_snapshot=False, - ) - # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? - bf_df = bf_df.set_index("rowindex", drop=True) - return bf_df - - -@pytest.fixture(scope="session") -def json_pandas_df() -> pd.DataFrame: - """Returns a pandas DataFrame containing JSON types and using the `rowindex` - column as the index.""" - json_data = [ - "null", - "true", - "100", - "0.98", - '"a string"', - "[]", - "[1, 2, 3]", - '[{"a": 1}, {"a": 2}, {"a": null}, {}]', - '"100"', - '{"date": "2024-07-16"}', - '{"int_value": 2, "null_filed": null}', - '{"list_data": [10, 20, 30]}', - ] - df = pd.DataFrame( - { - "rowindex": pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), - "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE), - }, - ) - # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? - df = df.set_index("rowindex", drop=True) - return df From 7ad2c2b469f1a76bbca7eed0eabb9d237a1cf2f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 21:56:06 +0000 Subject: [PATCH 19/20] split main --- .../scripts/generate_bigframes_bigquery.py | 349 +++++++++--------- 1 file changed, 183 insertions(+), 166 deletions(-) diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index 4baaef3ce6b0..6c7078af789a 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -101,181 +101,198 @@ def to_snake_case(name): return name -def main(): +def load_templates(): env = jinja2.Environment( loader=jinja2.FileSystemLoader(TEMPLATE_DIR), trim_blocks=True, lstrip_blocks=True, ) - template = env.get_template("operation.py.j2") - test_template = env.get_template("test_operation.py.j2") - - for yaml_file in DATA_DIR.glob("**/*.yaml"): - print(f"Processing {yaml_file}...") - with open(yaml_file, "r") as f: - data = yaml.safe_load(f) - - rel_path = yaml_file.relative_to(DATA_DIR) - module_path = rel_path.with_suffix("") - module_name = module_path.name - output_file = OUTPUT_DIR.joinpath(module_path).with_suffix(".py") - - ops_list = [] - functions_list = [] - - if "scalar_functions" in data: - for func_data in data["scalar_functions"]: - sql_name = func_data["name"] - python_name = to_snake_case(sql_name) - if python_name.startswith(module_name + "_"): - python_name = python_name[len(module_name) + 1 :] - - internal_op_name = f"_{python_name.upper()}_OP" - - # Aggregate args across impls - args_by_name = {} - arg_order = [] - for impl in func_data["impls"]: - for arg in impl["args"]: - name = arg["name"] - if name not in args_by_name: - args_by_name[name] = { - "types": set(), - "optional": arg["optional"], - "keyword_only": arg["keyword_only"], - } - arg_order.append(name) - args_by_name[name]["types"].add(arg["value"]) - - # Build ArgSpecs - arg_specs = [] - for name in arg_order: - arg_info = args_by_name[name] - spec = "googlesql.ArgSpec(" - if arg_info["keyword_only"]: - spec += f'arg_name="{name}", ' - if arg_info["optional"]: - spec += "optional=True, " - spec = spec.rstrip(", ") + ")" - arg_specs.append(spec) - - # Determine return dtype - return_types = {impl["return"] for impl in func_data["impls"]} - if len(return_types) == 1: - ret_type = list(return_types)[0] - signature = f"lambda *args: {DTYPE_MAP.get(ret_type, 'None')}" - else: - # Fallback to Any/None if ambiguous - signature = "lambda *args: None" - - ops_list.append( - { - "internal_name": internal_op_name, - "sql_name": sql_name.upper(), - "arg_specs": ", ".join(arg_specs), - "signature": signature, - } - ) - - # Function args - func_args = [] - for name in arg_order: - arg_info = args_by_name[name] - types = [PY_TYPE_MAP.get(t, "Any") for t in arg_info["types"]] + ["Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]"] - type_hint = ( - "Union[" + ", ".join(sorted(set(types))) + "]" - if len(types) > 1 - else types[0] - ) - default = "sentinels.Sentinel.ARGUMENT_DEFAULT" if arg_info["optional"] else "" - func_args.append( - { - "name": name, - "type_hint": type_hint, - "default": default, - } - ) - - # Clean up default values for mandatory args - # In Python, mandatory args come first. - for arg in func_args: - if not arg["default"]: - del arg["default"] - - # Test args - test_args = [] - for name in arg_order: - arg_info = args_by_name[name] - some_type = list(arg_info["types"])[0] - col_name = YAML_TYPE_TO_COL.get(some_type, "string_col") - test_args.append({"col_name": col_name}) - - functions_list.append( - { - "name": python_name, - "op_name": internal_op_name, - "description": func_data["description"], - "args": func_args, - "test_args": test_args, + return env.get_template("operation.py.j2"), env.get_template("test_operation.py.j2") + + +def parse_scalar_functions(data, module_name): + ops_list = [] + functions_list = [] + + if "scalar_functions" not in data: + return ops_list, functions_list + + for func_data in data["scalar_functions"]: + sql_name = func_data["name"] + python_name = to_snake_case(sql_name) + if python_name.startswith(module_name + "_"): + python_name = python_name[len(module_name) + 1 :] + + internal_op_name = f"_{python_name.upper()}_OP" + + # Aggregate args across impls + args_by_name = {} + arg_order = [] + for impl in func_data["impls"]: + for arg in impl["args"]: + name = arg["name"] + if name not in args_by_name: + args_by_name[name] = { + "types": set(), + "optional": arg["optional"], + "keyword_only": arg["keyword_only"], } - ) - - # Render and write - output_file.parent.mkdir(parents=True, exist_ok=True) - content = template.render( - yaml_path=str(yaml_file), - script_path="scripts/generate_bigframes_bigquery.py", - ops=ops_list, - functions=functions_list, - ) - with open(output_file, "w") as f: - f.write(content) - - subprocess.run( - RUFF_ARGS - + [ - str(output_file), - ], - check=True, - ) - print(f" Generated {output_file}") - - # Render and write test - import_path = "bigframes.bigquery._operations." + ".".join(module_path.parts) - test_output_file = TEST_OUTPUT_DIR.joinpath( - module_path.with_name(f"test_{module_path.name}") - ).with_suffix(".py") - - test_output_file.parent.mkdir(parents=True, exist_ok=True) - test_content = test_template.render( - yaml_path=str(yaml_file), - script_path="scripts/generate_bigframes_bigquery.py", - import_path=import_path, - short_name=module_path.name, - functions=functions_list, - ) - with open(test_output_file, "w") as f: - f.write(test_content) - - subprocess.run( - RUFF_ARGS - + [ - str(test_output_file), - ], - check=True, + arg_order.append(name) + args_by_name[name]["types"].add(arg["value"]) + + # Build ArgSpecs + arg_specs = [] + for name in arg_order: + arg_info = args_by_name[name] + spec = "googlesql.ArgSpec(" + if arg_info["keyword_only"]: + spec += f'arg_name="{name}", ' + if arg_info["optional"]: + spec += "optional=True, " + spec = spec.rstrip(", ") + ")" + arg_specs.append(spec) + + # Determine return dtype + return_types = {impl["return"] for impl in func_data["impls"]} + if len(return_types) == 1: + ret_type = list(return_types)[0] + signature = f"lambda *args: {DTYPE_MAP.get(ret_type, 'None')}" + else: + # Fallback to Any/None if ambiguous + signature = "lambda *args: None" + + ops_list.append( + { + "internal_name": internal_op_name, + "sql_name": sql_name.upper(), + "arg_specs": ", ".join(arg_specs), + "signature": signature, + } ) - print(f" Generated {test_output_file}") - - print(f" Updating snapshots for {test_output_file}...") - subprocess.run( - [ - "pytest", - str(test_output_file), - "--snapshot-update", - ], - check=False, + + # Function args + func_args = [] + for name in arg_order: + arg_info = args_by_name[name] + types = [PY_TYPE_MAP.get(t, "Any") for t in arg_info["types"]] + [ + "Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]" + ] + type_hint = ( + "Union[" + ", ".join(sorted(set(types))) + "]" + if len(types) > 1 + else types[0] + ) + default = ( + "sentinels.Sentinel.ARGUMENT_DEFAULT" if arg_info["optional"] else "" + ) + func_args.append( + { + "name": name, + "type_hint": type_hint, + "default": default, + } + ) + + # Clean up default values for mandatory args + # In Python, mandatory args come first. + for arg in func_args: + if not arg.get("default"): + arg.pop("default", None) + + # Test args + test_args = [] + for name in arg_order: + arg_info = args_by_name[name] + some_type = list(arg_info["types"])[0] + col_name = YAML_TYPE_TO_COL.get(some_type, "string_col") + test_args.append({"col_name": col_name}) + + functions_list.append( + { + "name": python_name, + "op_name": internal_op_name, + "description": func_data["description"], + "args": func_args, + "test_args": test_args, + } ) + return ops_list, functions_list + + +def run_ruff(path: pathlib.Path): + subprocess.run( + RUFF_ARGS + + [ + str(path), + ], + check=True, + ) + + +def process_yaml_file(yaml_file, template, test_template): + print(f"Processing {yaml_file}...") + with open(yaml_file, "r") as f: + data = yaml.safe_load(f) + + rel_path = yaml_file.relative_to(DATA_DIR) + module_path = rel_path.with_suffix("") + module_name = module_path.name + output_file = OUTPUT_DIR.joinpath(module_path).with_suffix(".py") + + ops_list, functions_list = parse_scalar_functions(data, module_name) + + # Render and write + output_file.parent.mkdir(parents=True, exist_ok=True) + content = template.render( + yaml_path=str(yaml_file), + script_path="scripts/generate_bigframes_bigquery.py", + ops=ops_list, + functions=functions_list, + ) + with open(output_file, "w") as f: + f.write(content) + + run_ruff(output_file) + print(f" Generated {output_file}") + + # Render and write test + import_path = "bigframes.bigquery._operations." + ".".join(module_path.parts) + test_output_file = TEST_OUTPUT_DIR.joinpath( + module_path.with_name(f"test_{module_path.name}") + ).with_suffix(".py") + + test_output_file.parent.mkdir(parents=True, exist_ok=True) + test_content = test_template.render( + yaml_path=str(yaml_file), + script_path="scripts/generate_bigframes_bigquery.py", + import_path=import_path, + short_name=module_path.name, + functions=functions_list, + ) + with open(test_output_file, "w") as f: + f.write(test_content) + + run_ruff(test_output_file) + print(f" Generated {test_output_file}") + + print(f" Updating snapshots for {test_output_file}...") + subprocess.run( + [ + "pytest", + str(test_output_file), + "--snapshot-update", + ], + check=False, + ) + + +def main(): + template, test_template = load_templates() + + for yaml_file in DATA_DIR.glob("**/*.yaml"): + process_yaml_file(yaml_file, template, test_template) + if __name__ == "__main__": main() From 8a48769fbed822789b8777e358191ff5f3f29a10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 19 May 2026 22:09:57 +0000 Subject: [PATCH 20/20] make generation more deterministic --- .../scripts/generate_bigframes_bigquery.py | 151 ++++++++++-------- 1 file changed, 87 insertions(+), 64 deletions(-) diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index 6c7078af789a..4cd98bddbf21 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -110,6 +110,87 @@ def load_templates(): return env.get_template("operation.py.j2"), env.get_template("test_operation.py.j2") +def _collect_args(impls): + args_by_name = {} + arg_order = [] + for impl in impls: + for arg in impl["args"]: + name = arg["name"] + if name not in args_by_name: + args_by_name[name] = { + "types": set(), + "optional": arg["optional"], + "keyword_only": arg["keyword_only"], + } + arg_order.append(name) + args_by_name[name]["types"].add(arg["value"]) + return args_by_name, arg_order + + +def _build_arg_specs(args_by_name, arg_order): + arg_specs = [] + for name in arg_order: + arg_info = args_by_name[name] + spec = "googlesql.ArgSpec(" + if arg_info["keyword_only"]: + spec += f'arg_name="{name}", ' + if arg_info["optional"]: + spec += "optional=True, " + spec = spec.rstrip(", ") + ")" + arg_specs.append(spec) + return arg_specs + + +def _get_return_signature(impls): + return_types = {impl["return"] for impl in impls} + if len(return_types) == 1: + ret_type = sorted(return_types)[0] + return f"lambda *args: {DTYPE_MAP.get(ret_type, 'None')}" + else: + # Fallback to Any/None if ambiguous + return "lambda *args: None" + + +def _get_func_args(args_by_name, arg_order): + func_args = [] + for name in arg_order: + arg_info = args_by_name[name] + types = [PY_TYPE_MAP.get(t, "Any") for t in sorted(arg_info["types"])] + [ + "Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]" + ] + type_hint = ( + "Union[" + ", ".join(sorted(set(types))) + "]" + if len(types) > 1 + else types[0] + ) + default = "sentinels.Sentinel.ARGUMENT_DEFAULT" if arg_info["optional"] else "" + func_args.append( + { + "name": name, + "type_hint": type_hint, + "default": default, + } + ) + + # Clean up default values for mandatory args + # In Python, mandatory args come first. + for arg in func_args: + if not arg.get("default"): + arg.pop("default", None) + + return func_args + + +def _get_test_args(args_by_name, arg_order): + test_args = [] + for name in arg_order: + arg_info = args_by_name[name] + some_type = sorted(arg_info["types"])[0] + col_name = YAML_TYPE_TO_COL.get(some_type, "string_col") + test_args.append({"col_name": col_name}) + return test_args + + def parse_scalar_functions(data, module_name): ops_list = [] functions_list = [] @@ -126,40 +207,13 @@ def parse_scalar_functions(data, module_name): internal_op_name = f"_{python_name.upper()}_OP" # Aggregate args across impls - args_by_name = {} - arg_order = [] - for impl in func_data["impls"]: - for arg in impl["args"]: - name = arg["name"] - if name not in args_by_name: - args_by_name[name] = { - "types": set(), - "optional": arg["optional"], - "keyword_only": arg["keyword_only"], - } - arg_order.append(name) - args_by_name[name]["types"].add(arg["value"]) + args_by_name, arg_order = _collect_args(func_data["impls"]) # Build ArgSpecs - arg_specs = [] - for name in arg_order: - arg_info = args_by_name[name] - spec = "googlesql.ArgSpec(" - if arg_info["keyword_only"]: - spec += f'arg_name="{name}", ' - if arg_info["optional"]: - spec += "optional=True, " - spec = spec.rstrip(", ") + ")" - arg_specs.append(spec) + arg_specs = _build_arg_specs(args_by_name, arg_order) # Determine return dtype - return_types = {impl["return"] for impl in func_data["impls"]} - if len(return_types) == 1: - ret_type = list(return_types)[0] - signature = f"lambda *args: {DTYPE_MAP.get(ret_type, 'None')}" - else: - # Fallback to Any/None if ambiguous - signature = "lambda *args: None" + signature = _get_return_signature(func_data["impls"]) ops_list.append( { @@ -171,41 +225,10 @@ def parse_scalar_functions(data, module_name): ) # Function args - func_args = [] - for name in arg_order: - arg_info = args_by_name[name] - types = [PY_TYPE_MAP.get(t, "Any") for t in arg_info["types"]] + [ - "Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]" - ] - type_hint = ( - "Union[" + ", ".join(sorted(set(types))) + "]" - if len(types) > 1 - else types[0] - ) - default = ( - "sentinels.Sentinel.ARGUMENT_DEFAULT" if arg_info["optional"] else "" - ) - func_args.append( - { - "name": name, - "type_hint": type_hint, - "default": default, - } - ) - - # Clean up default values for mandatory args - # In Python, mandatory args come first. - for arg in func_args: - if not arg.get("default"): - arg.pop("default", None) + func_args = _get_func_args(args_by_name, arg_order) # Test args - test_args = [] - for name in arg_order: - arg_info = args_by_name[name] - some_type = list(arg_info["types"])[0] - col_name = YAML_TYPE_TO_COL.get(some_type, "string_col") - test_args.append({"col_name": col_name}) + test_args = _get_test_args(args_by_name, arg_order) functions_list.append( { @@ -290,7 +313,7 @@ def process_yaml_file(yaml_file, template, test_template): def main(): template, test_template = load_templates() - for yaml_file in DATA_DIR.glob("**/*.yaml"): + for yaml_file in sorted(DATA_DIR.glob("**/*.yaml")): process_yaml_file(yaml_file, template, test_template)