diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8582983..c9007fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ concurrency: env: CARGO_TERM_COLOR: always INCAN_REF: release/v0.3 - EXPECTED_INCAN_VERSION: 0.3.0-rc21 + EXPECTED_INCAN_VERSION: 0.3.0-rc23 RUST_BACKTRACE: 1 INCAN_NO_BANNER: 1 diff --git a/docs/language/reference/functions/format.md b/docs/language/reference/functions/format.md new file mode 100644 index 0000000..ae282bc --- /dev/null +++ b/docs/language/reference/functions/format.md @@ -0,0 +1,57 @@ +# Format Functions (Reference) + +Format functions transform scalar values that are already present in a relation. Source discovery, file reads, and +relation reshaping belong to the session and relational APIs rather than this function family. + +The format catalog includes deterministic hashes, URL helpers, JSON helpers, and CSV helpers: + +| Function | Meaning | +| --- | --- | +| `md5(expr)` | Return the lowercase hexadecimal MD5 digest for a string expression. | +| `sha1(expr)` | Return the lowercase hexadecimal SHA-1 digest for a string expression. | +| `sha224(expr)` | Return the lowercase hexadecimal SHA-224 digest for a string expression. | +| `sha256(expr)` | Return the lowercase hexadecimal SHA-256 digest for a string expression. | +| `sha384(expr)` | Return the lowercase hexadecimal SHA-384 digest for a string expression. | +| `sha512(expr)` | Return the lowercase hexadecimal SHA-512 digest for a string expression. | +| `sha2(expr, bit_length)` | Compatibility helper that rewrites to `sha224`, `sha256`, `sha384`, or `sha512` for supported literal bit lengths. | +| `crc32(expr)` | Return the lowercase eight-character hexadecimal CRC-32 digest for a string expression. | +| `xxhash64(expr)` | Return the lowercase sixteen-character hexadecimal xxHash64 digest for a string expression. | +| `parse_url(expr, part)` | Extract `scheme`, `host`, `path`, `query`, `fragment`, `port`, `username`, or `password` from a URL string. | +| `url_encode(expr)` | Percent-encode a URL component string. | +| `url_decode(expr)` | Decode a percent-encoded URL component string and fail on malformed escapes. | +| `try_url_decode(expr)` | Decode a percent-encoded URL component string, returning null on malformed escapes. | +| `parse_json(expr)` | Validate and normalize a JSON payload string. | +| `check_json(expr)` | Return whether a string expression contains valid JSON. | +| `schema_of_json(expr)` | Infer a deterministic schema description from a JSON payload string. | +| `json_array_length(expr)` | Return the number of array elements for a JSON array payload, or null for non-array payloads. | +| `json_object_keys(expr)` | Return object keys from a JSON object payload as a JSON array string. | +| `get_json_object(expr, path)` | Extract a JSON value at a literal path and return it as JSON text. | +| `json_extract_path_text(expr, path)` | Extract a JSON value at a literal path and return scalar strings as plain text. | +| `from_json(expr, schema)` | Validate JSON with an explicit schema description and return a normalized JSON payload string. | +| `try_from_json(expr, schema)` | Validate JSON with an explicit schema description and return null when the payload is invalid. | +| `to_json(expr)` | Serialize a scalar expression as JSON text. | +| `schema_of_csv(expr)` | Infer a deterministic schema description from a CSV row string. | +| `from_csv(expr, schema)` | Parse a CSV row string into a JSON payload string, using schema field names when provided. | +| `to_csv(expr)` | Serialize a scalar or JSON array/object payload as a CSV row string. | + +```incan +from pub::inql.functions import col, from_json, get_json_object, parse_url, sha2, to_json + +projected = ( + events + .with_column("user_hash", sha2(col("user_id"), 256)) + .with_column("host", parse_url(col("landing_page"), "host")) + .with_column("event_type", get_json_object(col("payload"), "$.type")) + .with_column("payload_obj", from_json(col("payload"), "STRUCT")) + .with_column("payload_out", to_json(col("event_type"))) +) +``` + +Hash helpers operate on UTF-8 string bytes and return lowercase hexadecimal strings. `sha2(...)` accepts `224`, `256`, +`384`, and `512`; other digest lengths are rejected during expression construction. + +JSON and CSV helpers validate, normalize, and project payload text. They do not read external files or introduce a +dynamic variant value type. + +The DataFusion adapter executes the full RFC 022 catalog with native DataFusion functions where available and +Incan-authored adapter callbacks for helpers that DataFusion does not expose natively. diff --git a/docs/language/reference/functions/index.md b/docs/language/reference/functions/index.md index 77861a2..5b0615b 100644 --- a/docs/language/reference/functions/index.md +++ b/docs/language/reference/functions/index.md @@ -10,10 +10,11 @@ Today the concrete shipped surfaces are documented here: - [Generator and table-valued functions](generators.md) - [Nested data functions](nested.md) - [Window functions](windows.md) +- [Format functions](format.md) The canonical scalar literal helper is `lit(...)`. Typed literal helpers construct the same scalar-expression representation. -The current registry-backed helper surface covers references, literals, casts, operators, predicates, conditionals, math, ordering, aggregates, generators, nested data, and windows. Each runtime entry exposes a stable function reference such as `inql.functions.col`, namespace, canonical name, typed lifecycle metadata (`since`, versioned changes, and optional deprecation), function policy category, function class, null behavior, alias policy, aggregate modifier policy, and Substrait mapping metadata. Checked public helpers provide the signature and, by default, the canonical name; metadata may override the canonical name only for source spelling constraints such as the reserved-word `mod` case. +The current registry-backed helper surface covers references, literals, casts, operators, predicates, conditionals, math, ordering, aggregates, generators, nested data, windows, and format helpers. Each runtime entry exposes a stable function reference such as `inql.functions.col`, namespace, canonical name, typed lifecycle metadata (`since`, versioned changes, and optional deprecation), function policy category, function class, null behavior, alias policy, aggregate modifier policy, and Substrait mapping metadata. Checked public helpers provide the signature and, by default, the canonical name; metadata may override the canonical name only for source spelling constraints such as the reserved-word `mod` case. The registry is the source for non-derivable machine facts. Public helper declarations are the source for argument names, argument types, and return types. Docstrings remain human-facing explanation, examples, and parameter intent. The `registry-metadata` check validates the checked API metadata projections produced from public facade aliases, registry decorators, and decorated callable signatures. Runtime registry entries are lazy and process-local: they support helper execution and lowering for loaded helpers, while the complete public catalog comes from checked metadata. This matters for generated docs, diagnostics, Prism lowering, and backend capability checks as the catalog grows. @@ -37,6 +38,7 @@ The registered helper surface currently includes: | `array(...)`, `cardinality(...)`, `array_contains(...)`, `arrays_overlap(...)`, `array_position(...)`, `array_range(...)`, `element_at(...)`, `array_sort(...)`, `array_distinct(...)`, `array_except(...)`, `array_intersect(...)`, `array_union(...)`, `array_join(...)`, `array_slice(...)`, `array_reverse(...)`, `array_flatten(...)`, `map_from_arrays(...)`, `map_extract(...)`, `map_contains_key(...)`, `map_keys(...)`, `map_values(...)`, `map_entries(...)`, `named_struct(...)` | scalar | registered nested scalar helpers backed by Substrait extension mappings; `array_range(...)` registers canonical `range` for positional generator lowering and `map_contains_key(...)` lowers as a documented predicate rewrite | | `explode(...)`, `explode_outer(...)`, `posexplode(...)`, `posexplode_outer(...)`, `inline(...)`, `inline_outer(...)`, `flatten(...)`, `stack(...)` | generator | relation-extension mappings consumed by `generate(...)`; positional forms use zero-based positions | | `window()`, `unbounded_preceding()`, `preceding(...)`, `current_row()`, `following(...)`, `unbounded_following()`, `row_number()`, `rank()`, `dense_rank()`, `percent_rank()`, `cume_dist()`, `ntile(...)`, `lag(...)`, `lead(...)`, `first_value(...)`, `last_value(...)`, `nth_value(...)` | window | `window()` and bound helpers build structural window-spec metadata; placed ranking, distribution, offset, value, and aggregate-over-window helpers lower through `ConsistentPartitionWindowRel` and execute through the DataFusion session adapter | +| `md5(...)`, `sha1(...)`, `sha224(...)`, `sha256(...)`, `sha384(...)`, `sha512(...)`, `sha2(...)`, `crc32(...)`, `xxhash64(...)`, JSON helpers, CSV helpers, URL helpers | scalar | registered RFC 022 format helpers; concrete helpers lower through Substrait extension mappings, while `sha2(...)` rewrites to a supported concrete SHA-2 helper | | `asc(...)`, `desc(...)`, `asc_nulls_first(...)`, `asc_nulls_last(...)`, `desc_nulls_first(...)`, `desc_nulls_last(...)` | ordering | structural sort-field helpers consumed by `order_by(...)` and lowered to Substrait `SortRel.sorts` | | `sum(...)`, `count(...)`, `count_expr(...)`, `count_distinct(...)`, `count_if(...)`, `avg(...)`, `min(...)`, `max(...)` | aggregate | registered Substrait extension functions for core aggregates plus compatibility rewrites for `count_expr(...)`, `count_distinct(...)`, and `count_if(...)`; core aggregates allow `DISTINCT` and aggregate-local `FILTER` where the aggregate shape is valid | diff --git a/docs/release_notes/v0_1.md b/docs/release_notes/v0_1.md index 00824cb..ec232d9 100644 --- a/docs/release_notes/v0_1.md +++ b/docs/release_notes/v0_1.md @@ -17,6 +17,7 @@ Entries will be filled in as work lands (link RFCs and PRs when applicable). - **Nested data functions:** RFC 020 adds registry-backed scalar helpers for array construction/access, cardinality, containment, overlap, sorting, set-like operations, joining, slicing, reversing, scalar array flattening, map construction/access, map key/value/entry extraction, map key containment, and named struct construction. These helpers lower through Substrait extension metadata without introducing generator semantics, with representative DataFusion-backed Session coverage for composable array projection paths. - **Generator functions:** RFC 021 adds registry-backed generator applications for `explode(...)`, `explode_outer(...)`, `posexplode(...)`, `posexplode_outer(...)`, `inline(...)`, `inline_outer(...)`, portable `flatten(...)`, and `stack(...)`. Generators remain relation-shaping operations applied with `generate(...)`; they preserve input columns, require explicit output aliases, lower through the current Substrait extension-relation gap encoding, and execute through the DataFusion Session adapter with concrete output-column materialization. - **Window functions:** RFC 019 adds `window()` specs, explicit row/range frame bounds, ranking and distribution helpers (`row_number`, `rank`, `dense_rank`, `percent_rank`, `cume_dist`, `ntile`), offset and value helpers (`lag`, `lead`, `first_value`, `last_value`, `nth_value`), and aggregate-over-window placement through `with_window_column(...)`. Portable window helpers require explicit ordering where appropriate, lower through Substrait `ConsistentPartitionWindowRel`, and execute through the DataFusion session adapter. +- **Format functions:** RFC 022 adds scalar payload helpers for deterministic hashes (`md5`, `sha1`, `sha224`, `sha256`, `sha384`, `sha512`, `sha2`, `crc32`, and `xxhash64`), URL parsing/encoding/decoding, JSON validation/path/schema helpers, and CSV row/schema helpers. Format helpers lower through registry-owned Substrait metadata; the DataFusion adapter executes the full helper set with native functions where available and Incan-authored adapter callbacks for non-native helpers. - **Function registry:** RFC 014 adds declaration-site registry decorators for the current public helper surface, including stable function references, checked signature projection, lifecycle metadata, behavior categories, alias policy, Substrait mapping categories, and checked API metadata drift validation. - **Function extension policy:** InQL RFC 024 policy metadata now distinguishes portable core functions, namespaced extension-only functions, opt-in compatibility aliases, engine-specific functions, and rejected compatibility requests without adding an extension plugin system or backend-owned semantics. - **Projection:** builder-based `with_column`, `add`, `mul`, and literal expression helpers now lower derived columns through Prism, Substrait, and Session execution. diff --git a/docs/rfcs/022_semi_structured_format_functions.md b/docs/rfcs/022_semi_structured_format_functions.md index 8df68b7..bcf4bea 100644 --- a/docs/rfcs/022_semi_structured_format_functions.md +++ b/docs/rfcs/022_semi_structured_format_functions.md @@ -1,6 +1,6 @@ # InQL RFC 022: Semi-structured and format functions -- **Status:** Draft +- **Status:** Implemented - **Created:** 2026-04-27 - **Author(s):** Danny Meijer (@dannymeijer) - **Related:** @@ -10,14 +10,15 @@ - InQL RFC 013 (function catalog program) - InQL RFC 014 (function registry and catalog governance) - InQL RFC 020 (nested data functions) + - InQL RFC 026 (semi-structured variant logical values) - **Issue:** [InQL #39](https://github.com/dannys-code-corner/InQL/issues/39) -- **RFC PR:** — -- **Written against:** Incan v0.2 -- **Shipped in:** — +- **RFC PR:** [InQL #49](https://github.com/dannys-code-corner/InQL/pull/49) +- **Written against:** Incan v0.3-era InQL +- **Shipped in:** v0.1 ## Summary -This RFC defines InQL's semi-structured and format-oriented function families: JSON value functions, CSV value functions, schema inference helpers, type predicates for dynamic values, URL helpers, and hashing functions. These functions are practical data-engineering tools, but they should live in explicit format families rather than the core scalar catalog. +This RFC defines InQL's semi-structured and format-oriented function families: JSON value functions, CSV value functions, schema inference helpers, URL helpers, and hashing functions. These functions are practical data-engineering tools, but they should live in explicit format families rather than the core scalar catalog. Semi-structured variant values and their type predicates are defined separately by InQL RFC 026. ## Motivation @@ -29,7 +30,7 @@ Without a separate RFC, format helpers risk leaking ingestion policy into the sc - Define JSON scalar and schema helper functions. - Define CSV scalar and schema helper functions. -- Define dynamic-value type predicates where InQL supports variant-like values. +- Keep string-backed format helpers compatible with the variant value model defined by InQL RFC 026. - Define URL parse/encode/decode helpers. - Define deterministic hash functions for data engineering. - Keep format functions separate from source reading and writing contracts. @@ -37,7 +38,8 @@ Without a separate RFC, format helpers risk leaking ingestion policy into the sc ## Non-Goals - Defining source discovery, file scanning, or format handler registration. -- Defining XML, variant, geospatial, crypto, or sketch functions. +- Defining XML, geospatial, crypto, or sketch functions. +- Defining semi-structured variant logical values or variant predicates; see InQL RFC 026. - Defining nested array/map/struct functions except as return values of parsing functions. - Defining physical input-file metadata functions. @@ -69,9 +71,7 @@ InQL should define URL functions including `parse_url`, `url_encode`, `url_decod InQL should define hash functions including `crc32`, `md5`, `sha1`, `sha2`, and `xxhash64`, with input encoding and output representation specified. -Where InQL supports variant-like dynamic values, it should define type inspection and predicate functions such as `typeof`, `is_array`, `is_object`, `is_integer`, `is_timestamp`, and `is_null_value`. These functions must not be accepted before the value model they inspect is defined. - -Format functions that return structured values must return typed arrays, maps, structs, or declared model-compatible values according to InQL's nested type rules. +InQL RFC 026 defines semi-structured variant logical values and their type inspection predicates. Parser helpers in this RFC return validated, normalized payload text. RFC 026 must not silently change the meaning of the RFC 022 string-backed payload helpers. Schema inference helper functions must be deterministic for the same input values and options. They must not inspect external files or session state unless explicitly defined as source-discovery functions outside this RFC. @@ -99,7 +99,7 @@ This RFC is additive. It should not change existing CSV ingestion behavior. - **Place all format helpers in the common scalar catalog.** Rejected because format parsing has option, schema, and I/O-adjacent concerns that deserve a separate boundary. - **Make JSON and CSV functions source-only.** Rejected because scalar payload parsing is common inside already-loaded datasets. -- **Add full XML and variant support in the same RFC.** Rejected because those need their own type and compatibility discussion, even though this RFC may reserve JSON parsing and dynamic-value predicate names. +- **Add full XML and variant support in the same RFC.** Rejected because XML and semi-structured variant values need their own type and compatibility discussion. InQL RFC 026 owns the variant value model. ## Drawbacks @@ -110,17 +110,40 @@ This RFC is additive. It should not change existing CSV ingestion behavior. ## Layers affected - **InQL specification** — format functions must stay distinct from source and sink contracts. -- **InQL library package** — public helpers should expose JSON, CSV scalar, URL, and hash functions with option typing. -- **Incan compiler** — typechecking must validate structured return types and schema/model arguments. -- **Execution / interchange** — Prism and Substrait lowering must preserve parser options, hash encodings, and structured return values or diagnose unsupported functions. +- **InQL library package** — public helpers should expose JSON, CSV scalar, URL, and hash functions with explicit scalar argument contracts. +- **Incan compiler** — typechecking must validate current scalar argument shapes; no new compiler syntax is required by this RFC. +- **Execution / interchange** — Prism and Substrait lowering must preserve parser options, hash encodings, and scalar payload contracts or diagnose unsupported functions. - **Documentation** — docs should distinguish scalar format functions from session read/write APIs. -## Unresolved questions - -- Should `from_json` accept model types directly as schema arguments, or only explicit schema values? -- Should invalid JSON path expressions be compile-time errors when literal and runtime errors otherwise? -- What option-record shape should CSV and JSON scalar parsers use? -- Should hash functions return binary values or lowercase hexadecimal strings by default? -- Which variant-style type predicates are portable enough for InQL core, and which should stay in a Snowflake-compatibility extension? - - +## Design Decisions + +### Resolved + +- Hash helpers operate on UTF-8 string bytes and return lowercase hexadecimal strings. +- Portable concrete hash helpers are `md5`, `sha1`, `sha224`, `sha256`, `sha384`, `sha512`, `crc32`, and `xxhash64`, each with an honest Substrait extension mapping. The DataFusion adapter validates materialized execution for the full helper set, using native DataFusion functions where available and Incan-authored adapter callbacks where DataFusion has no built-in implementation. +- `sha2(expr, bit_length)` is a compatibility helper, not a separate backend mapping. It rewrites to `sha224`, `sha256`, `sha384`, or `sha512` for supported literal bit lengths and rejects unsupported values. +- URL helpers accept scalar URL or component strings. `url_decode(...)` is strict and fails malformed percent escapes; `try_url_decode(...)` returns null for malformed percent escapes. +- JSON helpers accept scalar JSON payload strings. Strict helpers fail invalid JSON; `try_from_json(...)` returns null for invalid JSON; schema and path helpers are deterministic over the provided payload and literal schema/path arguments. +- CSV helpers accept scalar row strings and explicit schema-description strings. They operate on payload values only and do not replace the session CSV read/write contract. +- Semi-structured variant predicates such as `typeof`, `is_array`, `is_object`, `is_integer`, `is_timestamp`, and `is_null_value` belong to InQL RFC 026. RFC 022 does not accept them as JSON-text parser helpers. + +## Implementation Plan + +1. Add registry-backed hashing helpers under a logical function family. +2. Add registry-backed URL, JSON, and CSV scalar payload helpers under logical function families. +3. Add stable Substrait extension anchors for concrete helpers. +4. Keep `sha2(...)` as a compatibility rewrite over concrete helpers rather than a second mapping. +5. Add focused helper, registry, Substrait lowering, and DataFusion-backed session tests with concrete output values across the full helper set. +6. Add user-facing format-function docs and release notes. +7. Record semi-structured variant values as InQL RFC 026 rather than accepting fake JSON-text predicates in RFC 022. + +## Progress Checklist + +- [x] RFC 022 marked Implemented with full scalar format helper design decisions recorded. +- [x] `md5`, `sha1`, `sha224`, `sha256`, `sha384`, `sha512`, `sha2`, `crc32`, and `xxhash64` helpers added under the function catalog. +- [x] JSON, CSV, and URL scalar payload helpers added under the function catalog. +- [x] Concrete helpers registered with Substrait extension metadata. +- [x] `sha2(...)` implemented as a literal-bit-length rewrite with invalid-input diagnostics. +- [x] Focused helper, registry, Substrait lowering, and DataFusion-backed session tests added. +- [x] User-facing format-function docs and release notes added. +- [x] Semi-structured variant predicates delegated to InQL RFC 026 instead of being accepted as JSON-text parser helpers. diff --git a/docs/rfcs/026_semi_structured_variant_values.md b/docs/rfcs/026_semi_structured_variant_values.md new file mode 100644 index 0000000..65ff770 --- /dev/null +++ b/docs/rfcs/026_semi_structured_variant_values.md @@ -0,0 +1,144 @@ +# InQL RFC 026: Semi-structured variant logical values + +- **Status:** Draft +- **Created:** 2026-05-28 +- **Author(s):** Danny Meijer (@dannymeijer) +- **Related:** + - InQL RFC 002 (Apache Substrait integration) + - InQL RFC 014 (function registry and catalog governance) + - InQL RFC 020 (nested data functions) + - InQL RFC 022 (semi-structured and format functions) + - InQL RFC 024 (function extension policy) +- **Issue:** [InQL #52](https://github.com/dannys-code-corner/InQL/issues/52) +- **RFC PR:** [InQL #49](https://github.com/dannys-code-corner/InQL/pull/49) +- **Written against:** Incan v0.3-era InQL +- **Shipped in:** — + +## Summary + +This RFC defines semi-structured variant logical values for InQL. A variant value is distinct from ordinary `str` and `bytes` payloads: it carries a logical kind such as null, boolean, integer, floating point, string, timestamp, array, or object, and InQL predicates inspect that logical value rather than reparsing arbitrary JSON text. + +## Core model + +1. A semi-structured variant is a first-class logical value, even when a backend stores it in an opaque native representation. +2. Variant null and relation-level SQL null are distinct. Variant predicates must not erase that distinction. +3. Type predicates such as `typeof`, `is_array`, `is_object`, `is_integer`, `is_timestamp`, and `is_null_value` operate on variant expressions, not raw strings. +4. RFC 022 string-backed JSON and CSV payload helpers remain stable; this RFC may add variant-returning parse helpers, but it must not silently change existing helper return types. +5. Backend adapters may implement, emulate, or reject variant operations, but backend-native variant semantics do not define the portable InQL contract. + +## Motivation + +JSON and semi-structured payloads are common in data pipelines, but predicates such as `is_array(...)` and `typeof(...)` are ambiguous if InQL only has string payloads. `is_array(col("payload"))` could mean "parse this string as JSON and inspect the root value", or it could mean "inspect a typed semi-structured value that was already parsed by the logical plan." Those are different contracts with different null behavior, error behavior, and backend portability. + +If InQL accepts variant predicate names before defining variant values, it either squats on better names with string-parser semantics or leaves backend adapters to decide meaning at execution time. This RFC records the missing logical value model so those predicates have a precise home. + +## Goals + +- Define a semi-structured variant logical value family. +- Define the predicate and inspection functions that operate on variant values. +- Distinguish relation-level SQL null from semi-structured null values. +- Define how variant-returning parse helpers interact with RFC 022 string-backed payload helpers. +- Keep variant semantics backend-neutral across Prism, Substrait, and backend adapters. + +## Non-Goals + +- Changing RFC 022 string-backed helpers such as `parse_json`, `from_json`, `to_json`, `from_csv`, or `to_csv`. +- Defining XML, geospatial, sketch, or binary-format functions. +- Inferring timestamps, decimals, or other rich scalar kinds from untyped JSON strings without an explicit schema or parse option. +- Requiring every backend to support native variant storage. +- Defining query-block syntax for variant destructuring. + +## Guide-level explanation (how authors think about it) + +Authors should parse payload text into a variant value before using variant predicates. The exact helper names remain open in this Draft, but the shape is: + +```incan +from pub::inql.functions import col, is_array, is_null_value, parse_variant_json, typeof, variant_get + +events_with_payload = events.with_column("payload_value", parse_variant_json(col("payload"))) + +projected = ( + events_with_payload + .with_column("payload_kind", typeof(col("payload_value"))) + .with_column("is_items_array", is_array(variant_get(col("payload_value"), "$.items"))) + .with_column("deleted_was_json_null", is_null_value(variant_get(col("payload_value"), "$.deleted_at"))) +) +``` + +Authors who only need text validation or normalized JSON strings should keep using the RFC 022 helpers. Variant helpers are for plans that need logical semi-structured values and type-aware predicates. + +## Reference-level explanation (precise rules) + +InQL must define a variant logical value type that can represent at least null, boolean, integer, floating point, string, timestamp, array, and object values. Decimal, binary, date, and interval kinds may be added by design decision before this RFC moves beyond Draft. + +Variant predicates must accept variant expressions. They must not accept ordinary `str` expressions as an implicit parse-and-inspect shortcut. Authors must use an explicit variant parse or cast helper when starting from JSON text. + +`typeof(expr)` must return a stable lowercase kind name for a non-null variant value. It must distinguish at least `null`, `boolean`, `integer`, `float`, `string`, `timestamp`, `array`, and `object`. It must not report `timestamp` for a plain JSON string unless an explicit schema or parse option produced a typed timestamp variant. + +`is_array(expr)`, `is_object(expr)`, `is_integer(expr)`, `is_timestamp(expr)`, and `is_null_value(expr)` must inspect the variant kind. `is_integer(...)` must be true only for integer variant values, not floating point values whose runtime value happens to have no fractional component. `is_null_value(...)` must be true only for semi-structured null values. + +SQL null must remain distinct from variant null. If a predicate input is SQL null rather than a present variant value, the predicate result must follow InQL's scalar null behavior for missing inputs rather than returning true for `is_null_value(...)`. + +Variant parse helpers must define strict and recoverable forms. Strict parse helpers must fail malformed payloads according to registry error metadata. Recoverable parse helpers must return SQL null or another explicitly documented recoverable result for malformed payloads. A JSON `null` payload must produce a present variant null, not SQL null. + +Variant field/path access must preserve whether a missing path produced SQL null, variant null, or a present value. If a backend cannot preserve that distinction, the adapter must reject the operation or require an explicit compatibility mode. + +Substrait lowering must preserve variant logical type identity through extension type metadata or reject the operation before execution. A backend adapter may map variant values and predicates to native functions only when it preserves the InQL variant contract. + +## Design details + +### Syntax + +This RFC does not require new language syntax. Variant values and predicates may use ordinary helper calls and dataframe method chains. Future query-block syntax must lower to the same variant expression model. + +### Semantics + +Variant arrays and objects are semi-structured values, not InQL relation shapes. They may be projected, inspected, and passed to variant-aware helpers. They must not change relation cardinality unless used with a generator or table-valued function that explicitly defines such a change. + +Variant ordering, equality, grouping, and serialization are not implicit. If InQL supports those operations for variants, the operation must define kind ordering, null behavior, and backend compatibility rather than inheriting an arbitrary backend default. + +### Interaction with other InQL surfaces + +RFC 020 nested data functions operate on typed array, map, and struct expressions. Variant arrays and objects may interoperate with those functions only through explicit conversion rules. + +RFC 022 format helpers that return normalized JSON or CSV text remain string-backed helpers. This RFC may add variant-returning helpers with different names or explicit options, but existing RFC 022 helpers must not silently change return type. + +Prism may use variant kind metadata for validation, projection pruning, and rewrite safety. Prism must not rewrite variant operations in ways that collapse SQL null and variant null, drop schema-directed typed scalar information, or turn variant predicates into text parser calls. + +The Substrait boundary remains between InQL semantics and backend execution. DataFusion or any other backend may be an implementation target, but backend-native variant names, path syntaxes, and null rules do not define the portable InQL semantics. + +### Compatibility / migration + +This RFC is additive. Existing string payload columns remain strings. Existing RFC 022 functions keep their current string-backed behavior. Authors who want variant semantics must opt into variant-returning helpers or explicit casts. + +## Alternatives considered + +- **Make `typeof` and `is_array` parse strings directly.** Rejected because it couples predicate semantics to JSON text parsing and makes typed variant values incompatible with the obvious function names. +- **Change RFC 022 JSON helpers to return variants.** Rejected because it would silently change the meaning of already-defined string-backed payload helpers and make simple normalization workflows depend on a richer value model. +- **Expose backend-native variant functions as compatibility aliases.** Rejected as the portable core because backend-native null behavior, path syntax, and type names differ. +- **Represent variants as ordinary structs or maps.** Rejected unless the value carries distinct variant logical type identity; ordinary nested values do not by themselves encode semi-structured null and kind semantics. + +## Drawbacks + +- Variant logical values add a new type family to expression planning and backend capability reporting. +- Cross-backend support will be uneven because engines differ in native semi-structured support. +- Keeping SQL null and variant null distinct requires careful documentation, tests, and adapter validation. +- Variant path access can become a large surface if not kept separate from parser and generator responsibilities. + +## Layers affected + +- **InQL specification** — variant values must be distinct from strings, bytes, typed nested values, and sketch values. +- **InQL library package** — public helpers must expose variant parse, path access, and predicate functions only with explicit registry metadata. +- **Incan compiler** — typechecking may need enough helper metadata to reject string expressions where variant expressions are required. +- **Execution / interchange** — Prism, Substrait lowering, and backend adapters must preserve variant type identity and SQL-null versus variant-null behavior or reject unsupported operations. +- **Documentation** — function references must present variant predicates as variant operations, not as JSON-text parser shortcuts. + +## Unresolved questions + +- What is the public type spelling for variant expressions? +- Should variant-returning JSON parse helpers use new names such as `parse_variant_json`, or should RFC 022 helpers gain explicit options that return variants? +- Which scalar kinds are required before Planned status: decimal, binary, date, interval, or only the minimum JSON-compatible set plus timestamp? +- What path expression grammar should variant access use, and should it match RFC 022 JSON path helper strings? +- Should missing object keys and out-of-range array indexes return SQL null, a missing sentinel, or an error in strict modes? + + diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index 523de58..fd30dfc 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -28,9 +28,10 @@ InQL uses its **own** RFC series (starting at 000), independent of the [Incan la | [019][rfc-019] | Implemented | Window functions | | | [020][rfc-020] | Implemented | Nested data functions | | | [021][rfc-021] | Implemented | Generator and table-valued functions | | -| [022][rfc-022] | Draft | Semi-structured and format functions | | +| [022][rfc-022] | Implemented | Semi-structured and format functions | | | [023][rfc-023] | Draft | Approximate and sketch functions | | | [024][rfc-024] | Implemented | Function extension policy | | +| [026][rfc-026] | Draft | Semi-structured variant logical values | | @@ -67,4 +68,5 @@ New RFCs should follow [TEMPLATE.md] (aligned with Incan’s RFC structure, adap [rfc-022]: 022_semi_structured_format_functions.md [rfc-023]: 023_approximate_sketch_functions.md [rfc-024]: 024_function_extension_policy.md +[rfc-026]: 026_semi_structured_variant_values.md [incan-rfcs]: https://github.com/dannys-code-corner/incan/tree/main/workspaces/docs-site/docs/RFCs diff --git a/incan.lock b/incan.lock index 2a6d844..97b484b 100644 --- a/incan.lock +++ b/incan.lock @@ -3,8 +3,8 @@ [incan] format = 1 -incan-version = "0.3.0-rc17" -deps-fingerprint = "sha256:424fd53b12f0e810ffe3ba4188a82203bb011b5dd0769ad2b33ab1b854027487" +incan-version = "0.3.0-rc23" +deps-fingerprint = "sha256:825926ed0a97c47f458cb9e9ed6c142473cb37e7f047d3de1a32263f1cef5097" cargo-features = [] cargo-no-default-features = false cargo-all-features = false @@ -433,9 +433,9 @@ dependencies = [ [[package]] name = "brotli" -version = "8.0.2" +version = "8.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -444,9 +444,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "5.0.0" +version = "5.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -1371,9 +1371,9 @@ dependencies = [ [[package]] name = "displaydoc" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", @@ -1675,9 +1675,9 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "http" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" dependencies = [ "bytes", "itoa", @@ -1824,14 +1824,14 @@ dependencies = [ [[package]] name = "incan_core" -version = "0.3.0-rc17" +version = "0.3.0-rc23" dependencies = [ "serde", ] [[package]] name = "incan_derive" -version = "0.3.0-rc17" +version = "0.3.0-rc23" dependencies = [ "proc-macro2", "quote", @@ -1840,10 +1840,12 @@ dependencies = [ [[package]] name = "incan_stdlib" -version = "0.3.0-rc17" +version = "0.3.0-rc23" dependencies = [ "incan_core", "incan_derive", + "serde", + "serde_json", "tokio", "xxhash-rust", ] @@ -1862,10 +1864,13 @@ dependencies = [ [[package]] name = "inql" -version = "0.3.0-rc17" +version = "0.3.0-rc23" dependencies = [ "byteorder", + "crc32fast", "datafusion", + "datafusion-common", + "datafusion-expr", "datafusion-substrait", "encoding_rs", "incan_derive", @@ -1873,7 +1878,12 @@ dependencies = [ "prost", "prost-types", "rustix", + "serde", + "sha1", + "sha2", "substrait 0.63.0", + "url", + "xxhash-rust", ] [[package]] @@ -2068,9 +2078,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" [[package]] name = "miniz_oxide" @@ -2731,6 +2741,17 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest", +] + [[package]] name = "sha2" version = "0.10.9" @@ -3523,18 +3544,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.48" +version = "0.8.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +checksum = "bce33a6288fa3f072a8c2c7d0f2fdbb90e28298f0135c1f99b96c3db2efcc60b" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.48" +version = "0.8.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +checksum = "8fd425244944f4ab65ccff928e7323354c5a018c75838362fdce749dfad2ee1e" dependencies = [ "proc-macro2", "quote", diff --git a/incan.toml b/incan.toml index 30f2701..cada06b 100644 --- a/incan.toml +++ b/incan.toml @@ -10,4 +10,11 @@ prost-types = "0.14" substrait = { version = "0.63", features = ["protoc"] } # Datafusion is InQL default query engine, and provides the Substrait to Datafusion translation. datafusion = "53" +datafusion_common = { package = "datafusion-common", version = "53" } +datafusion_expr = { package = "datafusion-expr", version = "53" } datafusion-substrait = { version = "53", features = ["protoc"] } +crc32fast = "1.5" +sha1 = "0.10" +sha2 = "0.10.9" +url = "2.5" +xxhash_rust = { package = "xxhash-rust", version = "0.8", features = ["xxh64"] } diff --git a/src/functions/csv/from_csv.incn b/src/functions/csv/from_csv.incn new file mode 100644 index 0000000..0aef426 --- /dev/null +++ b/src/functions/csv/from_csv.incn @@ -0,0 +1,35 @@ +"""CSV scalar parsing helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr, str_expr +from substrait.function_extensions import FROM_CSV_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("from_csv", FROM_CSV_FUNCTION_ANCHOR), +)) +pub def from_csv(expr: ColumnExpr, schema: str) -> ColumnExpr: + """ + Parse one CSV row string into a JSON payload using an optional schema description for field names. + + Examples: + row = from_csv(col("line"), "STRUCT") + + Parameters: + expr: String expression containing one CSV row. + schema: Explicit schema description used for output field names. + """ + return registered_application("from_csv", [expr, str_expr(schema)]) diff --git a/src/functions/csv/mod.incn b/src/functions/csv/mod.incn new file mode 100644 index 0000000..2a4e1a2 --- /dev/null +++ b/src/functions/csv/mod.incn @@ -0,0 +1,5 @@ +"""CSV scalar format helpers.""" + +pub from functions.csv.from_csv import from_csv +pub from functions.csv.schema_of_csv import schema_of_csv +pub from functions.csv.to_csv import to_csv diff --git a/src/functions/csv/schema_of_csv.incn b/src/functions/csv/schema_of_csv.incn new file mode 100644 index 0000000..fbc53e5 --- /dev/null +++ b/src/functions/csv/schema_of_csv.incn @@ -0,0 +1,34 @@ +"""CSV scalar schema inference helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import SCHEMA_OF_CSV_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("schema_of_csv", SCHEMA_OF_CSV_FUNCTION_ANCHOR), +)) +pub def schema_of_csv(expr: ColumnExpr) -> ColumnExpr: + """ + Infer a deterministic schema description from one CSV row string. + + Examples: + schema = schema_of_csv(col("line")) + + Parameters: + expr: String expression containing one CSV row. + """ + return registered_application("schema_of_csv", [expr]) diff --git a/src/functions/csv/to_csv.incn b/src/functions/csv/to_csv.incn new file mode 100644 index 0000000..77924ad --- /dev/null +++ b/src/functions/csv/to_csv.incn @@ -0,0 +1,32 @@ +"""CSV scalar serialization helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import TO_CSV_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("to_csv", TO_CSV_FUNCTION_ANCHOR), +)) +pub def to_csv(expr: ColumnExpr) -> ColumnExpr: + """ + Serialize one scalar or JSON array/object payload as one CSV row string. + + Examples: + line = to_csv(col("payload")) + + Parameters: + expr: String-backed scalar or JSON array/object payload to serialize as one CSV row. + """ + return registered_application("to_csv", [expr]) diff --git a/src/functions/hashing/crc32.incn b/src/functions/hashing/crc32.incn new file mode 100644 index 0000000..f2d0d97 --- /dev/null +++ b/src/functions/hashing/crc32.incn @@ -0,0 +1,32 @@ +"""CRC-32 hashing helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import CRC32_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("crc32", CRC32_FUNCTION_ANCHOR), +)) +pub def crc32(expr: ColumnExpr) -> ColumnExpr: + """ + Return the lowercase eight-character hexadecimal CRC-32 digest for one UTF-8 string expression. + + Examples: + digest = crc32(col("payload")) + + Parameters: + expr: String expression whose UTF-8 bytes should be hashed. + """ + return registered_application("crc32", [expr]) diff --git a/src/functions/hashing/md5.incn b/src/functions/hashing/md5.incn new file mode 100644 index 0000000..69fe160 --- /dev/null +++ b/src/functions/hashing/md5.incn @@ -0,0 +1,51 @@ +""" +MD5 hash helper. + +`md5` hashes a string expression and returns its lowercase hexadecimal digest. +""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import MD5_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("md5", MD5_FUNCTION_ANCHOR), +)) +pub def md5(expr: ColumnExpr) -> ColumnExpr: + """ + Build an MD5 hexadecimal digest expression. + + Examples: + user_digest = md5(col("user_id")) + + Parameters: + expr: String expression whose UTF-8 bytes should be hashed. + """ + return registered_application("md5", [expr]) + + +module tests: + from projection_builders import ( + ColumnExprKind, + col, + column_expr_argument_count, + column_expr_function_name, + column_expr_kind, + ) + def test_md5_builds_registered_application() -> None: + expr = md5(col("payload")) + assert column_expr_kind(expr) == ColumnExprKind.ScalarFunction + assert column_expr_function_name(expr) == "md5" + assert column_expr_argument_count(expr) == 1 diff --git a/src/functions/hashing/sha1.incn b/src/functions/hashing/sha1.incn new file mode 100644 index 0000000..ef82a2e --- /dev/null +++ b/src/functions/hashing/sha1.incn @@ -0,0 +1,32 @@ +"""SHA-1 hashing helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import SHA1_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("sha1", SHA1_FUNCTION_ANCHOR), +)) +pub def sha1(expr: ColumnExpr) -> ColumnExpr: + """ + Return the lowercase hexadecimal SHA-1 digest for one UTF-8 string expression. + + Examples: + digest = sha1(col("payload")) + + Parameters: + expr: String expression whose UTF-8 bytes should be hashed. + """ + return registered_application("sha1", [expr]) diff --git a/src/functions/hashing/sha2.incn b/src/functions/hashing/sha2.incn new file mode 100644 index 0000000..6871e1b --- /dev/null +++ b/src/functions/hashing/sha2.incn @@ -0,0 +1,76 @@ +""" +SHA-2 compatibility helper. + +`sha2(expr, bits)` rewrites to the matching concrete SHA-2 helper for supported digest lengths. +""" + +from rust::incan_stdlib::errors import raise_value_error +from function_registry import ( + CORE_FUNCTION_NAMESPACE, + FunctionClass, + FunctionDeterminism, + FunctionErrorBehavior, + FunctionLifecycle, + FunctionNullBehavior, + compatibility_alias_spec, + rewrite_mapping, + v0_1, +) +from functions.hashing.sha224 import sha224 +from functions.hashing.sha256 import sha256 +from functions.hashing.sha384 import sha384 +from functions.hashing.sha512 import sha512 +from functions.registry import register_function +from projection_builders import ColumnExpr + + +@register_function(compatibility_alias_spec( + namespace=CORE_FUNCTION_NAMESPACE, + function_class=FunctionClass.Scalar, + aliases=["sha224", "sha256", "sha384", "sha512"], + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + determinism=FunctionDeterminism.Deterministic, + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=rewrite_mapping("sha2(expr, bits) -> sha224/sha256/sha384/sha512(expr) for supported literal bit lengths"), +)) +pub def sha2(expr: ColumnExpr, bit_length: int) -> ColumnExpr: + """ + Build a SHA-2 hexadecimal digest expression for a supported digest length. + + Examples: + user_digest = sha2(col("user_id"), 256) + + Parameters: + expr: String expression whose UTF-8 bytes should be hashed. + bit_length: Supported digest size: 224, 256, 384, or 512. + """ + if bit_length == 224: + return sha224(expr) + if bit_length == 256: + return sha256(expr) + if bit_length == 384: + return sha384(expr) + if bit_length == 512: + return sha512(expr) + return raise_value_error("sha2 bit_length must be one of 224, 256, 384, or 512") + + +module tests: + from std.testing import assert_raises + from projection_builders import ( + ColumnExprKind, + col, + column_expr_argument_count, + column_expr_function_name, + column_expr_kind, + ) + def test_sha2_rewrites_to_supported_sha2_helper() -> None: + expr = sha2(col("payload"), 256) + assert column_expr_kind(expr) == ColumnExprKind.ScalarFunction + assert column_expr_function_name(expr) == "sha256" + assert column_expr_argument_count(expr) == 1 + def _call_sha2_with_unsupported_length() -> None: + sha2(col("payload"), 1) + def test_sha2_rejects_unsupported_bit_length() -> None: + assert_raises[ValueError](_call_sha2_with_unsupported_length) diff --git a/src/functions/hashing/sha224.incn b/src/functions/hashing/sha224.incn new file mode 100644 index 0000000..a31807a --- /dev/null +++ b/src/functions/hashing/sha224.incn @@ -0,0 +1,51 @@ +""" +SHA-224 hash helper. + +`sha224` hashes a string expression and returns its lowercase hexadecimal digest. +""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import SHA224_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("sha224", SHA224_FUNCTION_ANCHOR), +)) +pub def sha224(expr: ColumnExpr) -> ColumnExpr: + """ + Build a SHA-224 hexadecimal digest expression. + + Examples: + payload_digest = sha224(col("payload")) + + Parameters: + expr: String expression whose UTF-8 bytes should be hashed. + """ + return registered_application("sha224", [expr]) + + +module tests: + from projection_builders import ( + ColumnExprKind, + col, + column_expr_argument_count, + column_expr_function_name, + column_expr_kind, + ) + def test_sha224_builds_registered_application() -> None: + expr = sha224(col("payload")) + assert column_expr_kind(expr) == ColumnExprKind.ScalarFunction + assert column_expr_function_name(expr) == "sha224" + assert column_expr_argument_count(expr) == 1 diff --git a/src/functions/hashing/sha256.incn b/src/functions/hashing/sha256.incn new file mode 100644 index 0000000..39d11db --- /dev/null +++ b/src/functions/hashing/sha256.incn @@ -0,0 +1,51 @@ +""" +SHA-256 hash helper. + +`sha256` hashes a string expression and returns its lowercase hexadecimal digest. +""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import SHA256_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("sha256", SHA256_FUNCTION_ANCHOR), +)) +pub def sha256(expr: ColumnExpr) -> ColumnExpr: + """ + Build a SHA-256 hexadecimal digest expression. + + Examples: + payload_digest = sha256(col("payload")) + + Parameters: + expr: String expression whose UTF-8 bytes should be hashed. + """ + return registered_application("sha256", [expr]) + + +module tests: + from projection_builders import ( + ColumnExprKind, + col, + column_expr_argument_count, + column_expr_function_name, + column_expr_kind, + ) + def test_sha256_builds_registered_application() -> None: + expr = sha256(col("payload")) + assert column_expr_kind(expr) == ColumnExprKind.ScalarFunction + assert column_expr_function_name(expr) == "sha256" + assert column_expr_argument_count(expr) == 1 diff --git a/src/functions/hashing/sha384.incn b/src/functions/hashing/sha384.incn new file mode 100644 index 0000000..de0e371 --- /dev/null +++ b/src/functions/hashing/sha384.incn @@ -0,0 +1,51 @@ +""" +SHA-384 hash helper. + +`sha384` hashes a string expression and returns its lowercase hexadecimal digest. +""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import SHA384_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("sha384", SHA384_FUNCTION_ANCHOR), +)) +pub def sha384(expr: ColumnExpr) -> ColumnExpr: + """ + Build a SHA-384 hexadecimal digest expression. + + Examples: + payload_digest = sha384(col("payload")) + + Parameters: + expr: String expression whose UTF-8 bytes should be hashed. + """ + return registered_application("sha384", [expr]) + + +module tests: + from projection_builders import ( + ColumnExprKind, + col, + column_expr_argument_count, + column_expr_function_name, + column_expr_kind, + ) + def test_sha384_builds_registered_application() -> None: + expr = sha384(col("payload")) + assert column_expr_kind(expr) == ColumnExprKind.ScalarFunction + assert column_expr_function_name(expr) == "sha384" + assert column_expr_argument_count(expr) == 1 diff --git a/src/functions/hashing/sha512.incn b/src/functions/hashing/sha512.incn new file mode 100644 index 0000000..bfc9780 --- /dev/null +++ b/src/functions/hashing/sha512.incn @@ -0,0 +1,51 @@ +""" +SHA-512 hash helper. + +`sha512` hashes a string expression and returns its lowercase hexadecimal digest. +""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import SHA512_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("sha512", SHA512_FUNCTION_ANCHOR), +)) +pub def sha512(expr: ColumnExpr) -> ColumnExpr: + """ + Build a SHA-512 hexadecimal digest expression. + + Examples: + payload_digest = sha512(col("payload")) + + Parameters: + expr: String expression whose UTF-8 bytes should be hashed. + """ + return registered_application("sha512", [expr]) + + +module tests: + from projection_builders import ( + ColumnExprKind, + col, + column_expr_argument_count, + column_expr_function_name, + column_expr_kind, + ) + def test_sha512_builds_registered_application() -> None: + expr = sha512(col("payload")) + assert column_expr_kind(expr) == ColumnExprKind.ScalarFunction + assert column_expr_function_name(expr) == "sha512" + assert column_expr_argument_count(expr) == 1 diff --git a/src/functions/hashing/xxhash64.incn b/src/functions/hashing/xxhash64.incn new file mode 100644 index 0000000..4422427 --- /dev/null +++ b/src/functions/hashing/xxhash64.incn @@ -0,0 +1,32 @@ +"""xxHash64 hashing helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import XXHASH64_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("xxhash64", XXHASH64_FUNCTION_ANCHOR), +)) +pub def xxhash64(expr: ColumnExpr) -> ColumnExpr: + """ + Return the lowercase sixteen-character hexadecimal xxHash64 digest for one UTF-8 string expression. + + Examples: + digest = xxhash64(col("payload")) + + Parameters: + expr: String expression whose UTF-8 bytes should be hashed. + """ + return registered_application("xxhash64", [expr]) diff --git a/src/functions/json/check_json.incn b/src/functions/json/check_json.incn new file mode 100644 index 0000000..da1ade3 --- /dev/null +++ b/src/functions/json/check_json.incn @@ -0,0 +1,32 @@ +"""JSON validity predicate helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import CHECK_JSON_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("check_json", CHECK_JSON_FUNCTION_ANCHOR), +)) +pub def check_json(expr: ColumnExpr) -> ColumnExpr: + """ + Return whether one string expression contains valid JSON. + + Examples: + valid = check_json(col("payload")) + + Parameters: + expr: String expression containing a JSON payload. + """ + return registered_application("check_json", [expr]) diff --git a/src/functions/json/common.incn b/src/functions/json/common.incn new file mode 100644 index 0000000..089a2c8 --- /dev/null +++ b/src/functions/json/common.incn @@ -0,0 +1,13 @@ +"""Shared validation helpers for JSON scalar payload functions.""" + +from projection_builders import ColumnExpr, str_expr +from rust::incan_stdlib::errors import raise_value_error + + +pub def json_path_expr(path: str) -> ColumnExpr: + """Build a path literal expression after validating the supported JSON path prefix.""" + if path == "$": + return str_expr(path) + if len(path) >= 2 and path[0] == "$" and (path[1] == "." or path[1] == "["): + return str_expr(path) + return raise_value_error("JSON path must be `$`, start with `$.`, or start with `$[`") diff --git a/src/functions/json/from_json.incn b/src/functions/json/from_json.incn new file mode 100644 index 0000000..1edc210 --- /dev/null +++ b/src/functions/json/from_json.incn @@ -0,0 +1,35 @@ +"""JSON parse helper with an explicit schema description.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr, str_expr +from substrait.function_extensions import FROM_JSON_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("from_json", FROM_JSON_FUNCTION_ANCHOR), +)) +pub def from_json(expr: ColumnExpr, schema: str) -> ColumnExpr: + """ + Validate JSON using an explicit schema description and return a normalized JSON payload. + + Examples: + payload = from_json(col("payload"), "STRUCT") + + Parameters: + expr: String expression containing a JSON payload. + schema: Explicit schema description for the expected JSON payload. + """ + return registered_application("from_json", [expr, str_expr(schema)]) diff --git a/src/functions/json/get_json_object.incn b/src/functions/json/get_json_object.incn new file mode 100644 index 0000000..29c9bbb --- /dev/null +++ b/src/functions/json/get_json_object.incn @@ -0,0 +1,36 @@ +"""JSON object-path extraction helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.json.common import json_path_expr +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import GET_JSON_OBJECT_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("get_json_object", GET_JSON_OBJECT_FUNCTION_ANCHOR), +)) +pub def get_json_object(expr: ColumnExpr, path: str) -> ColumnExpr: + """ + Extract one JSON value at a literal object path and return it as JSON text. + + Examples: + event_type = get_json_object(col("payload"), "$.type") + + Parameters: + expr: String expression containing a JSON payload. + path: Literal JSON path beginning with `$`, `$.`, or `$[`. + """ + return registered_application("get_json_object", [expr, json_path_expr(path)]) diff --git a/src/functions/json/json_array_length.incn b/src/functions/json/json_array_length.incn new file mode 100644 index 0000000..c5c5e76 --- /dev/null +++ b/src/functions/json/json_array_length.incn @@ -0,0 +1,34 @@ +"""JSON array length helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import JSON_ARRAY_LENGTH_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("json_array_length", JSON_ARRAY_LENGTH_FUNCTION_ANCHOR), +)) +pub def json_array_length(expr: ColumnExpr) -> ColumnExpr: + """ + Return the number of elements in a JSON array, or null for non-array JSON values. + + Examples: + tag_count = json_array_length(col("tags_json")) + + Parameters: + expr: String expression containing a JSON array payload. + """ + return registered_application("json_array_length", [expr]) diff --git a/src/functions/json/json_extract_path_text.incn b/src/functions/json/json_extract_path_text.incn new file mode 100644 index 0000000..33e5363 --- /dev/null +++ b/src/functions/json/json_extract_path_text.incn @@ -0,0 +1,36 @@ +"""JSON path text extraction helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.json.common import json_path_expr +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import JSON_EXTRACT_PATH_TEXT_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("json_extract_path_text", JSON_EXTRACT_PATH_TEXT_FUNCTION_ANCHOR), +)) +pub def json_extract_path_text(expr: ColumnExpr, path: str) -> ColumnExpr: + """ + Extract one JSON value at a literal object path and return scalar strings as plain text. + + Examples: + event_type = json_extract_path_text(col("payload"), "$.type") + + Parameters: + expr: String expression containing a JSON payload. + path: Literal JSON path beginning with `$`, `$.`, or `$[`. + """ + return registered_application("json_extract_path_text", [expr, json_path_expr(path)]) diff --git a/src/functions/json/json_object_keys.incn b/src/functions/json/json_object_keys.incn new file mode 100644 index 0000000..fb39d29 --- /dev/null +++ b/src/functions/json/json_object_keys.incn @@ -0,0 +1,34 @@ +"""JSON object key listing helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import JSON_OBJECT_KEYS_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("json_object_keys", JSON_OBJECT_KEYS_FUNCTION_ANCHOR), +)) +pub def json_object_keys(expr: ColumnExpr) -> ColumnExpr: + """ + Return object keys from a JSON payload as a JSON array string. + + Examples: + keys = json_object_keys(col("payload")) + + Parameters: + expr: String expression containing a JSON object payload. + """ + return registered_application("json_object_keys", [expr]) diff --git a/src/functions/json/mod.incn b/src/functions/json/mod.incn new file mode 100644 index 0000000..e72bc12 --- /dev/null +++ b/src/functions/json/mod.incn @@ -0,0 +1,12 @@ +"""JSON scalar format helpers.""" + +pub from functions.json.check_json import check_json +pub from functions.json.from_json import from_json +pub from functions.json.get_json_object import get_json_object +pub from functions.json.json_array_length import json_array_length +pub from functions.json.json_extract_path_text import json_extract_path_text +pub from functions.json.json_object_keys import json_object_keys +pub from functions.json.parse_json import parse_json +pub from functions.json.schema_of_json import schema_of_json +pub from functions.json.to_json import to_json +pub from functions.json.try_from_json import try_from_json diff --git a/src/functions/json/parse_json.incn b/src/functions/json/parse_json.incn new file mode 100644 index 0000000..afb757b --- /dev/null +++ b/src/functions/json/parse_json.incn @@ -0,0 +1,34 @@ +"""Strict JSON normalization helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import PARSE_JSON_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("parse_json", PARSE_JSON_FUNCTION_ANCHOR), +)) +pub def parse_json(expr: ColumnExpr) -> ColumnExpr: + """ + Validate and normalize one JSON scalar payload. + + Examples: + normalized = parse_json(col("payload")) + + Parameters: + expr: String expression containing a JSON payload. + """ + return registered_application("parse_json", [expr]) diff --git a/src/functions/json/schema_of_json.incn b/src/functions/json/schema_of_json.incn new file mode 100644 index 0000000..809f292 --- /dev/null +++ b/src/functions/json/schema_of_json.incn @@ -0,0 +1,34 @@ +"""JSON schema inference helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import SCHEMA_OF_JSON_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("schema_of_json", SCHEMA_OF_JSON_FUNCTION_ANCHOR), +)) +pub def schema_of_json(expr: ColumnExpr) -> ColumnExpr: + """ + Infer a deterministic schema description from one JSON scalar payload. + + Examples: + schema = schema_of_json(col("payload")) + + Parameters: + expr: String expression containing a JSON payload. + """ + return registered_application("schema_of_json", [expr]) diff --git a/src/functions/json/to_json.incn b/src/functions/json/to_json.incn new file mode 100644 index 0000000..345e065 --- /dev/null +++ b/src/functions/json/to_json.incn @@ -0,0 +1,32 @@ +"""JSON serialization helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import TO_JSON_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("to_json", TO_JSON_FUNCTION_ANCHOR), +)) +pub def to_json(expr: ColumnExpr) -> ColumnExpr: + """ + Serialize one scalar expression as JSON text. + + Examples: + payload = to_json(col("event_type")) + + Parameters: + expr: String-backed scalar expression to serialize as JSON text. + """ + return registered_application("to_json", [expr]) diff --git a/src/functions/json/try_from_json.incn b/src/functions/json/try_from_json.incn new file mode 100644 index 0000000..2d9ecc5 --- /dev/null +++ b/src/functions/json/try_from_json.incn @@ -0,0 +1,33 @@ +"""Recoverable JSON parse helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr, str_expr +from substrait.function_extensions import TRY_FROM_JSON_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("try_from_json", TRY_FROM_JSON_FUNCTION_ANCHOR), +)) +pub def try_from_json(expr: ColumnExpr, schema: str) -> ColumnExpr: + """ + Parse JSON with an explicit schema description and return null when the payload is invalid. + + Examples: + payload = try_from_json(col("payload"), "STRUCT") + + Parameters: + expr: String expression containing a JSON payload. + schema: Explicit schema description for the expected JSON payload. + """ + return registered_application("try_from_json", [expr, str_expr(schema)]) diff --git a/src/functions/mod.incn b/src/functions/mod.incn index 9292041..6a2ae8b 100644 --- a/src/functions/mod.incn +++ b/src/functions/mod.incn @@ -81,6 +81,32 @@ pub from functions.windows.first_value import first_value pub from functions.windows.last_value import last_value pub from functions.windows.nth_value import nth_value pub from functions.windows.bounds import current_row, following, preceding, unbounded_following, unbounded_preceding +pub from functions.hashing.md5 import md5 +pub from functions.hashing.crc32 import crc32 +pub from functions.hashing.sha1 import sha1 +pub from functions.hashing.sha2 import sha2 +pub from functions.hashing.sha224 import sha224 +pub from functions.hashing.sha256 import sha256 +pub from functions.hashing.sha384 import sha384 +pub from functions.hashing.sha512 import sha512 +pub from functions.hashing.xxhash64 import xxhash64 +pub from functions.urls.parse_url import parse_url +pub from functions.urls.try_url_decode import try_url_decode +pub from functions.urls.url_decode import url_decode +pub from functions.urls.url_encode import url_encode +pub from functions.json.check_json import check_json +pub from functions.json.from_json import from_json +pub from functions.json.get_json_object import get_json_object +pub from functions.json.json_array_length import json_array_length +pub from functions.json.json_extract_path_text import json_extract_path_text +pub from functions.json.json_object_keys import json_object_keys +pub from functions.json.parse_json import parse_json +pub from functions.json.schema_of_json import schema_of_json +pub from functions.json.to_json import to_json +pub from functions.json.try_from_json import try_from_json +pub from functions.csv.from_csv import from_csv +pub from functions.csv.schema_of_csv import schema_of_csv +pub from functions.csv.to_csv import to_csv pub from functions.operators.add import add pub from functions.operators.and_ import and_ pub from functions.operators.div import div diff --git a/src/functions/urls/mod.incn b/src/functions/urls/mod.incn new file mode 100644 index 0000000..a664957 --- /dev/null +++ b/src/functions/urls/mod.incn @@ -0,0 +1,6 @@ +"""URL format helpers.""" + +pub from functions.urls.parse_url import parse_url +pub from functions.urls.try_url_decode import try_url_decode +pub from functions.urls.url_decode import url_decode +pub from functions.urls.url_encode import url_encode diff --git a/src/functions/urls/parse_url.incn b/src/functions/urls/parse_url.incn new file mode 100644 index 0000000..147708b --- /dev/null +++ b/src/functions/urls/parse_url.incn @@ -0,0 +1,40 @@ +"""URL component parsing helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr, str_expr +from rust::incan_stdlib::errors import raise_value_error +from substrait.function_extensions import PARSE_URL_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("parse_url", PARSE_URL_FUNCTION_ANCHOR), +)) +pub def parse_url(expr: ColumnExpr, part: str) -> ColumnExpr: + """ + Extract one URL component such as `scheme`, `host`, `path`, `query`, `fragment`, `port`, `username`, or `password`. + + Examples: + host = parse_url(col("landing_page"), "host") + + Parameters: + expr: String expression containing one absolute URL. + part: URL component to extract. + """ + if part == "scheme" or part == "host" or part == "path" or part == "query" or part == "fragment" or part == "port" or part == "username" or part == "password": + return registered_application("parse_url", [expr, str_expr(part)]) + return raise_value_error( + "parse_url part must be one of scheme, host, path, query, fragment, port, username, or password", + ) diff --git a/src/functions/urls/try_url_decode.incn b/src/functions/urls/try_url_decode.incn new file mode 100644 index 0000000..21c2748 --- /dev/null +++ b/src/functions/urls/try_url_decode.incn @@ -0,0 +1,32 @@ +"""Recoverable URL component decoding helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import TRY_URL_DECODE_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("try_url_decode", TRY_URL_DECODE_FUNCTION_ANCHOR), +)) +pub def try_url_decode(expr: ColumnExpr) -> ColumnExpr: + """ + Percent-decode one URL component and return null for malformed input. + + Examples: + decoded = try_url_decode(col("encoded_query")) + + Parameters: + expr: String expression containing one percent-encoded URL component. + """ + return registered_application("try_url_decode", [expr]) diff --git a/src/functions/urls/url_decode.incn b/src/functions/urls/url_decode.incn new file mode 100644 index 0000000..e50fd39 --- /dev/null +++ b/src/functions/urls/url_decode.incn @@ -0,0 +1,34 @@ +"""Strict URL component decoding helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionErrorBehavior, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import URL_DECODE_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + error_behavior=FunctionErrorBehavior.InvalidInputDiagnostic, + substrait=extension_mapping("url_decode", URL_DECODE_FUNCTION_ANCHOR), +)) +pub def url_decode(expr: ColumnExpr) -> ColumnExpr: + """ + Percent-decode one URL component and fail on malformed percent escapes or invalid UTF-8. + + Examples: + decoded = url_decode(col("encoded_query")) + + Parameters: + expr: String expression containing one percent-encoded URL component. + """ + return registered_application("url_decode", [expr]) diff --git a/src/functions/urls/url_encode.incn b/src/functions/urls/url_encode.incn new file mode 100644 index 0000000..f6b4371 --- /dev/null +++ b/src/functions/urls/url_encode.incn @@ -0,0 +1,32 @@ +"""URL component encoding helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.registry import register_function, registered_application +from projection_builders import ColumnExpr +from substrait.function_extensions import URL_ENCODE_FUNCTION_ANCHOR + + +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("url_encode", URL_ENCODE_FUNCTION_ANCHOR), +)) +pub def url_encode(expr: ColumnExpr) -> ColumnExpr: + """ + Percent-encode one UTF-8 string for URL component use. + + Examples: + encoded = url_encode(col("query_text")) + + Parameters: + expr: String expression to percent-encode as a URL component. + """ + return registered_application("url_encode", [expr]) diff --git a/src/lib.incn b/src/lib.incn index 08cddb9..79a72eb 100644 --- a/src/lib.incn +++ b/src/lib.incn @@ -133,6 +133,32 @@ pub from functions.windows.first_value import first_value pub from functions.windows.last_value import last_value pub from functions.windows.nth_value import nth_value pub from functions.windows.bounds import current_row, following, preceding, unbounded_following, unbounded_preceding +pub from functions.hashing.md5 import md5 +pub from functions.hashing.crc32 import crc32 +pub from functions.hashing.sha1 import sha1 +pub from functions.hashing.sha2 import sha2 +pub from functions.hashing.sha224 import sha224 +pub from functions.hashing.sha256 import sha256 +pub from functions.hashing.sha384 import sha384 +pub from functions.hashing.sha512 import sha512 +pub from functions.hashing.xxhash64 import xxhash64 +pub from functions.urls.parse_url import parse_url +pub from functions.urls.try_url_decode import try_url_decode +pub from functions.urls.url_decode import url_decode +pub from functions.urls.url_encode import url_encode +pub from functions.json.check_json import check_json +pub from functions.json.from_json import from_json +pub from functions.json.get_json_object import get_json_object +pub from functions.json.json_array_length import json_array_length +pub from functions.json.json_extract_path_text import json_extract_path_text +pub from functions.json.json_object_keys import json_object_keys +pub from functions.json.parse_json import parse_json +pub from functions.json.schema_of_json import schema_of_json +pub from functions.json.to_json import to_json +pub from functions.json.try_from_json import try_from_json +pub from functions.csv.from_csv import from_csv +pub from functions.csv.schema_of_csv import schema_of_csv +pub from functions.csv.to_csv import to_csv pub from functions.operators.add import add pub from functions.operators.and_ import and_ pub from functions.operators.div import div diff --git a/src/session/datafusion_backend.incn b/src/session/datafusion_backend.incn index 8aa8c1d..920cf2f 100644 --- a/src/session/datafusion_backend.incn +++ b/src/session/datafusion_backend.incn @@ -95,6 +95,7 @@ from substrait.relations import project_rel_with_expressions, read_named_table_r from substrait.schema import RowColumnSpec, SubstraitPrimitiveKind from substrait.schema_registry import register_named_table_schema from substrait.window_metadata import WINDOW_NULL_TREATMENT_OPTION, WINDOW_OUTPUT_NAME_OPTION +from session.datafusion_format_functions import register_format_udfs @derive(Clone) @@ -146,6 +147,7 @@ pub async def datafusion_execute_async( ) -> Result[None, BackendError]: """Execute one Substrait plan via DataFusion and discard collected rows.""" ctx = SessionContext.new() + register_format_udfs(ctx.clone()) await _register_sources(ctx, registrations)? df = await _dataframe_from_plan(ctx, plan)? @@ -161,6 +163,7 @@ pub async def datafusion_collect_materialization_async( """Execute one Substrait plan via DataFusion and return structured DataFrame materialization.""" ctx = SessionContext.new() resolved_columns = root_names(plan.clone()) + register_format_udfs(ctx.clone()) await _register_sources(ctx, registrations)? df = await _dataframe_from_plan(ctx, plan)? @@ -189,6 +192,7 @@ pub async def datafusion_write_csv_async( ) -> Result[None, BackendError]: """Execute one plan and write result rows to a CSV sink URI via DataFusion.""" ctx = SessionContext.new() + register_format_udfs(ctx.clone()) await _register_sources(ctx, registrations)? df = await _dataframe_from_plan(ctx, plan)? @@ -204,6 +208,7 @@ pub async def datafusion_write_parquet_async( ) -> Result[None, BackendError]: """Execute one plan and write result rows to a Parquet sink URI via DataFusion.""" ctx = SessionContext.new() + register_format_udfs(ctx.clone()) await _register_sources(ctx, registrations)? df = await _dataframe_from_plan(ctx, plan)? diff --git a/src/session/datafusion_format_functions.incn b/src/session/datafusion_format_functions.incn new file mode 100644 index 0000000..57f7a97 --- /dev/null +++ b/src/session/datafusion_format_functions.incn @@ -0,0 +1,876 @@ +"""DataFusion execution bridge for InQL-owned scalar format functions.""" + +from rust::crc32fast import Hasher as Crc32Hasher +from rust::datafusion::arrow::array import ArrayRef, BooleanArray, Int64Array, StringArray +from rust::datafusion::arrow::datatypes import DataType +from rust::datafusion::execution::context import SessionContext +from rust::datafusion_common import DataFusionError, ScalarValue +from rust::datafusion_expr import ColumnarValue, ScalarFunctionImplementation, ScalarUDF, Volatility, create_udf +from rust::sha1 import Sha1 +from rust::sha2 import Digest +from rust::std::primitive import i64 as RustI64, usize as RustUsize +from rust::std::string import String as RustString +from rust::std::sync import Arc +from rust::url import Url +from rust::xxhash_rust::xxh64 import Xxh64 +from std.encoding.hex import hexlify +from std.json import JsonError, JsonKind, JsonValue + + +pub def register_format_udfs(ctx: SessionContext) -> None: + """Register RFC 022 scalar format functions that DataFusion does not expose natively.""" + # DataFusion stores UDF callbacks as 'static Rust closures. Keep the captured function identity as a literal inside + # each callback instead of borrowing a constructor-local name. + ctx.register_udf(_unary_string_udf("sha1", Arc.from((args) => _apply_unary_string("sha1", args.to_vec())))) + ctx.register_udf(_unary_string_udf("crc32", Arc.from((args) => _apply_unary_string("crc32", args.to_vec())))) + ctx.register_udf(_unary_string_udf("xxhash64", Arc.from((args) => _apply_unary_string("xxhash64", args.to_vec())))) + ctx.register_udf( + _unary_string_udf("url_encode", Arc.from((args) => _apply_unary_string("url_encode", args.to_vec()))), + ) + ctx.register_udf( + _unary_string_udf("url_decode", Arc.from((args) => _apply_unary_string("url_decode", args.to_vec()))), + ) + ctx.register_udf( + _unary_string_udf("try_url_decode", Arc.from((args) => _apply_unary_string("try_url_decode", args.to_vec()))), + ) + ctx.register_udf( + _binary_string_udf("parse_url", Arc.from((args) => _apply_binary_string("parse_url", args.to_vec()))), + ) + ctx.register_udf( + _unary_string_udf("parse_json", Arc.from((args) => _apply_unary_string("parse_json", args.to_vec()))), + ) + ctx.register_udf(_unary_bool_udf("check_json", Arc.from((args) => _apply_unary_bool("check_json", args.to_vec())))) + ctx.register_udf( + _unary_string_udf("schema_of_json", Arc.from((args) => _apply_unary_string("schema_of_json", args.to_vec()))), + ) + ctx.register_udf( + _unary_int_udf("json_array_length", Arc.from((args) => _apply_unary_int("json_array_length", args.to_vec()))), + ) + ctx.register_udf( + _unary_string_udf("json_object_keys", Arc.from((args) => _apply_unary_string("json_object_keys", args.to_vec()))), + ) + ctx.register_udf( + _binary_string_udf("get_json_object", Arc.from((args) => _apply_binary_string("get_json_object", args.to_vec()))), + ) + ctx.register_udf( + _binary_string_udf( + "json_extract_path_text", + Arc.from((args) => _apply_binary_string("json_extract_path_text", args.to_vec())), + ), + ) + ctx.register_udf( + _binary_string_udf("from_json", Arc.from((args) => _apply_binary_string("from_json", args.to_vec()))), + ) + ctx.register_udf( + _binary_string_udf("try_from_json", Arc.from((args) => _apply_binary_string("try_from_json", args.to_vec()))), + ) + ctx.register_udf(_unary_string_udf("to_json", Arc.from((args) => _apply_unary_string("to_json", args.to_vec())))) + ctx.register_udf( + _unary_string_udf("schema_of_csv", Arc.from((args) => _apply_unary_string("schema_of_csv", args.to_vec()))), + ) + ctx.register_udf(_binary_string_udf("from_csv", Arc.from((args) => _apply_binary_string("from_csv", args.to_vec())))) + ctx.register_udf(_unary_string_udf("to_csv", Arc.from((args) => _apply_unary_string("to_csv", args.to_vec())))) + + +def _unary_string_udf(name: str, implementation: ScalarFunctionImplementation) -> ScalarUDF: + """Create one unary string-to-string DataFusion UDF backed by Incan source.""" + return create_udf(name, [DataType.Utf8], DataType.Utf8, Volatility.Immutable, implementation) + + +def _unary_bool_udf(name: str, implementation: ScalarFunctionImplementation) -> ScalarUDF: + """Create one unary string-to-boolean DataFusion UDF backed by Incan source.""" + return create_udf(name, [DataType.Utf8], DataType.Boolean, Volatility.Immutable, implementation) + + +def _unary_int_udf(name: str, implementation: ScalarFunctionImplementation) -> ScalarUDF: + """Create one unary string-to-int DataFusion UDF backed by Incan source.""" + return create_udf(name, [DataType.Utf8], DataType.Int64, Volatility.Immutable, implementation) + + +def _binary_string_udf(name: str, implementation: ScalarFunctionImplementation) -> ScalarUDF: + """Create one binary string-to-string DataFusion UDF backed by Incan source.""" + return create_udf(name, [DataType.Utf8, DataType.Utf8], DataType.Utf8, Volatility.Immutable, implementation) + + +def _apply_unary_string(name: str, args: list[ColumnarValue]) -> Result[ColumnarValue, DataFusionError]: + """Apply one unary string function across scalar and array DataFusion inputs.""" + _require_arity(args, 1)? + if _all_scalar(args): + return Ok(ColumnarValue.Scalar(ScalarValue.Utf8(_eval_unary_string(name, _string_at(args[0].clone(), 0)?)?))) + + row_count = _row_count(args)? + mut values: list[Option[str]] = [] + for idx in range(row_count): + values.append(_eval_unary_string(name, _string_at(args[0].clone(), idx)?)?) + array_ref: ArrayRef = Arc.from(StringArray.from(values)) + return Ok(ColumnarValue.Array(array_ref)) + + +def _apply_unary_bool(name: str, args: list[ColumnarValue]) -> Result[ColumnarValue, DataFusionError]: + """Apply one unary string-to-bool function across scalar and array DataFusion inputs.""" + _require_arity(args, 1)? + if _all_scalar(args): + return Ok(ColumnarValue.Scalar(ScalarValue.Boolean(_eval_unary_bool(name, _string_at(args[0].clone(), 0)?)?))) + + row_count = _row_count(args)? + mut values: list[Option[bool]] = [] + for idx in range(row_count): + values.append(_eval_unary_bool(name, _string_at(args[0].clone(), idx)?)?) + array_ref: ArrayRef = Arc.from(BooleanArray.from(values)) + return Ok(ColumnarValue.Array(array_ref)) + + +def _apply_unary_int(name: str, args: list[ColumnarValue]) -> Result[ColumnarValue, DataFusionError]: + """Apply one unary string-to-int function across scalar and array DataFusion inputs.""" + _require_arity(args, 1)? + if _all_scalar(args): + value = _eval_unary_int(name, _string_at(args[0].clone(), 0)?)? + match value: + Some(number) => return Ok(ColumnarValue.Scalar(ScalarValue.Int64(Some(_int_to_rust_i64(number)?)))) + None => return Ok(ColumnarValue.Scalar(ScalarValue.Int64(None))) + + row_count = _row_count(args)? + mut values: list[Option[int]] = [] + for idx in range(row_count): + values.append(_eval_unary_int(name, _string_at(args[0].clone(), idx)?)?) + array_ref: ArrayRef = Arc.from(Int64Array.from(values)) + return Ok(ColumnarValue.Array(array_ref)) + + +def _apply_binary_string(name: str, args: list[ColumnarValue]) -> Result[ColumnarValue, DataFusionError]: + """Apply one binary string function across scalar and array DataFusion inputs.""" + _require_arity(args, 2)? + if _all_scalar(args): + return Ok( + ColumnarValue.Scalar( + ScalarValue.Utf8( + _eval_binary_string(name, _string_at(args[0].clone(), 0)?, _string_at(args[1].clone(), 0)?)?, + ), + ), + ) + + row_count = _row_count(args)? + mut values: list[Option[str]] = [] + for idx in range(row_count): + values.append(_eval_binary_string(name, _string_at(args[0].clone(), idx)?, _string_at(args[1].clone(), idx)?)?) + array_ref: ArrayRef = Arc.from(StringArray.from(values)) + return Ok(ColumnarValue.Array(array_ref)) + + +def _eval_unary_string(name: str, value: Option[str]) -> Result[Option[str], DataFusionError]: + """Dispatch unary string-returning format functions.""" + match name: + "sha1" => return _hash_sha1(value) + "crc32" => return _hash_crc32(value) + "xxhash64" => return _hash_xxhash64(value) + "url_encode" => return _url_encode(value) + "url_decode" => return _url_decode(value) + "try_url_decode" => return _try_url_decode(value) + "parse_json" => return _parse_json(value) + "schema_of_json" => return _schema_of_json(value) + "json_object_keys" => return _json_object_keys(value) + "to_json" => return _to_json(value) + "schema_of_csv" => return _schema_of_csv(value) + "to_csv" => return _to_csv(value) + _ => return _datafusion_error(f"unknown unary format function `{name}`") + + +def _eval_unary_bool(name: str, value: Option[str]) -> Result[Option[bool], DataFusionError]: + """Dispatch unary boolean-returning format functions.""" + match name: + "check_json" => return _check_json(value) + _ => return _datafusion_error(f"unknown boolean format function `{name}`") + + +def _eval_unary_int(name: str, value: Option[str]) -> Result[Option[int], DataFusionError]: + """Dispatch unary integer-returning format functions.""" + match name: + "json_array_length" => return _json_array_length(value) + _ => return _datafusion_error(f"unknown integer format function `{name}`") + + +def _eval_binary_string(name: str, left: Option[str], right: Option[str]) -> Result[Option[str], DataFusionError]: + """Dispatch binary string-returning format functions.""" + match name: + "parse_url" => return _parse_url(left, right) + "get_json_object" => return _json_path_value(left, right, false) + "json_extract_path_text" => return _json_path_value(left, right, true) + "from_json" => return _from_json(left, right) + "try_from_json" => return _try_from_json(left, right) + "from_csv" => return _from_csv(left, right) + _ => return _datafusion_error(f"unknown binary format function `{name}`") + + +def _hash_sha1(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Hash one optional string with SHA-1.""" + match value: + Some(text) => return Ok(Some(hexlify(_sha1_digest(_utf8_bytes(text))))) + None => return Ok(None) + + +def _hash_crc32(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Hash one optional string with CRC-32.""" + match value: + Some(text) => + mut hasher = Crc32Hasher.new() + hasher.update(_utf8_bytes(text).as_slice()) + return Ok(Some(hexlify(_u32_to_be_bytes(hasher.finalize())))) + None => return Ok(None) + + +def _hash_xxhash64(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Hash one optional string with xxHash64.""" + match value: + Some(text) => return Ok(Some(hexlify(_u64_to_be_bytes(_xxhash64(_utf8_bytes(text)))))) + None => return Ok(None) + + +def _url_encode(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Percent-encode one URL component.""" + match value: + Some(text) => + mut encoded = "" + for byte in _utf8_bytes(text): + if _url_byte_needs_encoding(byte): + encoded += "%" + _hex_byte(byte).upper() + else: + encoded += _byte_to_ascii_text(byte) + return Ok(Some(encoded)) + None => return Ok(None) + + +def _url_decode(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Strictly decode one percent-encoded URL component.""" + decoded = _try_url_decode(value)? + match value: + Some(_) => + match decoded: + Some(_) => return Ok(decoded) + None => return _datafusion_error("invalid percent-encoded URL component") + None => return Ok(None) + + +def _try_url_decode(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Decode one percent-encoded URL component, returning null for malformed input.""" + match value: + Some(text) => + mut out: list[u8] = [] + mut idx = 0 + while idx < len(text): + ch = text[idx:idx + 1] + if ch == "%": + if idx + 2 >= len(text): + return Ok(None) + high = _hex_value(text[idx + 1:idx + 2]) + low = _hex_value(text[idx + 2:idx + 3]) + if high < 0 or low < 0: + return Ok(None) + _append_byte(out, high * 16 + low) + idx += 3 + else: + for byte in _utf8_bytes(ch): + _append_byte(out, byte) + idx += 1 + match RustString.from_utf8(out): + Ok(decoded) => return Ok(Some(decoded)) + Err(_) => return Ok(None) + None => return Ok(None) + + +def _parse_url(value: Option[str], part: Option[str]) -> Result[Option[str], DataFusionError]: + """Extract one URL part with the Rust URL parser through Incan interop.""" + match value: + Some(raw_url) => + match part: + Some(raw_part) => + match Url.parse(raw_url): + Ok(parsed) => + normalized = raw_part.lower() + match normalized: + "scheme" => return Ok(Some(f"{parsed.scheme()}")) + "host" => + match parsed.host_str(): + Some(host) => return Ok(Some(f"{host}")) + None => return Ok(None) + "path" => return Ok(Some(f"{parsed.path()}")) + "query" => + match parsed.query(): + Some(query) => return Ok(Some(f"{query}")) + None => return Ok(None) + "fragment" => + match parsed.fragment(): + Some(fragment) => return Ok(Some(f"{fragment}")) + None => return Ok(None) + "port" => + match parsed.port(): + Some(port) => return Ok(Some(str(port))) + None => return Ok(None) + "username" => return Ok(Some(f"{parsed.username()}")) + "password" => + match parsed.password(): + Some(password) => return Ok(Some(f"{password}")) + None => return Ok(None) + _ => return _datafusion_error(f"unknown parse_url part `{raw_part}`") + Err(err) => return _datafusion_error(err.to_string()) + None => return Ok(None) + None => return Ok(None) + + +def _parse_json(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Validate and normalize one JSON payload string.""" + match value: + Some(text) => + parsed = _parse_json_value(text)? + return Ok(Some(_json_text(parsed)?)) + None => return Ok(None) + + +def _check_json(value: Option[str]) -> Result[Option[bool], DataFusionError]: + """Return whether one optional payload is valid JSON.""" + match value: + Some(text) => + match JsonValue.parse(text): + Ok(_) => return Ok(Some(true)) + Err(_) => return Ok(Some(false)) + None => return Ok(None) + + +def _schema_of_json(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Infer a deterministic schema description for one JSON payload.""" + match value: + Some(text) => return Ok(Some(_json_schema(_parse_json_value(text)?))) + None => return Ok(None) + + +def _json_array_length(value: Option[str]) -> Result[Option[int], DataFusionError]: + """Return the length of a JSON array payload.""" + match value: + Some(text) => + parsed = _parse_json_value(text)? + match parsed.as_array(): + Some(values) => return Ok(Some(len(values))) + None => return Ok(None) + None => return Ok(None) + + +def _json_object_keys(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Return JSON object keys as a JSON array string.""" + match value: + Some(text) => + parsed = _parse_json_value(text)? + mut keys: list[JsonValue] = [] + for key in parsed.keys(): + keys.append(JsonValue.string(key)) + return Ok(Some(_json_text(JsonValue.array(keys))?)) + None => return Ok(None) + + +def _json_path_value(value: Option[str], path: Option[str], text_mode: bool) -> Result[Option[str], DataFusionError]: + """Select one JSON path and return JSON text, with scalar strings unquoted in text mode.""" + match value: + Some(raw_json) => + match path: + Some(raw_path) => + parsed = _parse_json_value(raw_json)? + match _select_json_path(parsed, raw_path)?: + Some(selected) => + if text_mode: + match selected.as_str(): + Some(text) => return Ok(Some(text)) + None => pass + return Ok(Some(_json_text(selected)?)) + None => return Ok(None) + None => return Ok(None) + None => return Ok(None) + + +def _from_json(value: Option[str], _schema: Option[str]) -> Result[Option[str], DataFusionError]: + """Validate and normalize one JSON payload string.""" + return _parse_json(value) + + +def _try_from_json(value: Option[str], _schema: Option[str]) -> Result[Option[str], DataFusionError]: + """Validate JSON, returning null for malformed payloads.""" + match value: + Some(text) => + match _parse_json_value(text): + Ok(parsed) => return Ok(Some(_json_text(parsed)?)) + Err(_) => return Ok(None) + None => return Ok(None) + + +def _to_json(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Serialize a scalar string as JSON text.""" + match value: + Some(text) => return Ok(Some(_json_text(JsonValue.string(text))?)) + None => return Ok(None) + + +def _schema_of_csv(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Infer a deterministic schema description for one CSV row string.""" + match value: + Some(text) => + fields = _csv_fields(text)? + mut specs: list[str] = [] + for idx, item in enumerate(fields): + specs.append(f"_c{idx}: {_scalar_schema(item)}") + return Ok(Some(f"STRUCT<{", ".join(specs)}>")) + None => return Ok(None) + + +def _from_csv(value: Option[str], schema: Option[str]) -> Result[Option[str], DataFusionError]: + """Parse one CSV row string into a JSON array or schema-named object string.""" + match value: + Some(text) => + fields = _csv_fields(text)? + names = _csv_schema_field_names(schema.unwrap_or("")) + if len(names) == 0: + mut values: list[JsonValue] = [] + for field in fields: + values.append(JsonValue.string(field)) + return Ok(Some(_json_text(JsonValue.array(values))?)) + + mut entries: Dict[str, JsonValue] = {} + for idx, name in enumerate(names): + if idx < len(fields): + entries[name] = JsonValue.string(fields[idx]) + else: + entries[name] = JsonValue.string("") + return Ok(Some(_json_text(JsonValue.object(entries))?)) + None => return Ok(None) + + +def _to_csv(value: Option[str]) -> Result[Option[str], DataFusionError]: + """Serialize one scalar or JSON array/object payload as a CSV row.""" + match value: + Some(text) => + match JsonValue.parse(text): + Ok(parsed) => + if let Some(values) = parsed.as_array(): + return Ok(Some(_csv_line([_json_csv_value(item)? for item in values]))) + if let Some(entries) = parsed.as_object(): + mut fields: list[str] = [] + for key in entries.keys(): + fields.append(_json_csv_value(entries[key])?) + return Ok(Some(_csv_line(fields))) + return Ok(Some(_csv_line([_json_csv_value(parsed)?]))) + Err(_) => return Ok(Some(_csv_line([text]))) + None => return Ok(None) + + +def _parse_json_value(text: str) -> Result[JsonValue, DataFusionError]: + """Parse one JSON payload and convert std.json errors to DataFusion errors.""" + match JsonValue.parse(text): + Ok(value) => return Ok(value) + Err(err) => return _json_error(err) + + +def _json_error[T](err: JsonError) -> Result[T, DataFusionError]: + """Convert a std.json error to the DataFusion UDF error channel.""" + return _datafusion_error(err.message()) + + +def _json_text(value: JsonValue) -> Result[str, DataFusionError]: + """Serialize one JSON value and convert std.json errors to DataFusion errors.""" + match value.to_json(): + Ok(text) => return Ok(text) + Err(err) => return _json_error(err) + + +def _json_schema(value: JsonValue) -> str: + """Infer a compact deterministic schema descriptor for one dynamic JSON value.""" + match value.kind(): + JsonKind.Null => return "NULL" + JsonKind.Bool => return "BOOLEAN" + JsonKind.Int => return "BIGINT" + JsonKind.Float => return "DOUBLE" + JsonKind.String => return "STRING" + JsonKind.Array => + match value.as_array(): + Some(values) => + if len(values) == 0: + return "ARRAY" + return f"ARRAY<{_json_schema(values[0])}>" + None => return "ARRAY" + JsonKind.Object => + mut fields: list[str] = [] + for item in value.items(): + key, nested = item + fields.append(f"{key}: {_json_schema(nested)}") + return f"STRUCT<{", ".join(fields)}>" + + +def _select_json_path(value: JsonValue, path: str) -> Result[Option[JsonValue], DataFusionError]: + """Select a small JSONPath subset accepted by the public helper validators.""" + if path == "$" or path == "": + return Ok(Some(value)) + mut current = value + mut remaining = path + if remaining.startswith("$."): + remaining = remaining[2:] + elif remaining.startswith("$["): + remaining = remaining[1:] + elif remaining.startswith("$"): + remaining = remaining[1:] + for segment in remaining.split("."): + if segment == "": + continue + match _select_json_segment(current, segment)?: + Some(selected) => + current = selected + None => return Ok(None) + return Ok(Some(current)) + + +def _select_json_segment(value: JsonValue, segment: str) -> Result[Option[JsonValue], DataFusionError]: + """Select one object field plus optional array index suffixes.""" + mut current = value + mut remaining = segment + if not remaining.startswith("["): + field = _segment_field_name(remaining) + match current.get(field): + Some(selected) => + current = selected + None => return Ok(None) + remaining = remaining[len(field):] + while remaining.startswith("["): + close_idx = _find_closing_bracket(remaining)? + index_text = remaining[1:close_idx] + index = _parse_non_negative_int(index_text)? + match current.as_array(): + Some(values) => + if index >= len(values): + return Ok(None) + current = values[index] + None => return Ok(None) + remaining = remaining[close_idx + 1:] + if remaining != "": + return Ok(None) + return Ok(Some(current)) + + +def _segment_field_name(segment: str) -> str: + """Return the object-field prefix before any array index suffix.""" + mut idx = 0 + while idx < len(segment): + if segment[idx:idx + 1] == "[": + return segment[0:idx] + idx += 1 + return segment + + +def _find_closing_bracket(value: str) -> Result[int, DataFusionError]: + """Find the first closing bracket in one path segment.""" + for idx, ch in enumerate(value): + if ch == "]": + return Ok(idx) + return _datafusion_error(f"invalid JSON path segment `{value}`") + + +def _parse_non_negative_int(value: str) -> Result[int, DataFusionError]: + """Parse a non-negative integer used as a JSON array index.""" + if value == "": + return _datafusion_error("empty JSON array index") + mut out = 0 + for ch in value: + digit = _digit_value(ch) + if digit < 0: + return _datafusion_error(f"invalid JSON array index `{value}`") + out = out * 10 + digit + return Ok(out) + + +def _csv_fields(text: str) -> Result[list[str], DataFusionError]: + """Parse one CSV row with comma delimiter and double-quote escaping.""" + mut fields: list[str] = [] + mut field = "" + mut in_quotes = false + mut idx = 0 + while idx < len(text): + ch = text[idx:idx + 1] + if in_quotes: + if ch == "\"": + if idx + 1 < len(text) and text[idx + 1:idx + 2] == "\"": + field += "\"" + idx += 2 + else: + in_quotes = false + idx += 1 + else: + field += ch + idx += 1 + elif ch == ",": + fields.append(field) + field = str("") + idx += 1 + elif ch == "\"" and field == "": + in_quotes = true + idx += 1 + else: + field += ch + idx += 1 + if in_quotes: + return _datafusion_error("unterminated quoted CSV field") + fields.append(field) + return Ok(fields) + + +def _csv_schema_field_names(schema: str) -> list[str]: + """Extract field names from a compact `STRUCT` schema string.""" + mut trimmed = schema.strip() + if trimmed.startswith("STRUCT<") and trimmed.endswith(">"): + trimmed = trimmed[7:len(trimmed) - 1] + mut names: list[str] = [] + for part in trimmed.split(","): + name = part.strip().split(":")[0].strip() + if name != "": + names.append(name) + return names + + +def _scalar_schema(value: str) -> str: + """Infer a small scalar schema spelling from one CSV field.""" + if _is_integer_text(value): + return "BIGINT" + if _is_float_text(value): + return "DOUBLE" + lower = value.lower() + if lower == "true" or lower == "false": + return "BOOLEAN" + return "STRING" + + +def _is_integer_text(value: str) -> bool: + """Return whether text has an integer spelling.""" + if value == "": + return false + mut start = 0 + if value.startswith("-") or value.startswith("+"): + if len(value) == 1: + return false + start = 1 + for ch in value[start:]: + if _digit_value(ch) < 0: + return false + return true + + +def _is_float_text(value: str) -> bool: + """Return whether text has a simple decimal floating-point spelling.""" + if not value.contains("."): + return false + mut digit_count = 0 + mut start = 0 + if value.startswith("-") or value.startswith("+"): + start = 1 + for ch in value[start:]: + if ch == ".": + pass + elif _digit_value(ch) >= 0: + digit_count += 1 + else: + return false + return digit_count > 0 + + +def _json_csv_value(value: JsonValue) -> Result[str, DataFusionError]: + """Convert one JSON value to its scalar CSV field text.""" + if value.kind() == JsonKind.Null: + return Ok("") + match value.as_str(): + Some(text) => return Ok(text) + None => return Ok(_json_text(value)?) + + +def _csv_line(fields: list[str]) -> str: + """Serialize one CSV row without a trailing newline.""" + return ",".join([_escape_csv_field(field) for field in fields]) + + +def _escape_csv_field(value: str) -> str: + """Escape one CSV field when delimiters, quotes, or newlines require quoting.""" + if not value.contains(",") and not value.contains("\"") and not value.contains("\n") and not value.contains("\r"): + return value + return "\"" + value.replace("\"", "\"\"") + "\"" + + +def _require_arity(args: list[ColumnarValue], expected: int) -> Result[None, DataFusionError]: + """Validate a DataFusion UDF arity.""" + if len(args) == expected: + return Ok(None) + return _datafusion_error(f"format UDF expected {expected} arguments, got {len(args)}") + + +def _all_scalar(args: list[ColumnarValue]) -> bool: + """Return whether all DataFusion arguments are scalar values.""" + for arg in args: + match arg: + ColumnarValue.Array(_) => return false + ColumnarValue.Scalar(_) => pass + return true + + +def _row_count(args: list[ColumnarValue]) -> Result[int, DataFusionError]: + """Return the first array row count, or one for scalar-only arguments.""" + for arg in args: + match arg: + ColumnarValue.Array(array) => return _rust_usize_to_int(array.len()) + ColumnarValue.Scalar(_) => pass + return Ok(1) + + +def _string_at(arg: ColumnarValue, idx: int) -> Result[Option[str], DataFusionError]: + """Read one optional string value from a scalar or array DataFusion value.""" + array_result: Result[ArrayRef, DataFusionError] = arg.to_array(_int_to_rust_usize(idx + 1)?) + array = array_result? + scalar_result: Result[ScalarValue, DataFusionError] = ScalarValue.try_from_array( + array.as_ref(), + _int_to_rust_usize(idx)?, + ) + scalar = scalar_result? + return _scalar_value_string(scalar) + + +def _scalar_value_string(value: ScalarValue) -> Result[Option[str], DataFusionError]: + """Convert one DataFusion scalar to optional text for format-function execution.""" + match value: + ScalarValue.Utf8(text) => return Ok(text) + ScalarValue.LargeUtf8(text) => return Ok(text) + ScalarValue.Utf8View(text) => return Ok(text) + ScalarValue.Null => return Ok(None) + _ => return Ok(Some(value.to_string())) + + +def _utf8_bytes(value: str) -> bytes: + """Encode one Incan string as UTF-8 bytes.""" + return RustString.from(value).into_bytes() + + +def _sha1_digest(data: bytes) -> bytes: + """Return SHA-1 digest bytes.""" + mut hasher = Sha1.default() + Digest.update(hasher, data.as_slice()) + return hasher.finalize_reset().to_vec() + + +def _xxhash64(data: bytes) -> u64: + """Return xxHash64 with the same zero-seed contract as InQL's public helper.""" + mut hasher = Xxh64.default() + hasher.update(data.as_slice()) + return hasher.digest() + + +def _u32_to_be_bytes(value: u32) -> bytes: + """Encode a 32-bit integer as big-endian bytes.""" + number = int(value) + mut out: list[u8] = [] + _append_byte(out, (number // 16777216) % 256) + _append_byte(out, (number // 65536) % 256) + _append_byte(out, (number // 256) % 256) + _append_byte(out, number % 256) + return out + + +def _u64_to_be_bytes(value: u64) -> bytes: + """Encode a 64-bit integer as big-endian bytes.""" + mut out: list[u8] = [] + _append_byte(out, int((value // 72057594037927936) % 256)) + _append_byte(out, int((value // 281474976710656) % 256)) + _append_byte(out, int((value // 1099511627776) % 256)) + _append_byte(out, int((value // 4294967296) % 256)) + _append_byte(out, int((value // 16777216) % 256)) + _append_byte(out, int((value // 65536) % 256)) + _append_byte(out, int((value // 256) % 256)) + _append_byte(out, int(value % 256)) + return out + + +def _append_byte(mut out: list[u8], value: int) -> None: + """Append one wrapped byte to a byte list.""" + byte: u8 = value.wrapping_resize() + out.append(byte) + + +def _hex_byte(value: int) -> str: + """Return a two-character lowercase hex byte.""" + as_int = value + high = as_int // 16 + low = as_int - (high * 16) + return _hex_digit(high) + _hex_digit(low) + + +def _hex_digit(value: int) -> str: + """Return one lowercase hex digit.""" + digits = "0123456789abcdef" + return digits[value:value + 1] + + +def _url_byte_needs_encoding(value: int) -> bool: + """Return whether a byte belongs to the RFC 022 URL component encode set.""" + as_int = value + if as_int < 32 or as_int > 126: + return true + ch = _byte_to_ascii_text(value) + return ch == " " or ch == "\"" or ch == "#" or ch == "%" or ch == "<" or ch == ">" or ch == "?" or ch == "`" or ch == "{" or ch == "}" + + +def _byte_to_ascii_text(value: int) -> str: + """Convert a printable ASCII byte to one-character text.""" + as_int = value + if as_int < 32 or as_int > 126: + return "" + printable = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" + return printable[as_int - 32:as_int - 31] + + +def _hex_value(ch: str) -> int: + """Return a hex digit value, or -1 for non-hex input.""" + if ch >= "0" and ch <= "9": + return int(ch) + lower = ch.lower() + if lower == "a": + return 10 + if lower == "b": + return 11 + if lower == "c": + return 12 + if lower == "d": + return 13 + if lower == "e": + return 14 + if lower == "f": + return 15 + return -1 + + +def _digit_value(ch: str) -> int: + """Return a decimal digit value, or -1 for non-digits.""" + if ch >= "0" and ch <= "9": + return int(ch) + return -1 + + +def _rust_usize_to_int(value: RustUsize) -> Result[int, DataFusionError]: + """Convert Rust usize to Incan int through a checked i64 boundary.""" + match RustI64.try_from(value): + Ok(converted) => return Ok(converted.into()) + Err(_) => return _datafusion_error("row count does not fit Incan int") + + +def _int_to_rust_usize(value: int) -> Result[RustUsize, DataFusionError]: + """Convert an Incan int to Rust usize through a checked boundary.""" + match RustUsize.try_from(value): + Ok(converted) => return Ok(converted) + Err(_) => return _datafusion_error(f"index {value} does not fit Rust usize") + + +def _int_to_rust_i64(value: int) -> Result[RustI64, DataFusionError]: + """Convert an Incan int to Rust i64 through a checked boundary.""" + match RustI64.try_from(value): + Ok(converted) => return Ok(converted) + Err(_) => return _datafusion_error(f"value {value} does not fit Rust i64") + + +def _datafusion_error[T](message: str) -> Result[T, DataFusionError]: + """Build one DataFusion execution error.""" + return Err(DataFusionError.Execution(f"{message}")) diff --git a/src/substrait/function_extensions.incn b/src/substrait/function_extensions.incn index d0c0413..f303df7 100644 --- a/src/substrait/function_extensions.incn +++ b/src/substrait/function_extensions.incn @@ -88,6 +88,31 @@ pub const LEAD_FUNCTION_ANCHOR: u32 = 60 pub const FIRST_VALUE_FUNCTION_ANCHOR: u32 = 61 pub const LAST_VALUE_FUNCTION_ANCHOR: u32 = 62 pub const NTH_VALUE_FUNCTION_ANCHOR: u32 = 63 +pub const MD5_FUNCTION_ANCHOR: u32 = 64 +pub const SHA224_FUNCTION_ANCHOR: u32 = 65 +pub const SHA256_FUNCTION_ANCHOR: u32 = 66 +pub const SHA384_FUNCTION_ANCHOR: u32 = 67 +pub const SHA512_FUNCTION_ANCHOR: u32 = 68 +pub const SHA1_FUNCTION_ANCHOR: u32 = 69 +pub const CRC32_FUNCTION_ANCHOR: u32 = 70 +pub const XXHASH64_FUNCTION_ANCHOR: u32 = 71 +pub const URL_ENCODE_FUNCTION_ANCHOR: u32 = 72 +pub const URL_DECODE_FUNCTION_ANCHOR: u32 = 73 +pub const TRY_URL_DECODE_FUNCTION_ANCHOR: u32 = 74 +pub const PARSE_URL_FUNCTION_ANCHOR: u32 = 75 +pub const PARSE_JSON_FUNCTION_ANCHOR: u32 = 76 +pub const CHECK_JSON_FUNCTION_ANCHOR: u32 = 77 +pub const SCHEMA_OF_JSON_FUNCTION_ANCHOR: u32 = 78 +pub const JSON_ARRAY_LENGTH_FUNCTION_ANCHOR: u32 = 79 +pub const JSON_OBJECT_KEYS_FUNCTION_ANCHOR: u32 = 80 +pub const GET_JSON_OBJECT_FUNCTION_ANCHOR: u32 = 81 +pub const JSON_EXTRACT_PATH_TEXT_FUNCTION_ANCHOR: u32 = 82 +pub const FROM_JSON_FUNCTION_ANCHOR: u32 = 83 +pub const TRY_FROM_JSON_FUNCTION_ANCHOR: u32 = 84 +pub const TO_JSON_FUNCTION_ANCHOR: u32 = 85 +pub const SCHEMA_OF_CSV_FUNCTION_ANCHOR: u32 = 86 +pub const FROM_CSV_FUNCTION_ANCHOR: u32 = 87 +pub const TO_CSV_FUNCTION_ANCHOR: u32 = 88 pub const FUNCTION_EXTENSION_URI: str = "https://inql.io/extensions/v0.1/functions.yaml" pub const EXPLODE_EXTENSION_URI: str = "https://inql.io/extensions/v0.1/unnest.yaml#explode" pub const EXPLODE_OUTER_EXTENSION_URI: str = "https://inql.io/extensions/v0.1/unnest.yaml#explode_outer" diff --git a/tests/test_format_functions.incn b/tests/test_format_functions.incn new file mode 100644 index 0000000..15b3033 --- /dev/null +++ b/tests/test_format_functions.incn @@ -0,0 +1,29 @@ +"""Test: RFC 022 scalar format helper validation.""" + +from std.testing import assert_raises +from functions import col, get_json_object, json_extract_path_text, parse_url + + +def _call_parse_url_with_unknown_part() -> None: + """Call parse_url with an unsupported literal part for ValueError assertions.""" + parse_url(col("landing_page"), "authority") + return + + +def _call_get_json_object_with_invalid_path() -> None: + """Call get_json_object with an unsupported literal path for ValueError assertions.""" + get_json_object(col("payload"), "type") + return + + +def _call_json_extract_path_text_with_invalid_path() -> None: + """Call json_extract_path_text with an unsupported literal path for ValueError assertions.""" + json_extract_path_text(col("payload"), "") + return + + +def test_format_functions__literal_parse_options_are_validated_early() -> None: + # -- Arrange / Act / Assert -- + assert_raises[ValueError](_call_parse_url_with_unknown_part) + assert_raises[ValueError](_call_get_json_object_with_invalid_path) + assert_raises[ValueError](_call_json_extract_path_text_with_invalid_path) diff --git a/tests/test_function_registry.incn b/tests/test_function_registry.incn index 0b06361..723a805 100644 --- a/tests/test_function_registry.incn +++ b/tests/test_function_registry.incn @@ -32,6 +32,7 @@ from functions import ( case_when, cast, ceil, + check_json, col, coalesce, cardinality, @@ -39,6 +40,7 @@ from functions import ( count_distinct, count_expr, count_if, + crc32, cume_dist, desc, desc_nulls_first, @@ -55,6 +57,8 @@ from functions import ( floor, float_expr, following, + from_csv, + from_json, function_registry_canonical_names, function_registry_entries, function_registry_entry, @@ -63,6 +67,7 @@ from functions import ( function_registry_function_refs, gt, gte, + get_json_object, in_, int_expr, int_lit, @@ -71,6 +76,9 @@ from functions import ( is_not_nan, is_not_null, is_null, + json_array_length, + json_extract_path_text, + json_object_keys, inline, inline_outer, lag, @@ -86,6 +94,7 @@ from functions import ( map_keys, map_values, max, + md5, min, modulo, mul, @@ -105,15 +114,32 @@ from functions import ( rank, round, row_number, + parse_json, + parse_url, + schema_of_csv, + schema_of_json, + sha1, + sha2, + sha224, + sha256, + sha384, + sha512, str_expr, str_lit, sub, sum, stack, try_cast, + try_from_json, + try_url_decode, + to_csv, + to_json, + url_decode, + url_encode, unbounded_following, unbounded_preceding, window, + xxhash64, ) from function_registry import ( CORE_FUNCTION_NAMESPACE, @@ -158,20 +184,28 @@ from substrait.function_extensions import ( BETWEEN_FUNCTION_ANCHOR, CARDINALITY_FUNCTION_ANCHOR, CEIL_FUNCTION_ANCHOR, + CHECK_JSON_FUNCTION_ANCHOR, COALESCE_FUNCTION_ANCHOR, COUNT_FUNCTION_ANCHOR, + CRC32_FUNCTION_ANCHOR, CUME_DIST_FUNCTION_ANCHOR, DENSE_RANK_FUNCTION_ANCHOR, DIVIDE_FUNCTION_ANCHOR, EQUAL_FUNCTION_ANCHOR, FIRST_VALUE_FUNCTION_ANCHOR, FLOOR_FUNCTION_ANCHOR, + FROM_CSV_FUNCTION_ANCHOR, + FROM_JSON_FUNCTION_ANCHOR, + GET_JSON_OBJECT_FUNCTION_ANCHOR, GT_FUNCTION_ANCHOR, GTE_FUNCTION_ANCHOR, IS_NAN_FUNCTION_ANCHOR, IS_NOT_DISTINCT_FROM_FUNCTION_ANCHOR, IS_NOT_NULL_FUNCTION_ANCHOR, IS_NULL_FUNCTION_ANCHOR, + JSON_ARRAY_LENGTH_FUNCTION_ANCHOR, + JSON_EXTRACT_PATH_TEXT_FUNCTION_ANCHOR, + JSON_OBJECT_KEYS_FUNCTION_ANCHOR, LT_FUNCTION_ANCHOR, LTE_FUNCTION_ANCHOR, LAG_FUNCTION_ANCHOR, @@ -184,6 +218,7 @@ from substrait.function_extensions import ( MAP_KEYS_FUNCTION_ANCHOR, MAP_VALUES_FUNCTION_ANCHOR, MAX_FUNCTION_ANCHOR, + MD5_FUNCTION_ANCHOR, MIN_FUNCTION_ANCHOR, MODULUS_FUNCTION_ANCHOR, MULTIPLY_FUNCTION_ANCHOR, @@ -200,8 +235,24 @@ from substrait.function_extensions import ( ROW_NUMBER_FUNCTION_ANCHOR, ROUND_FUNCTION_ANCHOR, STACK_EXTENSION_URI, + PARSE_JSON_FUNCTION_ANCHOR, + PARSE_URL_FUNCTION_ANCHOR, + SCHEMA_OF_CSV_FUNCTION_ANCHOR, + SCHEMA_OF_JSON_FUNCTION_ANCHOR, + SHA1_FUNCTION_ANCHOR, + SHA224_FUNCTION_ANCHOR, + SHA256_FUNCTION_ANCHOR, + SHA384_FUNCTION_ANCHOR, + SHA512_FUNCTION_ANCHOR, SUBTRACT_FUNCTION_ANCHOR, SUM_FUNCTION_ANCHOR, + TO_CSV_FUNCTION_ANCHOR, + TO_JSON_FUNCTION_ANCHOR, + TRY_FROM_JSON_FUNCTION_ANCHOR, + TRY_URL_DECODE_FUNCTION_ANCHOR, + URL_DECODE_FUNCTION_ANCHOR, + URL_ENCODE_FUNCTION_ANCHOR, + XXHASH64_FUNCTION_ANCHOR, EXPLODE_EXTENSION_URI, EXPLODE_OUTER_EXTENSION_URI, FLATTEN_EXTENSION_URI, @@ -269,12 +320,12 @@ def _local_entry_by_namespace_and_name_or_fail( def _expected_registry_names() -> list[str]: """Return the expected registered public helper names.""" - return ["col", "lit", "sum", "count", "count_expr", "count_distinct", "count_if", "avg", "min", "max", "int_expr", "float_expr", "str_expr", "bool_expr", "add", "mul", "int_lit", "str_lit", "bool_lit", "always_true", "always_false", "eq", "gt", "cast", "try_cast", "ne", "lt", "lte", "gte", "equal_null", "and_", "or_", "not_", "is_null", "is_not_null", "is_nan", "is_not_nan", "sub", "div", "mod", "neg", "coalesce", "nullif", "case_when", "in_", "between", "asc", "desc", "asc_nulls_first", "asc_nulls_last", "desc_nulls_first", "desc_nulls_last", "abs", "ceil", "floor", "round", "array", "array_contains", "array_distinct", "array_except", "array_flatten", "array_intersect", "array_join", "array_position", "range", "array_reverse", "array_slice", "array_sort", "array_union", "arrays_overlap", "cardinality", "element_at", "map_contains_key", "map_entries", "map_extract", "map_from_arrays", "map_keys", "map_values", "named_struct", "explode", "explode_outer", "posexplode", "posexplode_outer", "inline", "inline_outer", "flatten", "stack", "window", "unbounded_preceding", "preceding", "current_row", "following", "unbounded_following", "row_number", "rank", "dense_rank", "percent_rank", "cume_dist", "ntile", "lag", "lead", "first_value", "last_value", "nth_value"] + return ["col", "lit", "sum", "count", "count_expr", "count_distinct", "count_if", "avg", "min", "max", "int_expr", "float_expr", "str_expr", "bool_expr", "add", "mul", "int_lit", "str_lit", "bool_lit", "always_true", "always_false", "eq", "gt", "cast", "try_cast", "ne", "lt", "lte", "gte", "equal_null", "and_", "or_", "not_", "is_null", "is_not_null", "is_nan", "is_not_nan", "sub", "div", "mod", "neg", "coalesce", "nullif", "case_when", "in_", "between", "asc", "desc", "asc_nulls_first", "asc_nulls_last", "desc_nulls_first", "desc_nulls_last", "abs", "ceil", "floor", "round", "array", "array_contains", "array_distinct", "array_except", "array_flatten", "array_intersect", "array_join", "array_position", "range", "array_reverse", "array_slice", "array_sort", "array_union", "arrays_overlap", "cardinality", "element_at", "map_contains_key", "map_entries", "map_extract", "map_from_arrays", "map_keys", "map_values", "named_struct", "explode", "explode_outer", "posexplode", "posexplode_outer", "inline", "inline_outer", "flatten", "stack", "window", "unbounded_preceding", "preceding", "current_row", "following", "unbounded_following", "row_number", "rank", "dense_rank", "percent_rank", "cume_dist", "ntile", "lag", "lead", "first_value", "last_value", "nth_value", "sha224", "sha256", "sha384", "sha512", "sha2", "md5", "sha1", "crc32", "xxhash64", "parse_url", "try_url_decode", "url_decode", "url_encode", "check_json", "from_json", "get_json_object", "json_array_length", "json_extract_path_text", "json_object_keys", "parse_json", "schema_of_json", "to_json", "try_from_json", "from_csv", "schema_of_csv", "to_csv"] def _expected_substrait_mapped_names() -> list[str]: """Return helpers with concrete Substrait extension-function mappings.""" - return ["sum", "count", "avg", "min", "max", "add", "mul", "eq", "gt", "ne", "lt", "lte", "gte", "equal_null", "and_", "or_", "not_", "is_null", "is_not_null", "is_nan", "sub", "div", "mod", "neg", "coalesce", "nullif", "between", "abs", "ceil", "floor", "round", "array", "array_contains", "array_distinct", "array_except", "array_flatten", "array_intersect", "array_join", "array_position", "range", "array_reverse", "array_slice", "array_sort", "array_union", "arrays_overlap", "cardinality", "element_at", "map_entries", "map_extract", "map_from_arrays", "map_keys", "map_values", "named_struct", "row_number", "rank", "dense_rank", "percent_rank", "cume_dist", "ntile", "lag", "lead", "first_value", "last_value", "nth_value"] + return ["sum", "count", "avg", "min", "max", "add", "mul", "eq", "gt", "ne", "lt", "lte", "gte", "equal_null", "and_", "or_", "not_", "is_null", "is_not_null", "is_nan", "sub", "div", "mod", "neg", "coalesce", "nullif", "between", "abs", "ceil", "floor", "round", "array", "array_contains", "array_distinct", "array_except", "array_flatten", "array_intersect", "array_join", "array_position", "range", "array_reverse", "array_slice", "array_sort", "array_union", "arrays_overlap", "cardinality", "element_at", "map_entries", "map_extract", "map_from_arrays", "map_keys", "map_values", "named_struct", "row_number", "rank", "dense_rank", "percent_rank", "cume_dist", "ntile", "lag", "lead", "first_value", "last_value", "nth_value", "sha224", "sha256", "sha384", "sha512", "md5", "sha1", "crc32", "xxhash64", "parse_url", "try_url_decode", "url_decode", "url_encode", "check_json", "from_json", "get_json_object", "json_array_length", "json_extract_path_text", "json_object_keys", "parse_json", "schema_of_json", "to_json", "try_from_json", "from_csv", "schema_of_csv", "to_csv"] def _exercise_current_public_helpers() -> None: @@ -384,6 +435,32 @@ def _exercise_current_public_helpers() -> None: first_value(amount) last_value(amount) nth_value(amount, 2) + sha224(status) + sha256(status) + sha384(status) + sha512(status) + sha2(status, 256) + md5(status) + sha1(status) + crc32(status) + xxhash64(status) + parse_url(status, "host") + url_encode(status) + url_decode(status) + try_url_decode(status) + parse_json(status) + check_json(status) + schema_of_json(status) + get_json_object(status, "$.type") + json_extract_path_text(status, "$.type") + json_array_length(status) + json_object_keys(status) + from_json(status, "STRUCT") + try_from_json(status, "STRUCT") + to_json(status) + schema_of_csv(status) + from_csv(status, "STRUCT") + to_csv(status) return @@ -522,7 +599,7 @@ def test_function_registry__core_helpers_expose_portable_policy_metadata() -> No # -- Act / Assert -- for entry in entries: assert entry.namespace == CORE_FUNCTION_NAMESPACE, f"{entry.function_ref} should live in the core function namespace" - if entry.canonical_name == "count_expr" or entry.canonical_name == "count_distinct" or entry.canonical_name == "count_if": + if entry.canonical_name == "count_expr" or entry.canonical_name == "count_distinct" or entry.canonical_name == "count_if" or entry.canonical_name == "sha2": assert entry.policy_category == FunctionPolicyCategory.CompatibilityAlias, f"{entry.canonical_name} should be marked as a compatibility helper" assert entry.alias_policy == FunctionAliasPolicy.OptInCompatibility, "compatibility helpers should be opt-in by policy" continue @@ -703,6 +780,31 @@ def test_function_registry__substrait_extension_mappings_are_structured() -> Non _assert_extension_mapping("map_keys", "map_keys", MAP_KEYS_FUNCTION_ANCHOR) _assert_extension_mapping("map_values", "map_values", MAP_VALUES_FUNCTION_ANCHOR) _assert_extension_mapping("named_struct", "named_struct", NAMED_STRUCT_FUNCTION_ANCHOR) + _assert_extension_mapping("sha224", "sha224", SHA224_FUNCTION_ANCHOR) + _assert_extension_mapping("sha256", "sha256", SHA256_FUNCTION_ANCHOR) + _assert_extension_mapping("sha384", "sha384", SHA384_FUNCTION_ANCHOR) + _assert_extension_mapping("sha512", "sha512", SHA512_FUNCTION_ANCHOR) + _assert_extension_mapping("md5", "md5", MD5_FUNCTION_ANCHOR) + _assert_extension_mapping("sha1", "sha1", SHA1_FUNCTION_ANCHOR) + _assert_extension_mapping("crc32", "crc32", CRC32_FUNCTION_ANCHOR) + _assert_extension_mapping("xxhash64", "xxhash64", XXHASH64_FUNCTION_ANCHOR) + _assert_extension_mapping("parse_url", "parse_url", PARSE_URL_FUNCTION_ANCHOR) + _assert_extension_mapping("url_encode", "url_encode", URL_ENCODE_FUNCTION_ANCHOR) + _assert_extension_mapping("url_decode", "url_decode", URL_DECODE_FUNCTION_ANCHOR) + _assert_extension_mapping("try_url_decode", "try_url_decode", TRY_URL_DECODE_FUNCTION_ANCHOR) + _assert_extension_mapping("parse_json", "parse_json", PARSE_JSON_FUNCTION_ANCHOR) + _assert_extension_mapping("check_json", "check_json", CHECK_JSON_FUNCTION_ANCHOR) + _assert_extension_mapping("schema_of_json", "schema_of_json", SCHEMA_OF_JSON_FUNCTION_ANCHOR) + _assert_extension_mapping("json_array_length", "json_array_length", JSON_ARRAY_LENGTH_FUNCTION_ANCHOR) + _assert_extension_mapping("json_object_keys", "json_object_keys", JSON_OBJECT_KEYS_FUNCTION_ANCHOR) + _assert_extension_mapping("get_json_object", "get_json_object", GET_JSON_OBJECT_FUNCTION_ANCHOR) + _assert_extension_mapping("json_extract_path_text", "json_extract_path_text", JSON_EXTRACT_PATH_TEXT_FUNCTION_ANCHOR) + _assert_extension_mapping("from_json", "from_json", FROM_JSON_FUNCTION_ANCHOR) + _assert_extension_mapping("try_from_json", "try_from_json", TRY_FROM_JSON_FUNCTION_ANCHOR) + _assert_extension_mapping("to_json", "to_json", TO_JSON_FUNCTION_ANCHOR) + _assert_extension_mapping("schema_of_csv", "schema_of_csv", SCHEMA_OF_CSV_FUNCTION_ANCHOR) + _assert_extension_mapping("from_csv", "from_csv", FROM_CSV_FUNCTION_ANCHOR) + _assert_extension_mapping("to_csv", "to_csv", TO_CSV_FUNCTION_ANCHOR) def test_function_registry__generator_helpers_are_relation_extensions() -> None: @@ -799,6 +901,10 @@ def test_function_registry__rewrite_mappings_identify_non_extension_helpers() -> assert always_false_entry.substrait.kind == SubstraitMappingKind.Rewrite, "always_false should lower as a literal rewrite" _assert_rewrite_mapping("is_not_nan", "not_(is_nan(expr))") _assert_rewrite_mapping("map_contains_key", "gt(cardinality(map_extract(map_expr, key)), int_expr(0))") + _assert_rewrite_mapping( + "sha2", + "sha2(expr, bits) -> sha224/sha256/sha384/sha512(expr) for supported literal bit lengths", + ) assert always_true_entry.null_behavior == FunctionNullBehavior.Predicate, "predicate helpers should expose predicate null behavior" assert always_false_entry.null_behavior == FunctionNullBehavior.Predicate, "predicate helpers should expose predicate null behavior" @@ -832,6 +938,18 @@ def test_function_registry__public_helpers_preserve_existing_behavior() -> None: status, [str_lit("paid"), str_lit("open")], ), lt(amount, int_lit(10)), lte(amount, int_lit(10)), modulo(amount, lit(2)), round(amount)] + hash_exprs = [md5(status), sha2(status, 256), sha224(status), sha256(status), sha384(status), sha512(status), sha1( + status, + ), crc32(status), xxhash64(status)] + format_exprs = [parse_url(status, "host"), url_encode(status), url_decode(status), try_url_decode(status), parse_json( + status, + ), check_json(status), schema_of_json(status), get_json_object(status, "$.type"), json_extract_path_text( + status, + "$.type", + ), json_array_length(status), json_object_keys(status), from_json(status, "STRUCT"), try_from_json( + status, + "STRUCT", + ), to_json(status), schema_of_csv(status), from_csv(status, "STRUCT"), to_csv(status)] # -- Assert -- assert column_expr_kind(amount) == ColumnExprKind.Column, "col should still build a column reference" @@ -856,5 +974,9 @@ def test_function_registry__public_helpers_preserve_existing_behavior() -> None: assert column_expr_kind(gt_expr) == ColumnExprKind.ScalarFunction, "gt should use the shared scalar function kind" for core_expr in core_exprs: assert column_expr_kind(core_expr) != ColumnExprKind.Column, "core scalar helpers should build scalar expressions" + for hash_expr in hash_exprs: + assert column_expr_kind(hash_expr) == ColumnExprKind.ScalarFunction, "hash helpers should build scalar expressions" + for format_expr in format_exprs: + assert column_expr_kind(format_expr) == ColumnExprKind.ScalarFunction, "format helpers should build scalar expressions" assert column_expr_kind(always_true()) == ColumnExprKind.BoolLiteral, "always_true should still build a bool literal" assert column_expr_kind(always_false()) == ColumnExprKind.BoolLiteral, "always_false should still build a bool literal" diff --git a/tests/test_hashing_functions.incn b/tests/test_hashing_functions.incn new file mode 100644 index 0000000..7978b2e --- /dev/null +++ b/tests/test_hashing_functions.incn @@ -0,0 +1,58 @@ +"""Test: RFC 022 hashing helper surface.""" + +from std.testing import assert_raises +from functions import col, crc32, md5, sha1, sha2, sha224, sha256, sha384, sha512, xxhash64 +from function_registry import function_ref_for +from projection_builders import ( + ColumnExpr, + ColumnExprKind, + column_expr_argument_count, + column_expr_function_name, + column_expr_function_ref, + column_expr_kind, +) + + +def _assert_hash_application(expr: ColumnExpr, expected_name: str) -> None: + """Assert one hashing helper builds a registry-backed scalar application.""" + assert column_expr_kind(expr) == ColumnExprKind.ScalarFunction, f"{expected_name} should use scalar application nodes" + assert column_expr_function_name(expr) == expected_name, f"{expected_name} should preserve its canonical name" + assert column_expr_function_ref(expr) == function_ref_for(expected_name), f"{expected_name} should preserve its function ref" + assert column_expr_argument_count(expr) == 1, f"{expected_name} should carry one string input expression" + + +def _call_sha2_with_unsupported_length() -> None: + """Call sha2 with an unsupported digest length for ValueError assertions.""" + sha2(col("payload"), 1) + return + + +def test_hashing_functions__concrete_helpers_share_scalar_application_node() -> None: + # -- Arrange -- + payload = col("payload") + + # -- Act / Assert -- + _assert_hash_application(md5(payload), "md5") + _assert_hash_application(sha224(payload), "sha224") + _assert_hash_application(sha256(payload), "sha256") + _assert_hash_application(sha384(payload), "sha384") + _assert_hash_application(sha512(payload), "sha512") + _assert_hash_application(sha1(payload), "sha1") + _assert_hash_application(crc32(payload), "crc32") + _assert_hash_application(xxhash64(payload), "xxhash64") + + +def test_hashing_functions__sha2_rewrites_to_concrete_sha2_helpers() -> None: + # -- Arrange -- + payload = col("payload") + + # -- Act / Assert -- + _assert_hash_application(sha2(payload, 224), "sha224") + _assert_hash_application(sha2(payload, 256), "sha256") + _assert_hash_application(sha2(payload, 384), "sha384") + _assert_hash_application(sha2(payload, 512), "sha512") + + +def test_hashing_functions__sha2_rejects_unsupported_digest_lengths() -> None: + # -- Arrange / Act / Assert -- + assert_raises[ValueError](_call_sha2_with_unsupported_length) diff --git a/tests/test_session_projection.incn b/tests/test_session_projection.incn index ee91282..d22957f 100644 --- a/tests/test_session_projection.incn +++ b/tests/test_session_projection.incn @@ -11,20 +11,45 @@ from functions import ( case_when, cast, ceil, + check_json, coalesce, col, + crc32, desc, div, floor, + from_csv, + from_json, + get_json_object, gt, + json_array_length, + json_extract_path_text, + json_object_keys, lit, + md5, modulo, mul, neg, nullif, + parse_json, + parse_url, round, + schema_of_csv, + schema_of_json, + sha1, + sha2, + sha224, + sha384, + sha512, sub, + to_csv, + to_json, + try_from_json, + try_url_decode, try_cast, + url_decode, + url_encode, + xxhash64, cardinality, element_at, ) @@ -208,6 +233,115 @@ def test_session_projection__collect_executes_common_math_scalar_projection_func assert payload.contains("3"), "round projection should include round(10 / 4.0)" +def test_session_projection__collect_executes_format_hashing_projection_functions() -> None: + """collect should execute RFC 022 hashing helpers through DataFusion.""" + # -- Arrange -- + mut session = Session.default() + + # -- Act -- + lazy: LazyFrame[AggregateOrder] = assert_is_ok( + session.read_csv("aggregate_orders", AGGREGATE_ORDERS_CSV_FIXTURE), + "aggregate orders fixture should load", + ) + projected = lazy.with_column("md5_abc", md5(lit("abc"))).with_column("sha1_abc", sha1(lit("abc"))).with_column( + "crc32_abc", + crc32(lit("abc")), + ).with_column("xxhash64_abc", xxhash64(lit("abc"))).with_column("sha224_abc", sha224(lit("abc"))).with_column( + "sha2_256_abc", + sha2(lit("abc"), 256), + ).with_column("sha384_abc", sha384(lit("abc"))).with_column("sha512_abc", sha512(lit("abc"))) + df = _collect_or_fail(session, projected) + payload = df.preview_text() + resolved = df.resolved_columns() + + # -- Assert -- + assert df.row_count() == 3, "hashing projections should preserve the input rows" + assert len(resolved) == 10, "projection should expose all appended hash outputs" + assert payload.contains("md5_abc"), "md5 projection should materialize its alias" + assert payload.contains("sha2_256_abc"), "sha2 compatibility projection should materialize its alias" + assert payload.contains("900150983cd24fb0d6963f7d28e17f72"), "md5 should return the lowercase hex digest for abc" + assert payload.contains("a9993e364706816aba3e25717850c26c9cd0d89d"), "sha1 should return the lowercase hex digest for abc" + assert payload.contains("352441c2"), "crc32 should return the lowercase hex digest for abc" + assert payload.contains("44bc2cf5ad770999"), "xxhash64 should return the lowercase hex digest for abc" + assert payload.contains("23097d223405d8228642a477bda255b32aadbce4bda0b3f7e36c9da7"), "sha224 should return the lowercase hex digest for abc" + assert payload.contains("ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"), "sha2(..., 256) should rewrite to sha256" + assert payload.contains( + "cb00753f45a35e8bb5a03d699ac65007272c32ab0eded1631a8b605a43ff5bed8086072ba1e7cc2358baeca134c825a7", + ), "sha384 should return the lowercase hex digest for abc" + assert payload.contains( + "ddaf35a193617abacc417349ae20413112e6fa4e89a97ea20a9eeee64b55d39a2192992a274fc1a836ba3c23a3feebbd454d4423643ce80e2a9ac94fa54ca49f", + ), "sha512 should return the lowercase hex digest for abc" + + +def test_session_projection__collect_executes_json_url_and_csv_format_functions() -> None: + """collect should execute RFC 022 scalar format helpers through Incan-authored DataFusion callbacks.""" + # -- Arrange -- + mut session = Session.default() + + # -- Act -- + lazy: LazyFrame[AggregateOrder] = assert_is_ok( + session.read_csv("aggregate_orders_format", AGGREGATE_ORDERS_CSV_FIXTURE), + "aggregate orders fixture should load", + ) + json_payload = lit("{\"type\":\"click\",\"tags\":[\"paid\",\"web\"],\"user\":{\"id\":7}}") + csv_payload = lit("42,paid") + projected = lazy.with_column("json_valid", check_json(json_payload)).with_column( + "json_type", + json_extract_path_text(json_payload, "$.type"), + ).with_column("json_object", get_json_object(json_payload, "$.user")).with_column( + "json_tags", + json_array_length(lit("[\"paid\",\"web\"]")), + ).with_column("json_keys", json_object_keys(json_payload)).with_column("json_schema", schema_of_json(json_payload)).with_column( + "json_normalized", + parse_json(json_payload), + ).with_column("json_from", from_json(json_payload, "STRUCT")).with_column( + "json_try_invalid", + try_from_json(lit("{"), "STRUCT"), + ).with_column("json_string", to_json(lit("paid"))).with_column( + "url_host", + parse_url(lit("https://example.com/orders?id=7#top"), "host"), + ).with_column("url_encoded", url_encode(lit("a b"))).with_column("url_decoded", url_decode(lit("a%20b"))).with_column( + "url_try_invalid", + try_url_decode(lit("%zz")), + ).with_column("csv_schema", schema_of_csv(csv_payload)).with_column( + "csv_json", + from_csv(csv_payload, "STRUCT"), + ).with_column("csv_line", to_csv(lit("[\"42\",\"paid\"]"))) + df = _collect_or_fail(session, projected) + payload = df.preview_text() + resolved = df.resolved_columns() + + # -- Assert -- + assert df.row_count() == 3, "format projections should preserve input rows" + assert len(resolved) == 19, "projection should expose all appended format outputs" + assert payload.contains("click"), "JSON path extraction should return scalar text" + assert payload.contains("{\"id\":7}"), "get_json_object should return nested JSON text" + assert payload.contains("example.com"), "parse_url should extract the host" + assert payload.contains("a%20b"), "url_encode should percent-encode spaces" + assert payload.contains("a b"), "url_decode should decode valid percent escapes" + assert payload.contains("STRUCT<"), "schema helpers should emit deterministic schema text" + assert payload.contains("{\"id\":\"42\",\"status\":\"paid\"}"), "from_csv should use schema field names" + assert payload.contains("42,paid"), "to_csv should serialize JSON arrays as CSV rows" + + +def test_session_projection__collect_reports_strict_format_decode_errors() -> None: + """collect should surface strict format parsing failures as backend execution errors.""" + # -- Arrange -- + mut session = Session.default() + + # -- Act -- + lazy: LazyFrame[AggregateOrder] = assert_is_ok( + session.read_csv("aggregate_orders_bad_format", AGGREGATE_ORDERS_CSV_FIXTURE), + "aggregate orders fixture should load", + ) + projected = lazy.with_column("bad_url", url_decode(lit("%zz"))) + err = assert_is_err(session.collect(projected), "expected strict URL decoding to fail") + + # -- Assert -- + assert err.kind == SessionErrorKind.BackendExecutionError, err.error_message() + assert err.message.contains("invalid percent-encoded URL component"), err.error_message() + + def test_session_projection__collect_executes_nested_scalar_projection_functions() -> None: """collect should execute RFC 020 nested scalar helpers through DataFusion.""" # -- Arrange -- diff --git a/tests/test_substrait_plan.incn b/tests/test_substrait_plan.incn index ff97d78..56cf70f 100644 --- a/tests/test_substrait_plan.incn +++ b/tests/test_substrait_plan.incn @@ -26,12 +26,14 @@ from functions import ( case_when, cast, ceil, + check_json, col, cume_dist, coalesce, count, count_distinct, count_if, + crc32, dense_rank, desc, div, @@ -39,6 +41,9 @@ from functions import ( equal_null, floor, first_value, + from_csv, + from_json, + get_json_object, gt, gte, in_, @@ -46,6 +51,9 @@ from functions import ( is_not_nan, is_not_null, is_null, + json_array_length, + json_extract_path_text, + json_object_keys, lit, lag, last_value, @@ -59,6 +67,7 @@ from functions import ( map_keys, map_values, max, + md5, min, modulo, mul, @@ -70,13 +79,30 @@ from functions import ( nullif, ntile, or_, + parse_json, + parse_url, percent_rank, rank, round, row_number, + schema_of_csv, + schema_of_json, + sha1, + sha2, + sha224, + sha256, + sha384, + sha512, sub, sum, + to_csv, + to_json, try_cast, + try_from_json, + try_url_decode, + url_decode, + url_encode, + xxhash64, cardinality, element_at, explode, @@ -454,6 +480,32 @@ def test_plan__core_scalar_extension_mappings_lower_to_substrait() -> None: _assert_scalar_expr_lowers(ceil(div(col("amount"), lit(4.0)))) _assert_scalar_expr_lowers(floor(div(col("amount"), lit(4.0)))) _assert_scalar_expr_lowers(round(div(col("amount"), lit(4.0)))) + _assert_scalar_expr_lowers(md5(col("status"))) + _assert_scalar_expr_lowers(sha224(col("status"))) + _assert_scalar_expr_lowers(sha256(col("status"))) + _assert_scalar_expr_lowers(sha384(col("status"))) + _assert_scalar_expr_lowers(sha512(col("status"))) + _assert_scalar_expr_lowers(sha2(col("status"), 256)) + _assert_scalar_expr_lowers(sha1(col("status"))) + _assert_scalar_expr_lowers(crc32(col("status"))) + _assert_scalar_expr_lowers(xxhash64(col("status"))) + _assert_scalar_expr_lowers(parse_url(col("status"), "host")) + _assert_scalar_expr_lowers(url_encode(col("status"))) + _assert_scalar_expr_lowers(url_decode(col("status"))) + _assert_scalar_expr_lowers(try_url_decode(col("status"))) + _assert_scalar_expr_lowers(parse_json(col("status"))) + _assert_scalar_expr_lowers(check_json(col("status"))) + _assert_scalar_expr_lowers(schema_of_json(col("status"))) + _assert_scalar_expr_lowers(get_json_object(col("status"), "$.type")) + _assert_scalar_expr_lowers(json_extract_path_text(col("status"), "$.type")) + _assert_scalar_expr_lowers(json_array_length(col("status"))) + _assert_scalar_expr_lowers(json_object_keys(col("status"))) + _assert_scalar_expr_lowers(from_json(col("status"), "STRUCT")) + _assert_scalar_expr_lowers(try_from_json(col("status"), "STRUCT")) + _assert_scalar_expr_lowers(to_json(col("status"))) + _assert_scalar_expr_lowers(schema_of_csv(col("status"))) + _assert_scalar_expr_lowers(from_csv(col("status"), "STRUCT")) + _assert_scalar_expr_lowers(to_csv(col("status"))) def test_plan__nested_scalar_extension_mappings_lower_to_substrait() -> None: