diff --git a/.github/workflows/fuzz-long.yml b/.github/workflows/fuzz-long.yml new file mode 100644 index 0000000..4da17ba --- /dev/null +++ b/.github/workflows/fuzz-long.yml @@ -0,0 +1,28 @@ +name: Long Fuzz (reusable) + +on: + workflow_call: {} + +jobs: + fuzz: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Download goldens artifact + uses: actions/download-artifact@v3 + with: + name: goldens + path: artifact/golden + + - name: Setup runtime + uses: ./.github/actions/ci-setup + with: + otp-version: '28.1' + rebar3-version: '3.25.1' + gleam-version: '1.13.0' + + - name: Run long fuzz harness + run: | + set -eux + gleam run -m thrifty/fuzz_cli diff --git a/.github/workflows/goldens.yml b/.github/workflows/goldens.yml new file mode 100644 index 0000000..6c755f8 --- /dev/null +++ b/.github/workflows/goldens.yml @@ -0,0 +1,59 @@ +name: Generate and Validate Goldens + +on: + workflow_dispatch: + inputs: + regen: + description: 'Regenerate goldens' + required: false + type: boolean + push: + branches: [ main, master ] + +jobs: + goldens: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup runtime + uses: ./.github/actions/ci-setup + with: + otp-version: '28.1' + rebar3-version: '3.25.1' + gleam-version: '1.13.0' + + - name: Cache pip + uses: actions/cache@v4 + with: + path: | + ~/.cache/pip + .venv + key: ${{ runner.os }}-pip-${{ hashFiles('tools/golden/generate_compact_vectors.py') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Set up Python + run: | + set -eux + python3 -m venv .venv + . .venv/bin/activate + pip install --upgrade pip + pip install thriftpy2 + + - name: Generate goldens + run: | + set -eux + . .venv/bin/activate + python3 tools/golden/generate_compact_vectors.py + + - name: Run golden validation (gleam tests) + run: | + set -eux + gleam test + + - name: Upload goldens + uses: actions/upload-artifact@v4 + with: + name: goldens + path: artifact/golden diff --git a/.github/workflows/publish-gleam.yml b/.github/workflows/publish-gleam.yml deleted file mode 100644 index b73f5f0..0000000 --- a/.github/workflows/publish-gleam.yml +++ /dev/null @@ -1,172 +0,0 @@ -name: Publish (Gleam) - -# Publish package using `gleam publish` when a tag matching v* is pushed. -# Steps: -# - checkout (full history) -# - setup OTP / Gleam -# - build & test -# - verify the tag version matches `gleam.toml` -# - verify HEX API key via HTTP -# - publish with `gleam publish --yes` (non-interactive) - -on: - push: - tags: - - 'v*' - pull_request: - types: [closed] - branches: [main] - -permissions: - contents: write - packages: write - id-token: write - -jobs: - publish: - name: Release to Hex - runs-on: ubuntu-latest - if: >- - contains(github.ref, 'refs/tags/') || (github.event_name == 'pull_request' && github.event.pull_request.merged == true) - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Setup runtime - uses: ./.github/actions/ci-setup - with: - otp-version: '28.1' - gleam-version: '1.13.0' - - - name: Show versions - run: | - erl -version || true - gleam --version || true - - - name: Cache Gleam deps - uses: actions/cache@v4 - with: - path: | - ~/.cache/gleam - ~/.gleam - ./_gleam_deps - ./build - key: ${{ runner.os }}-gleam-1.13.0-otp28-publish-${{ hashFiles('**/gleam.toml') }} - restore-keys: | - ${{ runner.os }}-gleam-1.13.0-otp28-publish- - - - name: Install dependencies - run: | - set -eux - gleam deps download - - - name: Build - run: | - set -eux - gleam build - - - name: Run tests (sanity) - run: | - set -eux - gleam test - - - name: Determine version - id: ver - run: | - set -euo pipefail - # If this run was triggered by a tag push, derive version from the tag. - # If triggered by a merged PR (no tag yet), fall back to the version in gleam.toml. - if echo "${GITHUB_REF}" | grep -q '^refs/tags/'; then - TAG=${GITHUB_REF#refs/tags/} - VERSION=${TAG#v} - echo "Determined version ${VERSION} from tag ${TAG}" - else - VERSION=$(grep '^version' gleam.toml | sed -E 's/.*= *"([^\"]+)".*/\1/') || true - echo "GITHUB_REF=${GITHUB_REF}; using version from gleam.toml: ${VERSION}" - fi - echo "version=${VERSION}" >> $GITHUB_OUTPUT - - - name: Create tag from gleam.toml (on merged PR) - if: github.event_name == 'pull_request' && github.event.pull_request.merged == true - run: | - set -euo pipefail - VERSION=$(grep '^version' gleam.toml | sed -E 's/.*= *"([^\"]+)".*/\1/') - TAG="v${VERSION}" - echo "Derived tag: ${TAG}" - git fetch origin --tags - if git rev-parse -q --verify "refs/tags/${TAG}" >/dev/null; then - echo "Tag ${TAG} already exists, skipping" - exit 0 - fi - git config user.name "github-actions[bot]" - git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - git tag -a "${TAG}" -m "Release ${TAG}" - git push origin "${TAG}" - echo "Pushed tag ${TAG} to origin" - - - name: Verify tag matches `gleam.toml` - run: | - VERSION="${{ steps.ver.outputs.version }}" - FILE_VER=$(grep '^version' gleam.toml | sed -E 's/.*= *"([^"]+)".*/\1/') || true - echo "gleam.toml version: ${FILE_VER}" - if [ -n "${FILE_VER}" ] && [ "${FILE_VER}" != "${VERSION}" ]; then - echo "Tag version (${VERSION}) does not match gleam.toml (${FILE_VER}). Aborting publish." - exit 1 - fi - - - name: Trust HEX API key presence (skip /api/me check) - env: - HEX_API_KEY: ${{ secrets.HEX_API_KEY }} - run: | - set -euo pipefail - if [ -z "${HEX_API_KEY:-}" ]; then - echo "HEX_API_KEY secret is not set; aborting" - exit 1 - fi - # Do not call the Hex API here (some environments return unexpected 404s). - # Treat the presence of a non-empty secret as 'valid' for CI publishing. - # Do not print the key or any masked preview in logs. - # Only confirm that the secret is present to avoid accidental leaks. - echo "HEX_API_KEY is set" - - - name: Check if version already published on Hex - run: | - set -euo pipefail - PACKAGE=$(grep '^name' gleam.toml | sed -E 's/.*= *"([^"]+)".*/\1/') - VERSION="${{ steps.ver.outputs.version }}" - echo "Checking Hex for package ${PACKAGE} version ${VERSION}" - if curl -s "https://hex.pm/api/packages/${PACKAGE}" | jq -e ".releases[] | select(.version==\"${VERSION}\")" > /dev/null; then - echo "Version ${VERSION} already exists on Hex. Skipping publish." - exit 0 - fi - - - name: Publish to Hex - if: startsWith(github.ref, 'refs/tags/') || (github.event_name == 'pull_request' && github.event.pull_request.merged == true) - env: - HEX_API_KEY: ${{ secrets.HEX_API_KEY }} - run: | - set -euo pipefail - echo "Publishing with gleam publish --yes" - echo "I am not using semantic versioning" | gleam publish --yes - - - name: Create GitHub Release - if: startsWith(github.ref, 'refs/tags/') - uses: softprops/action-gh-release@v1 - with: - body: | - Release ${{ github.ref_name }} - - See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/${{ github.ref_name }}/CHANGELOG.md) for details. - draft: false - prerelease: false - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Upload package artifact (audit) - if: always() - uses: actions/upload-artifact@v4 - with: - name: gleam-hex-package - path: _build/default/lib/thrifty/hex/thrifty-*.tar || true diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..a1f4a07 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,108 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +--- + +## [1.1.0] — 2026-03-20 + +### Fixed + +#### Wire-format correctness (breaking for interoperability) + +- **Field ID in long-form header not zigzag-encoded** (`field.gleam`) + The Thrift Compact Protocol specification requires that field IDs in + long-form headers (used when the delta from the previous field ID is + outside `[1, 15]`) are encoded as a zigzag i16 followed by a varint. + The encoder was writing a plain varint and the decoder was reading a + plain varint, making payloads with long-form field headers + incompatible with every other Thrift implementation. + Fix: `encode_field_header` now applies `zigzag.encode_i32` before + `varint.encode_varint`; `read_field_header` now applies + `zigzag.decode_i32` after reading the varint. + Affects any field whose delta from the previous field ID is `> 15` + or `<= 0` (e.g. fields with ID > 15, or out-of-order fields). + +- **`read_i8` decoded bytes as unsigned instead of signed** (`reader.gleam`) + The bit-pattern `<>` matches unsigned (0–255). + Negative i8 values such as `-1` (wire byte `0xFF`) were silently + decoded as `255` instead of `-1`. + Fix: changed to `<>`. + +- **`message.gleam` sequence ID broken for negative values** + `encode_message_header` called `varint.encode_varint` directly on + `sequence_id`. Because `encode_varint` requires a non-negative + integer, negative sequence IDs produced corrupt output (or, after + the `encode_varint` guard was added, a panic). + The reference Java implementation encodes the sequence ID as its + unsigned 32-bit two's-complement representation. + Fix: encoder masks `sequence_id` to uint32 before encoding; + decoder sign-extends values `> INT32_MAX` back to negative i32. + +#### API correctness + +- **`encode_varint` accepted negative integers silently** (`varint.gleam`) + Passing a negative integer produced an invalid varint (the MSB + continuation bit would be set on the last byte). Now panics with an + explicit message including the offending value. + +- **`write_i8` silently truncated out-of-range values** (`writer.gleam`) + Values outside `[-128, 127]` were accepted and silently truncated by + the bit-syntax, e.g. `write_i8(256)` produced `<<0>>`. Now panics + with an explicit message, consistent with `write_i32` / `write_i64`. + +- **`write_i16` accepted the full i32 range** (`writer.gleam`) + `write_i16` delegated to `zigzag.encode_i32`, which allows values up + to ±2 147 483 647. Now panics on values outside `[-32 768, 32 767]`. + +- **`BoolElementPolicy::AcceptBoth` was identical to `AcceptCanonicalOnly`** + (`reader.gleam`, `types.gleam`) + Both policies executed the same code, making the distinction + meaningless. `AcceptBoth` now also accepts byte value `0` as `False`, + providing compatibility with older implementations that used the + BinaryProtocol `0 = False / 1 = True` encoding. + `AcceptCanonicalOnly` continues to accept only `1` (True) and `2` + (False). + +#### Documentation + +- **Doc comments placed after function bodies** (`varint.gleam`, + `zigzag.gleam`, `field.gleam`, `reader.gleam`, `writer.gleam`, + `message.gleam`, `writer_highlevel.gleam`) + Gleam attaches `///` doc comments to the next item in the file. + Comments that appeared after a function body were silently associated + with the wrong function in generated documentation. All misplaced + comments have been moved to immediately precede the function they + describe. + +- **Duplicate Apache-2.0 license header** (`varint.gleam`, `field.gleam`) + The license block appeared twice at the top of both files. The + duplicate has been removed. + +#### Dead code + +- **`types.Writer` type was never used** (`types.gleam`) + `pub type Writer { Writer(buffer: BitArray) }` was defined but never + instantiated or referenced anywhere in the library. The actual writer + uses `writer.Buffer`. The unused type has been removed. + +### Tests added + +- `read_i8_negative_roundtrip_test` — roundtrip for i8 values + `-128, -42, -1, 0, 1, 127` via `write_i8` / `read_i8`. +- `i8_write_boundaries_test` — verifies the raw byte written for + boundary values `127`, `-128`, `-1` (expected `0xFF`). +- `i16_write_boundaries_test` — roundtrip for `32 767`, `-32 768`, `-1` + via `write_i16` / `read_i16`. +- `permissive_bool_accepts_zero_as_false_test` — verifies that + `AcceptBoth` maps byte `0` to `False`. +- `message_negative_seqid_roundtrip_test` — roundtrip for sequence IDs + `-1`, `-100`, `-2 147 483 648`. +- `long_form_negative_field_id_test` — verifies that a long-form field + header with `field_id = -1` encodes and decodes correctly. +- `long_form_header_test` updated to use the correct wire bytes: + `zigzag(32) = 64`, varint `<<0x40>>` instead of the previous + (incorrect) plain varint `<<0x20>>`. diff --git a/src/thrifty/field.gleam b/src/thrifty/field.gleam index 7898c34..b154735 100644 --- a/src/thrifty/field.gleam +++ b/src/thrifty/field.gleam @@ -12,24 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Copyright 2025 The thrifty contributors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - import gleam/bit_array import thrifty/types import thrifty/varint +import thrifty/zigzag /// Read a field header from the reader and advance the reader. /// @@ -65,10 +52,11 @@ pub fn read_field_header( let delta = header_byte / 16 case delta == 0 { True -> - // Long form: field id encoded as varint (absolute) + // Long form: field id encoded as zigzag i16 + varint (per spec). case varint.decode_varint(data, byte_pos + 1) { Error(e) -> Error(e) - Ok(#(field_id, next_pos)) -> + Ok(#(field_id_raw, next_pos)) -> { + let field_id = zigzag.decode_i32(field_id_raw) case int_to_field_type(type_value) { Error(e) -> Error(e) Ok(field_type) -> @@ -77,6 +65,7 @@ pub fn read_field_header( set_position(reader, next_pos, options), )) } + } } False -> { // Short form: field id = last_field_id + delta @@ -128,7 +117,8 @@ pub fn encode_field_header( <> } False -> { - let varint_bytes = varint.encode_varint(field_id) + // Long form: field id as zigzag i16 + varint (per spec). + let varint_bytes = varint.encode_varint(zigzag.encode_i32(field_id)) let type_nibble = field_type_to_int(field_type) let header_byte = type_nibble bit_array.concat([<>, varint_bytes]) @@ -143,7 +133,8 @@ pub fn encode_field_header( <> } False -> { - let varint_bytes = varint.encode_varint(field_id) + // Long form: field id as zigzag i16 + varint (per spec). + let varint_bytes = varint.encode_varint(zigzag.encode_i32(field_id)) let type_nibble = field_type_to_int(field_type) let header_byte = type_nibble bit_array.concat([<>, varint_bytes]) @@ -153,6 +144,13 @@ pub fn encode_field_header( } } +/// Convert a `types.FieldType` to its compact protocol integer nibble. +/// +/// Inputs +/// - `ft`: field type to convert. +/// +/// Outputs +/// - Integer value per Compact Protocol mapping used in header bytes. fn field_type_to_int(ft: types.FieldType) -> Int { case ft { types.Stop -> 0 @@ -171,13 +169,14 @@ fn field_type_to_int(ft: types.FieldType) -> Int { } } -/// Convert a `types.FieldType` to its compact protocol integer nibble. +/// Convert an integer nibble to `types.FieldType`. /// /// Inputs -/// - `ft`: field type to convert. +/// - `n`: integer nibble from a header byte. /// /// Outputs -/// - Integer value per Compact Protocol mapping used in header bytes. +/// - `Ok(FieldType)` when the nibble maps to a known type. +/// - `Error(types.UnsupportedType(n))` for unknown nibble values. fn int_to_field_type(n: Int) -> Result(types.FieldType, types.DecodeError) { case n { 0 -> Ok(types.Stop) @@ -197,14 +196,15 @@ fn int_to_field_type(n: Int) -> Result(types.FieldType, types.DecodeError) { } } -/// Convert an integer nibble to `types.FieldType`. +/// Return a new `types.Reader` with updated byte position and options. /// /// Inputs -/// - `n`: integer nibble from a header byte. +/// - `reader`: existing `types.Reader`. +/// - `byte_pos`: new byte offset. +/// - `options`: reader options to attach. /// /// Outputs -/// - `Ok(FieldType)` when the nibble maps to a known type. -/// - `Error(types.UnsupportedType(n))` for unknown nibble values. +/// - `types.Reader` referencing the same `data` with updated state. fn set_position( reader: types.Reader, byte_pos: Int, @@ -213,12 +213,3 @@ fn set_position( let types.Reader(data, _, _) = reader types.Reader(data, byte_pos, options) } -/// Return a new `types.Reader` with updated byte position and options. -/// -/// Inputs -/// - `reader`: existing `types.Reader`. -/// - `byte_pos`: new byte offset. -/// - `options`: reader options to attach. -/// -/// Outputs -/// - `types.Reader` referencing the same `data` with updated state. diff --git a/src/thrifty/message.gleam b/src/thrifty/message.gleam index 9131595..37883b9 100644 --- a/src/thrifty/message.gleam +++ b/src/thrifty/message.gleam @@ -53,7 +53,11 @@ pub fn encode_message_header(header: MessageHeader) -> BitArray { let type_value = message_type_to_int(header.message_type) let version_and_type = version * 32 + type_value - let seq_id_varint = varint.encode_varint(header.sequence_id) + // Encode seq_id as unsigned 32-bit two's-complement varint, matching the + // reference Java implementation which uses unsigned right-shift (>>>). + let seq_id_u32 = + { header.sequence_id % 4_294_967_296 + 4_294_967_296 } % 4_294_967_296 + let seq_id_varint = varint.encode_varint(seq_id_u32) let name_bytes = string.to_utf_codepoints(header.name) @@ -108,7 +112,13 @@ pub fn decode_message_header( // Read sequence ID first case varint.decode_varint(data, byte_pos + 2) { Error(e) -> Error(e) - Ok(#(seq_id, pos_after_seq)) -> { + Ok(#(seq_id_raw, pos_after_seq)) -> { + // Sign-extend back to i32: values > INT32_MAX were + // negative before the uint32 encoding on the write side. + let seq_id = case seq_id_raw > 2_147_483_647 { + True -> seq_id_raw - 4_294_967_296 + False -> seq_id_raw + } // Read method name length case varint.decode_varint(data, pos_after_seq) { Error(e) -> Error(e) @@ -160,6 +170,14 @@ pub fn decode_message_header( } } +/// Convert a `MessageType` to the protocol numeric value used in the +/// version/type byte. +/// +/// Inputs +/// - `mt`: message type enum. +/// +/// Outputs +/// - Integer value used in encoding the version/type byte. fn message_type_to_int(mt: MessageType) -> Int { case mt { Call -> 1 @@ -169,14 +187,14 @@ fn message_type_to_int(mt: MessageType) -> Int { } } -/// Convert a `MessageType` to the protocol numeric value used in the -/// version/type byte. +/// Convert a numeric message type to `MessageType`. /// /// Inputs -/// - `mt`: message type enum. +/// - `n`: numeric type extracted from the version/type byte. /// /// Outputs -/// - Integer value used in encoding the version/type byte. +/// - `Ok(MessageType)` when recognized. +/// - `Error(types.UnsupportedType(n))` when unknown. fn int_to_message_type(n: Int) -> Result(MessageType, types.DecodeError) { case n { 1 -> Ok(Call) @@ -186,7 +204,6 @@ fn int_to_message_type(n: Int) -> Result(MessageType, types.DecodeError) { _ -> Error(types.UnsupportedType(n)) } } -/// Convert a numeric message type to `MessageType`. /// /// Inputs /// - `n`: numeric type extracted from the version/type byte. diff --git a/src/thrifty/reader.gleam b/src/thrifty/reader.gleam index fee18eb..1da871b 100644 --- a/src/thrifty/reader.gleam +++ b/src/thrifty/reader.gleam @@ -64,7 +64,7 @@ pub fn read_i8( Error(_) -> Error(types.UnexpectedEndOfInput) Ok(bits) -> case bits { - <> -> + <> -> Ok(#(value, set_position(reader, byte_pos + 1, options))) _ -> Error(types.InvalidWireFormat("Invalid byte")) } @@ -197,8 +197,10 @@ pub fn read_bool_element( _ -> Error(types.InvalidWireFormat("Invalid boolean element")) } types.AcceptBoth -> - // Accept either common encoding 1 or 2 and map to booleans. + // Accept canonical (1/2) and also legacy (0/1) boolean encoding. + // 0 => False, 1 => True, 2 => False; all other bytes are invalid. case value { + 0 -> Ok(#(False, set_position(reader, byte_pos + 1, options))) 1 -> Ok(#(True, set_position(reader, byte_pos + 1, options))) 2 -> Ok(#(False, set_position(reader, byte_pos + 1, options))) _ -> Error(types.InvalidWireFormat("Invalid boolean element")) @@ -519,6 +521,16 @@ fn map_reader( } } +/// Set the reader's byte position and options, returning a new immutable reader. +/// +/// Inputs +/// - `reader`: existing `types.Reader` instance. +/// - `byte_pos`: new byte offset to set. +/// - `options`: `types.ReaderOptions` to attach to the new reader. +/// +/// Outputs +/// - A `types.Reader` referencing the same underlying data with updated +/// position and options. fn set_position( reader: types.Reader, byte_pos: Int, @@ -528,16 +540,16 @@ fn set_position( types.Reader(data, byte_pos, options) } -/// Set the reader's byte position and options, returning a new immutable reader. +/// Ensure the current recursion `depth` does not exceed configured limits. /// /// Inputs -/// - `reader`: existing `types.Reader` instance. -/// - `byte_pos`: new byte offset to set. -/// - `options`: `types.ReaderOptions` to attach to the new reader. +/// - `reader`: the current `types.Reader` containing configured options. +/// - `depth`: current recursion depth to validate. /// /// Outputs -/// - A `types.Reader` referencing the same underlying data with updated -/// position and options. +/// - `Ok(Nil)` when within limits. +/// - `Error(types.InvalidWireFormat("Exceeded maximum depth"))` when the depth +/// exceeds `options.max_depth`. fn ensure_depth( reader: types.Reader, depth: Int, @@ -549,26 +561,6 @@ fn ensure_depth( } } -fn ensure_limit( - value: Int, - limit: Int, - message: String, -) -> Result(Nil, types.DecodeError) { - case value > limit { - True -> Error(types.InvalidWireFormat(message)) - False -> Ok(Nil) - } -} -/// Ensure the current recursion `depth` does not exceed configured limits. -/// -/// Inputs -/// - `reader`: the current `types.Reader` containing configured options. -/// - `depth`: current recursion depth to validate. -/// -/// Outputs -/// - `Ok(Nil)` when within limits. -/// - `Error(types.InvalidWireFormat("Exceeded maximum depth"))` when the depth -/// exceeds `options.max_depth`. /// Ensure a numeric `value` does not exceed a configured `limit`. /// /// Inputs @@ -579,3 +571,13 @@ fn ensure_limit( /// Outputs /// - `Ok(Nil)` when `value <= limit`. /// - `Error(types.InvalidWireFormat(message))` when `value > limit`. +fn ensure_limit( + value: Int, + limit: Int, + message: String, +) -> Result(Nil, types.DecodeError) { + case value > limit { + True -> Error(types.InvalidWireFormat(message)) + False -> Ok(Nil) + } +} diff --git a/src/thrifty/types.gleam b/src/thrifty/types.gleam index 0f16c3c..104ce3e 100644 --- a/src/thrifty/types.gleam +++ b/src/thrifty/types.gleam @@ -5,10 +5,6 @@ pub type Reader { Reader(data: BitArray, position: Int, options: ReaderOptions) } -pub type Writer { - Writer(buffer: BitArray) -} - pub type FieldType { BoolTrue BoolFalse @@ -51,16 +47,15 @@ pub type ReaderOptions { /// How boolean elements (bytes inside list/set/map representing booleans) /// are interpreted and validated by the reader. pub type BoolElementPolicy { - // Accept both byte encodings 1 and 2 and map them to booleans as - // 1 => True, 2 => False. This is the default and maximally compatible - // behaviour. + // Accept both the canonical Compact Protocol encoding (1 => True, 2 => False) + // and the legacy BinaryProtocol encoding (0 => False, 1 => True). + // Use this policy for maximum interoperability with older implementations. + // Valid bytes: 0, 1, 2. All other bytes are still rejected. AcceptBoth - // Enforce canonical validation: accept only the canonical spec encodings - // (1 => True, 2 => False) and reject any other byte with + // Enforce canonical Compact Protocol encodings only: + // 1 => True, 2 => False. Byte 0 and any other value are rejected with // `InvalidWireFormat("Invalid boolean element")`. - // (Functionally equivalent to AcceptBoth today but provided to make the - // intent explicit and to allow future divergence if needed.) AcceptCanonicalOnly } diff --git a/src/thrifty/varint.gleam b/src/thrifty/varint.gleam index 23b738b..57e9d44 100644 --- a/src/thrifty/varint.gleam +++ b/src/thrifty/varint.gleam @@ -12,36 +12,43 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Copyright 2025 The thrifty contributors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - import gleam/bit_array +import gleam/int import gleam/list import thrifty/types /// Encode an unsigned integer as a varint BitArray. /// Returns a BitArray of 1-10 bytes (for i64). -/// +/// /// Varint encoding: 7 bits per byte, MSB as continuation bit. /// Bytes are emitted LSB first. +/// +/// Panics if `n` is negative. Use callers that guarantee non-negative input +/// (e.g. ZigZag-encode signed integers before calling this function). pub fn encode_varint(n: Int) -> BitArray { - encode_varint_loop(n, []) - |> list.reverse - |> list_to_bitarray + case n < 0 { + True -> + panic as { + "encode_varint requires a non-negative integer, got " + <> int.to_string(n) + } + False -> + encode_varint_loop(n, []) + |> list.reverse + |> list_to_bitarray + } } +/// Internal loop that emits varint bytes (LSB-first) into an accumulator. +/// +/// Inputs +/// - `n`: remaining unsigned integer to encode. +/// - `acc`: accumulator list of emitted bytes (in reverse order). +/// +/// Outputs +/// - List of byte integers representing the varint, in emission order when +/// reversed by the caller. fn encode_varint_loop(n: Int, acc: List(Int)) -> List(Int) { let low7 = n % 128 let rest = n / 128 @@ -58,15 +65,13 @@ fn encode_varint_loop(n: Int, acc: List(Int)) -> List(Int) { } } -/// Internal loop that emits varint bytes (LSB-first) into an accumulator. +/// Convert a list of byte integers to a `BitArray` by serial concatenation. /// /// Inputs -/// - `n`: remaining unsigned integer to encode. -/// - `acc`: accumulator list of emitted bytes (in reverse order). +/// - `bytes`: list of integers each 0..255. /// /// Outputs -/// - List of byte integers representing the varint, in emission order when -/// reversed by the caller. +/// - `BitArray` concatenating the provided bytes. fn list_to_bitarray(bytes: List(Int)) -> BitArray { case bytes { [] -> <<>> @@ -77,16 +82,9 @@ fn list_to_bitarray(bytes: List(Int)) -> BitArray { } } -/// Convert a list of byte integers to a `BitArray` by serial concatenation. -/// -/// Inputs -/// - `bytes`: list of integers each 0..255. -/// -/// Outputs -/// - `BitArray` concatenating the provided bytes. /// Decode a varint from a BitArray starting at a given byte position. /// Returns Ok(#(value, next_byte_position)) on success, or Error on failure. -/// +/// /// Max length: 10 bytes for i64, 5 bytes for i32. /// Reads bytes LSB first, accumulating 7-bit groups. pub fn decode_varint( @@ -96,6 +94,19 @@ pub fn decode_varint( decode_varint_loop(data, byte_position, 0, 1, 0) } +/// Recursive helper that decodes varint bytes accumulating the result. +/// +/// Inputs +/// - `data`: source `BitArray`. +/// - `byte_pos`: current byte offset to read. +/// - `value`: accumulated integer value so far. +/// - `multiplier`: current multiplier (powers of 128) for incoming 7-bit groups. +/// - `byte_count`: number of bytes consumed so far (used to bound length). +/// +/// Outputs +/// - `Ok(#(value, next_byte_pos))` on successful termination. +/// - `Error(types.InvalidVarint)` when maximum byte length is exceeded. +/// - `Error(types.UnexpectedEndOfInput)` when data runs out mid-varint. fn decode_varint_loop( data: BitArray, byte_pos: Int, @@ -137,16 +148,3 @@ fn decode_varint_loop( } } } -/// Recursive helper that decodes varint bytes accumulating the result. -/// -/// Inputs -/// - `data`: source `BitArray`. -/// - `byte_pos`: current byte offset to read. -/// - `value`: accumulated integer value so far. -/// - `multiplier`: current multiplier (powers of 128) for incoming 7-bit groups. -/// - `byte_count`: number of bytes consumed so far (used to bound length). -/// -/// Outputs -/// - `Ok(#(value, next_byte_pos))` on successful termination. -/// - `Error(types.InvalidVarint)` when maximum byte length is exceeded. -/// - `Error(types.UnexpectedEndOfInput)` when data runs out mid-varint. diff --git a/src/thrifty/writer.gleam b/src/thrifty/writer.gleam index 8a949e1..e6af0fb 100644 --- a/src/thrifty/writer.gleam +++ b/src/thrifty/writer.gleam @@ -111,19 +111,32 @@ pub fn write_zigzag_i64_checked( /// /// Outputs /// - Returns a `BitArray` of length 1 containing the byte. +/// +/// Error modes +/// - Panics when `value` is outside -128..127. Use a range check before +/// calling if the value origin is untrusted. pub fn write_i8(value: Int) -> BitArray { - <> + case value < -128 || value > 127 { + True -> panic as { "Value " <> int.to_string(value) <> " overflows i8" } + False -> <> + } } /// Write an i16 value using zigzag encoding and varint bytes. /// /// Inputs -/// - `value`: signed integer; caller should ensure it fits expected range. +/// - `value`: signed integer in -32768..32767. /// /// Outputs -/// - Returns a `BitArray` with the encoded bytes (delegates to zigzag+varint). +/// - Returns a `BitArray` with the encoded bytes (zigzag + varint). +/// +/// Error modes +/// - Panics when `value` is outside the i16 range. pub fn write_i16(value: Int) -> BitArray { - varint.encode_varint(zigzag.encode_i32(value)) + case value < -32_768 || value > 32_767 { + True -> panic as { "Value " <> int.to_string(value) <> " overflows i16" } + False -> varint.encode_varint(zigzag.encode_i32(value)) + } } /// Write an i32 value using zigzag encoding and varint bytes. @@ -254,11 +267,6 @@ pub fn write_map( concat_many([write_map_header(size, key_type, value_type), payload]) } -/// Concatenate a list of bitarrays in order. -fn concat_many(parts: List(BitArray)) -> BitArray { - list.fold(parts, <<>>, fn(acc, part) { bit_array.concat([acc, part]) }) -} - /// Concatenate a list of `BitArray` parts into a single contiguous `BitArray`. /// /// Inputs @@ -266,24 +274,20 @@ fn concat_many(parts: List(BitArray)) -> BitArray { /// /// Outputs /// - A single `BitArray` containing the concatenated bytes. +fn concat_many(parts: List(BitArray)) -> BitArray { + list.fold(parts, <<>>, fn(acc, part) { bit_array.concat([acc, part]) }) +} + /// Buffer accumulator used by the high-level writer for struct assembly. pub type Buffer { Buffer(parts: List(BitArray)) } -/// Create an empty buffer accumulator. +/// Create a new empty buffer accumulator for high-level writer assembly. pub fn buffer_new() -> Buffer { Buffer([]) } -/// Create a new empty buffer accumulator for high-level writer assembly. -/// Append a new part to the buffer. -pub fn buffer_append(buffer: Buffer, part: BitArray) -> Buffer { - case buffer { - Buffer(parts) -> Buffer([part, ..parts]) - } -} - /// Append a part to an existing `Buffer` accumulator. /// /// Inputs @@ -292,10 +296,9 @@ pub fn buffer_append(buffer: Buffer, part: BitArray) -> Buffer { /// /// Outputs /// - New `Buffer` with `part` added to the internal list. -/// Convert an accumulated buffer into a contiguous bitarray. -pub fn buffer_to_bitarray(buffer: Buffer) -> BitArray { +pub fn buffer_append(buffer: Buffer, part: BitArray) -> Buffer { case buffer { - Buffer(parts) -> concat_many(list.reverse(parts)) + Buffer(parts) -> Buffer([part, ..parts]) } } @@ -303,6 +306,12 @@ pub fn buffer_to_bitarray(buffer: Buffer) -> BitArray { /// /// Outputs /// - Concatenated `BitArray` in original append order. +pub fn buffer_to_bitarray(buffer: Buffer) -> BitArray { + case buffer { + Buffer(parts) -> concat_many(list.reverse(parts)) + } +} + /// Write a boolean field using the inline field header encoding. pub fn write_bool(field_id: Int, value: Bool, last_field_id: Int) -> BitArray { write_bool_inline(field_id, value, last_field_id) @@ -321,10 +330,10 @@ pub fn write_bool_inline( write_field_header(field_id, field_type, last_field_id) } +/// Format a `ZigzagRangeError` into a human readable string for panics/logs. fn zigzag_error_to_string(err: zigzag.ZigzagRangeError) -> String { case err { zigzag.ZigzagRangeError(value, bits) -> "Value " <> int.to_string(value) <> " overflows i" <> int.to_string(bits) } } -/// Format a `ZigzagRangeError` into a human readable string for panics/logs. diff --git a/src/thrifty/writer_highlevel.gleam b/src/thrifty/writer_highlevel.gleam index adc748c..a22551b 100644 --- a/src/thrifty/writer_highlevel.gleam +++ b/src/thrifty/writer_highlevel.gleam @@ -237,6 +237,18 @@ pub fn finish(builder: StructWriter) -> BitArray { |> low_writer.buffer_to_bitarray } +/// Internal helper that appends a field header and optional payload to the +/// builder's buffer and returns the updated `StructWriter`. +/// +/// Inputs +/// - `struct_writer`: current `StructWriter` state. +/// - `field_id`: identifier for the field being appended. +/// - `field_type`: type of the field. +/// - `payload`: pre-encoded payload bytes for the field (may be empty). +/// +/// Outputs +/// - `StructWriter` updated with the appended header/payload and the new +/// last_field_id set to `field_id`. fn append_field( struct_writer: StructWriter, field_id: Int, @@ -253,15 +265,3 @@ fn append_field( } StructWriter(field_id, buffer_with_payload) } -/// Internal helper that appends a field header and optional payload to the -/// builder's buffer and returns the updated `StructWriter`. -/// -/// Inputs -/// - `struct_writer`: current `StructWriter` state. -/// - `field_id`: identifier for the field being appended. -/// - `field_type`: type of the field. -/// - `payload`: pre-encoded payload bytes for the field (may be empty). -/// -/// Outputs -/// - `StructWriter` updated with the appended header/payload and the new -/// last_field_id set to `field_id`. diff --git a/src/thrifty/zigzag.gleam b/src/thrifty/zigzag.gleam index 6573c29..c58b7f7 100644 --- a/src/thrifty/zigzag.gleam +++ b/src/thrifty/zigzag.gleam @@ -116,6 +116,17 @@ pub fn decode_i64(z: Int) -> Int { } } +/// Validate that `value` falls within `[min, max]` inclusive and return an +/// explicit `ZigzagRangeError` when it does not. +/// +/// Inputs +/// - `value`: integer to validate. +/// - `min`, `max`: inclusive bounds. +/// - `bits`: bit-width used for error reporting. +/// +/// Outputs +/// - `Ok(value)` when within range. +/// - `Error(ZigzagRangeError)` when out of range. fn check_range( value: Int, min: Int, @@ -128,17 +139,14 @@ fn check_range( } } -/// Validate that `value` falls within `[min, max]` inclusive and return an -/// explicit `ZigzagRangeError` when it does not. +/// Ensure an integer is represented as a non-negative residue modulo `modulus`. /// /// Inputs -/// - `value`: integer to validate. -/// - `min`, `max`: inclusive bounds. -/// - `bits`: bit-width used for error reporting. +/// - `value`: integer to reduce. +/// - `modulus`: modulus used for reduction (e.g., 2^32, 2^64). /// /// Outputs -/// - `Ok(value)` when within range. -/// - `Error(ZigzagRangeError)` when out of range. +/// - Non-negative integer in 0..modulus-1 representing `value mod modulus`. fn mask_uint(value: Int, modulus: Int) -> Int { let remainder = value % modulus case remainder < 0 { @@ -147,14 +155,14 @@ fn mask_uint(value: Int, modulus: Int) -> Int { } } -/// Ensure an integer is represented as a non-negative residue modulo `modulus`. +/// Core zigzag mapping formula: maps signed integers to unsigned integers +/// such that small-magnitude signed values map to small unsigned values. /// /// Inputs -/// - `value`: integer to reduce. -/// - `modulus`: modulus used for reduction (e.g., 2^32, 2^64). +/// - `n`: signed integer. /// /// Outputs -/// - Non-negative integer in 0..modulus-1 representing `value mod modulus`. +/// - ZigZag-mapped integer (unsigned representation prior to masking). fn zigzag_encode_formula(n: Int) -> Int { case n >= 0 { True -> n * 2 @@ -162,14 +170,13 @@ fn zigzag_encode_formula(n: Int) -> Int { } } -/// Core zigzag mapping formula: maps signed integers to unsigned integers -/// such that small-magnitude signed values map to small unsigned values. +/// Format a `ZigzagRangeError` for logging or panic messages. /// /// Inputs -/// - `n`: signed integer. +/// - `err`: the error to format. /// /// Outputs -/// - ZigZag-mapped integer (unsigned representation prior to masking). +/// - Human-readable string describing the out-of-range value and bit width. fn zigzag_range_error_to_string(err: ZigzagRangeError) -> String { case err { ZigzagRangeError(value, bits) -> @@ -180,4 +187,3 @@ fn zigzag_range_error_to_string(err: ZigzagRangeError) -> String { <> " range" } } -/// Format a `ZigzagRangeError` for logging or panic messages. diff --git a/test/thrifty/field_test.gleam b/test/thrifty/field_test.gleam index 592f906..3cb9fb1 100644 --- a/test/thrifty/field_test.gleam +++ b/test/thrifty/field_test.gleam @@ -5,6 +5,7 @@ import thrifty/field import thrifty/reader import thrifty/types import thrifty/varint +import thrifty/zigzag pub fn main() { gleeunit.main() @@ -25,9 +26,10 @@ pub fn short_form_header_test() { } pub fn long_form_header_test() { - // header byte: delta=0, type=I32 (5) -> 0x05 - // field id: 32 (varint) - let field_id_bits = varint.encode_varint(32) + // Long-form header per spec: header byte = 0x00 | type_nibble, + // followed by zigzag(field_id) encoded as varint. + // field_id=32 -> zigzag(32)=64 -> varint(64) = <<0x40>> + let field_id_bits = varint.encode_varint(zigzag.encode_i32(32)) let data = <<0x05:int-size(8), field_id_bits:bits>> let reader0 = reader.from_bit_array(data) let assert Ok(#(hdr, r1)) = field.read_field_header(reader0, 16) @@ -37,10 +39,25 @@ pub fn long_form_header_test() { ftype |> should.equal(types.I32) } } - // next pos should be header + varint length + // next pos should be header byte + 1 varint byte (64 fits in 1 byte) reader.position(r1) |> should.equal(2) } +pub fn long_form_negative_field_id_test() { + // Negative field IDs are unusual but valid i16 per spec. + // field_id=-1 -> zigzag(-1)=1 -> varint(1) = <<0x01>> + let field_id_bits = varint.encode_varint(zigzag.encode_i32(-1)) + let data = <<0x05:int-size(8), field_id_bits:bits>> + let reader0 = reader.from_bit_array(data) + let assert Ok(#(hdr, _)) = field.read_field_header(reader0, 100) + case hdr { + types.FieldHeader(field_id, ftype) -> { + field_id |> should.equal(-1) + ftype |> should.equal(types.I32) + } + } +} + pub fn boolean_inline_header_test() { // delta=1, type=BOOL_TRUE(1) -> 0x11 let data = <<0x11:int-size(8)>> diff --git a/test/thrifty/message_test.gleam b/test/thrifty/message_test.gleam index a75f9bc..e09b640 100644 --- a/test/thrifty/message_test.gleam +++ b/test/thrifty/message_test.gleam @@ -1,3 +1,4 @@ +import gleam/list import gleeunit import gleeunit/should import thrifty/message.{Call, Exception, MessageHeader, Oneway, Reply} @@ -66,6 +67,19 @@ pub fn message_empty_name_test() { decoded.sequence_id |> should.equal(1) } +// Negative sequence IDs must roundtrip correctly (uint32 masking fix) +pub fn message_negative_seqid_roundtrip_test() { + let values = [-1, -100, -2_147_483_648] + list.map(values, fn(seqid) { + let header = + MessageHeader(name: "m", message_type: Call, sequence_id: seqid) + let encoded = message.encode_message_header(header) + let assert Ok(#(decoded, _)) = message.decode_message_header(encoded, 0) + decoded.sequence_id |> should.equal(seqid) + 0 + }) +} + pub fn message_long_name_test() { let long_name = "thisIsAVeryLongMethodNameThatExceedsTypicalLengths" let header = diff --git a/test/thrifty/reader_test.gleam b/test/thrifty/reader_test.gleam index 39a2fd3..e572ec9 100644 --- a/test/thrifty/reader_test.gleam +++ b/test/thrifty/reader_test.gleam @@ -1,4 +1,5 @@ import gleam/bit_array +import gleam/list import gleeunit import gleeunit/should @@ -11,6 +12,18 @@ pub fn main() { gleeunit.main() } +// read_i8: negative values must roundtrip correctly after the signed fix +pub fn read_i8_negative_roundtrip_test() { + let values = [-1, -128, -42, -100, 0, 1, 127] + list.map(values, fn(v) { + let bits = writer.write_i8(v) + let r = reader.from_bit_array(bits) + let assert Ok(#(decoded, _)) = reader.read_i8(r) + decoded |> should.equal(v) + 0 + }) +} + pub fn read_primitives_test() { let r0 = reader.from_bit_array(writer.write_i32(-123)) let assert Ok(#(value32, _)) = reader.read_i32(r0) diff --git a/test/thrifty/strict_bool_extra_test.gleam b/test/thrifty/strict_bool_extra_test.gleam index bf85b5b..ef7ebc4 100644 --- a/test/thrifty/strict_bool_extra_test.gleam +++ b/test/thrifty/strict_bool_extra_test.gleam @@ -57,8 +57,29 @@ pub fn read_bool_element_accepts_1_and_2_permissive_test() { } } +// AcceptBoth must accept legacy byte 0 as False +pub fn permissive_bool_accepts_zero_as_false_test() { + let data = <<0:int-size(8)>> + + let reader = + thrifty_reader.with_options( + data, + types.ReaderOptions( + max_depth: 64, + max_container_items: 32, + max_string_bytes: 256, + bool_element_policy: types.AcceptBoth, + ), + ) + + case thrifty_reader.read_bool_element(reader) { + Ok(#(v, _)) -> v |> should.equal(False) + Error(_) -> should.fail() + } +} + pub fn read_bool_element_rejects_other_bytes_test() { - // byte 0 is invalid + // byte 0 is invalid with AcceptCanonicalOnly let data = <<0:int-size(8)>> let reader = diff --git a/test/thrifty/writer_test.gleam b/test/thrifty/writer_test.gleam index aae967f..93527a9 100644 --- a/test/thrifty/writer_test.gleam +++ b/test/thrifty/writer_test.gleam @@ -35,6 +35,39 @@ pub fn i8_write_test() { v |> should.equal(42) } +// write_i8: boundary values and roundtrip with negative values +pub fn i8_write_boundaries_test() { + // max positive i8 + let b_max = writer.write_i8(127) + let assert Ok(<>) = bit_array.slice(b_max, 0, 1) + v_max |> should.equal(127) + + // min negative i8 + let b_min = writer.write_i8(-128) + let assert Ok(<>) = bit_array.slice(b_min, 0, 1) + v_min |> should.equal(-128) + + // -1 encodes as 0xFF + let b_neg1 = writer.write_i8(-1) + let assert Ok(<>) = bit_array.slice(b_neg1, 0, 1) + raw |> should.equal(255) +} + +// write_i16: boundary values roundtrip via read_i16 +pub fn i16_write_boundaries_test() { + let r_max = reader.from_bit_array(writer.write_i16(32_767)) + let assert Ok(#(v_max, _)) = reader.read_i16(r_max) + v_max |> should.equal(32_767) + + let r_min = reader.from_bit_array(writer.write_i16(-32_768)) + let assert Ok(#(v_min, _)) = reader.read_i16(r_min) + v_min |> should.equal(-32_768) + + let r_neg = reader.from_bit_array(writer.write_i16(-1)) + let assert Ok(#(v_neg, _)) = reader.read_i16(r_neg) + v_neg |> should.equal(-1) +} + pub fn list_header_roundtrip_test() { // short form let hdr = writer.write_list_header(14, container.I32Type)