fix: detect and preserve image format instead of hardcoding RAW_UINT8 (#67)

iwillspeak · corpo-iwillspeak · snus-kin · web-flow · commit 2bd81d9b01af · 2025-12-02T09:38:40.000Z
* fix: detect and preserve image format instead of hardcoding RAW_UINT8

Previously, all images were sent with IMAGE_FORMAT_RAW_UINT8 regardless
of their actual format (JPEG, PNG, etc.), causing incorrect metadata to
be sent to the API when images weren't resized.

Changes:
- Add image format detection via magic number signatures for PNG, JPEG,
  GIF, BMP, WebP, and TIFF formats
- Update ImageData to automatically detect format during initialization
- Preserve detected format throughout transformation pipeline
- Convert UNSPECIFIED format to RAW_UINT8 before sending to ensure API
  never receives UNSPECIFIED
- Update resize transformer to set format to RAW_UINT8 when converting
  to raw pixel data

This ensures:
1. Native image formats (PNG, JPEG, etc.) are correctly identified and
   preserved when sent without resizing
2. Resized images are correctly marked as RAW_UINT8
3. IMAGE_FORMAT_UNSPECIFIED is never sent over the API (defaults to
   RAW_UINT8)

Tests: Added 46 new tests covering format detection, API validation,
and format preservation across both streaming and single classification
methods. All 166 tests passing.

* build: Suppress Lint Warnings

* refactor: use named constants for image format magic bytes

- Remove __future__ annotations import from input_model.py
- Replace inline byte literals with named constants in image_format_detector.py
- Calculate lengths dynamically using len() instead of hardcoded values
- Eliminates need for noqa comments on magic value comparisons

* style: ruff format and line end fix

---------

Co-authored-by: Will Speak &lt;will.speak@kroll.com&gt;
Co-authored-by: snus-kin &lt;tcarroll@snufk.in&gt;
diff --git a/.env.example b/.env.example
@@ -6,4 +6,4 @@ OAUTH_AUDIENCE=crisp-athena-live
 
 # Athena server configuration
 # ATHENA_HOST=trust-messages.crispthinking.com
-ATHENA_AFFILIATE=athena-test
+ATHENA_AFFILIATE=athena-test
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
     hooks:
       - id: basedpyright
         name: basedpyright
-        entry: basedpyright
+        entry: uv run basedpyright
         language: system
         types_or: [python, pyi]
         pass_filenames: false
diff --git a/src/resolver_athena_client/client/athena_client.py b/src/resolver_athena_client/client/athena_client.py
@@ -239,12 +239,18 @@ async def classify_single(
             else RequestEncoding.REQUEST_ENCODING_UNCOMPRESSED
         )
 
+        # Ensure we never send UNSPECIFIED format over the API
+        # If format is still UNSPECIFIED, default to RAW_UINT8
+        image_format = processed_image.image_format
+        if image_format == ImageFormat.IMAGE_FORMAT_UNSPECIFIED:
+            image_format = ImageFormat.IMAGE_FORMAT_RAW_UINT8
+
         classification_input = ClassificationInput(
             affiliate=self.options.affiliate,
             correlation_id=correlation_id,
             encoding=request_encoding,
             data=processed_image.data,
-            format=ImageFormat.IMAGE_FORMAT_RAW_UINT8,
+            format=image_format,
             hashes=[
                 ImageHash(
                     value=hash_value,
diff --git a/src/resolver_athena_client/client/image_format_detector.py b/src/resolver_athena_client/client/image_format_detector.py
@@ -0,0 +1,73 @@
+"""Utility for detecting image formats from raw bytes."""
+
+from resolver_athena_client.generated.athena.models_pb2 import ImageFormat
+
+PNG_MAGIC_BYTES = b"\x89PNG"
+JPEG_MAGIC_BYTES = b"\xff\xd8\xff"
+GIF87A_MAGIC_BYTES = b"GIF87a"
+GIF89A_MAGIC_BYTES = b"GIF89a"
+BMP_MAGIC_BYTES = b"BM"
+WEBP_RIFF_MAGIC_BYTES = b"RIFF"
+WEBP_WEBP_MAGIC_BYTES = b"WEBP"
+TIFF_LE_MAGIC_BYTES = b"II*\x00"
+TIFF_BE_MAGIC_BYTES = b"MM\x00*"
+
+
+def detect_image_format(data: bytes) -> ImageFormat.ValueType:  # noqa: PLR0911
+    """Detect image format from raw bytes using magic number signatures.
+
+    Args:
+    ----
+        data: Raw image bytes to analyze
+
+    Returns:
+    -------
+        ImageFormat enum value representing the detected format
+
+    """
+    if not data:
+        return ImageFormat.IMAGE_FORMAT_UNSPECIFIED
+
+    # Check magic numbers for common image formats
+    # PNG: starts with PNG_MAGIC_BYTES
+    png_len = len(PNG_MAGIC_BYTES)
+    if len(data) >= png_len and data[:png_len] == PNG_MAGIC_BYTES:
+        return ImageFormat.IMAGE_FORMAT_PNG
+
+    # JPEG: starts with JPEG_MAGIC_BYTES
+    jpeg_len = len(JPEG_MAGIC_BYTES)
+    if len(data) >= jpeg_len and data[:jpeg_len] == JPEG_MAGIC_BYTES:
+        return ImageFormat.IMAGE_FORMAT_JPEG
+
+    # GIF: starts with GIF87A_MAGIC_BYTES or GIF89A_MAGIC_BYTES
+    gif_len = len(GIF87A_MAGIC_BYTES)
+    if len(data) >= gif_len and data[:gif_len] in (
+        GIF87A_MAGIC_BYTES,
+        GIF89A_MAGIC_BYTES,
+    ):
+        return ImageFormat.IMAGE_FORMAT_GIF
+
+    # BMP: starts with BMP_MAGIC_BYTES
+    bmp_len = len(BMP_MAGIC_BYTES)
+    if len(data) >= bmp_len and data[:bmp_len] == BMP_MAGIC_BYTES:
+        return ImageFormat.IMAGE_FORMAT_BMP
+
+    # WebP: RIFF....WEBP (12 bytes minimum for full signature)
+    webp_min_len = len(WEBP_RIFF_MAGIC_BYTES) + len(WEBP_WEBP_MAGIC_BYTES) + 4
+    if (
+        len(data) >= webp_min_len
+        and data[:4] == WEBP_RIFF_MAGIC_BYTES
+        and data[8:12] == WEBP_WEBP_MAGIC_BYTES
+    ):
+        return ImageFormat.IMAGE_FORMAT_WEBP
+
+    # TIFF: little-endian or big-endian magic bytes
+    tiff_len = len(TIFF_LE_MAGIC_BYTES)
+    if len(data) >= tiff_len and (
+        data[:tiff_len] == TIFF_LE_MAGIC_BYTES
+        or data[:tiff_len] == TIFF_BE_MAGIC_BYTES
+    ):
+        return ImageFormat.IMAGE_FORMAT_TIFF
+
+    # Fallback when format cannot be determined
+    return ImageFormat.IMAGE_FORMAT_UNSPECIFIED
diff --git a/src/resolver_athena_client/client/models/input_model.py b/src/resolver_athena_client/client/models/input_model.py
@@ -6,6 +6,14 @@
 """
 
 import hashlib
+from typing import TYPE_CHECKING
+
+from resolver_athena_client.client.image_format_detector import (
+    detect_image_format,
+)
+
+if TYPE_CHECKING:
+    from resolver_athena_client.generated.athena.models_pb2 import ImageFormat
 
 
 class ImageData:
@@ -24,6 +32,8 @@ class ImageData:
     Attributes:
     ----------
         data: The raw bytes of the image (modified in-place by transformers).
+        image_format: The format of the image data (e.g., JPEG, PNG, RAW_UINT8).
+            Updated by transformers when they change the format.
         sha256_hashes: List of SHA256 hashes tracking image transformations.
             Index 0 is the original image, subsequent indices track
             transformations.
@@ -66,6 +76,9 @@ def __init__(self, image_bytes: bytes) -> None:
 
         """
         self.data: bytes = image_bytes
+        self.image_format: ImageFormat.ValueType = detect_image_format(
+            image_bytes
+        )
         self.sha256_hashes: list[str] = [
             hashlib.sha256(image_bytes).hexdigest()
         ]
diff --git a/src/resolver_athena_client/client/transformers/classification_input.py b/src/resolver_athena_client/client/transformers/classification_input.py
@@ -48,15 +48,20 @@ def __init__(
     def _create_classification_input(
         self, image_data: ImageData
     ) -> ClassificationInput:
-        # Get image format and data
+        # Ensure we never send UNSPECIFIED format over the API
+        # If format is still UNSPECIFIED, default to RAW_UINT8
+        image_format = image_data.image_format
+        if image_format == ImageFormat.IMAGE_FORMAT_UNSPECIFIED:
+            image_format = ImageFormat.IMAGE_FORMAT_RAW_UINT8
+
         return ClassificationInput(
             affiliate=self.affiliate,
             correlation_id=self.correlation_provider.get_correlation_id(
                 image_data.data
             ),
             data=image_data.data,
             encoding=self.request_encoding,
-            format=ImageFormat.IMAGE_FORMAT_RAW_UINT8,
+            format=image_format,
         )
 
     @override
diff --git a/src/resolver_athena_client/client/transformers/core.py b/src/resolver_athena_client/client/transformers/core.py
@@ -13,6 +13,7 @@
 
 from resolver_athena_client.client.consts import EXPECTED_HEIGHT, EXPECTED_WIDTH
 from resolver_athena_client.client.models import ImageData
+from resolver_athena_client.generated.athena.models_pb2 import ImageFormat
 
 # Global optimization constants
 _target_size = (EXPECTED_WIDTH, EXPECTED_HEIGHT)
@@ -73,6 +74,7 @@ def process_image() -> tuple[bytes, bool]:
     # Only modify data and add hashes if transformation occurred
     if was_transformed:
         image_data.data = resized_bytes
+        image_data.image_format = ImageFormat.IMAGE_FORMAT_RAW_UINT8
         image_data.add_transformation_hashes()
 
     return image_data
diff --git a/tests/client/models/__init__.py b/tests/client/models/__init__.py
@@ -0,0 +1 @@
+"""Tests for model classes."""
diff --git a/tests/client/models/test_image_data.py b/tests/client/models/test_image_data.py
@@ -0,0 +1,105 @@
+"""Tests for ImageData model."""
+
+import pytest
+
+from resolver_athena_client.client.models import ImageData
+from resolver_athena_client.generated.athena.models_pb2 import ImageFormat
+
+
+def test_image_data_detects_png_format() -> None:
+    """Test that PNG format is detected during initialization."""
+    png_data = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100
+    image_data = ImageData(png_data)
+
+    assert image_data.image_format == ImageFormat.IMAGE_FORMAT_PNG
+    assert image_data.data == png_data
+    assert len(image_data.sha256_hashes) == 1
+    assert len(image_data.md5_hashes) == 1
+
+
+def test_image_data_detects_jpeg_format() -> None:
+    """Test that JPEG format is detected during initialization."""
+    jpeg_data = b"\xff\xd8\xff\xe0" + b"\x00" * 100
+    image_data = ImageData(jpeg_data)
+
+    assert image_data.image_format == ImageFormat.IMAGE_FORMAT_JPEG
+    assert image_data.data == jpeg_data
+
+
+def test_image_data_detects_gif_format() -> None:
+    """Test that GIF format is detected during initialization."""
+    gif_data = b"GIF89a" + b"\x00" * 100
+    image_data = ImageData(gif_data)
+
+    assert image_data.image_format == ImageFormat.IMAGE_FORMAT_GIF
+
+
+def test_image_data_detects_bmp_format() -> None:
+    """Test that BMP format is detected during initialization."""
+    bmp_data = b"BM" + b"\x00" * 100
+    image_data = ImageData(bmp_data)
+
+    assert image_data.image_format == ImageFormat.IMAGE_FORMAT_BMP
+
+
+def test_image_data_detects_webp_format() -> None:
+    """Test that WebP format is detected during initialization."""
+    webp_data = b"RIFF\x00\x00\x00\x00WEBP" + b"\x00" * 100
+    image_data = ImageData(webp_data)
+
+    assert image_data.image_format == ImageFormat.IMAGE_FORMAT_WEBP
+
+
+def test_image_data_unspecified_for_unknown_format() -> None:
+    """Test that unknown data results in UNSPECIFIED format."""
+    unknown_data = b"not_a_valid_image"
+    image_data = ImageData(unknown_data)
+
+    assert image_data.image_format == ImageFormat.IMAGE_FORMAT_UNSPECIFIED
+
+
+def test_image_data_unspecified_for_empty_data() -> None:
+    """Test that empty data results in UNSPECIFIED format."""
+    image_data = ImageData(b"")
+
+    assert image_data.image_format == ImageFormat.IMAGE_FORMAT_UNSPECIFIED
+
+
+def test_image_data_transformation_preserves_format() -> None:
+    """Test that format is preserved when transformation hashes are added."""
+    png_data = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100
+    image_data = ImageData(png_data)
+
+    assert image_data.image_format == ImageFormat.IMAGE_FORMAT_PNG
+
+    # Simulate transformation
+    image_data.data = b"transformed_data"
+    image_data.add_transformation_hashes()
+
+    # Format should still be PNG (transformers will update it if needed)
+    assert image_data.image_format == ImageFormat.IMAGE_FORMAT_PNG
+    assert len(image_data.sha256_hashes) == 2  # noqa: PLR2004
+    assert len(image_data.md5_hashes) == 2  # noqa: PLR2004
+
+
+@pytest.mark.parametrize(
+    ("data", "expected_format"),
+    [
+        (b"\x89PNG\r\n\x1a\n", ImageFormat.IMAGE_FORMAT_PNG),
+        (b"\xff\xd8\xff", ImageFormat.IMAGE_FORMAT_JPEG),
+        (b"GIF87a", ImageFormat.IMAGE_FORMAT_GIF),
+        (b"GIF89a", ImageFormat.IMAGE_FORMAT_GIF),
+        (b"BM", ImageFormat.IMAGE_FORMAT_BMP),
+        (b"RIFF\x00\x00\x00\x00WEBP", ImageFormat.IMAGE_FORMAT_WEBP),
+        (b"II*\x00", ImageFormat.IMAGE_FORMAT_TIFF),
+        (b"MM\x00*", ImageFormat.IMAGE_FORMAT_TIFF),
+        (b"unknown", ImageFormat.IMAGE_FORMAT_UNSPECIFIED),
+        (b"", ImageFormat.IMAGE_FORMAT_UNSPECIFIED),
+    ],
+)
+def test_image_data_format_detection_parametrized(
+    data: bytes, expected_format: ImageFormat.ValueType
+) -> None:
+    """Test format detection with various image data."""
+    image_data = ImageData(data)
+    assert image_data.image_format == expected_format
diff --git a/tests/client/test_image_format_detector.py b/tests/client/test_image_format_detector.py
diff --git a/tests/client/transformers/test_classification_input.py b/tests/client/transformers/test_classification_input.py
diff --git a/tests/test_classify_single.py b/tests/test_classify_single.py