From c75cc237e47323131617febc4ed0f03b814ef56e Mon Sep 17 00:00:00 2001 From: Adithya Samavedhi Date: Fri, 12 Jun 2026 13:53:42 -0700 Subject: [PATCH] fix-fetching-userids-offlineconversions-accounting-for-numpy-ndarray --- tests/unit/test_handlers_build_items.py | 68 ++++++++++++++++++- .../handlers/offline_conversion.py | 10 +-- 2 files changed, 71 insertions(+), 7 deletions(-) diff --git a/tests/unit/test_handlers_build_items.py b/tests/unit/test_handlers_build_items.py index 403dbde..f7b0e2f 100644 --- a/tests/unit/test_handlers_build_items.py +++ b/tests/unit/test_handlers_build_items.py @@ -4,6 +4,10 @@ """ from datetime import datetime, timezone +from typing import Union + +import numpy as np +import pytest import ttd_databricks_python.ttd_databricks.handlers.advertiser as adv_handler from ttd_databricks_python.ttd_databricks.id_types import normalize_id_type @@ -20,6 +24,13 @@ _UnsetType = type(UNSET) +# An array column reaches build_items as a list via the adhoc path +# (collect + asDict) and as a numpy array via the batch path (mapInPandas). +# build_items must handle both, so array-column tests run against each shape. +def _build_array_column(array_type: type, items: list[dict]) -> Union[list, np.ndarray]: + return items if array_type is list else np.array(items, dtype=object) + + # --------------------------------------------------------------------------- # # Advertiser handler # # --------------------------------------------------------------------------- # @@ -119,10 +130,13 @@ def test_builds_offline_conversion_data_item_with_correct_fields(self): assert isinstance(item.timestamp_utc, datetime) assert isinstance(item.user_id_array, _UnsetType) - def test_user_ids_converted_to_user_id_array_with_type_codes(self): + @pytest.mark.parametrize("array_type", [list, np.ndarray]) + def test_user_ids_converted_to_user_id_array_with_type_codes(self, array_type): row = { **self._MINIMAL, - "user_ids": [{"type": "TDID", "id": "test-tdid-value"}, {"type": "DAID", "id": "test-daid-value"}], + "user_ids": _build_array_column( + array_type, [{"type": "TDID", "id": "test-tdid-value"}, {"type": "DAID", "id": "test-daid-value"}] + ), } item = oc_handler.build_items([row])[0] assert item.user_id_array == [["0", "test-tdid-value"], ["1", "test-daid-value"]] @@ -147,4 +161,52 @@ def test_optional_fields_are_passed_through_when_provided(self): item = oc_handler.build_items([row])[0] assert item.order_id == "test-order-id" assert item.value == "99.99" - assert item.country == "US" \ No newline at end of file + assert item.country == "US" + + @pytest.mark.parametrize("array_type", [list, np.ndarray]) + def test_multi_element_line_items(self, array_type): + row = { + **self._MINIMAL, + "line_items": _build_array_column( + array_type, + [ + {"item_code": "sku1", "name": "first", "qty": "1", "price": "9.99", "cat": "books"}, + {"item_code": "sku2", "name": "second", "qty": "2", "price": "5.00", "cat": "toys"}, + ], + ), + } + item = oc_handler.build_items([row])[0] + assert len(item.line_items) == 2 + assert item.line_items[0].item_code == "sku1" + + @pytest.mark.parametrize("array_type", [list, np.ndarray]) + def test_multi_element_privacy_settings(self, array_type): + row = { + **self._MINIMAL, + "privacy_settings": _build_array_column( + array_type, + [ + {"privacy_type": "GDPR", "is_applicable": "true", "consent_string": "abc"}, + {"privacy_type": "CCPA", "is_applicable": "false", "consent_string": "xyz"}, + ], + ), + } + item = oc_handler.build_items([row])[0] + assert len(item.privacy_settings) == 2 + assert item.privacy_settings[0].privacy_type == "GDPR" + + @pytest.mark.parametrize("array_type", [list, np.ndarray]) + def test_collect_raw_pii_ids_keeps_only_pii_types(self, array_type): + rows = [ + { + **self._MINIMAL, + "user_ids": _build_array_column( + array_type, + [{"type": "Email", "id": "a@example.com"}, {"type": "TDID", "id": "device-1"}], + ), + } + ] + assert oc_handler.collect_raw_pii_ids_per_row(rows) == [["a@example.com"]] + + def test_collect_raw_pii_ids_handles_missing_user_ids(self): + assert oc_handler.collect_raw_pii_ids_per_row([self._MINIMAL]) == [[]] \ No newline at end of file diff --git a/ttd_databricks_python/ttd_databricks/handlers/offline_conversion.py b/ttd_databricks_python/ttd_databricks/handlers/offline_conversion.py index 31f9ce3..759afb9 100644 --- a/ttd_databricks_python/ttd_databricks/handlers/offline_conversion.py +++ b/ttd_databricks_python/ttd_databricks/handlers/offline_conversion.py @@ -66,7 +66,7 @@ def build_items(items_data: list[dict[str, Any]]) -> list[OfflineConversionDataI } raw_user_ids = row.get("user_ids") - if raw_user_ids: + if raw_user_ids is not None and len(raw_user_ids) > 0: kwargs["user_id_array"] = [[_user_id_type(user_id["type"]), user_id["id"]] for user_id in raw_user_ids] for field in ITEM_OPTIONAL_FIELDS: @@ -75,7 +75,7 @@ def build_items(items_data: list[dict[str, Any]]) -> list[OfflineConversionDataI kwargs[field] = value raw_line_items = row.get("line_items") - if raw_line_items: + if raw_line_items is not None and len(raw_line_items) > 0: kwargs["line_items"] = [ RealTimeConversionEventLineItem( **{k: v for k, v in (li if isinstance(li, dict) else li.asDict()).items() if v is not None} @@ -84,7 +84,7 @@ def build_items(items_data: list[dict[str, Any]]) -> list[OfflineConversionDataI ] raw_privacy_settings = row.get("privacy_settings") - if raw_privacy_settings: + if raw_privacy_settings is not None and len(raw_privacy_settings) > 0: kwargs["privacy_settings"] = [ RealTimeConversionEventsPrivacySetting( **{k: v for k, v in (ps if isinstance(ps, dict) else ps.asDict()).items() if v is not None} @@ -103,7 +103,9 @@ def collect_raw_pii_ids_per_row(items_data: list[dict[str, Any]]) -> list[list[s """ out: list[list[str]] = [] for row in items_data: - raw_user_ids = row.get("user_ids") or [] + raw_user_ids = row.get("user_ids") + if raw_user_ids is None: + raw_user_ids = [] out.append( [entry["id"] for entry in raw_user_ids if entry["type"] and entry["type"].upper() in RAW_PII_ID_TYPES] )