diff --git a/news/6611.bugfix.md b/news/6611.bugfix.md new file mode 100644 index 00000000000..b74d18cff6d --- /dev/null +++ b/news/6611.bugfix.md @@ -0,0 +1 @@ +Anonymous telemetry now reports the installation and project identifiers as UUID strings rather than 128-bit integers. PostHog coerced the large integers to floats, discarding all but ~16 significant digits and risking distinct installs or apps being correlated as one. Each identifier is re-encoded to the same value (a UUID carries the same 128 bits), and a one-time PostHog `$create_alias` links an installation's pre-existing history to its new identifier so continuity is preserved. diff --git a/reflex/utils/frontend_skeleton.py b/reflex/utils/frontend_skeleton.py index 6212fb43121..3564740bf99 100644 --- a/reflex/utils/frontend_skeleton.py +++ b/reflex/utils/frontend_skeleton.py @@ -1,7 +1,7 @@ """This module provides utility functions to initialize the frontend skeleton.""" import json -import random +import uuid from pathlib import Path from reflex_base import constants @@ -498,8 +498,9 @@ def init_reflex_json(project_hash: int | None): if project_hash is not None: console.debug(f"Project hash is already set to {project_hash}.") else: - # Get a random project hash. - project_hash = random.getrandbits(128) + # Generate a uuid4 and persist its 128-bit integer form. Telemetry + # re-encodes it as the canonical UUID string before sending. + project_hash = uuid.uuid4().int console.debug(f"Setting project hash to {project_hash}.") # Write the hash and version to the reflex json file. diff --git a/reflex/utils/prerequisites.py b/reflex/utils/prerequisites.py index 29a2d75d924..a8c84da6aaa 100644 --- a/reflex/utils/prerequisites.py +++ b/reflex/utils/prerequisites.py @@ -7,10 +7,10 @@ import importlib.metadata import inspect import json -import random import re import sys import typing +import uuid from datetime import datetime from os import getcwd from pathlib import Path @@ -511,6 +511,45 @@ def get_project_hash(raise_on_fail: bool = False) -> int | None: return data.get("project_hash") +_DISTINCT_ID_SEMANTICS_VERSION = "0.9.5" + + +def _installation_id_semantics_file() -> Path: + """Return the path of the telemetry distinct_id semantics marker file. + + Returns: + The marker path, next to the installation id in the Reflex dir. + """ + return environment.REFLEX_DIR.get() / "installation_id_semantics" + + +def has_uuid_distinct_id_semantics() -> bool: + """Return whether this installation uses UUID telemetry distinct_id semantics. + + The marker is written for brand-new installs (by + ``ensure_reflex_installation_id``) and after a legacy install attempts to + alias its numeric distinct_id to the UUID form, so its absence identifies an + as-yet-unmigrated legacy installation. + + Returns: + True if the per-installation semantics marker file exists. + """ + return _installation_id_semantics_file().exists() + + +def mark_uuid_distinct_id_semantics(): + """Record that this installation uses UUID telemetry distinct_id semantics. + + The marker lives next to the installation id in the Reflex dir, so it is + per-machine (like the id itself) rather than per-app. Failures are ignored: + the marker is best-effort and a missing one only triggers a later retry. + """ + with contextlib.suppress(Exception): + marker = _installation_id_semantics_file() + marker.parent.mkdir(parents=True, exist_ok=True) + marker.write_text(_DISTINCT_ID_SEMANTICS_VERSION) + + def check_running_mode(frontend: bool, backend: bool) -> RunningMode: """Check if the app is running in frontend or backend mode. @@ -603,8 +642,14 @@ def ensure_reflex_installation_id() -> int | None: # - content not parseable as an int if installation_id is None: - installation_id = random.getrandbits(128) + # Generate a uuid4 and persist its 128-bit integer form. Storing the + # int keeps the file readable by older Reflex versions; telemetry + # re-encodes it as the canonical UUID string before sending. + installation_id = uuid.uuid4().int installation_id_file.write_text(str(installation_id)) + # A freshly generated id is UUID-native, so record the new semantics + # up front; there is no legacy numeric id for telemetry to alias. + mark_uuid_distinct_id_semantics() except Exception as e: console.debug(f"Failed to ensure reflex installation id: {e}") return None diff --git a/reflex/utils/telemetry.py b/reflex/utils/telemetry.py index 877e2cf94ed..6d4a0189a91 100644 --- a/reflex/utils/telemetry.py +++ b/reflex/utils/telemetry.py @@ -8,6 +8,7 @@ import os import platform import sys +import uuid import warnings from contextlib import suppress from datetime import datetime, timezone @@ -25,7 +26,12 @@ from reflex.utils import console, processes from reflex.utils.js_runtimes import get_bun_version, get_node_version -from reflex.utils.prerequisites import ensure_reflex_installation_id, get_project_hash +from reflex.utils.prerequisites import ( + ensure_reflex_installation_id, + get_project_hash, + has_uuid_distinct_id_semantics, + mark_uuid_distinct_id_semantics, +) UTC = timezone.utc POSTHOG_API_URL: str = "https://app.posthog.com/capture/" @@ -264,8 +270,8 @@ def _raise_on_missing_project_hash() -> bool: class _Properties(TypedDict): """Properties type for telemetry.""" - distinct_id: int - distinct_app_id: NotRequired[int] + distinct_id: str + distinct_app_id: NotRequired[str] user_os: str user_os_detail: str reflex_version: str @@ -292,6 +298,29 @@ class _Event(_DefaultEvent): timestamp: str +def _encode_distinct_id(value: int) -> str: + """Encode a 128-bit telemetry identifier as a canonical UUID string. + + Historically ``distinct_id`` and ``distinct_app_id`` were sent as raw + 128-bit integers. PostHog coerces large JSON numbers to floats, silently + discarding all but ~16 significant digits, so distinct installs or apps can + collapse onto the same truncated value and have their events correlated. + + A UUID carries the same 128 bits, so the hex string is sent losslessly while + remaining the *same value* as the legacy integer + (``uuid.UUID(int=value).int == value``). Deriving the UUID from the existing + identifier — rather than minting a fresh one — keeps an installation's new + events linkable to its pre-migration history. + + Args: + value: The stored 128-bit identifier. + + Returns: + The identifier encoded as a UUID hex string. + """ + return str(uuid.UUID(int=value)) + + def _get_event_defaults() -> _DefaultEvent | None: """Get the default event data. @@ -303,7 +332,7 @@ def _get_event_defaults() -> _DefaultEvent | None: return None cpuinfo = get_cpu_info() properties: _Properties = { - "distinct_id": installation_id, + "distinct_id": _encode_distinct_id(installation_id), "user_os": get_os(), "user_os_detail": get_detailed_platform_str(), "reflex_version": get_reflex_version(), @@ -322,7 +351,7 @@ def _get_event_defaults() -> _DefaultEvent | None: if ( project_hash := get_project_hash(raise_on_fail=_raise_on_missing_project_hash()) ) is not None: - properties["distinct_app_id"] = project_hash + properties["distinct_app_id"] = _encode_distinct_id(project_hash) return { "api_key": "phc_JoMo0fOyi0GQAooY3UyO9k0hebGkMyFJrrCw1Gt5SGb", @@ -436,6 +465,58 @@ def _send( background_tasks = set() +_legacy_alias_attempted = False + + +def _maybe_alias_legacy_distinct_id(telemetry_enabled: bool | None) -> None: + """Link the legacy numeric distinct_id to its UUID form, once per install. + + Older Reflex versions reported ``distinct_id`` as a 128-bit integer, which + PostHog stored as a lossy float. Now that the same value is sent as a UUID + string (see ``_encode_distinct_id``), the two PostHog identities must be + merged so an installation's history stays on a single person. PostHog does + this through a one-time ``$create_alias`` event. + + A per-machine marker file (next to the installation id) records that the + install uses the new semantics. The marker is written even when the alias + does not match — the legacy id is lossy, so PostHog may silently drop it — to + avoid resending on every run. + + Args: + telemetry_enabled: Whether telemetry is enabled (resolved from the config + when None). + """ + global _legacy_alias_attempted + if _legacy_alias_attempted: + return + + with suppress(Exception): + if telemetry_enabled is None: + telemetry_enabled = get_config().telemetry_enabled + if not telemetry_enabled: + # Don't latch: a later enabled send in this process should retry. + return + + # Latch before the alias send below (which re-enters send()) so it cannot + # recurse and so the attempt happens at most once per process. + _legacy_alias_attempted = True + + # Resolve the installation id first: a brand-new install is created and + # marked UUID-native by this call, so the marker check then skips it. + if (installation_id := ensure_reflex_installation_id()) is None: + return + if has_uuid_distinct_id_semantics(): + return + + # distinct_id is the UUID form (set by get_event_defaults); send the + # legacy integer as ``alias`` so PostHog coerces it to the same float as + # the historic events and merges the two persons. + send("$create_alias", telemetry_enabled, properties={"alias": installation_id}) + + # Record the new semantics regardless of outcome; we must not retry every + # run even if the lossy legacy id failed to match. + mark_uuid_distinct_id_semantics() + def send( event: str, @@ -453,6 +534,7 @@ def send( properties. Preferred over ``kwargs`` for new events. kwargs: Additional data to send with the event. """ + _maybe_alias_legacy_distinct_id(telemetry_enabled) async def async_send( # noqa: RUF029 event: str, diff --git a/tests/units/test_prerequisites.py b/tests/units/test_prerequisites.py index edd3140383e..0079366ce97 100644 --- a/tests/units/test_prerequisites.py +++ b/tests/units/test_prerequisites.py @@ -1,6 +1,7 @@ import json import shutil import tempfile +import uuid from collections.abc import Callable, Generator from dataclasses import dataclass from pathlib import Path @@ -14,7 +15,7 @@ from reflex.reflex import cli from reflex.testing import chdir -from reflex.utils import frontend_skeleton, js_runtimes +from reflex.utils import frontend_skeleton, js_runtimes, prerequisites from reflex.utils.frontend_skeleton import ( _compile_vite_config, _update_react_router_config, @@ -1372,3 +1373,52 @@ def index(): app.add_page(index) """ ) + + +def test_has_uuid_distinct_id_semantics_absent( + tmp_path, monkeypatch: pytest.MonkeyPatch +): + """No marker file means the install has not adopted UUID semantics.""" + monkeypatch.setenv("REFLEX_DIR", str(tmp_path)) + assert prerequisites.has_uuid_distinct_id_semantics() is False + + +def test_mark_uuid_distinct_id_semantics_writes_marker( + tmp_path, monkeypatch: pytest.MonkeyPatch +): + """Marking creates the per-install marker file with the semantics version.""" + monkeypatch.setenv("REFLEX_DIR", str(tmp_path)) + + prerequisites.mark_uuid_distinct_id_semantics() + + assert prerequisites.has_uuid_distinct_id_semantics() is True + marker = tmp_path / "installation_id_semantics" + assert marker.read_text() == prerequisites._DISTINCT_ID_SEMANTICS_VERSION + + +def test_ensure_installation_id_marks_new_install( + tmp_path, monkeypatch: pytest.MonkeyPatch +): + """A brand-new installation id is generated and marked UUID-native.""" + monkeypatch.setenv("REFLEX_DIR", str(tmp_path)) + assert prerequisites.has_uuid_distinct_id_semantics() is False + + install_id = prerequisites.ensure_reflex_installation_id() + + assert install_id is not None + # The id is a uuid4 persisted as its integer form. + assert uuid.UUID(int=install_id).version == 4 + assert prerequisites.has_uuid_distinct_id_semantics() is True + + +def test_ensure_installation_id_keeps_legacy_install_unmarked( + tmp_path, monkeypatch: pytest.MonkeyPatch +): + """An existing legacy id is read and left unmarked, so telemetry will alias it.""" + monkeypatch.setenv("REFLEX_DIR", str(tmp_path)) + (tmp_path / "installation_id").write_text("12345") + + install_id = prerequisites.ensure_reflex_installation_id() + + assert install_id == 12345 + assert prerequisites.has_uuid_distinct_id_semantics() is False diff --git a/tests/units/test_telemetry.py b/tests/units/test_telemetry.py index 699430ff8ec..35780ccbf90 100644 --- a/tests/units/test_telemetry.py +++ b/tests/units/test_telemetry.py @@ -1,4 +1,5 @@ import importlib.metadata +import uuid from types import SimpleNamespace import pytest @@ -19,8 +20,10 @@ def event_defaults(mocker: MockerFixture) -> dict: defaults = { "api_key": "test_api_key", "properties": { - "distinct_id": 12345, - "distinct_app_id": 78285505863498957834586115958872998605, + # Post-conversion defaults carry UUID-string identifiers (the hex + # forms of 12345 and 78285505863498957834586115958872998605). + "distinct_id": "00000000-0000-0000-0000-000000003039", + "distinct_app_id": "3ae53d70-56b0-b52a-f645-37040fb802cd", "user_os": "Test OS", "user_os_detail": "Mocked Platform", "reflex_version": "0.8.0", @@ -405,3 +408,192 @@ def test_prepare_event_properties_override_kwargs(event_defaults): assert event is not None props: dict = event["properties"] # pyright: ignore[reportAssignmentType] assert props["template"] == "from-properties" + + +def test_encode_distinct_id_round_trips_losslessly(): + """A legacy 128-bit integer id encodes to UUID without losing precision.""" + legacy_id = 78285505863498957834586115958872998605 + encoded = telemetry._encode_distinct_id(legacy_id) + + assert isinstance(encoded, str) + assert encoded == str(uuid.UUID(int=legacy_id)) + # Full 128-bit fidelity is preserved, unlike the old float-truncated int. + assert uuid.UUID(encoded).int == legacy_id + + +def test_encode_distinct_id_handles_uuid4_int_form(): + """A freshly generated uuid4 round-trips through its integer storage form.""" + generated = uuid.uuid4() + assert telemetry._encode_distinct_id(generated.int) == str(generated) + + +def test_encode_distinct_id_pads_small_values(): + """Small integers still encode to a valid, zero-padded UUID string.""" + encoded = telemetry._encode_distinct_id(12345) + assert encoded == "00000000-0000-0000-0000-000000003039" + assert uuid.UUID(encoded).int == 12345 + + +@pytest.fixture +def stub_event_default_sources(mocker: MockerFixture): + """Stub the slow/host-specific inputs of ``_get_event_defaults``. + + Returns: + A callable ``configure(installation_id, project_hash)`` that sets the + stored identifier values feeding the default event payload. + """ + mocker.patch.object(telemetry, "get_cpu_info", return_value=None) + mocker.patch.object(telemetry, "get_node_version", return_value=None) + mocker.patch.object(telemetry, "get_bun_version", return_value=None) + + def configure(*, installation_id: int | None, project_hash: int | None) -> None: + mocker.patch.object( + telemetry, "ensure_reflex_installation_id", return_value=installation_id + ) + mocker.patch.object(telemetry, "get_project_hash", return_value=project_hash) + + return configure + + +def test_get_event_defaults_encodes_ids_as_uuid_strings(stub_event_default_sources): + """distinct_id and distinct_app_id are sent as lossless UUID strings. + + Regression: previously these were raw 128-bit ints that PostHog truncated + to floats, collapsing distinct installs/apps onto one identifier. + """ + installation_id = 0xDEADBEEFDEADBEEFDEADBEEFDEADBEEF + project_hash = 78285505863498957834586115958872998605 + stub_event_default_sources( + installation_id=installation_id, project_hash=project_hash + ) + + defaults = telemetry._get_event_defaults() + + assert defaults is not None + props: dict = defaults["properties"] # pyright: ignore[reportAssignmentType] + assert isinstance(props["distinct_id"], str) + assert isinstance(props["distinct_app_id"], str) + assert props["distinct_id"] == str(uuid.UUID(int=installation_id)) + assert props["distinct_app_id"] == str(uuid.UUID(int=project_hash)) + # Continuity: each encoded id decodes back to the original integer value. + assert uuid.UUID(props["distinct_id"]).int == installation_id + assert uuid.UUID(props["distinct_app_id"]).int == project_hash + + +def test_get_event_defaults_omits_distinct_app_id_without_project_hash( + stub_event_default_sources, +): + """No distinct_app_id is emitted when the project hash is unavailable.""" + stub_event_default_sources(installation_id=12345, project_hash=None) + + defaults = telemetry._get_event_defaults() + + assert defaults is not None + assert "distinct_app_id" not in defaults["properties"] + assert defaults["properties"]["distinct_id"] == str(uuid.UUID(int=12345)) + + +def test_get_event_defaults_returns_none_without_installation_id( + stub_event_default_sources, +): + """A missing installation id short-circuits defaults (unchanged contract).""" + stub_event_default_sources(installation_id=None, project_hash=12345) + assert telemetry._get_event_defaults() is None + + +@pytest.fixture(autouse=True) +def _reset_alias_guard(): + """Reset the per-process alias guard so each test starts fresh.""" + telemetry._legacy_alias_attempted = False + yield + telemetry._legacy_alias_attempted = False + + +def test_maybe_alias_sends_create_alias_for_legacy_install(mocker: MockerFixture): + """A legacy install (no semantics marker) aliases and then marks itself.""" + mocker.patch.object(telemetry, "ensure_reflex_installation_id", return_value=12345) + mocker.patch.object(telemetry, "has_uuid_distinct_id_semantics", return_value=False) + mark = mocker.patch.object(telemetry, "mark_uuid_distinct_id_semantics") + send_mock = mocker.patch.object(telemetry, "send") + + telemetry._maybe_alias_legacy_distinct_id(telemetry_enabled=True) + + send_mock.assert_called_once_with( + "$create_alias", True, properties={"alias": 12345} + ) + mark.assert_called_once() + + +def test_maybe_alias_skips_for_uuid_native_install(mocker: MockerFixture): + """An install already on UUID semantics sends no alias and is not re-marked.""" + mocker.patch.object(telemetry, "ensure_reflex_installation_id", return_value=12345) + mocker.patch.object(telemetry, "has_uuid_distinct_id_semantics", return_value=True) + mark = mocker.patch.object(telemetry, "mark_uuid_distinct_id_semantics") + send_mock = mocker.patch.object(telemetry, "send") + + telemetry._maybe_alias_legacy_distinct_id(telemetry_enabled=True) + + send_mock.assert_not_called() + mark.assert_not_called() + + +def test_maybe_alias_skips_without_installation_id(mocker: MockerFixture): + """No installation id means no alias, no marker, and no semantics check.""" + mocker.patch.object(telemetry, "ensure_reflex_installation_id", return_value=None) + has = mocker.patch.object(telemetry, "has_uuid_distinct_id_semantics") + mark = mocker.patch.object(telemetry, "mark_uuid_distinct_id_semantics") + send_mock = mocker.patch.object(telemetry, "send") + + telemetry._maybe_alias_legacy_distinct_id(telemetry_enabled=True) + + send_mock.assert_not_called() + mark.assert_not_called() + has.assert_not_called() + + +def test_maybe_alias_skips_when_telemetry_disabled(mocker: MockerFixture): + """Disabled telemetry does no work and leaves the marker unwritten.""" + ensure = mocker.patch.object(telemetry, "ensure_reflex_installation_id") + send_mock = mocker.patch.object(telemetry, "send") + + telemetry._maybe_alias_legacy_distinct_id(telemetry_enabled=False) + + ensure.assert_not_called() + send_mock.assert_not_called() + + +def test_maybe_alias_runs_at_most_once_per_process(mocker: MockerFixture): + """The guard prevents a second alias attempt within the same process.""" + mocker.patch.object(telemetry, "ensure_reflex_installation_id", return_value=7) + mocker.patch.object(telemetry, "has_uuid_distinct_id_semantics", return_value=False) + mocker.patch.object(telemetry, "mark_uuid_distinct_id_semantics") + send_mock = mocker.patch.object(telemetry, "send") + + telemetry._maybe_alias_legacy_distinct_id(telemetry_enabled=True) + telemetry._maybe_alias_legacy_distinct_id(telemetry_enabled=True) + + send_mock.assert_called_once() + + +def test_maybe_alias_create_alias_payload( + event_defaults, httpx_post, mocker: MockerFixture +): + """The posted $create_alias pairs the new UUID distinct_id with the legacy int.""" + mocker.patch.object(telemetry, "has_uuid_distinct_id_semantics", return_value=False) + mocker.patch.object(telemetry, "mark_uuid_distinct_id_semantics") + legacy_id = 78285505863498957834586115958872998605 + mocker.patch.object( + telemetry, "ensure_reflex_installation_id", return_value=legacy_id + ) + + telemetry._maybe_alias_legacy_distinct_id(telemetry_enabled=True) + + httpx_post.assert_called_once() + payload = httpx_post.call_args.kwargs["json"] + assert payload["event"] == "$create_alias" + props = payload["properties"] + # The legacy integer is sent at full precision so PostHog re-coerces it to + # the same lossy float as the historic events and merges the two persons. + assert props["alias"] == legacy_id + # distinct_id is the new UUID-string identity (from the event defaults). + assert props["distinct_id"] == event_defaults["properties"]["distinct_id"]