diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b29eda51..4ec908cde 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ If your change does not need a CHANGELOG entry, add the "skip changelog" label t ## Unreleased +- fix(otlp-aws-exporter): avoid `RecursionError` when `pip_system_certs` replaces `ssl.SSLContext` on Python 3.12 by rebinding stale `botocore`/`urllib3` SSL context references and caching credentials in `AwsAuthSession` - feat: support environment-configured endpoint visibility for HTTP operation names ([#718](https://github.com/aws-observability/aws-otel-python-instrumentation/pull/718)) - fix(lambda-layer): Standardize CompactConsoleLogRecordExporter output with CloudWatch OTLP backend schema. diff --git a/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/common/aws_auth_session.py b/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/common/aws_auth_session.py index 564bfe9e2..336d09ea2 100644 --- a/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/common/aws_auth_session.py +++ b/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/common/aws_auth_session.py @@ -2,12 +2,15 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from threading import Lock import requests from botocore.auth import SigV4Auth from botocore.awsrequest import AWSRequest from botocore.session import Session +from amazon.opentelemetry.distro.patches._pip_system_certs_patches import apply_pip_system_certs_compatibility_patch + _logger = logging.getLogger(__name__) @@ -39,13 +42,70 @@ def __init__(self, aws_region: str, service: str, session: Session): self._service: str = service self._session: Session = session + # Cached credentials are resolved on the first ``request()`` call. The returned + # ``Credentials`` / ``RefreshableCredentials`` object handles its own expiry and + # rotation when its attributes are accessed, so caching the reference does not + # cache the underlying credential values. + self._credentials = None + self._credentials_resolved = False + self._credentials_lock = Lock() + super().__init__() + def _ensure_initialized(self) -> None: + """Apply one-time, deferred initialization on the first ``request()`` call. + + This runs after sitecustomize has fully completed (i.e., after any ``.pth`` + based ``ssl.SSLContext`` injection from packages such as ``pip_system_certs``), + which is the only point at which we can safely re-align stale ``SSLContext`` + references captured by ``botocore`` / ``urllib3`` during ADOT startup. + + Credentials are also resolved once here. ``RefreshableCredentials`` handles + rotation internally on attribute access, so caching the reference is safe. + + On a transient credential resolution failure (e.g., IMDS timeout), the + ``_credentials_resolved`` flag is left ``False`` so the next ``request()`` call + will retry. Only a successful resolution latches the flag, matching the + original "retry every request" behavior for the failure path while keeping + the SSL-context-construction cost amortized to once on the success path. + + Note: the read of ``_credentials_resolved`` outside the lock is safe because + Python's GIL makes attribute reads/writes atomic. On free-threaded Python + builds (3.13t+) this would need a memory barrier; revisit if/when we + support those. + """ + if self._credentials_resolved: + return + + with self._credentials_lock: + if self._credentials_resolved: + return + + # Realign stale ssl.SSLContext references in botocore / urllib3 before + # the first credential resolution constructs an SSL context. This is a + # no-op when pip_system_certs is not installed. + try: + apply_pip_system_certs_compatibility_patch() + except Exception as patch_error: # pylint: disable=broad-except + _logger.warning("Failed to apply pip_system_certs compatibility patch: %s", patch_error) + + try: + self._credentials = self._session.get_credentials() + except Exception as cred_error: # pylint: disable=broad-except + # Don't latch _credentials_resolved on failure - leave it False so + # the next request retries credential resolution. This preserves + # self-healing behavior on transient errors (e.g., IMDS timeouts). + _logger.error("Failed to load AWS Credentials: %s", cred_error) + self._credentials = None + return + + self._credentials_resolved = True + def request(self, method, url, *args, data=None, headers=None, **kwargs): - credentials = self._session.get_credentials() + self._ensure_initialized() - if credentials: - signer = SigV4Auth(credentials, self._service, self._aws_region) + if self._credentials: + signer = SigV4Auth(self._credentials, self._service, self._aws_region) request = AWSRequest( method="POST", url=url, @@ -64,6 +124,6 @@ def request(self, method, url, *args, data=None, headers=None, **kwargs): except Exception as signing_error: # pylint: disable=broad-except _logger.error("Failed to sign request: %s", signing_error) else: - _logger.error("Failed to load AWS Credentials: %s") + _logger.error("Failed to load AWS Credentials") return super().request(method=method, url=url, *args, data=data, headers=headers, **kwargs) diff --git a/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/patches/_pip_system_certs_patches.py b/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/patches/_pip_system_certs_patches.py new file mode 100644 index 000000000..eaad4fde2 --- /dev/null +++ b/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/patches/_pip_system_certs_patches.py @@ -0,0 +1,100 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from importlib.metadata import PackageNotFoundError, version +from logging import Logger, getLogger + +_logger: Logger = getLogger(__name__) + +# Module-level guard so the patch is applied at most once per process. +# The plain bool is intentional: the patch body itself is idempotent +# (re-running it produces the same final state), so a benign race between two +# threads where both observe ``_patch_applied is False`` and both run the rebind +# costs an extra dict assignment and nothing more. We don't pay for a lock here. +_patch_applied = False + + +def _is_pip_system_certs_installed() -> bool: + """Is the pip_system_certs package installed?""" + try: + dist_version = version("pip_system_certs") + _logger.debug("pip_system_certs is installed: %s", dist_version) + return True + except PackageNotFoundError as exc: + _logger.debug("pip_system_certs is not installed. %s", exc) + return False + + +def apply_pip_system_certs_compatibility_patch() -> None: + """Re-bind stale ``ssl.SSLContext`` references in botocore/urllib3. + + When ``pip_system_certs`` is installed, it injects ``truststore.SSLContext`` as the + process-wide ``ssl.SSLContext`` via a ``.pth`` file. The injection runs in the + ``finally`` block of a ``site.execsitecustomize`` wrapper, i.e. *after* + ``sitecustomize.py`` returns. + + OpenTelemetry's auto-instrumentation entry point (``opentelemetry-instrument``) + runs from ``sitecustomize.py``, which loads the ADOT distro and transitively imports + ``requests`` (via the upstream OTLP HTTP exporters) and ``botocore``. Both of those + modules capture a reference to ``ssl.SSLContext`` at import time. Because the import + happens before ``pip_system_certs``'s injection runs, the captured reference is the + original C-level ``ssl.SSLContext``, not the truststore-wrapped class. + + On Python 3.12, ``ssl.SSLContext.options.__set__`` is implemented as + ``super(SSLContext, SSLContext).options.__set__(self, value)`` where ``SSLContext`` + is resolved from ``ssl``'s module globals at call time. After ``pip_system_certs`` + runs, that name resolves to ``truststore.SSLContext``, and the ``super()`` chain + bounces between the original and truststore classes until the recursion limit + (~978 frames) is exceeded. + + This patch re-binds ``botocore.httpsession.SSLContext`` and + ``urllib3.util.ssl_.SSLContext`` to the *current* ``ssl.SSLContext`` + (i.e., truststore's wrapper). truststore's own ``SSLContext.options`` setter does + not use the recursive ``super()`` pattern, so subsequent SSL context creations + succeed. + + The patch is idempotent: a module-level guard ensures it only runs once per + process. It is a no-op when ``pip_system_certs`` is not installed or when the + references already match ``ssl.SSLContext``. ``ImportError`` is the only + expected failure (e.g., ``botocore`` or ``urllib3`` not installed in some + minimal environment) and is silently skipped per library. + """ + global _patch_applied # pylint: disable=global-statement + if _patch_applied: + return + + # Only apply the patch when pip_system_certs is installed in user application space. + if not _is_pip_system_certs_installed(): + _patch_applied = True + return + + # pylint: disable=import-outside-toplevel + import ssl + + try: + # pylint: disable=import-outside-toplevel + import botocore.httpsession + + if botocore.httpsession.SSLContext is not ssl.SSLContext: + _logger.debug( + "Rebinding botocore.httpsession.SSLContext to current ssl.SSLContext (pip_system_certs detected)." + ) + botocore.httpsession.SSLContext = ssl.SSLContext + except ImportError: + # botocore not installed; nothing to rebind on the botocore side. + pass + + try: + # pylint: disable=import-outside-toplevel + import urllib3.util.ssl_ + + if urllib3.util.ssl_.SSLContext is not ssl.SSLContext: + _logger.debug( + "Rebinding urllib3.util.ssl_.SSLContext to current ssl.SSLContext (pip_system_certs detected)." + ) + urllib3.util.ssl_.SSLContext = ssl.SSLContext + except ImportError: + # urllib3 not installed; nothing to rebind. + pass + + _patch_applied = True diff --git a/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/exporter/otlp/aws/common/test_aws_auth_session.py b/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/exporter/otlp/aws/common/test_aws_auth_session.py index 11babbb7b..cbc84908f 100644 --- a/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/exporter/otlp/aws/common/test_aws_auth_session.py +++ b/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/exporter/otlp/aws/common/test_aws_auth_session.py @@ -47,3 +47,130 @@ def test_aws_auth_session(self, _, __): self.assertIn(AUTHORIZATION_HEADER, actual_headers) self.assertIn(X_AMZ_DATE_HEADER, actual_headers) self.assertIn(X_AMZ_SECURITY_TOKEN_HEADER, actual_headers) + + @patch("requests.Session.request", return_value=requests.Response()) + @patch("botocore.session.Session.get_credentials", return_value=mock_credentials) + def test_credentials_are_resolved_once(self, mock_get_credentials, _): + """Credentials must be resolved only once across multiple ``request()`` calls. + + This is the hot-path mitigation for the pip_system_certs RecursionError: each + ``get_credentials()`` call walks the credential resolver chain, which constructs + a urllib3 SSL context. Caching the returned object (``RefreshableCredentials`` + rotates internally on attribute access) ensures the SSL context is created at + most once per exporter, not once per export. + """ + session = AwsAuthSession("us-east-1", "xray", get_aws_session()) + + for _ in range(5): + session.request("POST", AWS_OTLP_TRACES_ENDPOINT, data="", headers={}) + + self.assertEqual(mock_get_credentials.call_count, 1) + + @patch("requests.Session.request", return_value=requests.Response()) + def test_credentials_retry_after_transient_failure(self, _): + """A transient ``get_credentials()`` failure must NOT latch the resolved + flag. The next ``request()`` call must retry resolution. This preserves + self-healing behavior on transient errors (e.g., IMDS timeouts) and matches + the pre-fix behavior on the failure path. + """ + # First call raises, subsequent calls succeed. + get_credentials_mock = patch( + "botocore.session.Session.get_credentials", + side_effect=[RuntimeError("transient"), mock_credentials, mock_credentials], + ) + with get_credentials_mock as mock_get_credentials: + session = AwsAuthSession("us-east-1", "xray", get_aws_session()) + + # 1st request: get_credentials raises, no auth headers added. + headers_first = {} + session.request("POST", AWS_OTLP_TRACES_ENDPOINT, data="", headers=headers_first) + self.assertNotIn(AUTHORIZATION_HEADER, headers_first) + + # 2nd request: get_credentials succeeds, auth headers must appear. + headers_second = {} + session.request("POST", AWS_OTLP_TRACES_ENDPOINT, data="", headers=headers_second) + self.assertIn(AUTHORIZATION_HEADER, headers_second) + + # 3rd request: cached credentials reused, no further get_credentials calls. + headers_third = {} + session.request("POST", AWS_OTLP_TRACES_ENDPOINT, data="", headers=headers_third) + self.assertIn(AUTHORIZATION_HEADER, headers_third) + + # Two resolution attempts: one failed, one succeeded; third request reuses cache. + self.assertEqual(mock_get_credentials.call_count, 2) + + @patch("requests.Session.request", return_value=requests.Response()) + @patch("botocore.session.Session.get_credentials", return_value=mock_credentials) + @patch( + "amazon.opentelemetry.distro.exporter.otlp.aws.common.aws_auth_session" + ".apply_pip_system_certs_compatibility_patch" + ) + def test_pip_system_certs_patch_invoked_on_first_request(self, mock_apply_patch, _, __): + """The ssl.SSLContext rebind helper is invoked on the first ``request()`` call + and not re-invoked on subsequent calls. + + The patch itself is a no-op when pip_system_certs is not installed, so this + test only asserts the call site, not the patch behavior.""" + session = AwsAuthSession("us-east-1", "xray", get_aws_session()) + + session.request("POST", AWS_OTLP_TRACES_ENDPOINT, data="", headers={}) + session.request("POST", AWS_OTLP_TRACES_ENDPOINT, data="", headers={}) + session.request("POST", AWS_OTLP_TRACES_ENDPOINT, data="", headers={}) + + self.assertEqual(mock_apply_patch.call_count, 1) + + @patch("requests.Session.request", return_value=requests.Response()) + @patch( + "amazon.opentelemetry.distro.exporter.otlp.aws.common.aws_auth_session" + ".apply_pip_system_certs_compatibility_patch", + side_effect=RuntimeError("simulated patch failure"), + ) + @patch("botocore.session.Session.get_credentials", return_value=mock_credentials) + def test_patch_failure_does_not_break_request(self, _, __, ___): + """If the SSL-context-rebind helper itself raises, the failure is logged + but ``request()`` still proceeds and signs successfully. The patch is + defensive infrastructure, not a hard precondition.""" + session = AwsAuthSession("us-east-1", "xray", get_aws_session()) + actual_headers: dict = {} + + session.request("POST", AWS_OTLP_TRACES_ENDPOINT, data="", headers=actual_headers) + + self.assertIn(AUTHORIZATION_HEADER, actual_headers) + + @patch("requests.Session.request", return_value=requests.Response()) + @patch("botocore.session.Session.get_credentials", return_value=mock_credentials) + def test_signing_failure_does_not_break_request(self, _, __): + """If SigV4 signing itself raises, ``request()`` still issues the + unauthenticated request rather than crashing the caller.""" + session = AwsAuthSession("us-east-1", "xray", get_aws_session()) + + with patch("amazon.opentelemetry.distro.exporter.otlp.aws.common.aws_auth_session.SigV4Auth") as mock_sigv4: + mock_sigv4.return_value.add_auth.side_effect = RuntimeError("signing boom") + actual_headers: dict = {} + # Should not raise + session.request("POST", AWS_OTLP_TRACES_ENDPOINT, data="", headers=actual_headers) + + # No auth header because signing raised before headers could be merged. + self.assertNotIn(AUTHORIZATION_HEADER, actual_headers) + + @patch("requests.Session.request", return_value=requests.Response()) + @patch("botocore.session.Session.get_credentials", return_value=mock_credentials) + def test_concurrent_requests_resolve_credentials_once(self, mock_get_credentials, _): + """Two threads racing on the first request must both observe a single + credential resolution. The double-checked locking in ``_ensure_initialized`` + is what provides this guarantee.""" + # pylint: disable=import-outside-toplevel + from threading import Thread + + session = AwsAuthSession("us-east-1", "xray", get_aws_session()) + + def call(): + session.request("POST", AWS_OTLP_TRACES_ENDPOINT, data="", headers={}) + + threads = [Thread(target=call) for _ in range(8)] + for t in threads: + t.start() + for t in threads: + t.join() + + self.assertEqual(mock_get_credentials.call_count, 1) diff --git a/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/patches/test_pip_system_certs_patches.py b/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/patches/test_pip_system_certs_patches.py new file mode 100644 index 000000000..38e405ae2 --- /dev/null +++ b/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/patches/test_pip_system_certs_patches.py @@ -0,0 +1,146 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +from importlib.metadata import PackageNotFoundError +from unittest import TestCase +from unittest.mock import patch + +from amazon.opentelemetry.distro.patches import _pip_system_certs_patches +from amazon.opentelemetry.distro.patches._pip_system_certs_patches import apply_pip_system_certs_compatibility_patch + + +class TestPipSystemCertsPatches(TestCase): + def setUp(self) -> None: + # Reset the module-level guard before every test so each test exercises the + # full code path. + _pip_system_certs_patches._patch_applied = False + + def tearDown(self) -> None: + # Leave the guard in a clean state for tests that follow. + _pip_system_certs_patches._patch_applied = False + + @patch("amazon.opentelemetry.distro.patches._pip_system_certs_patches.version") + def test_no_op_when_pip_system_certs_not_installed(self, mock_version): + """When pip_system_certs is not installed, the patch is a no-op and does not + touch botocore/urllib3 module globals.""" + mock_version.side_effect = PackageNotFoundError("pip_system_certs") + + # pylint: disable=import-outside-toplevel + import botocore.httpsession + import urllib3.util.ssl_ + + sentinel_class = type("SentinelSSLContext", (), {}) + + with patch.object(botocore.httpsession, "SSLContext", sentinel_class): + with patch.object(urllib3.util.ssl_, "SSLContext", sentinel_class): + apply_pip_system_certs_compatibility_patch() + + # References must remain untouched when pip_system_certs is not present. + self.assertIs(botocore.httpsession.SSLContext, sentinel_class) + self.assertIs(urllib3.util.ssl_.SSLContext, sentinel_class) + + self.assertTrue(_pip_system_certs_patches._patch_applied) + + @patch("amazon.opentelemetry.distro.patches._pip_system_certs_patches.version") + def test_rebinds_stale_references_when_installed(self, mock_version): + """When pip_system_certs is installed and botocore/urllib3 hold stale + ``ssl.SSLContext`` references, the patch rebinds them to the current + ``ssl.SSLContext``.""" + mock_version.return_value = "5.3" + + # pylint: disable=import-outside-toplevel + import ssl + + import botocore.httpsession + import urllib3.util.ssl_ + + # Simulate the post-injection state: ssl.SSLContext has been replaced with + # truststore's wrapper, but botocore/urllib3 still hold the original. + original_ssl_context = ssl.SSLContext + truststore_like = type("TruststoreSSLContext", (), {}) + + with patch.object(botocore.httpsession, "SSLContext", original_ssl_context): + with patch.object(urllib3.util.ssl_, "SSLContext", original_ssl_context): + with patch.object(ssl, "SSLContext", truststore_like): + apply_pip_system_certs_compatibility_patch() + + self.assertIs(botocore.httpsession.SSLContext, truststore_like) + self.assertIs(urllib3.util.ssl_.SSLContext, truststore_like) + + @patch("amazon.opentelemetry.distro.patches._pip_system_certs_patches.version") + def test_no_op_when_references_already_match(self, mock_version): + """When references already match the current ``ssl.SSLContext``, the patch + leaves them untouched (idempotent).""" + mock_version.return_value = "5.3" + + # pylint: disable=import-outside-toplevel + import ssl + + import botocore.httpsession + import urllib3.util.ssl_ + + current = ssl.SSLContext + + with patch.object(botocore.httpsession, "SSLContext", current): + with patch.object(urllib3.util.ssl_, "SSLContext", current): + apply_pip_system_certs_compatibility_patch() + + self.assertIs(botocore.httpsession.SSLContext, current) + self.assertIs(urllib3.util.ssl_.SSLContext, current) + + @patch("amazon.opentelemetry.distro.patches._pip_system_certs_patches.version") + def test_runs_only_once(self, mock_version): + """The patch is guarded so the package detection only runs on the first call.""" + mock_version.side_effect = PackageNotFoundError("pip_system_certs") + + apply_pip_system_certs_compatibility_patch() + apply_pip_system_certs_compatibility_patch() + apply_pip_system_certs_compatibility_patch() + + self.assertEqual(mock_version.call_count, 1) + + @patch("amazon.opentelemetry.distro.patches._pip_system_certs_patches.version") + def test_botocore_import_failure_does_not_crash(self, mock_version): + """If botocore.httpsession is absent the patch silently skips it and + still processes urllib3.""" + mock_version.return_value = "5.3" + + # pylint: disable=import-outside-toplevel + import sys + + import urllib3.util.ssl_ + + saved = sys.modules.get("botocore.httpsession") + # Setting to None forces Python to raise ImportError on `import botocore.httpsession`. + sys.modules["botocore.httpsession"] = None + try: + apply_pip_system_certs_compatibility_patch() + finally: + if saved is None: + sys.modules.pop("botocore.httpsession", None) + else: + sys.modules["botocore.httpsession"] = saved + + # Patch should still mark itself as applied even when one library is missing. + self.assertTrue(_pip_system_certs_patches._patch_applied) + # urllib3 path should still have been considered. + self.assertTrue(hasattr(urllib3.util.ssl_, "SSLContext")) + + @patch("amazon.opentelemetry.distro.patches._pip_system_certs_patches.version") + def test_urllib3_import_failure_does_not_crash(self, mock_version): + """If urllib3.util.ssl_ is absent the patch silently skips it.""" + mock_version.return_value = "5.3" + + # pylint: disable=import-outside-toplevel + import sys + + saved = sys.modules.get("urllib3.util.ssl_") + sys.modules["urllib3.util.ssl_"] = None + try: + apply_pip_system_certs_compatibility_patch() + finally: + if saved is None: + sys.modules.pop("urllib3.util.ssl_", None) + else: + sys.modules["urllib3.util.ssl_"] = saved + + self.assertTrue(_pip_system_certs_patches._patch_applied)