diff --git a/helpers/http_helper.py b/helpers/http_helper.py new file mode 100644 index 00000000..4c2b4798 --- /dev/null +++ b/helpers/http_helper.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +""" +HTTP helper with curl_cffi fallback for WAF / TLS-fingerprint bypass. + +Background +---------- +Many enterprise WAF appliances (Akamai, Imperva, F5 ASM, etc.) drop the +TCP/TLS connection from python-requests at the TLS ClientHello stage, +because requests' cipher suites, extensions, ALPN order and GREASE +values do not match any real browser. The user gets a +``requests.exceptions.ConnectionError`` instead of an HTTP response, +even though a real browser to the same URL succeeds. + +This is common with Swedish government sites (bolagsverket.se, +skatteverket.se, etc.) when running webperf_core's standard-files test. + +Solution +-------- +``curl_cffi`` uses ``curl-impersonate`` under the hood to perform the +TLS handshake (and HTTP/2 frame ordering) with byte-exact browser +fingerprints, which lets us through. We only use it as a fallback when +the normal ``requests`` call has already failed with a connection error, +keeping the common case as fast as it was before. + +Install +------- +Add to ``requirements.txt``:: + + curl-cffi>=0.13.0 + +The module is *optional*: if ``curl_cffi`` is not installed the helper +behaves exactly like a bare ``requests.get`` call. +""" +import requests + +try: + from curl_cffi import requests as cffi_requests # type: ignore + HAS_CURL_CFFI = True +except ImportError: + HAS_CURL_CFFI = False + cffi_requests = None # type: ignore + + +# Browser profile that ``curl_cffi`` uses to spoof the TLS / HTTP/2 fingerprint. +# Keep this reasonably current. See the list of supported impersonations at: +# https://github.com/lexiforest/curl_cffi#supported-impersonate-versions +# Picked Chrome since it has the largest share of real-world traffic and is +# least likely to be the focus of fingerprint-specific anti-bot rules. +DEFAULT_IMPERSONATE = 'chrome131' + + +def http_get_with_fallback(url, + headers=None, + timeout=60, + allow_redirects=False): + """ + Perform an HTTP GET request, falling back to a browser-impersonating + client if the plain request fails at the TCP/TLS level. + + The first attempt uses ``requests`` (fast, no extra dependencies in + the critical path). On ``requests.exceptions.ConnectionError`` we + retry once via ``curl_cffi`` with a Chrome TLS fingerprint. If the + fallback is unavailable or also fails, the **original** connection + error is re-raised, so existing exception handling in callers (for + example the broad ``except requests.exceptions.ConnectionError`` in + ``tests.utils.get_http_content``) continues to behave the same. + + The returned object exposes the standard response interface used by + webperf_core call sites: ``.text``, ``.content``, ``.status_code``, + ``.headers``. Both ``requests.Response`` and ``curl_cffi``'s response + satisfy this contract. + + Parameters + ---------- + url : str + The URL to GET. + headers : dict, optional + HTTP request headers (User-Agent, Authorization, etc.). Passed + through unchanged to both attempts. + timeout : int, optional + Per-request timeout in seconds. Default 60. + allow_redirects : bool, optional + Whether to follow HTTP redirects. Default False, matching + webperf_core's existing ``get_http_content`` default. + + Returns + ------- + Response + A ``requests.Response`` or ``curl_cffi`` response object. + + Raises + ------ + requests.exceptions.ConnectionError + Both the primary attempt and any fallback failed to connect. + Other ``requests`` exceptions + Propagated unchanged from the primary attempt (SSL errors, + invalid URLs, timeouts, etc. are not retried via curl_cffi). + """ + headers = headers or {} + + try: + return requests.get( + url, + headers=headers, + timeout=timeout, + allow_redirects=allow_redirects, + ) + except requests.exceptions.ConnectionError as primary_err: + if not HAS_CURL_CFFI: + raise + + try: + return cffi_requests.get( + url, + headers=headers, + timeout=timeout, + allow_redirects=allow_redirects, + impersonate=DEFAULT_IMPERSONATE, + ) + except Exception: + # The fallback also failed. Re-raise the *original* error so + # callers see a familiar requests.exceptions.ConnectionError + # and existing handling (logging, retry-over-HTTPS, etc.) + # continues to work unchanged. + raise primary_err diff --git a/requirements.txt b/requirements.txt index 9865f773..a3e28283 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ Numpy==2.4.6 cryptography==48.0.0 pylint==4.0.5 packaging==26.2 +curl-cffi>=0.13.0 diff --git a/tests/utils.py b/tests/utils.py index 5eb09720..83e5d885 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -25,6 +25,7 @@ import dns.name from helpers.setting_helper import get_config +from helpers.http_helper import http_get_with_fallback CONFIG_WARNINGS = {} IP2_LOCATION_DB = { @@ -480,8 +481,11 @@ def get_http_content(url, allow_redirects=False, use_text_instead_of_content=Tru hostname = urlparse(url).hostname if hostname == 'api.github.com' and get_config('github.api.key') is not None: headers['authorization'] = f"Bearer {get_config('github.api.key')}" - response = requests.get(url, allow_redirects=allow_redirects, - headers=headers, timeout=get_config('general.request.timeout')*2) + response = http_get_with_fallback( + url, + headers=headers, + timeout=get_config('general.request.timeout')*2, + allow_redirects=allow_redirects) if use_text_instead_of_content: content = response.text