diff --git a/.jules/bolt.md b/.jules/bolt.md index 8f60c46..a538c1b 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -60,6 +60,9 @@ **Learning:** When downloading large files (e.g., blocklists), the default chunk size of HTTP libraries might be small, leading to excessive loop iterations and list operations. Increasing the buffer size (e.g., to 16KB) reduces CPU overhead during I/O-bound operations. **Action:** When using `iter_bytes()` or similar streaming methods for large resources, explicitly set a larger `chunk_size` (e.g., 16384) to improve throughput and reduce CPU usage. +## 2026-02-17 - [Cache DNS Lookups by Hostname] +**Learning:** When validating multiple URLs from the same host (e.g., githubusercontent), caching based on the full URL still triggers redundant DNS lookups for each unique path. Extracting hostname validation into a separate `@lru_cache` function avoids repeated blocking `getaddrinfo` calls for the same domain. +**Action:** Identify expensive validation steps (like DNS) that depend on a subset of the input (hostname vs full URL) and cache them independently. ## 2024-03-24 - [Avoid Regex on Simple Strings] **Learning:** Running complex regex substitutions on every log message (for sanitization) introduces measurable CPU overhead, especially when most strings don't contain sensitive patterns. Simple string checks (`in`) are orders of magnitude faster than regex execution. **Action:** Add early return checks (e.g., `if "://" in s:`) before invoking expensive regex operations in hot paths like logging or string sanitization. diff --git a/main.py b/main.py index e1e516c..a9ce7d7 100644 --- a/main.py +++ b/main.py @@ -894,12 +894,57 @@ def _parse_rate_limit_headers(response: httpx.Response) -> None: log.debug(f"Failed to parse rate limit headers: {e}") +@lru_cache(maxsize=128) +def validate_hostname(hostname: str) -> bool: + """ + Validates a hostname (DNS resolution and IP checks). + Cached to prevent redundant DNS lookups for the same host across different URLs. + """ + # Check for potentially malicious hostnames + if hostname.lower() in ("localhost", "127.0.0.1", "::1"): + log.warning( + f"Skipping unsafe hostname (localhost detected): {sanitize_for_log(hostname)}" + ) + return False + + try: + ip = ipaddress.ip_address(hostname) + if not ip.is_global or ip.is_multicast: + log.warning(f"Skipping unsafe IP: {sanitize_for_log(hostname)}") + return False + return True + except ValueError: + # Not an IP literal, it's a domain. Resolve and check IPs. + try: + # Resolve hostname to IPs (IPv4 and IPv6) + # We filter for AF_INET/AF_INET6 to ensure we get IP addresses + addr_info = socket.getaddrinfo(hostname, None, proto=socket.IPPROTO_TCP) + for res in addr_info: + # res is (family, type, proto, canonname, sockaddr) + # sockaddr is (address, port) for AF_INET/AF_INET6 + ip_str = res[4][0] + ip = ipaddress.ip_address(ip_str) + if not ip.is_global or ip.is_multicast: + log.warning( + f"Skipping unsafe hostname {sanitize_for_log(hostname)} (resolves to non-global/multicast IP {ip})" + ) + return False + except (socket.gaierror, ValueError, OSError) as e: + log.warning( + f"Failed to resolve/validate domain {sanitize_for_log(hostname)}: {sanitize_for_log(e)}" + ) + return False + + if not addr_info: + return False + for res in addr_info: + + @lru_cache(maxsize=128) def validate_folder_url(url: str) -> bool: """ Validates a folder URL. - Cached to avoid repeated DNS lookups (socket.getaddrinfo) for the same URL - during warm-up and sync phases. + Cached to avoid repeated URL parsing for the same URL. """ if not url.startswith("https://"): log.warning( @@ -913,41 +958,7 @@ def validate_folder_url(url: str) -> bool: if not hostname: return False - # Check for potentially malicious hostnames - if hostname.lower() in ("localhost", "127.0.0.1", "::1"): - log.warning( - f"Skipping unsafe URL (localhost detected): {sanitize_for_log(url)}" - ) - return False - - try: - ip = ipaddress.ip_address(hostname) - if not ip.is_global or ip.is_multicast: - log.warning( - f"Skipping unsafe URL (non-global/multicast IP): {sanitize_for_log(url)}" - ) - return False - except ValueError: - # Not an IP literal, it's a domain. Resolve and check IPs. - try: - # Resolve hostname to IPs (IPv4 and IPv6) - # We filter for AF_INET/AF_INET6 to ensure we get IP addresses - addr_info = socket.getaddrinfo(hostname, None, proto=socket.IPPROTO_TCP) - for res in addr_info: - # res is (family, type, proto, canonname, sockaddr) - # sockaddr is (address, port) for AF_INET/AF_INET6 - ip_str = res[4][0] - ip = ipaddress.ip_address(ip_str) - if not ip.is_global or ip.is_multicast: - log.warning( - f"Skipping unsafe URL (domain {hostname} resolves to non-global/multicast IP {ip}): {sanitize_for_log(url)}" - ) - return False - except (socket.gaierror, ValueError, OSError) as e: - log.warning( - f"Failed to resolve/validate domain {hostname}: {sanitize_for_log(e)}" - ) - return False + return validate_hostname(hostname) except Exception as e: log.warning( diff --git a/tests/test_hostname_validation.py b/tests/test_hostname_validation.py new file mode 100644 index 0000000..3e0cd70 --- /dev/null +++ b/tests/test_hostname_validation.py @@ -0,0 +1,73 @@ + +import socket +from unittest.mock import MagicMock, patch + +import pytest +import main + +def test_validate_hostname_caching(): + """ + Verify that validate_hostname caches results and avoids redundant DNS lookups. + """ + # Mock socket.getaddrinfo + with patch("socket.getaddrinfo") as mock_dns: + # Setup mock return value (valid IP) + mock_dns.return_value = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', ('93.184.216.34', 443))] + + # Clear cache to start fresh + main.validate_hostname.cache_clear() + + # First call - should trigger DNS lookup + assert main.validate_hostname("example.com") is True + assert mock_dns.call_count == 1 + + # Second call - should use cache + assert main.validate_hostname("example.com") is True + assert mock_dns.call_count == 1 # Still 1 + + # different hostname - should trigger DNS lookup + assert main.validate_hostname("google.com") is True + assert mock_dns.call_count == 2 + +def test_validate_hostname_security(): + """ + Verify security checks in validate_hostname. + """ + # Localhost + assert main.validate_hostname("localhost") is False + assert main.validate_hostname("127.0.0.1") is False + assert main.validate_hostname("::1") is False + + # Private IP + assert main.validate_hostname("192.168.1.1") is False + + # Domain resolving to private IP + with patch("socket.getaddrinfo") as mock_dns: + # Return private IP + mock_dns.return_value = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', ('192.168.1.1', 443))] + main.validate_hostname.cache_clear() + + assert main.validate_hostname("private.local") is False + +def test_validate_folder_url_uses_validate_hostname(): + """ + Verify that validate_folder_url calls validate_hostname. + """ + with patch("main.validate_hostname") as mock_validate: + mock_validate.return_value = True + + # Clear cache + main.validate_folder_url.cache_clear() + + url = "https://example.com/data.json" + assert main.validate_folder_url(url) is True + + mock_validate.assert_called_with("example.com") + + # Invalid hostname + mock_validate.return_value = False + + # Clear cache again because URL is the same + main.validate_folder_url.cache_clear() + + assert main.validate_folder_url(url) is False