From 3520639ca41bd994b8634d8b9e7645ae9fff1541 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 17 Feb 2026 15:06:55 +0000 Subject: [PATCH 1/2] feat(perf): cache DNS lookups by hostname to reduce network I/O Co-authored-by: abhimehro <84992105+abhimehro@users.noreply.github.com> --- .jules/bolt.md | 4 ++ main.py | 83 +++++++++++++++++-------------- tests/test_hostname_validation.py | 73 +++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 37 deletions(-) create mode 100644 tests/test_hostname_validation.py diff --git a/.jules/bolt.md b/.jules/bolt.md index 30c2edd..d0c9343 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -58,3 +58,7 @@ ## 2026-02-04 - [Optimize Buffer for Large Downloads] **Learning:** When downloading large files (e.g., blocklists), the default chunk size of HTTP libraries might be small, leading to excessive loop iterations and list operations. Increasing the buffer size (e.g., to 16KB) reduces CPU overhead during I/O-bound operations. **Action:** When using `iter_bytes()` or similar streaming methods for large resources, explicitly set a larger `chunk_size` (e.g., 16384) to improve throughput and reduce CPU usage. + +## 2026-02-17 - [Cache DNS Lookups by Hostname] +**Learning:** When validating multiple URLs from the same host (e.g., githubusercontent), caching based on the full URL still triggers redundant DNS lookups for each unique path. Extracting hostname validation into a separate `@lru_cache` function avoids repeated blocking `getaddrinfo` calls for the same domain. +**Action:** Identify expensive validation steps (like DNS) that depend on a subset of the input (hostname vs full URL) and cache them independently. diff --git a/main.py b/main.py index 1f4f047..9822e98 100644 --- a/main.py +++ b/main.py @@ -638,12 +638,55 @@ def save_disk_cache() -> None: _cache_stats["errors"] += 1 +@lru_cache(maxsize=128) +def validate_hostname(hostname: str) -> bool: + """ + Validates a hostname (DNS resolution and IP checks). + Cached to prevent redundant DNS lookups for the same host across different URLs. + """ + # Check for potentially malicious hostnames + if hostname.lower() in ("localhost", "127.0.0.1", "::1"): + log.warning( + f"Skipping unsafe hostname (localhost detected): {sanitize_for_log(hostname)}" + ) + return False + + try: + ip = ipaddress.ip_address(hostname) + if not ip.is_global or ip.is_multicast: + log.warning(f"Skipping unsafe IP: {sanitize_for_log(hostname)}") + return False + return True + except ValueError: + # Not an IP literal, it's a domain. Resolve and check IPs. + try: + # Resolve hostname to IPs (IPv4 and IPv6) + # We filter for AF_INET/AF_INET6 to ensure we get IP addresses + addr_info = socket.getaddrinfo(hostname, None, proto=socket.IPPROTO_TCP) + for res in addr_info: + # res is (family, type, proto, canonname, sockaddr) + # sockaddr is (address, port) for AF_INET/AF_INET6 + ip_str = res[4][0] + ip = ipaddress.ip_address(ip_str) + if not ip.is_global or ip.is_multicast: + log.warning( + f"Skipping unsafe hostname {sanitize_for_log(hostname)} (resolves to non-global/multicast IP {ip})" + ) + return False + except (socket.gaierror, ValueError, OSError) as e: + log.warning( + f"Failed to resolve/validate domain {sanitize_for_log(hostname)}: {sanitize_for_log(e)}" + ) + return False + + return True + + @lru_cache(maxsize=128) def validate_folder_url(url: str) -> bool: """ Validates a folder URL. - Cached to avoid repeated DNS lookups (socket.getaddrinfo) for the same URL - during warm-up and sync phases. + Cached to avoid repeated URL parsing for the same URL. """ if not url.startswith("https://"): log.warning( @@ -657,41 +700,7 @@ def validate_folder_url(url: str) -> bool: if not hostname: return False - # Check for potentially malicious hostnames - if hostname.lower() in ("localhost", "127.0.0.1", "::1"): - log.warning( - f"Skipping unsafe URL (localhost detected): {sanitize_for_log(url)}" - ) - return False - - try: - ip = ipaddress.ip_address(hostname) - if not ip.is_global or ip.is_multicast: - log.warning( - f"Skipping unsafe URL (non-global/multicast IP): {sanitize_for_log(url)}" - ) - return False - except ValueError: - # Not an IP literal, it's a domain. Resolve and check IPs. - try: - # Resolve hostname to IPs (IPv4 and IPv6) - # We filter for AF_INET/AF_INET6 to ensure we get IP addresses - addr_info = socket.getaddrinfo(hostname, None, proto=socket.IPPROTO_TCP) - for res in addr_info: - # res is (family, type, proto, canonname, sockaddr) - # sockaddr is (address, port) for AF_INET/AF_INET6 - ip_str = res[4][0] - ip = ipaddress.ip_address(ip_str) - if not ip.is_global or ip.is_multicast: - log.warning( - f"Skipping unsafe URL (domain {hostname} resolves to non-global/multicast IP {ip}): {sanitize_for_log(url)}" - ) - return False - except (socket.gaierror, ValueError, OSError) as e: - log.warning( - f"Failed to resolve/validate domain {hostname}: {sanitize_for_log(e)}" - ) - return False + return validate_hostname(hostname) except Exception as e: log.warning( diff --git a/tests/test_hostname_validation.py b/tests/test_hostname_validation.py new file mode 100644 index 0000000..3e0cd70 --- /dev/null +++ b/tests/test_hostname_validation.py @@ -0,0 +1,73 @@ + +import socket +from unittest.mock import MagicMock, patch + +import pytest +import main + +def test_validate_hostname_caching(): + """ + Verify that validate_hostname caches results and avoids redundant DNS lookups. + """ + # Mock socket.getaddrinfo + with patch("socket.getaddrinfo") as mock_dns: + # Setup mock return value (valid IP) + mock_dns.return_value = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', ('93.184.216.34', 443))] + + # Clear cache to start fresh + main.validate_hostname.cache_clear() + + # First call - should trigger DNS lookup + assert main.validate_hostname("example.com") is True + assert mock_dns.call_count == 1 + + # Second call - should use cache + assert main.validate_hostname("example.com") is True + assert mock_dns.call_count == 1 # Still 1 + + # different hostname - should trigger DNS lookup + assert main.validate_hostname("google.com") is True + assert mock_dns.call_count == 2 + +def test_validate_hostname_security(): + """ + Verify security checks in validate_hostname. + """ + # Localhost + assert main.validate_hostname("localhost") is False + assert main.validate_hostname("127.0.0.1") is False + assert main.validate_hostname("::1") is False + + # Private IP + assert main.validate_hostname("192.168.1.1") is False + + # Domain resolving to private IP + with patch("socket.getaddrinfo") as mock_dns: + # Return private IP + mock_dns.return_value = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', ('192.168.1.1', 443))] + main.validate_hostname.cache_clear() + + assert main.validate_hostname("private.local") is False + +def test_validate_folder_url_uses_validate_hostname(): + """ + Verify that validate_folder_url calls validate_hostname. + """ + with patch("main.validate_hostname") as mock_validate: + mock_validate.return_value = True + + # Clear cache + main.validate_folder_url.cache_clear() + + url = "https://example.com/data.json" + assert main.validate_folder_url(url) is True + + mock_validate.assert_called_with("example.com") + + # Invalid hostname + mock_validate.return_value = False + + # Clear cache again because URL is the same + main.validate_folder_url.cache_clear() + + assert main.validate_folder_url(url) is False From 7ba3b207c4f029c65567cc13ddd2576d352cbaeb Mon Sep 17 00:00:00 2001 From: Abhi Mehrotra Date: Wed, 18 Feb 2026 13:06:35 -0600 Subject: [PATCH 2/2] Update main.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 9822e98..5e5ea0c 100644 --- a/main.py +++ b/main.py @@ -679,7 +679,9 @@ def validate_hostname(hostname: str) -> bool: ) return False - return True + if not addr_info: + return False + for res in addr_info: @lru_cache(maxsize=128)