diff --git a/main.py b/main.py index 544f2a1..94b1713 100644 --- a/main.py +++ b/main.py @@ -19,7 +19,9 @@ import logging import time import re +import ipaddress from typing import Dict, List, Optional, Any, Set, Sequence +from urllib.parse import urlparse import httpx from dotenv import load_dotenv @@ -99,10 +101,75 @@ def _api_client() -> httpx.Client: def validate_folder_url(url: str) -> bool: - """Validate that the folder URL is safe (HTTPS only).""" + """ + Validates that the folder URL is safe for use by enforcing strict security boundaries. + + Validation behavior: + 1. Only HTTPS URLs are allowed. + 2. Only URLs with hostnames matching the following domains (or their subdomains) are allowed: + - github.com + - githubusercontent.com + 3. URLs with direct IP addresses (both IPv4 and IPv6) are blocked. + 4. The following types of IP addresses are explicitly rejected: + - Private (e.g., 10.0.0.0/8, 192.168.0.0/16) + - Loopback (e.g., 127.0.0.1, ::1) + - Link-local (e.g., 169.254.0.0/16, fe80::/10) + - Reserved addresses + 5. Subdomain matching is supported for allowed domains (e.g., raw.githubusercontent.com is allowed). + + This helps prevent SSRF attacks by restricting access to trusted, public resources only. + """ + # Check HTTPS protocol if not url.startswith("https://"): - log.warning(f"Skipping unsafe or invalid URL: {url}") + log.warning(f"Skipping unsafe or invalid URL (not HTTPS): {url}") + return False + + # Allowlist of trusted domains + ALLOWED_DOMAINS = { + "github.com", + "githubusercontent.com", + } + + try: + parsed = urlparse(url) + host = parsed.hostname + + if not host: + log.warning(f"Skipping invalid URL (no hostname): {url}") + return False + + # First, check if the hostname is an IP address + except Exception as e: + log.warning(f"Parsing error while validating URL {url}: {e}") + return False + + if not host: + log.warning(f"Skipping invalid URL (no hostname): {url}") + return False + + # First, check if the hostname is an IP address + # If it is, ensure it's not a private IP (reject all direct IP access) + try: + ip = ipaddress.ip_address(host) + if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved: + log.warning(f"Skipping URL with private/internal IP address: {url}") + return False + except ValueError: + # Not an IP address, it's a domain name - continue with domain validation + pass + + # Check if host is in allowlist (including subdomains) + host_lower = host.lower() + is_allowed = False + for allowed_domain in ALLOWED_DOMAINS: + if host_lower == allowed_domain or host_lower.endswith(f".{allowed_domain}"): + is_allowed = True + break + + if not is_allowed: + log.warning(f"Skipping URL from untrusted domain: {url} (host: {host}) [rejected for security reasons]") return False + return True