Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 69 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
import logging
import time
import re
import ipaddress
from typing import Dict, List, Optional, Any, Set, Sequence
from urllib.parse import urlparse

import httpx
from dotenv import load_dotenv
Expand Down Expand Up @@ -99,10 +101,75 @@ def _api_client() -> httpx.Client:


def validate_folder_url(url: str) -> bool:
"""Validate that the folder URL is safe (HTTPS only)."""
"""
Validates that the folder URL is safe for use by enforcing strict security boundaries.

Validation behavior:
1. Only HTTPS URLs are allowed.
2. Only URLs with hostnames matching the following domains (or their subdomains) are allowed:
- github.com
- githubusercontent.com
3. URLs with direct IP addresses (both IPv4 and IPv6) are blocked.
4. The following types of IP addresses are explicitly rejected:
- Private (e.g., 10.0.0.0/8, 192.168.0.0/16)
- Loopback (e.g., 127.0.0.1, ::1)
- Link-local (e.g., 169.254.0.0/16, fe80::/10)
- Reserved addresses
5. Subdomain matching is supported for allowed domains (e.g., raw.githubusercontent.com is allowed).

This helps prevent SSRF attacks by restricting access to trusted, public resources only.
"""
# Check HTTPS protocol
if not url.startswith("https://"):
log.warning(f"Skipping unsafe or invalid URL: {url}")
log.warning(f"Skipping unsafe or invalid URL (not HTTPS): {url}")
return False

# Allowlist of trusted domains
ALLOWED_DOMAINS = {
"github.com",
"githubusercontent.com",
}
Comment on lines +128 to +131
Copy link

Copilot AI Dec 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ALLOWED_DOMAINS constant is defined inside the function, which means it's recreated on every function call. Consider moving this to a module-level constant (e.g., after the imports or with other constants like API_BASE) to improve performance and make it easier to maintain and test. This would also make it more discoverable for future configuration needs.

Copilot uses AI. Check for mistakes.

try:
parsed = urlparse(url)
host = parsed.hostname

if not host:
log.warning(f"Skipping invalid URL (no hostname): {url}")
return False

# First, check if the hostname is an IP address
except Exception as e:
log.warning(f"Parsing error while validating URL {url}: {e}")
return False

if not host:
log.warning(f"Skipping invalid URL (no hostname): {url}")
return False

# First, check if the hostname is an IP address
# If it is, ensure it's not a private IP (reject all direct IP access)
try:
ip = ipaddress.ip_address(host)
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
log.warning(f"Skipping URL with private/internal IP address: {url}")
return False
except ValueError:
# Not an IP address, it's a domain name - continue with domain validation
pass

# Check if host is in allowlist (including subdomains)
host_lower = host.lower()
is_allowed = False
for allowed_domain in ALLOWED_DOMAINS:
if host_lower == allowed_domain or host_lower.endswith(f".{allowed_domain}"):
is_allowed = True
break

if not is_allowed:
log.warning(f"Skipping URL from untrusted domain: {url} (host: {host}) [rejected for security reasons]")
return False

return True


Expand Down
Loading