From 21ad2f8a7a8b2e684b29f20953388c43c4db1052 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 3 Apr 2026 23:27:21 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=9B=A1=EF=B8=8F=20Sentinel:=20[CRITICAL]?= =?UTF-8?q?=20Fix=20SSRF=20vulnerability=20in=20JobParser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Implemented `_fetch_url_safe` to validate IP addresses before fetching URLs - Blocked private, loopback, link-local, reserved, and unspecified IPs - Safely followed redirects to prevent bypasses - Handled `requests` as an optional dependency securely Co-authored-by: anchapin <6326294+anchapin@users.noreply.github.com> --- cli/integrations/job_parser.py | 109 ++++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 8 deletions(-) diff --git a/cli/integrations/job_parser.py b/cli/integrations/job_parser.py index 2c3cdf6..832d073 100644 --- a/cli/integrations/job_parser.py +++ b/cli/integrations/job_parser.py @@ -15,11 +15,14 @@ """ import hashlib +import ipaddress import json import re +import socket from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -30,6 +33,11 @@ requests = None +class SSRFError(ValueError): + """Raised when a potential SSRF attack is detected.""" + pass + + @dataclass class JobDetails: """Structured job posting data.""" @@ -219,11 +227,15 @@ def parse_from_url(self, url: str) -> JobDetails: if cached: return cached + # Check dependency first before any exception blocks try to reference it + if requests is None: + raise NotImplementedError( + "URL fetching requires 'requests' library. Install with: pip install requests" + ) + # Fetch and parse try: - - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} - response = requests.get(url, headers=headers, timeout=30) + response = self._fetch_url_safe(url) response.raise_for_status() job_details = self._parse_html(response.text) @@ -234,13 +246,94 @@ def parse_from_url(self, url: str) -> JobDetails: return job_details - except ImportError: - raise NotImplementedError( - "URL fetching requires 'requests' library. Install with: pip install requests" - ) - except requests.RequestException as e: + except (requests.RequestException, SSRFError) as e: raise RuntimeError(f"Failed to fetch URL: {e}") + def _fetch_url_safe(self, url: str) -> Any: + """ + Safely fetch a URL protecting against Server-Side Request Forgery (SSRF). + + Args: + url: The URL to fetch. + + Returns: + The requests Response object. + """ + parsed_url = urlparse(url) + if parsed_url.scheme not in ("http", "https"): + raise SSRFError(f"Invalid URL scheme: {parsed_url.scheme}") + + hostname = parsed_url.hostname + if not hostname: + raise SSRFError("Invalid URL: missing hostname") + + try: + # Resolve hostname using socket.getaddrinfo for IPv4 and IPv6 + # Returns a list of tuples: (family, type, proto, canonname, sockaddr) + addr_info = socket.getaddrinfo(hostname, None) + except socket.gaierror as e: + raise SSRFError(f"DNS resolution failed for hostname {hostname}: {e}") + + # Check all resolved IPs to ensure they are safe + for info in addr_info: + ip_str = info[4][0] + try: + ip = ipaddress.ip_address(ip_str) + except ValueError: + # Fail closed if we cannot parse the IP returned by DNS + raise SSRFError(f"SSRF Protection: could not parse resolved IP {ip_str}") + + if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast or ip.is_unspecified: + raise SSRFError(f"SSRF Protection: resolved IP {ip_str} is not allowed") + + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} + + # Follow redirects manually to validate them + # Note: This approach provides a good first layer of defense against SSRF, + # but it is technically vulnerable to Time-Of-Check to Time-Of-Use (TOCTOU) DNS + # Rebinding attacks because requests.get() performs its own DNS resolution. + max_redirects = 5 + current_url = url + + for _ in range(max_redirects): + response = requests.get(current_url, headers=headers, timeout=30, allow_redirects=False) + + if response.is_redirect: + redirect_url = response.headers.get("location") + if not redirect_url: + break + + # Handle relative redirects + redirect_url = urljoin(current_url, redirect_url) + + # Validate the redirect URL as well to prevent SSRF via redirect + parsed_redirect = urlparse(redirect_url) + if parsed_redirect.scheme not in ("http", "https"): + raise SSRFError(f"Invalid redirect scheme: {parsed_redirect.scheme}") + + hostname = parsed_redirect.hostname + if not hostname: + raise SSRFError("Invalid redirect URL: missing hostname") + + try: + addr_info = socket.getaddrinfo(hostname, None) + for info in addr_info: + ip_str = info[4][0] + try: + ip = ipaddress.ip_address(ip_str) + if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast or ip.is_unspecified: + raise SSRFError(f"SSRF Protection: redirected IP {ip_str} is not allowed") + except ValueError: + raise SSRFError(f"SSRF Protection: could not parse redirected IP {ip_str}") + except socket.gaierror as e: + raise SSRFError(f"DNS resolution failed for redirected hostname {hostname}: {e}") + + current_url = redirect_url + else: + return response + + raise SSRFError("Too many redirects") + def _parse_html(self, html: str) -> JobDetails: """ Parse HTML content and detect source automatically.