From 21bcc061d40ee83f54e006e93e4de62c23ade2fb Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 23:25:39 +0000 Subject: [PATCH 1/2] Fix SSRF vulnerability in job parser Co-authored-by: anchapin <6326294+anchapin@users.noreply.github.com> --- .jules/sentinel.md | 5 ++ cli/integrations/job_parser.py | 84 ++++++++++++++++++++++++++++++---- 2 files changed, 80 insertions(+), 9 deletions(-) diff --git a/.jules/sentinel.md b/.jules/sentinel.md index 3a1d237..b337e37 100644 --- a/.jules/sentinel.md +++ b/.jules/sentinel.md @@ -7,3 +7,8 @@ **Vulnerability:** The `CoverLetterGenerator` used a standard Jinja2 environment (intended for HTML/XML or plain text) to render LaTeX templates. This allowed malicious user input (or AI hallucinations) containing LaTeX control characters (e.g., `\input{...}`) to be injected directly into the LaTeX source, leading to potential Local File Inclusion (LFI) or other exploits. **Learning:** Jinja2's default `autoescape` is context-aware based on file extensions, but usually only for HTML/XML. It does NOT automatically escape LaTeX special characters. Relying on manual filters (like `| latex_escape`) in templates is error-prone and brittle, as developers might forget to apply them to every variable. **Prevention:** Always use a dedicated Jinja2 environment for LaTeX generation that enforces auto-escaping via a `finalize` hook (e.g., `tex_env.finalize = latex_escape`). This ensures *all* variable output is sanitized by default, providing defense-in-depth even if the template author forgets explicit filters. + +## 2024-05-18 - [Critical] SSRF Vulnerability in Job Parser +**Vulnerability:** The `JobParser.parse_from_url` method fetched external URLs provided by users via `requests.get` without any validation of the target host. This allowed an attacker to input internal IP addresses (like `127.0.0.1` or `169.254.169.254`) causing the server to fetch them, leading to Server-Side Request Forgery (SSRF). +**Learning:** URL fetching functionality is inherently risky and requires strict validation to ensure only external, publicly accessible resources are queried. Additionally, simply checking if an IP is private/loopback isn't enough; the `0.0.0.0` (unspecified) address can also resolve to localhost on some systems, and manual redirect tracking is essential to prevent bypasses. +**Prevention:** Implement an SSRF-safe wrapper for `requests.get` that manually parses the URL, resolves the hostname (`socket.getaddrinfo`), explicitly checks if the resulting IP is private/loopback/link-local/multicast/reserved/unspecified, and tracks redirects manually to prevent routing bypasses. diff --git a/cli/integrations/job_parser.py b/cli/integrations/job_parser.py index 2c3cdf6..938ad1b 100644 --- a/cli/integrations/job_parser.py +++ b/cli/integrations/job_parser.py @@ -17,6 +17,9 @@ import hashlib import json import re +import socket +import ipaddress +from urllib.parse import urlparse, urljoin from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -219,14 +222,10 @@ def parse_from_url(self, url: str) -> JobDetails: if cached: return cached - # Fetch and parse + # Fetch and parse safely (SSRF protection) try: - - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} - response = requests.get(url, headers=headers, timeout=30) - response.raise_for_status() - - job_details = self._parse_html(response.text) + html_content = self._fetch_url_safe(url) + job_details = self._parse_html(html_content) job_details.url = url # Save to cache @@ -238,8 +237,75 @@ def parse_from_url(self, url: str) -> JobDetails: raise NotImplementedError( "URL fetching requires 'requests' library. Install with: pip install requests" ) - except requests.RequestException as e: - raise RuntimeError(f"Failed to fetch URL: {e}") + + def _fetch_url_safe(self, url: str, timeout: int = 30, max_redirects: int = 5) -> str: + """ + Safely fetch a URL with SSRF protection. + + Args: + url: URL to fetch + timeout: Request timeout in seconds + max_redirects: Maximum number of redirects to follow + + Returns: + HTML content of the page + + Raises: + ValueError: If URL is invalid or points to a restricted IP + RuntimeError: If request fails + """ + if requests is None: + raise ImportError("requests library is not installed") + + current_url = url + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} + + for _ in range(max_redirects): + parsed = urlparse(current_url) + if parsed.scheme not in ("http", "https"): + raise ValueError(f"Invalid URL scheme: {parsed.scheme}") + + try: + addr_info = socket.getaddrinfo( + parsed.hostname, parsed.port or (443 if parsed.scheme == "https" else 80) + ) + for family, _, _, _, sockaddr in addr_info: + ip = ipaddress.ip_address(sockaddr[0]) + if hasattr(ip, "ipv4_mapped") and ip.ipv4_mapped: + ip = ip.ipv4_mapped + if ( + ip.is_private + or ip.is_loopback + or ip.is_link_local + or ip.is_multicast + or ip.is_reserved + or ip.is_unspecified + ): + raise ValueError(f"Access to internal/private IP is forbidden: {ip}") + except socket.gaierror: + raise ValueError(f"Could not resolve hostname: {parsed.hostname}") + + try: + response = requests.get( + current_url, headers=headers, timeout=timeout, allow_redirects=False + ) + response.raise_for_status() + except requests.RequestException as e: + raise RuntimeError(f"Failed to fetch URL: {e}") + + if response.status_code in (301, 302, 303, 307, 308): + redirect_url = response.headers.get("Location") + if not redirect_url: + raise RuntimeError("Redirect missing Location header") + if not redirect_url.startswith("http"): + current_url = urljoin(current_url, redirect_url) + else: + current_url = redirect_url + continue + + return response.text + + raise ValueError(f"Too many redirects ({max_redirects})") def _parse_html(self, html: str) -> JobDetails: """ From cb773a1996b58f69fd2ac122bd685faa7b021d79 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 23:29:38 +0000 Subject: [PATCH 2/2] Fix SSRF vulnerability in job parser Co-authored-by: anchapin <6326294+anchapin@users.noreply.github.com> --- cli/integrations/job_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cli/integrations/job_parser.py b/cli/integrations/job_parser.py index 938ad1b..bf7812b 100644 --- a/cli/integrations/job_parser.py +++ b/cli/integrations/job_parser.py @@ -15,14 +15,14 @@ """ import hashlib +import ipaddress import json import re import socket -import ipaddress -from urllib.parse import urlparse, urljoin from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag