Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .jules/sentinel.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,8 @@
**Vulnerability:** The `CoverLetterGenerator` used a standard Jinja2 environment (intended for HTML/XML or plain text) to render LaTeX templates. This allowed malicious user input (or AI hallucinations) containing LaTeX control characters (e.g., `\input{...}`) to be injected directly into the LaTeX source, leading to potential Local File Inclusion (LFI) or other exploits.
**Learning:** Jinja2's default `autoescape` is context-aware based on file extensions, but usually only for HTML/XML. It does NOT automatically escape LaTeX special characters. Relying on manual filters (like `| latex_escape`) in templates is error-prone and brittle, as developers might forget to apply them to every variable.
**Prevention:** Always use a dedicated Jinja2 environment for LaTeX generation that enforces auto-escaping via a `finalize` hook (e.g., `tex_env.finalize = latex_escape`). This ensures *all* variable output is sanitized by default, providing defense-in-depth even if the template author forgets explicit filters.

## 2024-05-18 - [Critical] SSRF Vulnerability in Job Parser
**Vulnerability:** The `JobParser.parse_from_url` method fetched external URLs provided by users via `requests.get` without any validation of the target host. This allowed an attacker to input internal IP addresses (like `127.0.0.1` or `169.254.169.254`) causing the server to fetch them, leading to Server-Side Request Forgery (SSRF).
**Learning:** URL fetching functionality is inherently risky and requires strict validation to ensure only external, publicly accessible resources are queried. Additionally, simply checking if an IP is private/loopback isn't enough; the `0.0.0.0` (unspecified) address can also resolve to localhost on some systems, and manual redirect tracking is essential to prevent bypasses.
**Prevention:** Implement an SSRF-safe wrapper for `requests.get` that manually parses the URL, resolves the hostname (`socket.getaddrinfo`), explicitly checks if the resulting IP is private/loopback/link-local/multicast/reserved/unspecified, and tracks redirects manually to prevent routing bypasses.
84 changes: 75 additions & 9 deletions cli/integrations/job_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
"""

import hashlib
import ipaddress
import json
import re
import socket
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup, Tag

Expand Down Expand Up @@ -219,14 +222,10 @@ def parse_from_url(self, url: str) -> JobDetails:
if cached:
return cached

# Fetch and parse
# Fetch and parse safely (SSRF protection)
try:

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()

job_details = self._parse_html(response.text)
html_content = self._fetch_url_safe(url)
job_details = self._parse_html(html_content)
job_details.url = url

# Save to cache
Expand All @@ -238,8 +237,75 @@ def parse_from_url(self, url: str) -> JobDetails:
raise NotImplementedError(
"URL fetching requires 'requests' library. Install with: pip install requests"
)
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch URL: {e}")

def _fetch_url_safe(self, url: str, timeout: int = 30, max_redirects: int = 5) -> str:
"""
Safely fetch a URL with SSRF protection.

Args:
url: URL to fetch
timeout: Request timeout in seconds
max_redirects: Maximum number of redirects to follow

Returns:
HTML content of the page

Raises:
ValueError: If URL is invalid or points to a restricted IP
RuntimeError: If request fails
"""
if requests is None:
raise ImportError("requests library is not installed")

current_url = url
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

for _ in range(max_redirects):
parsed = urlparse(current_url)
if parsed.scheme not in ("http", "https"):
raise ValueError(f"Invalid URL scheme: {parsed.scheme}")

try:
addr_info = socket.getaddrinfo(
parsed.hostname, parsed.port or (443 if parsed.scheme == "https" else 80)
)
Comment on lines +269 to +271
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: The getaddrinfo call could be tightened by explicitly constraining address families.

socket.getaddrinfo is currently called without a family, so it may return non-IP address families on some platforms, which you don’t handle (you assume sockaddr[0] is an IP string). Consider constraining the call to IP families, e.g. family=socket.AF_UNSPEC, type=socket.SOCK_STREAM, or using separate calls for IPv4/IPv6 if you need finer control.

Suggested change
addr_info = socket.getaddrinfo(
parsed.hostname, parsed.port or (443 if parsed.scheme == "https" else 80)
)
addr_info = socket.getaddrinfo(
parsed.hostname,
parsed.port or (443 if parsed.scheme == "https" else 80),
family=socket.AF_UNSPEC,
type=socket.SOCK_STREAM,
)

for family, _, _, _, sockaddr in addr_info:
ip = ipaddress.ip_address(sockaddr[0])
if hasattr(ip, "ipv4_mapped") and ip.ipv4_mapped:
ip = ip.ipv4_mapped
if (
ip.is_private
or ip.is_loopback
or ip.is_link_local
or ip.is_multicast
or ip.is_reserved
or ip.is_unspecified
):
raise ValueError(f"Access to internal/private IP is forbidden: {ip}")
except socket.gaierror:
raise ValueError(f"Could not resolve hostname: {parsed.hostname}")

try:
response = requests.get(
current_url, headers=headers, timeout=timeout, allow_redirects=False
)
response.raise_for_status()
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch URL: {e}")

if response.status_code in (301, 302, 303, 307, 308):
redirect_url = response.headers.get("Location")
if not redirect_url:
raise RuntimeError("Redirect missing Location header")
if not redirect_url.startswith("http"):
current_url = urljoin(current_url, redirect_url)
else:
current_url = redirect_url
continue

return response.text

raise ValueError(f"Too many redirects ({max_redirects})")

def _parse_html(self, html: str) -> JobDetails:
"""
Expand Down
Loading