Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions .jules/sentinel.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
## 2025-02-12 - [Critical] API Authentication Fail-Open Default
**Vulnerability:** The API authentication mechanism (`api/auth.py`) defaulted to allowing access if the `RESUME_API_KEY` environment variable was not set ("dev mode"). Additionally, it used a timing-vulnerable string comparison for the API key check.
**Learning:** "Dev mode" defaults that bypass security controls are dangerous because they can easily be deployed to production by accident, leaving the system wide open.
**Prevention:** Implement a "fail-closed" strategy. If a security configuration (like an API key) is missing, the application should refuse to start or deny all requests, rather than failing open. Always use `secrets.compare_digest` for sensitive string comparisons.
## 2024-03-22 - [JobParser] Add SSRF protection

## 2025-02-19 - [Critical] LaTeX Injection in Cover Letter Generator
**Vulnerability:** The `CoverLetterGenerator` used a standard Jinja2 environment (intended for HTML/XML or plain text) to render LaTeX templates. This allowed malicious user input (or AI hallucinations) containing LaTeX control characters (e.g., `\input{...}`) to be injected directly into the LaTeX source, leading to potential Local File Inclusion (LFI) or other exploits.
**Learning:** Jinja2's default `autoescape` is context-aware based on file extensions, but usually only for HTML/XML. It does NOT automatically escape LaTeX special characters. Relying on manual filters (like `| latex_escape`) in templates is error-prone and brittle, as developers might forget to apply them to every variable.
**Prevention:** Always use a dedicated Jinja2 environment for LaTeX generation that enforces auto-escaping via a `finalize` hook (e.g., `tex_env.finalize = latex_escape`). This ensures *all* variable output is sanitized by default, providing defense-in-depth even if the template author forgets explicit filters.
**Vulnerability:**
The `JobParser.parse_from_url` method passed arbitrary user-provided URLs directly to `requests.get` without any validation. This exposed the application to Server-Side Request Forgery (SSRF) attacks, where an attacker could coerce the server into making requests to internal network resources (e.g., AWS IMDS at `169.254.169.254`, `localhost`, etc).

**Learning:**
When an application takes a URL from untrusted user input and makes a server-side request using it, it is critical to resolve the hostname and validate the target IP address to ensure it does not belong to private or reserved IP ranges. Additionally, the IP validation should fail securely if resolution errors occur, and `0.0.0.0` needs explicit checks as it may resolve to localhost depending on the environment. Time-of-check to time-of-use (TOCTOU) issues via DNS rebinding can still technically occur if the hostname is passed to requests instead of the verified IP, but validating the IP is a crucial first step.

**Prevention:**
Always validate both the scheme and the resolved IP address (using a tool like Python's `ipaddress` library with `is_private`, `is_loopback`, `is_reserved`, etc.) before making requests to user-supplied URLs. Ensure the failure modes fail closed instead of open.
71 changes: 59 additions & 12 deletions cli/integrations/job_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
try:
import requests
except ImportError:
requests = None
requests = None # type: ignore[assignment]


@dataclass
Expand Down Expand Up @@ -200,6 +200,54 @@ def parse_from_file(self, file_path: Path, url: Optional[str] = None) -> JobDeta
job_details.url = url
return job_details

def _fetch_url_safe(self, url: str) -> requests.Response:
"""
Safely fetch a URL protecting against Server-Side Request Forgery (SSRF).

Validates URL scheme, resolves hostname, and ensures the IP is not private,
loopback, link-local, or reserved.

Args:
url: URL to fetch

Returns:
requests.Response
"""
import ipaddress
import socket
from urllib.parse import urlparse

parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
raise ValueError(f"Invalid URL scheme: {parsed.scheme}")

hostname = parsed.hostname
if not hostname:
raise ValueError("Invalid URL: no hostname")

# Resolve IP to check if it's safe
try:
addrinfo = socket.getaddrinfo(hostname, None)
for result in addrinfo:
ip_addr = result[4][0]
ip = ipaddress.ip_address(ip_addr)
if (
ip.is_private
or ip.is_loopback
or ip.is_link_local
or ip.is_reserved
or ip.is_unspecified
):
raise RuntimeError(f"Access to internal IP address is not allowed: {ip_addr}")
except socket.gaierror as e:
raise ValueError(f"Could not resolve hostname {hostname}: {e}")
except ValueError as e:
# Catch ValueError from ipaddress parsing and fail closed
raise ValueError(f"Invalid IP address resolved for hostname: {e}")

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
return requests.get(url, headers=headers, timeout=30)

def parse_from_url(self, url: str) -> JobDetails:
"""
Parse job posting from URL.
Expand All @@ -221,9 +269,7 @@ def parse_from_url(self, url: str) -> JobDetails:

# Fetch and parse
try:

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
response = requests.get(url, headers=headers, timeout=30)
response = self._fetch_url_safe(url)
response.raise_for_status()

job_details = self._parse_html(response.text)
Expand All @@ -238,7 +284,7 @@ def parse_from_url(self, url: str) -> JobDetails:
raise NotImplementedError(
"URL fetching requires 'requests' library. Install with: pip install requests"
)
except requests.RequestException as e:
except (requests.RequestException, ValueError, RuntimeError) as e:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): Catching broad ValueError/RuntimeError here may hide programming errors unrelated to URL fetching.

This except (requests.RequestException, ValueError, RuntimeError) will also catch ValueError/RuntimeError from _parse_html or other later code, incorrectly rewrapping them as fetch failures. Please restrict this handler to the concrete exceptions raised by _fetch_url_safe/requests, or split the try so parsing errors are not masked as network/URL-fetch errors.

Suggested implementation:

            try:
                response = self._fetch_url_safe(url)
                response.raise_for_status()
            except (requests.RequestException, ValueError, RuntimeError) as e:
                # Limit this handler strictly to URL fetching / validation failures
                raise RuntimeError(f"Failed to fetch URL: {e}")

            job_details = self._parse_html(response.text)
            raise NotImplementedError(
                "URL fetching requires 'requests' library. Install with: pip install requests"
            )

If there are other calls to _parse_html wrapped in the same broad except (requests.RequestException, ValueError, RuntimeError) pattern elsewhere in this file, they should be refactored similarly: keep the exception handler around only the fetch/validation logic, and let parsing errors propagate normally.

raise RuntimeError(f"Failed to fetch URL: {e}")

def _parse_html(self, html: str) -> JobDetails:
Expand Down Expand Up @@ -426,7 +472,8 @@ def _parse_generic(self, html: str) -> JobDetails:
# Look for company in meta tags
meta_company = soup.find("meta", attrs={"name": "company"})
if meta_company:
company = meta_company.get("content", "")
content = meta_company.get("content", "")
company = str(content) if isinstance(content, list) else content

# Extract position from h1 or title
position = ""
Expand All @@ -447,11 +494,11 @@ def _parse_generic(self, html: str) -> JobDetails:
salary = self._extract_salary_from_text(html)

# Extract requirements section - look for heading tags first
requirements = []
requirements: List[str] = []
req_heading = soup.find(
["h1", "h2", "h3", "h4", "h5", "h6"],
string=re.compile(r"requirements|qualifications|skills", re.IGNORECASE),
)
) # type: ignore[call-overload]
if req_heading:
# Get the next sibling element(s) containing the list
next_elem = req_heading.find_next_sibling(["ul", "ol", "div", "p"])
Expand All @@ -462,11 +509,11 @@ def _parse_generic(self, html: str) -> JobDetails:
requirements = self._extract_list_by_keyword(html, "requirements")

# Extract responsibilities section
responsibilities = []
responsibilities: List[str] = []
resp_heading = soup.find(
["h1", "h2", "h3", "h4", "h5", "h6"],
string=re.compile(r"responsibilities|duties|what you", re.IGNORECASE),
)
) # type: ignore[call-overload]
if resp_heading:
next_elem = resp_heading.find_next_sibling(["ul", "ol", "div", "p"])
if next_elem:
Expand Down Expand Up @@ -586,8 +633,8 @@ def _extract_sections_from_description(self, description: str) -> Tuple[List[str
Returns:
Tuple of (requirements, responsibilities)
"""
requirements = []
responsibilities = []
requirements: List[str] = []
responsibilities: List[str] = []

if not description:
return requirements, responsibilities
Expand Down
Loading