Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 36 additions & 29 deletions cli/integrations/job_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple, Union

from bs4 import BeautifulSoup, Tag

Expand All @@ -29,6 +29,29 @@
except ImportError:
requests = None

# Pre-compiled regex patterns for performance optimization
_SALARY_PATTERNS = [
re.compile(r"\$[\d,]+(?:\s*[-–to]+\s*\$[\d,]+)?", re.IGNORECASE), # $100k - $150k
re.compile(r"\$[\d,]+k(?:\s*[-–to]+\s*\$[\d,]+k)?", re.IGNORECASE), # $100k - $150k
re.compile(r"[\d,]+k(?:\s*[-–to]+\s*[\d,]+k)", re.IGNORECASE), # 100k - 150k
re.compile(r"(?:salary|pay|compensation)[:\s]*(\$[^<>\n]+)", re.IGNORECASE), # Salary: $X
re.compile(r"(?:per|/)\s*(?:year|annum)[:\s]*(\$[^<>\n]+)", re.IGNORECASE), # per year: $X
]

_JOB_TYPE_PATTERNS = [
re.compile(
r"\b(full[- ]?time|part[- ]?time|contract|freelance|intern|temporary)\b", re.IGNORECASE
),
re.compile(r"\b(permanent|fixed[- ]?term)\b", re.IGNORECASE),
]

_EXPERIENCE_LEVEL_PATTERNS = [
re.compile(
r"\b(entry[- ]?level|junior|mid[- ]?level|senior|staff|principal|lead)\b", re.IGNORECASE
),
re.compile(r"\b(associate|vice[- ]?president|director|executive)\b", re.IGNORECASE),
]


@dataclass
class JobDetails:
Expand Down Expand Up @@ -529,18 +552,21 @@ def _find_by_selectors(self, soup: BeautifulSoup, selectors: List[str]) -> Optio
return elem
return None

def _extract_text_by_pattern(self, text: str, pattern: str) -> Optional[str]:
def _extract_text_by_pattern(self, text: str, pattern: Union[str, re.Pattern]) -> Optional[str]:
"""
Extract text using regex pattern.

Args:
text: Text to search
pattern: Regex pattern
pattern: Regex pattern (string or pre-compiled)

Returns:
Extracted text or None
"""
match = re.search(pattern, text, re.IGNORECASE)
if isinstance(pattern, re.Pattern):
match = pattern.search(text)
else:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).strip()
return None
Expand All @@ -555,17 +581,8 @@ def _extract_salary_from_text(self, text: str) -> Optional[str]:
Returns:
Salary string or None
"""
# Common salary patterns
patterns = [
r"\$[\d,]+(?:\s*[-–to]+\s*\$[\d,]+)?", # $100k - $150k
r"\$[\d,]+k(?:\s*[-–to]+\s*\$[\d,]+k)?", # $100k - $150k
r"[\d,]+k(?:\s*[-–to]+\s*[\d,]+k)", # 100k - 150k
r"(?:salary|pay|compensation)[:\s]*(\$[^<>\n]+)", # Salary: $X
r"(?:per|/)\s*(?:year|annum)[:\s]*(\$[^<>\n]+)", # per year: $X
]

for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
for pattern in _SALARY_PATTERNS:
match = pattern.search(text)
if match:
salary = match.group(0) if match.lastindex is None else match.group(1)
# Clean up the salary string
Expand Down Expand Up @@ -805,13 +822,8 @@ def _extract_job_type(self, html: str) -> Optional[str]:
Returns:
Job type string or None
"""
patterns = [
r"\b(full[- ]?time|part[- ]?time|contract|freelance|intern|temporary)\b",
r"\b(permanent|fixed[- ]?term)\b",
]

for pattern in patterns:
match = re.search(pattern, html, re.IGNORECASE)
for pattern in _JOB_TYPE_PATTERNS:
match = pattern.search(html)
if match:
return match.group(1).lower().replace("-", "-")

Expand All @@ -827,13 +839,8 @@ def _extract_experience_level(self, html: str) -> Optional[str]:
Returns:
Experience level string or None
"""
patterns = [
r"\b(entry[- ]?level|junior|mid[- ]?level|senior|staff|principal|lead)\b",
r"\b(associate|vice[- ]?president|director|executive)\b",
]

for pattern in patterns:
match = re.search(pattern, html, re.IGNORECASE)
for pattern in _EXPERIENCE_LEVEL_PATTERNS:
match = pattern.search(html)
if match:
return match.group(1).lower().replace("-", "-")

Expand Down
Loading