diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d0f53b7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Python +__pycache__/ +*.pyc +*.pyo +*.csv + diff --git a/scripts/scrape_fake_jobs.py b/scripts/scrape_fake_jobs.py new file mode 100755 index 0000000..7638ccb --- /dev/null +++ b/scripts/scrape_fake_jobs.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +""" +Scrape fake job postings and save them to fake_jobs.csv + +This script demonstrates using BeautifulSoup's class-based find/find_all +features (see https://blog.apify.com/beautifulsoup-find-by-class/) to extract +job postings from the Real Python "fake jobs" demo page and write them to a CSV. + +Output CSV: fake_jobs.csv +Columns (header): Job Title, Company, Location, Date Posted + +Usage: + python3 scripts/scrape_fake_jobs.py + +Requirements: + pip install requests beautifulsoup4 + +The script uses class-based selectors to extract: + - Job Title:

+ - Company:

+ - Location:

+ - Date Posted:

container. +The script includes fallbacks for slight HTML variations to ensure robustness. +""" + +from __future__ import annotations +import csv +import sys +from typing import List, Dict + +import requests +from bs4 import BeautifulSoup + +SOURCE_URL = "https://realpython.github.io/fake-jobs/" +OUTPUT_CSV = "fake_jobs.csv" +CSV_HEADERS = ["Job Title", "Company", "Location", "Date Posted"] + + +def fetch_page(url: str) -> str: + """ + Fetch HTML content from the given URL. + + Args: + url: The URL to fetch + + Returns: + The HTML content as a string + + Raises: + requests.HTTPError: If the request fails + """ + resp = requests.get(url) + resp.raise_for_status() + return resp.text + + +def _get_text(elem) -> str: + """ + Safely extract text from a BeautifulSoup element. + + Args: + elem: A BeautifulSoup element or None + + Returns: + The stripped text content, or empty string if elem is None + """ + return elem.get_text(strip=True) if elem else "" + + +def parse_jobs(html: str) -> List[Dict[str, str]]: + """ + Parse job postings from HTML content. + + Uses class-based find/find_all per https://blog.apify.com/beautifulsoup-find-by-class/ + to extract job data with fallbacks for HTML variations. + + Args: + html: The HTML content to parse + + Returns: + A list of dictionaries, each containing job data with keys: + "Job Title", "Company", "Location", "Date Posted" + """ + soup = BeautifulSoup(html, "html.parser") + + # The Real Python fake-jobs demo wraps each job card inside + #
. We use class_ parameter for find_all + # as demonstrated in https://blog.apify.com/beautifulsoup-find-by-class/ + job_cards = soup.find_all("div", class_="card-content") + + jobs = [] + for card in job_cards: + # Extract job title from

+ # Fallback: try "title" class first, then any h2 + title_elem = card.find("h2", class_="title") or card.find("h2") + title = _get_text(title_elem) + + # Extract company from

+ # Fallback: try "company" class first, then any h3 + company_elem = card.find("h3", class_="company") or card.find("h3") + company = _get_text(company_elem) + + # Extract location from

+ # Fallback: try "location" class first, then first p tag + location_elem = card.find("p", class_="location") or card.find("p") + location = _get_text(location_elem) + + # Extract date posted from