From b6c5f4d6573b1005160d4b70002b2c48dee812f7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 18 Oct 2025 14:13:37 +0000 Subject: [PATCH 1/2] Initial plan From b831b8ae581044740dea56b65a6796a8979db991 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 18 Oct 2025 14:19:21 +0000 Subject: [PATCH 2/2] Add scrape_fake_jobs.py script with comprehensive parsing and CSV output Co-authored-by: ausarkhan <218135094+ausarkhan@users.noreply.github.com> --- .gitignore | 6 ++ scripts/scrape_fake_jobs.py | 180 ++++++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 .gitignore create mode 100755 scripts/scrape_fake_jobs.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d0f53b7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Python +__pycache__/ +*.pyc +*.pyo +*.csv + diff --git a/scripts/scrape_fake_jobs.py b/scripts/scrape_fake_jobs.py new file mode 100755 index 0000000..7638ccb --- /dev/null +++ b/scripts/scrape_fake_jobs.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +""" +Scrape fake job postings and save them to fake_jobs.csv + +This script demonstrates using BeautifulSoup's class-based find/find_all +features (see https://blog.apify.com/beautifulsoup-find-by-class/) to extract +job postings from the Real Python "fake jobs" demo page and write them to a CSV. + +Output CSV: fake_jobs.csv +Columns (header): Job Title, Company, Location, Date Posted + +Usage: + python3 scripts/scrape_fake_jobs.py + +Requirements: + pip install requests beautifulsoup4 + +The script uses class-based selectors to extract: + - Job Title:

+ - Company:

+ - Location:

+ - Date Posted:

container. +The script includes fallbacks for slight HTML variations to ensure robustness. +""" + +from __future__ import annotations +import csv +import sys +from typing import List, Dict + +import requests +from bs4 import BeautifulSoup + +SOURCE_URL = "https://realpython.github.io/fake-jobs/" +OUTPUT_CSV = "fake_jobs.csv" +CSV_HEADERS = ["Job Title", "Company", "Location", "Date Posted"] + + +def fetch_page(url: str) -> str: + """ + Fetch HTML content from the given URL. + + Args: + url: The URL to fetch + + Returns: + The HTML content as a string + + Raises: + requests.HTTPError: If the request fails + """ + resp = requests.get(url) + resp.raise_for_status() + return resp.text + + +def _get_text(elem) -> str: + """ + Safely extract text from a BeautifulSoup element. + + Args: + elem: A BeautifulSoup element or None + + Returns: + The stripped text content, or empty string if elem is None + """ + return elem.get_text(strip=True) if elem else "" + + +def parse_jobs(html: str) -> List[Dict[str, str]]: + """ + Parse job postings from HTML content. + + Uses class-based find/find_all per https://blog.apify.com/beautifulsoup-find-by-class/ + to extract job data with fallbacks for HTML variations. + + Args: + html: The HTML content to parse + + Returns: + A list of dictionaries, each containing job data with keys: + "Job Title", "Company", "Location", "Date Posted" + """ + soup = BeautifulSoup(html, "html.parser") + + # The Real Python fake-jobs demo wraps each job card inside + #
. We use class_ parameter for find_all + # as demonstrated in https://blog.apify.com/beautifulsoup-find-by-class/ + job_cards = soup.find_all("div", class_="card-content") + + jobs = [] + for card in job_cards: + # Extract job title from

+ # Fallback: try "title" class first, then any h2 + title_elem = card.find("h2", class_="title") or card.find("h2") + title = _get_text(title_elem) + + # Extract company from

+ # Fallback: try "company" class first, then any h3 + company_elem = card.find("h3", class_="company") or card.find("h3") + company = _get_text(company_elem) + + # Extract location from

+ # Fallback: try "location" class first, then first p tag + location_elem = card.find("p", class_="location") or card.find("p") + location = _get_text(location_elem) + + # Extract date posted from