|
| 1 | +# Day 26: Project - Simple Web Scraper |
| 2 | + |
| 3 | +# We'll use 'requests' to get the HTML and 'BeautifulSoup' to parse it. |
| 4 | +import requests |
| 5 | +from bs4 import BeautifulSoup |
| 6 | + |
| 7 | +def simple_scraper(url): |
| 8 | + """Scrapes all the headings from a given URL.""" |
| 9 | + try: |
| 10 | + # Step 1: Fetch the HTML content |
| 11 | + print(f"Fetching data from {url}...") |
| 12 | + response = requests.get(url) |
| 13 | + response.raise_for_status() # Check for HTTP errors |
| 14 | + |
| 15 | + # Step 2: Parse the HTML content |
| 16 | + soup = BeautifulSoup(response.text, 'html.parser') |
| 17 | + |
| 18 | + # Step 3: Find and extract all h1, h2, and h3 headings |
| 19 | + headings = soup.find_all(['h1', 'h2', 'h3']) |
| 20 | + |
| 21 | + # Step 4: Process and store the extracted data |
| 22 | + extracted_data = [heading.get_text().strip() for heading in headings] |
| 23 | + |
| 24 | + # Step 5: Save the data to a file |
| 25 | + with open("scraped_headings.txt", "w", encoding="utf-8") as file: |
| 26 | + for item in extracted_data: |
| 27 | + file.write(item + "\n") |
| 28 | + |
| 29 | + print("Scraping successful! Headings saved to scraped_headings.txt") |
| 30 | + return extracted_data |
| 31 | + |
| 32 | + except requests.exceptions.RequestException as e: |
| 33 | + print(f"An error occurred: {e}") |
| 34 | + return None |
| 35 | + |
| 36 | +# The website to scrape |
| 37 | +target_url = "https://example.com" |
| 38 | +scraped_headings = simple_scraper(target_url) |
| 39 | + |
| 40 | +if scraped_headings: |
| 41 | + print("\nExtracted Headings:") |
| 42 | + for heading in scraped_headings: |
| 43 | + print(f"- {heading}") |
0 commit comments