Skip to content

Commit ef193e5

Browse files
authored
Implement simple web scraper using requests and BeautifulSoup
1 parent a1f6b35 commit ef193e5

1 file changed

Lines changed: 43 additions & 0 deletions

File tree

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Day 26: Project - Simple Web Scraper
2+
3+
# We'll use 'requests' to get the HTML and 'BeautifulSoup' to parse it.
4+
import requests
5+
from bs4 import BeautifulSoup
6+
7+
def simple_scraper(url):
8+
"""Scrapes all the headings from a given URL."""
9+
try:
10+
# Step 1: Fetch the HTML content
11+
print(f"Fetching data from {url}...")
12+
response = requests.get(url)
13+
response.raise_for_status() # Check for HTTP errors
14+
15+
# Step 2: Parse the HTML content
16+
soup = BeautifulSoup(response.text, 'html.parser')
17+
18+
# Step 3: Find and extract all h1, h2, and h3 headings
19+
headings = soup.find_all(['h1', 'h2', 'h3'])
20+
21+
# Step 4: Process and store the extracted data
22+
extracted_data = [heading.get_text().strip() for heading in headings]
23+
24+
# Step 5: Save the data to a file
25+
with open("scraped_headings.txt", "w", encoding="utf-8") as file:
26+
for item in extracted_data:
27+
file.write(item + "\n")
28+
29+
print("Scraping successful! Headings saved to scraped_headings.txt")
30+
return extracted_data
31+
32+
except requests.exceptions.RequestException as e:
33+
print(f"An error occurred: {e}")
34+
return None
35+
36+
# The website to scrape
37+
target_url = "https://example.com"
38+
scraped_headings = simple_scraper(target_url)
39+
40+
if scraped_headings:
41+
print("\nExtracted Headings:")
42+
for heading in scraped_headings:
43+
print(f"- {heading}")

0 commit comments

Comments
 (0)