Python-Site-Scraper/scrape.py at master · dthenley/Python-Site-Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from bs4 import BeautifulSoup
import requests
import pandas as pd

url = {blog_url}
page = requests.get(url)
page = page.content

soup = BeautifulSoup(page, 'html.parser')

ul = soup.find('div', class_='custom eme_block')

articles = ul.find_all( 'li', class_='content_item' )
blogs = []
for article in articles:
    title = article.find('a').text
    excerpt = article.find('p').text
    link = article.find('a').attrs['href']
    newurl = f"site_url{link}"
    newpage = requests.get(newurl)
    newsoup = BeautifulSoup(newpage.content, 'html.parser')
    articleContent = newsoup.find('div', {'id': 'Content_container'})
    image = articleContent.find('img').attrs['src']
    articleContent = str(articleContent)
    blogs.append([title,excerpt, image, articleContent])

df = pd.DataFrame(blogs, columns=['Title', 'Excerpt', 'Image', 'Article'])

df.to_csv('articles.csv')