-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
29 lines (23 loc) · 880 Bytes
/
scrape.py
File metadata and controls
29 lines (23 loc) · 880 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = {blog_url}
page = requests.get(url)
page = page.content
soup = BeautifulSoup(page, 'html.parser')
ul = soup.find('div', class_='custom eme_block')
articles = ul.find_all( 'li', class_='content_item' )
blogs = []
for article in articles:
title = article.find('a').text
excerpt = article.find('p').text
link = article.find('a').attrs['href']
newurl = f"site_url{link}"
newpage = requests.get(newurl)
newsoup = BeautifulSoup(newpage.content, 'html.parser')
articleContent = newsoup.find('div', {'id': 'Content_container'})
image = articleContent.find('img').attrs['src']
articleContent = str(articleContent)
blogs.append([title,excerpt, image, articleContent])
df = pd.DataFrame(blogs, columns=['Title', 'Excerpt', 'Image', 'Article'])
df.to_csv('articles.csv')