-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathch4ex1.py
More file actions
103 lines (83 loc) · 2.49 KB
/
ch4ex1.py
File metadata and controls
103 lines (83 loc) · 2.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
from bs4 import BeautifulSoup
class Content:
"""
Common base class for all articles/pages
"""
def __init__(self, url, title, body):
self.url = url
self.title = title
self.body = body
def print(self):
"""
Flexible printing function controls output
"""
print("URL: {}".format(self.url))
print("TITLE: {}".format(self.title))
print("BODY:\n{}".format(self.body))
class Website:
"""
Contains information about website structure
"""
def __init__(self, name, url, titleTag, bodyTag):
self.name = name
self.url = url
self.titleTag = titleTag
self.bodyTag = bodyTag
class Crawler:
def getPage(self, url):
try:
req = requests.get(url)
except requests.exceptions.RequestException:
return None
return BeautifulSoup(req.text, 'lxml')
def safeGet(self, pageObj, selector):
"""
Utility function used to get a content string from a
Beautiful Soup object and a selector. Returns an empty
string if no object is found for the given selector
"""
selectedElems = pageObj.select(selector)
if selectedElems is not None and len(selectedElems) > 0:
return '\n'.join(
[elem.get_text() for elem in selectedElems])
return ''
def parse(self, site, url):
"""
Extract content from a given page URL
"""
bs = self.getPage(url)
if bs is not None:
title = self.safeGet(bs, site.titleTag)
body = self.safeGet(bs, site.bodyTag)
#print(title + '-'*10 + body)
if title != '' and body != '':
content = Content(url, title, body)
content.print()
crawler = Crawler()
"""
had to fix the css selectors and some minor bugs
"""
siteData = [
['O\'Reilly Media', 'http://oreilly.com',
'h1', 'div.content > span'],
['Reuters', 'http://reuters.com',
'h1', 'div.StandardArticleBody_body'],
['Brookings', 'http://www.brookings.edu',
'h1', 'div.post-body'],
['New York Times', 'http://nytimes.com',
'h1', 'article#story > section[name="articleBody"]']
]
websites = []
for row in siteData:
websites.append(Website(row[0], row[1], row[2], row[3]))
crawler.parse(websites[0], 'http://shop.oreilly.com/product/'\
'0636920028154.do')
print('\n'+'='*80+'\n')
crawler.parse(websites[1], 'http://www.reuters.com/article/'\
'us-usa-epa-pruitt-idUSKBN19W2D0')
print('\n'+'='*80+'\n')
crawler.parse(websites[2], 'https://www.brookings.edu/articles/modeling-with-data-tools-and-techniques-for-scientific-computing/')
print('\n'+'='*80+'\n')
crawler.parse(websites[3], 'https://www.nytimes.com/2018/01/'\
'28/business/energy-environment/oil-boom.html')