-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
executable file
·130 lines (98 loc) · 4.64 KB
/
scrape.py
File metadata and controls
executable file
·130 lines (98 loc) · 4.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
import urllib.request as req
from bs4 import BeautifulSoup as bs
import emailFunction
import csv
import time
import os.path as path
'''
This program scapes https://news.ycombinator.com/ for the first 30 articles on the front page. The
program then saves the article title, link, and score in their own seperate columns in a .csv file.
@author: Jonathan Shreckengost (jonathanshrek@gmail.com)
'''
class Scrape:
def __init__(self, url, savePath):
self.url = url
self.path = savePath
self.moment = time.strftime("%Y-%b-%d_%H_%M", time.localtime())
self.fileName = "hackerNews " + self.moment
self.completeName = path.join(self.path, self.fileName + ".csv")
def hackScraper(self):
# Error handling in case initial request fails
try:
page = req.urlopen(self.url)
pageCount = 1
while True:
# Parses HTML from site
soup = bs(page, features="html5lib")
# Loops each story and collects its title and link in seperate lists
links = []
titles = []
for storyLink in soup.find_all('a', class_="storylink"):
link = storyLink.get("href")
links.append(link)
title = storyLink.string
titles.append(title)
# Loops through each article and collects its Hacker News score in a list
articleScores = []
for score in soup.find_all('span', class_="score"):
scores = score.string
articleScores.append(scores)
# Loops each article and scrapes the age since posted to Hacker News
# This is specific to when the scrape occurs
ages = []
for x in soup.find_all('span', class_='age'):
age = x.string
ages.append(age)
# Used a variable because this line is used in other blocks and its easier to read
more = soup.find_all('a', class_="morelink")
# Loops through more to find morelink and collect href for further page scraping
for x in more:
moreLink = x.get("href")
# This block allows the scraper to continue scraping if there are more pages to be scraped.
if bool(more) == True:
#time.sleep(30)
try:
page = req.urlopen("https://news.ycombinator.com/" + moreLink)
# Initial page used to set the column headings
if pageCount == 1:
print(pageCount)
pageCount += 1
with open(self.completeName, 'a') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(["TITLES", "LINKS", "SCORE", "AGE"])
f.close()
else:
print(pageCount)
pageCount += 1
with open(self.completeName, 'a') as f:
csv_writer = csv.writer(f)
rows = zip(titles, links, articleScores, ages)
for row in rows:
csv_writer.writerow(row)
f.close()
if bool(more) == False:
break
# Set page count to however many pages you would like to scrape
# or comment out this entire block to scrape the entire site
#if pageCount == 300:
#break
#else:
#pageCount += 1
#continue
except:
return print("Could not make request!")
else:
with open(self.completeName, 'a') as f:
csv_writer = csv.writer(f)
rows = zip(titles, links, articleScores, ages)
for row in rows:
csv_writer.writerow(row)
f.close()
break
except:
return print("Could not make request!")
try:
emailFunction.email(self.fileName + ".csv")
except:
return print("Could not send email!")