-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
123 lines (91 loc) · 5.12 KB
/
scraper.py
File metadata and controls
123 lines (91 loc) · 5.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
################################################################################
# Scrapes from SB website all pdf documents which are rounds, and extracts the
# text out of the pdf documents.
#
################################################################################
import urllib2
from urllib2 import Request, urlopen
from StringIO import StringIO
#import cookielib
#import os
from bs4 import BeautifulSoup
#import pyPDF2
from pyPdf import PdfFileWriter, PdfFileReader
# Website to scrape
HOME_URL = 'http://science.energy.gov/wdts/nsb/high-school/high-school-regionals/hs-rules-forms-resources/sample-science-bowl-questions/'
# File extension to scrape
FILE_EXT = '.pdf'
# Request page content
home_req = urllib2.Request(HOME_URL)
home_content = urllib2.urlopen(home_req)
# Construct BeautifulSoup object of page content
soup = BeautifulSoup(home_content)
f=open('round.txt','w') #Where all the text of every file goes
for link in soup.find_all('a'):
candidate = str(link.get('href'))
#print "I FOUND A CANDIDATE"
# print candidate
if (candidate[-len(FILE_EXT):] == FILE_EXT):
file_url = HOME_URL + candidate
file_url = file_url.replace(" ", "%20")
# note above: can use urlencode to remove dangerous characters such as spaces
#print candidate[0:28]
if len(candidate)>29 and candidate[0:28] == '/~/media/wdts/nsb/pdf/hs/pdf':
#print candidate
# k = index of last "/" of the candidate => ...path/filename.pdf
k = candidate.rfind("/")
pathname = candidate[:k]
filename = candidate[k+1:]
#print(pathname) #Everything After root
#print(filename) #Only the last part
print(file_url) #Full URL
############################################################
# At this point, I have file_url which leads to a pdf I want
writer = PdfFileWriter()
remoteFile = urlopen(Request(file_url)).read()
memoryFile = StringIO(remoteFile)
pdfFile = PdfFileReader(memoryFile)
############################################################
#Use this to save scraped file to output.pdf
for pageNum in xrange(pdfFile.getNumPages()):
currentPage = pdfFile.getPage(pageNum)
#currentPage.mergePage(watermark.getPage(0))
writer.addPage(currentPage)
outputStream = open("output.pdf","wb")
writer.write(outputStream)
outputStream.close()
############################################################
# Read output.pdf and extract text from each page, spit into
# round.txt
pdf = PdfFileReader(open("output.pdf", "rb"))
for page in pdf.pages:
pgtxt = page.extractText().encode("ascii", "ignore")
#remove new lines
pgtxt = pgtxt.replace('\n', ' ').replace('\r', '')
print pgtxt
f.write(pgtxt)
f.close()
# Open and save content of the file in pdf_read
#cj = cookielib.CookieJar()
#opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
#pdf_req = urllib2.Request(file_url)
#pdf_content = opener.open(pdf_req)
#pdf_read = pdf_content.read()
#pdf_content.close()
#opener.close()
# Create the path if not defined
#if not os.path.exists(pathname):
# os.makedirs(pathname)
# Write content of the file to the path
#pdf_write = open("round.txt", 'w')
#pdf_write.write(pdf_read)
#pdf_write.close()
#THIS CODE IS THE ONE I LIKE. DON"T DELETE IT
#f= open('round.txt','w')
#pdfl = getPDFContent("test.pdf").encode("ascii", "ignore")
#f.write(pdfl)
#f.close()
#for pageNumber, page in enumerate(PDFDocument.get_pages()):
#if pageNumber == 42:
# print("sup")
# getPage(i).extractText()