Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 26 additions & 14 deletions PaperScraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,43 @@


# edit these two parameters
URL = "https://warwick.ac.uk/services/exampapers/?q=&department=CS&year="
COOKIE_TOKEN = "your token here"

COOKIE_TOKEN = ""

s = rq.Session()
s.cookies["WarwickSSO"] = COOKIE_TOKEN


def downloadpdf(url: str):
def downloadpdf(url: str, dep: str):

file = s.get(url)
filename: str = url.split('/')[-1]
year: str = url.split('/')[-2]

makedirs(f"papers/{year}", exist_ok=True)
makedirs(f"papers/{dep}/{year}", exist_ok=True)

with open(f"papers/{year}/{filename}", "wb+") as f:
with open(f"papers/{dep}/{year}/{filename}", "wb+") as f:
f.write(file.content)


page = s.get(URL).text
soup = BeautifulSoup(page, 'html.parser')

for tag in soup.find_all("a"):
url: str = tag.get("href")
if url.endswith(".pdf"):
print(url)
downloadpdf(url)
def getDepartment(dep: list[str]):
URL=f"https://warwick.ac.uk/services/exampapers/?q=&department={dep[0]}&year="
page = s.get(URL).text
soup = BeautifulSoup(page, 'html.parser')

for tag in soup.find_all("a"):
url: str = tag.get("href")
if url.endswith(".pdf"):
print(url)
downloadpdf(url,dep[1])

def chosenDepartments():
departments=[]
with open("departments.txt") as dep:
for line in dep.readlines():
if line[0]!="#":
parts=line.strip().split(" ")
departments.append([parts[0]," ".join(parts[1:])])
return departments

for department in chosenDepartments():
getDepartment(department)
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ You'll need to provide an SSO token from a cookie so the script can access the p

Next open devtools, and navigate to the storage tab. There you should be able to find your cookies for this site. You want with one with the key `WarwickSSO`. Copy the value of it, and assign it to the constant `COOKIE_TOKEN` in the script.

By default the url points to all DCS past papers for all years. You can change this by navigating to the page you want the papers from, and changing the `URL` parameter at the top of the script.
By default no departments are selected you can change this by un-commenting the department(s) of your choice in departments.txt

The papers are organised by year. If you want to make a PR to organise them by module or some other way feel free.

Expand Down
23 changes: 23 additions & 0 deletions departments.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#un-comment any departments to download
#ET Applied Linguistics
#CE Centre for Lifelong Learning
#CY Centre for Scientific Computing
#CH Chemistry
#CX Classics & Ancient History
#AM Comparative American Studies
#CS Computer Science
#EC Economics
#EQ Education Studies
#EN English and Comparative Literary Studies
#PX Physics
#PO Politics & International Studies
#PS Psychology
#ES School of Engineering
#AS School of Health & Social Studies
#LA School of Law
#LN School of Modern Languages and Cultures
#SO Sociology
#ST Statistics
#TH Theatre and Performance Studies
#IB Warwick Business School
#MA Warwick Mathematics Institute