From 0e8a49a128aa37f43e965f013ec0b58dd1d5067a Mon Sep 17 00:00:00 2001 From: Joseph Evans Date: Thu, 29 Jul 2021 01:20:34 +0100 Subject: [PATCH 1/3] made it easy to scrape multiple departments --- .vscode/settings.json | 5 +++++ PaperScraper.py | 40 ++++++++++++++++++++++++++-------------- README.md | 29 +++++++++++++++++++++++++++++ departments.txt | 23 +++++++++++++++++++++++ 4 files changed, 83 insertions(+), 14 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 departments.txt diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a3318e4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "cSpell.words": [ + "pastpapers" + ] +} \ No newline at end of file diff --git a/PaperScraper.py b/PaperScraper.py index 5040148..6ec0501 100644 --- a/PaperScraper.py +++ b/PaperScraper.py @@ -4,31 +4,43 @@ # edit these two parameters -URL = "https://warwick.ac.uk/services/exampapers/?q=&department=CS&year=" -COOKIE_TOKEN = "your token here" - +COOKIE_TOKEN = "" s = rq.Session() s.cookies["WarwickSSO"] = COOKIE_TOKEN -def downloadpdf(url: str): +def downloadpdf(url: str, dep: str): file = s.get(url) filename: str = url.split('/')[-1] year: str = url.split('/')[-2] - makedirs(f"papers/{year}", exist_ok=True) + makedirs(f"papers/{dep}/{year}", exist_ok=True) - with open(f"papers/{year}/{filename}", "wb+") as f: + with open(f"papers/{dep}/{year}/{filename}", "wb+") as f: f.write(file.content) -page = s.get(URL).text -soup = BeautifulSoup(page, 'html.parser') - -for tag in soup.find_all("a"): - url: str = tag.get("href") - if url.endswith(".pdf"): - print(url) - downloadpdf(url) +def getDepartment(dep: list[str]): + URL=f"https://warwick.ac.uk/services/exampapers/?q=&department={dep[0]}&year=" + page = s.get(URL).text + soup = BeautifulSoup(page, 'html.parser') + + for tag in soup.find_all("a"): + url: str = tag.get("href") + if url.endswith(".pdf"): + print(url) + downloadpdf(url,dep[1]) + +def chosenDepartments(): + departments=[] + with open("departments.txt") as dep: + for line in dep.readlines(): + if line[0]!="#": + parts=line.strip().split(" ") + departments.append([parts[0]," ".join(parts[1:])]) + return departments + +for department in chosenDepartments(): + getDepartment(department) \ No newline at end of file diff --git a/README.md b/README.md index 2ec639e..f291032 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,35 @@ By default the url points to all DCS past papers for all years. You can change t The papers are organised by year. If you want to make a PR to organise them by module or some other way feel free. +## Department Codes + +Each department has a two letter code + +department| code +--|-- +Applied Linguistics | ET +Centre for Lifelong Learning | CE +Centre for Scientific Computing | CY +Chemistry | CH +Classics & Ancient History | CX +Comparative American Studies | AM +Computer Science | CS +Economics | EC +Education Studies | EQ +English and Comparative Literary Studies | EN +Physics |PX +Politics & International Studies |PO +Psychology |PS +School of Engineering |ES +School of Health & Social Studies |AS +School of Law |LA +School of Modern Languages and Cultures |LN +Sociology |SO +Statistics |ST +Theatre and Performance Studies |TH +Warwick Business School |IB +Warwick Mathematics Institute |MA + ## Disclaimer This script is provided for personal use only. Downloaded papers may not be shared with anyone, even other students. I am not responsible for anything you do with this script, nor anything you do with the files you download using it. diff --git a/departments.txt b/departments.txt new file mode 100644 index 0000000..d68ffe3 --- /dev/null +++ b/departments.txt @@ -0,0 +1,23 @@ +#un-comment any departments to download +#ET Applied Linguistics +#CE Centre for Lifelong Learning +#CY Centre for Scientific Computing +#CH Chemistry +#CX Classics & Ancient History +#AM Comparative American Studies +#CS Computer Science +#EC Economics +#EQ Education Studies +#EN English and Comparative Literary Studies +#PX Physics +#PO Politics & International Studies +#PS Psychology +#ES School of Engineering +#AS School of Health & Social Studies +#LA School of Law +#LN School of Modern Languages and Cultures +#SO Sociology +#ST Statistics +#TH Theatre and Performance Studies +#IB Warwick Business School +#MA Warwick Mathematics Institute \ No newline at end of file From 0b4ba9bcae05133d521676efa64090cad29dca29 Mon Sep 17 00:00:00 2001 From: Joseph Date: Thu, 29 Jul 2021 01:25:20 +0100 Subject: [PATCH 2/3] removed department codes from Readme, not needed --- README.md | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/README.md b/README.md index f291032..6172d93 100644 --- a/README.md +++ b/README.md @@ -8,39 +8,10 @@ You'll need to provide an SSO token from a cookie so the script can access the p Next open devtools, and navigate to the storage tab. There you should be able to find your cookies for this site. You want with one with the key `WarwickSSO`. Copy the value of it, and assign it to the constant `COOKIE_TOKEN` in the script. -By default the url points to all DCS past papers for all years. You can change this by navigating to the page you want the papers from, and changing the `URL` parameter at the top of the script. +By default no departments are selected you can change this by un-commenting the department(s) of your choice in departments.txt The papers are organised by year. If you want to make a PR to organise them by module or some other way feel free. -## Department Codes - -Each department has a two letter code - -department| code ---|-- -Applied Linguistics | ET -Centre for Lifelong Learning | CE -Centre for Scientific Computing | CY -Chemistry | CH -Classics & Ancient History | CX -Comparative American Studies | AM -Computer Science | CS -Economics | EC -Education Studies | EQ -English and Comparative Literary Studies | EN -Physics |PX -Politics & International Studies |PO -Psychology |PS -School of Engineering |ES -School of Health & Social Studies |AS -School of Law |LA -School of Modern Languages and Cultures |LN -Sociology |SO -Statistics |ST -Theatre and Performance Studies |TH -Warwick Business School |IB -Warwick Mathematics Institute |MA - ## Disclaimer This script is provided for personal use only. Downloaded papers may not be shared with anyone, even other students. I am not responsible for anything you do with this script, nor anything you do with the files you download using it. From f50f321fb1c569abae524814117244ec86ca067d Mon Sep 17 00:00:00 2001 From: Joseph Date: Thu, 29 Jul 2021 01:30:34 +0100 Subject: [PATCH 3/3] removed vscode settings --- .vscode/settings.json | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index a3318e4..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "cSpell.words": [ - "pastpapers" - ] -} \ No newline at end of file