diff --git a/PaperScraper.py b/PaperScraper.py index 5040148..6ec0501 100644 --- a/PaperScraper.py +++ b/PaperScraper.py @@ -4,31 +4,43 @@ # edit these two parameters -URL = "https://warwick.ac.uk/services/exampapers/?q=&department=CS&year=" -COOKIE_TOKEN = "your token here" - +COOKIE_TOKEN = "" s = rq.Session() s.cookies["WarwickSSO"] = COOKIE_TOKEN -def downloadpdf(url: str): +def downloadpdf(url: str, dep: str): file = s.get(url) filename: str = url.split('/')[-1] year: str = url.split('/')[-2] - makedirs(f"papers/{year}", exist_ok=True) + makedirs(f"papers/{dep}/{year}", exist_ok=True) - with open(f"papers/{year}/{filename}", "wb+") as f: + with open(f"papers/{dep}/{year}/{filename}", "wb+") as f: f.write(file.content) -page = s.get(URL).text -soup = BeautifulSoup(page, 'html.parser') - -for tag in soup.find_all("a"): - url: str = tag.get("href") - if url.endswith(".pdf"): - print(url) - downloadpdf(url) +def getDepartment(dep: list[str]): + URL=f"https://warwick.ac.uk/services/exampapers/?q=&department={dep[0]}&year=" + page = s.get(URL).text + soup = BeautifulSoup(page, 'html.parser') + + for tag in soup.find_all("a"): + url: str = tag.get("href") + if url.endswith(".pdf"): + print(url) + downloadpdf(url,dep[1]) + +def chosenDepartments(): + departments=[] + with open("departments.txt") as dep: + for line in dep.readlines(): + if line[0]!="#": + parts=line.strip().split(" ") + departments.append([parts[0]," ".join(parts[1:])]) + return departments + +for department in chosenDepartments(): + getDepartment(department) \ No newline at end of file diff --git a/README.md b/README.md index 2ec639e..6172d93 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ You'll need to provide an SSO token from a cookie so the script can access the p Next open devtools, and navigate to the storage tab. There you should be able to find your cookies for this site. You want with one with the key `WarwickSSO`. Copy the value of it, and assign it to the constant `COOKIE_TOKEN` in the script. -By default the url points to all DCS past papers for all years. You can change this by navigating to the page you want the papers from, and changing the `URL` parameter at the top of the script. +By default no departments are selected you can change this by un-commenting the department(s) of your choice in departments.txt The papers are organised by year. If you want to make a PR to organise them by module or some other way feel free. diff --git a/departments.txt b/departments.txt new file mode 100644 index 0000000..d68ffe3 --- /dev/null +++ b/departments.txt @@ -0,0 +1,23 @@ +#un-comment any departments to download +#ET Applied Linguistics +#CE Centre for Lifelong Learning +#CY Centre for Scientific Computing +#CH Chemistry +#CX Classics & Ancient History +#AM Comparative American Studies +#CS Computer Science +#EC Economics +#EQ Education Studies +#EN English and Comparative Literary Studies +#PX Physics +#PO Politics & International Studies +#PS Psychology +#ES School of Engineering +#AS School of Health & Social Studies +#LA School of Law +#LN School of Modern Languages and Cultures +#SO Sociology +#ST Statistics +#TH Theatre and Performance Studies +#IB Warwick Business School +#MA Warwick Mathematics Institute \ No newline at end of file