forked from AjaniStewart/SpringerLinkDownload
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdl.py
More file actions
executable file
·136 lines (122 loc) · 4.86 KB
/
dl.py
File metadata and controls
executable file
·136 lines (122 loc) · 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/local/bin/python3
'''
Author: Ajani Stewart
Contributors: Fortrieb
This code is liscenced under the MIT license
'''
from sys import argv
from csv import DictReader
from os import mkdir, chdir, path
from argparse import ArgumentParser, Namespace
from threading import Lock
from functools import reduce
import logging
import glob
import requests
import concurrent.futures
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("Downloader")
BASE_PDF_URL = "https://link.springer.com/content/pdf/"
BASE_EPUB_URL = "https://link.springer.com/download/epub/"
def dl_arg_parser() -> Namespace:
"""Parse cli argument for download program
:return: parsed argument namespace
"""
parse = ArgumentParser()
parse.add_argument("folder", type=str, help="Path folder with CSV files")
parse.add_argument("--epub",
default=False,
help="Trying to download ePub file",
dest="epub",
action="store_true")
parse.add_argument("--debug",
default=False,
help="Enable debug logging",
dest="debug",
action="store_true")
return parse.parse_args()
def download_book(csv, folder: str, epub: bool) -> int:
""" Process a csv file containing book meta data
:param csv: CSV file path
:param folder: save files in this folder path
:param epub: Epub download
:return: amount of downloaded files
"""
e = concurrent.futures.ThreadPoolExecutor(max_workers=4)
result_futures = []
with open(csv, "r") as csv_file:
reader = DictReader(csv_file, quotechar='"', skipinitialspace=True)
for row in reader:
result_futures.append(e.submit(process_row, row, folder, epub))
# wait until all submitted tasks are complete
e.shutdown(wait=True)
# fold left over future array
return reduce(lambda x, y: x + y,
list(map(lambda x: x.result(), result_futures)), 0)
def process_row(row: str, folder: str, epub: bool) -> int:
""" Row processesing
:param row: current row from CSV file
:param folder: folder to store file
:param epub: EPUB file should downloaded
:return: amount of downloaded files
"""
row_title = row['Item Title'].strip('\"')
log.debug("Start downloading: %s", row_title)
item_doi = row['Item DOI'].split('/')
cur_url_pdf = BASE_PDF_URL + item_doi[0] + "%2f" + item_doi[1] + ".pdf"
cur_url_epub = BASE_EPUB_URL + item_doi[0] + "%2f" + item_doi[1] + ".epub"
response = requests.get(cur_url_pdf)
response_epub = None
if epub:
response_epub = requests.get(cur_url_epub)
year = row['Publication Year']
content_type = row['Content Type']
item_title = ''.join(row_title.split(sep=' '))
book_pdf_title = f"{year + '_' + content_type + '_' + item_title}.pdf"
book_epub_title = f"{year + '_' + content_type + '_' + item_title}.epub"
download_counter = 0
# Epub
if epub and response_epub:
if response_epub.status_code == 200:
log.debug("Epub: %s", path.join(folder, book_epub_title))
with open(path.join(folder, book_epub_title), 'wb') as book:
if response_epub.status_code == 200:
log.info("Finished %s", book_epub_title)
book.write(response_epub.content)
download_counter += 1
else:
log.error("EPUB not found for %s", item_doi)
# PDF
log.debug("PDF: %s", path.join(folder, book_pdf_title))
with open(path.join(folder, book_pdf_title), 'wb') as book:
if response.status_code == 200:
log.info("Finished %s", book_pdf_title)
book.write(response.content)
download_counter += 1
return download_counter
if __name__ == "__main__":
args = dl_arg_parser()
# check debug status
if args.debug:
log.setLevel(logging.DEBUG)
log.debug("CSV files in folder %s", args.folder)
# read CSV files
csvs = [f for f in glob.glob(path.join(args.folder, "*.csv"))]
log.debug("Files found: %s", csvs)
executor = concurrent.futures.ThreadPoolExecutor(max_workers=2)
futures = []
for csv in csvs:
log.debug("Process file: %s", csv)
base_name = path.basename(csv)
out_dir = path.join(args.folder, path.splitext(base_name)[0])
# check file is present
if not path.exists(out_dir):
log.debug("Output directory not exists. Create %s", out_dir)
mkdir(out_dir)
log.debug("Output dir: %s", out_dir)
futures.append(executor.submit(download_book, csv, out_dir, args.epub))
# wait futures completed
executor.shutdown(wait=True)
count_files = reduce(lambda a, b: int(a) + int(b),
list(map(lambda x: x.result(), futures)), 0)
log.info("%s files downloaded", count_files)