Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,22 @@ However, to scrape publication data from the preprint servers [biorxiv](https://

```py
from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
medrxiv() # Takes <1h -> +90K papers (~200 MB file)
biorxiv() # Up to 6h -> +400K papers (~800 MB file)
chemrxiv() # Takes <15min -> +50K papers (~30 MB file)
medrxiv() # Takes <30min -> +100K papers (~200 MB file)
biorxiv() # Takes <3h -> +450 papers (~800 MB file)
```
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
*NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
*NOTE*: If you experience API connection issues, retries and request behavior can be tuned, e.g.:

```py
biorxiv(
max_retries=12,
request_timeout=(5.0, 45.0), # connect timeout, read timeout
retry_backoff_seconds=1.0, # initial retry backoff
max_workers=8, # number of parallel date windows
window_days=30, # smaller windows increase parallelism
)
```

Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
```py
Expand Down
44 changes: 29 additions & 15 deletions paperscraper/get_dumps/biorxiv.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
"""Dump bioRxiv data in JSONL format."""

import json
import os
from datetime import datetime
from typing import Optional

from tqdm import tqdm
from typing import Optional, Tuple

from ..utils import get_server_dumps_dir
from ..xrxiv.xrxiv_api import BioRxivApi
Expand All @@ -22,6 +19,10 @@ def biorxiv(
end_date: Optional[str] = None,
save_path: str = save_path,
max_retries: int = 10,
request_timeout: Tuple[float, float] = (5.0, 30.0),
retry_backoff_seconds: float = 1.0,
window_days: int = 30,
max_workers: int = 8,
):
"""Fetches papers from biorxiv based on time range, i.e., start_date and end_date.
If the start_date and end_date are not provided, papers will be fetched from biorxiv
Expand All @@ -37,15 +38,28 @@ def biorxiv(
Defaults to save_path.
max_retries (int, optional): Number of retries when API shows connection issues.
Defaults to 10.
request_timeout (Tuple[float, float], optional): (connect timeout, read timeout).
Defaults to (5.0, 30.0).
retry_backoff_seconds (float, optional): Initial retry backoff.
Defaults to 1.0.
window_days (int, optional): Date-window size used for pagination.
Defaults to 30.
max_workers (int, optional): Number of parallel workers over date windows.
Defaults to 8.
"""
# create API client
api = BioRxivApi(max_retries=max_retries)

# dump all papers
with open(save_path, "w") as fp:
for index, paper in enumerate(
tqdm(api.get_papers(start_date=start_date, end_date=end_date))
):
if index > 0:
fp.write(os.linesep)
fp.write(json.dumps(paper))
api = BioRxivApi(
max_retries=max_retries,
request_timeout=request_timeout,
retry_backoff_seconds=retry_backoff_seconds,
window_days=max(1, int(window_days)),
)
api.dump_papers(
save_path=save_path,
start_date=start_date,
end_date=end_date,
max_retries=max_retries,
max_workers=max_workers,
window_days=window_days,
deduplicate_dois=False,
show_progress=True,
)
43 changes: 29 additions & 14 deletions paperscraper/get_dumps/medrxiv.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
"""Dump medrxiv data in JSONL format."""

import json
import os
from datetime import datetime
from typing import Optional

from tqdm import tqdm
from typing import Optional, Tuple

from ..utils import get_server_dumps_dir
from ..xrxiv.xrxiv_api import MedRxivApi
Expand All @@ -20,6 +17,10 @@ def medrxiv(
end_date: Optional[str] = None,
save_path: str = save_path,
max_retries: int = 10,
request_timeout: Tuple[float, float] = (5.0, 30.0),
retry_backoff_seconds: float = 1.0,
window_days: int = 30,
max_workers: int = 8,
):
"""Fetches papers from medrxiv based on time range, i.e., start_date and end_date.
If the start_date and end_date are not provided, then papers will be fetched from
Expand All @@ -35,14 +36,28 @@ def medrxiv(
Defaults to save_path.
max_retries (int, optional): Number of retries when API shows connection issues.
Defaults to 10.
request_timeout (Tuple[float, float], optional): (connect timeout, read timeout).
Defaults to (5.0, 30.0).
retry_backoff_seconds (float, optional): Initial retry backoff.
Defaults to 1.0.
window_days (int, optional): Date-window size used for pagination.
Defaults to 30.
max_workers (int, optional): Number of parallel workers over date windows.
Defaults to 8.
"""
# create API client
api = MedRxivApi(max_retries=max_retries)
# dump all papers
with open(save_path, "w") as fp:
for index, paper in enumerate(
tqdm(api.get_papers(start_date=start_date, end_date=end_date))
):
if index > 0:
fp.write(os.linesep)
fp.write(json.dumps(paper))
api = MedRxivApi(
max_retries=max_retries,
request_timeout=request_timeout,
retry_backoff_seconds=retry_backoff_seconds,
window_days=max(1, int(window_days)),
)
api.dump_papers(
save_path=save_path,
start_date=start_date,
end_date=end_date,
max_retries=max_retries,
max_workers=max_workers,
window_days=window_days,
deduplicate_dois=False,
show_progress=True,
)
1 change: 1 addition & 0 deletions paperscraper/get_dumps/utils/chemrxiv/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def get_metrics(metrics_list: List[Dict]) -> Dict:
# This assumes that the .jsonl is constructed at roughly the same date
# where this entry was obtained from the API
metric_dict.update({"timestamp": today})
return metric_dict


def parse_dump(source_path: str, target_path: str) -> None:
Expand Down
6 changes: 5 additions & 1 deletion paperscraper/load_dumps.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@
dump_root = get_server_dumps_dir()

for db in ["biorxiv", "chemrxiv", "medrxiv"]:
dump_paths = glob.glob(os.path.join(dump_root, db + "*"))
dump_paths = [
path
for path in glob.glob(os.path.join(dump_root, db + "*"))
if os.path.isfile(path) and path.endswith(".jsonl")
]
if not dump_paths:
logger.warning(f" No dump found for {db}. Skipping entry.")
continue
Expand Down
35 changes: 31 additions & 4 deletions paperscraper/pdf/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,10 +335,37 @@ def save_pdf(
):
pdf_url = meta_pdf.get("content")
try:
response = requests.get(pdf_url, timeout=60)
response.raise_for_status()
pdf_headers = {
**user_agent,
"Accept": "application/pdf,application/octet-stream;q=0.9,*/*;q=0.8",
"Referer": resolved_url,
}
pdf_response = None
# Some publisher endpoints (e.g. Nature) are sensitive to cookies and
# referer headers. Try with cookies from the landing page first.
if "response" in locals():
try:
pdf_response = requests.get(
pdf_url,
timeout=60,
headers=pdf_headers,
cookies=response.cookies,
allow_redirects=True,
)
pdf_response.raise_for_status()
except Exception:
pdf_response = None

if pdf_response is None:
pdf_response = requests.get(
pdf_url,
timeout=60,
headers=pdf_headers,
allow_redirects=True,
)
pdf_response.raise_for_status()

if response.content[:4] != b"%PDF":
if pdf_response.content[:4] != b"%PDF":
logger.warning(
f"The file from {url} does not appear to be a valid PDF."
)
Expand All @@ -360,7 +387,7 @@ def save_pdf(
return True
else:
with open(output_path.with_suffix(".pdf"), "wb+") as f:
f.write(response.content)
f.write(pdf_response.content)
except Exception as e:
logger.warning(f"Could not download {pdf_url}: {e}")
else: # if no citation_pdf_url meta tag found, try other fallbacks
Expand Down
Loading
Loading