Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ venv

# created from tests
export.ris

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove from this PR?

# extra benchmark data only for internal use (because of copyright)
benchmark_data
tests/test_benchmark_extra.py
benchmark_*.svg
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ so these may need to be modified for specific export systems:
'TI': 'title',
'TT': 'translated_title',
'TY': 'type_of_reference',
'UK': 'unknown_tag',
'UR': 'urls',
'VL': 'volume',
'Y1': 'publication_year',
Expand Down
3 changes: 2 additions & 1 deletion rispy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""A Python reader/writer of RIS reference files"""

from .config import LIST_TYPE_TAGS, TAG_KEY_MAPPING, TYPE_OF_REFERENCE_MAPPING
from .parser import RisParser, WokParser, load, loads
from .parser import PubMedParser, RisParser, WokParser, load, loads
from .writer import BaseWriter, RisWriter, dump, dumps

__version__ = "0.9.0"
Expand All @@ -11,6 +11,7 @@
"TAG_KEY_MAPPING",
"TYPE_OF_REFERENCE_MAPPING",
"BaseWriter",
"PubMedParser",
"RisParser",
"RisWriter",
"WokParser",
Expand Down
125 changes: 124 additions & 1 deletion rispy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@
"Y1": "publication_year",
"Y2": "access_date",
"ER": "end_of_reference",
"UK": "unknown_tag",
}

TYPE_OF_REFERENCE_MAPPING = {
Expand Down Expand Up @@ -227,3 +226,127 @@
"ER": "end_of_record",
"EF": "end_of_file",
}
PUBMED_LIST_TYPE_TAGS = {
"AD",
"AID",
"AU",
"AUID",
"CIN",
"CON",
"CN",
"RN",
"EDAT",
"EIN",
"FAU",
"FIR",
"GR",
"IR",
"IRAD",
"IS",
"LA",
"LID",
"MHDA",
"MH",
"OT",
"PHST",
"PST",
"PT",
"PMC",
"SI",
"SO",
"SB",
}

# from https://pubmed.ncbi.nlm.nih.gov/help/#pubmed-format
PUBMED_TAG_KEY_MAPPING = {
"AB": "abstract",
"AD": "affiliation",
"AID": "article_identifier",
"AU": "author",
"AUID": "author_identifier",
"BTI": "book_title",
"CI": "copyright_information",
"CIN": "comment_in",
"CN": "corporate_author",
"COIS": "conflict_of_interest", # mistake in the doc (COIS instead of COI, March 2025)
"CON": "comment_on",
"CP": "chapter",
"CRDT": "create_date",
"CRF": "corrected_and_republished_from",
"CRI": "corrected_and_republished_in",
"CTDT": "contribution_date",
"CTI": "collection_title",
"DCOM": "completion_date",
"DDIN": "dataset_described_in",
"DRIN": "dataset_use_reported_in",
"DEP": "date_of_electronic_publication",
"DP": "publication_date",
"DRDT": "date_revised",
"ECF": "expression_of_concern_for",
"ECI": "expression_of_concern_in",
"EDAT": "entry_date",
"EFR": "erratum_for",
"EIN": "erratum_in",
"ED": "editor",
"EN": "edition",
"FAU": "full_author_name",
"FED": "full_editor_name",
"FIR": "full_investigator_name",
"FPS": "full_personal_name_as_subject",
"GN": "general_note",
"GR": "grants_and_funding",
"GS": "gene_symbol",
"IP": "issue",
"IR": "investigator",
"IRAD": "investigator_affiliation",
"IS": "issn",
"ISBN": "isbn",
"JID": "nlm_unique_id",
"JT": "full_journal_title",
"LA": "language",
"LID": "location_id",
"LR": "modification_date",
"MH": "mesh_terms",
"MHDA": "mesh_date",
"MID": "manuscript_identifier",
"NM": "substance_name",
"OAB": "other_abstract",
"OABL": "other_abstract_language",
"OCI": "other_copyright_information",
"OID": "other_id",
"ORI": "original_report_in",
"OT": "other_term",
"OTO": "other_term_owner",
"OWN": "owner",
"PB": "publisher",
"PG": "pagination",
"PHST": "publication_history_status_date",
"PL": "place_of_publication",
"PMC": "pubmed_central_identifier",
"PMCR": "pmc_release",
"PMID": "pubmed_unique_identifier",
"PS": "personal_name_as_subject",
"PST": "publication_status",
"PT": "publication_type",
"RF": "number_of_references",
"RIN": "retraction_in",
"RN": "ec_rn_number",
"ROF": "retraction_of",
"RPF": "republished_from",
"RPI": "republished_in",
"RRI": "retracted_and_republished_in",
"RRF": "retracted_and_republished_from",
"SB": "subset",
"SFM": "space_flight_mission",
"SI": "secondary_source_id",
"SO": "source",
"SPIN": "summary_for_patients_in",
"STAT": "status_tag",
"TA": "journal_title_abbreviation",
"TI": "title",
"TT": "transliterated_title",
"UIN": "update_in",
"UOF": "update_of",
"VI": "volume",
"VTI": "volume_title",
}
113 changes: 82 additions & 31 deletions rispy/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from .config import (
DELIMITED_TAG_MAPPING,
LIST_TYPE_TAGS,
PUBMED_LIST_TYPE_TAGS,
PUBMED_TAG_KEY_MAPPING,
TAG_KEY_MAPPING,
WOK_LIST_TYPE_TAGS,
WOK_TAG_KEY_MAPPING,
Expand Down Expand Up @@ -45,7 +47,6 @@ class RisParser:

START_TAG: str = "TY"
END_TAG: str = "ER"
UNKNOWN_TAG: str = "UK"
PATTERN: str
DEFAULT_IGNORE: ClassVar[list[str]] = []
DEFAULT_MAPPING: dict = TAG_KEY_MAPPING
Expand All @@ -63,6 +64,7 @@ def __init__(
skip_unknown_tags: bool = False,
enforce_list_tags: bool = True,
newline: Optional[str] = None,
undo_wrapping: bool = False,
):
"""Initialize the parser function.

Expand Down Expand Up @@ -98,6 +100,7 @@ def __init__(
self.skip_unknown_tags = skip_unknown_tags
self.enforce_list_tags = enforce_list_tags
self.newline = newline if newline is not None else self.DEFAULT_NEWLINE
self.undo_wrapping = undo_wrapping

def _iter_till_start(self, lines) -> dict:
while True:
Expand All @@ -122,24 +125,39 @@ def parse_lines(self, lines: Union[TextIO, list[str]]) -> list[dict]:
while True:
tag, content = self.parse_line(next(lines))

if tag is None:
self._add_tag(record, last_tag, content, extend_multiline=True)
continue

if tag in self.ignore:
continue

if tag == self.END_TAG:
if self.END_TAG and tag == self.END_TAG:
result.append(record)

last_tag = tag
record = self._iter_till_start(lines)
continue

self._add_tag(record, tag, content)
last_tag = tag
if self.END_TAG is None and tag == self.START_TAG:
result.append(record)
record = {self.mapping[self.START_TAG]: content}
last_tag = tag
continue

if tag is None and not self.undo_wrapping and last_tag in self.list_tags:
self._add_tag(record, last_tag, content)
elif tag is None:
self._extend_tag(record, last_tag, content)
else:
self._add_tag(record, tag, content)
last_tag = tag

except StopIteration:
return result
pass

if self.END_TAG is not None and last_tag != self.END_TAG:
raise ParseError(f"Missing end tag: {self.END_TAG}")

if self.END_TAG is None:
result.append(record)

return result

def parse_line(self, line: str) -> Union[tuple[str, str], tuple[None, str]]:
"""Parse line of RIS file.
Expand Down Expand Up @@ -178,18 +196,11 @@ def _add_single_value(
The output for a tag can be a list when a delimiter is specified,
even if it is not a list tag.
"""
if not is_multi:
if self.enforce_list_tags or name not in record:
ignore_this_if_has_one = value
record.setdefault(name, ignore_this_if_has_one)
else:
self._add_list_value(record, name, value)
if self.enforce_list_tags or name not in record:
ignore_this_if_has_one = value
record.setdefault(name, ignore_this_if_has_one)
else:
value_must_exist_or_is_bug = record[name]
if isinstance(value, list):
record[name].extend(value)
else:
record[name] = " ".join((value_must_exist_or_is_bug, value))
self._add_list_value(record, name, value)

def _add_list_value(self, record: dict, name: str, value: Union[str, list[str]]) -> None:
"""Process tags with multiple values."""
Expand All @@ -204,29 +215,34 @@ def _add_list_value(self, record: dict, name: str, value: Union[str, list[str]])
must_exist = record[name]
record[name] = [must_exist, *value_list]

def _add_tag(
self, record: dict, tag: str, content: str, extend_multiline: bool = False
) -> None:
def _extend_tag(self, record: dict, tag: str, content: Union[str, list[str]]) -> None:
"""Extend tags with multiline values."""

sep = " " if self.undo_wrapping else "\n"

name = self.mapping[tag]
if isinstance(record[name], list):
record[name][-1] = sep.join((record[name][-1], content))
else:
record[name] = sep.join((record[name], content))

def _add_tag(self, record: dict, tag: str, content: str) -> None:
try:
name = self.mapping[tag]
except KeyError:
if self.skip_unknown_tags:
return

# handle unknown tag
name = self.mapping[self.UNKNOWN_TAG]
if name not in record:
record[name] = defaultdict(list)
record[name][tag].append(content)

record.setdefault("unknown_tag", defaultdict(list))[tag].append(content)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since we removed UK, make unknown_tag a variable constant UNKNOWN_TAG since right now it's a magic string used in a few places

return
else:
if delimiter := self.delimiter_map.get(tag):
content = [i.strip() for i in content.split(delimiter)]

if tag in self.list_tags:
self._add_list_value(record, name, content)
else:
self._add_single_value(record, name, content, is_multi=extend_multiline)
self._add_single_value(record, name, content)


class WokParser(RisParser):
Expand All @@ -238,6 +254,9 @@ class WokParser(RisParser):
DEFAULT_LIST_TAGS = WOK_LIST_TYPE_TAGS
DEFAULT_DELIMITER_MAPPING: ClassVar[dict] = {}

def __init__(self, undo_wrapping: bool = True, **kw):
super().__init__(undo_wrapping=undo_wrapping, **kw)

def parse_line(self, line: str) -> Union[tuple[str, str], tuple[None, str]]:
"""Parse line of RIS file.

Expand All @@ -262,6 +281,38 @@ def parse_line(self, line: str) -> Union[tuple[str, str], tuple[None, str]]:
return (line[0:2], line[3:].strip())


class PubMedParser(RisParser):
"""Subclass of Base for reading PubMed RIS files."""

START_TAG: str = "PMID"
END_TAG: None = None
DEFAULT_MAPPING: dict = PUBMED_TAG_KEY_MAPPING
DEFAULT_LIST_TAGS: list[str] = PUBMED_LIST_TYPE_TAGS
DEFAULT_DELIMITER_MAPPING: ClassVar[dict] = {}

def __init__(self, undo_wrapping: bool = True, **kw):
super().__init__(undo_wrapping=undo_wrapping, **kw)

def parse_line(self, line: str) -> Union[tuple[str, str], tuple[None, str]]:
"""Parse line of PubMed file.

Parameters
----------
line : str
Line of RIS file between start and end tag.

Returns
-------
tuple
Tuple containing the tag and the content of the tag.
"""

Copy link

Copilot AI May 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding a check for the minimum line length before slicing to prevent potential index errors when processing unexpectedly short lines.

Suggested change
if len(line) < 5:
return (None, line.strip())

Copilot uses AI. Check for mistakes.
if line[4:5] == "-":
return (line[0:4].rstrip(), line[6:].rstrip())
else:
return (None, line[6:].rstrip())


def load(
file: Union[TextIO, Path],
*,
Expand Down
7 changes: 5 additions & 2 deletions rispy/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,11 @@ def _format_reference(self, ref, count, n):
try:
tag = self._rev_mapping[label.lower()]
except KeyError:
warnings.warn(UserWarning(f"label `{label}` not exported"), stacklevel=2)
continue
if label.lower() == "unknown_tag":
Copy link

Copilot AI May 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ensure that the constant for unknown tags (self.UNKNOWN_TAG) is defined and maintained consistently across modules. Consider extracting this value into a shared configuration to avoid discrepancies between parser and writer behavior.

Suggested change
if label.lower() == "unknown_tag":
if label.lower() == self.UNKNOWN_TAG:

Copilot uses AI. Check for mistakes.
tag = self.UNKNOWN_TAG
else:
warnings.warn(UserWarning(f"label `{label}` not exported"), stacklevel=2)
continue

# ignore
if tag in tags_to_skip:
Expand Down
Loading