MrTango · J535D165 · May 22, 2025 · May 22, 2025 · May 23, 2025 · May 23, 2025
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,8 @@ venv
 
 # created from tests
 export.ris
+
+# extra benchmark data only for internal use (because of copyright)
+benchmark_data
+tests/test_benchmark_extra.py
+benchmark_*.svg
diff --git a/README.md b/README.md
@@ -176,7 +176,6 @@ so these may need to be modified for specific export systems:
  'TI': 'title',
  'TT': 'translated_title',
  'TY': 'type_of_reference',
- 'UK': 'unknown_tag',
  'UR': 'urls',
  'VL': 'volume',
  'Y1': 'publication_year',

diff --git a/rispy/__init__.py b/rispy/__init__.py
@@ -1,7 +1,7 @@
 """A Python reader/writer of RIS reference files"""
 
 from .config import LIST_TYPE_TAGS, TAG_KEY_MAPPING, TYPE_OF_REFERENCE_MAPPING
-from .parser import RisParser, WokParser, load, loads
+from .parser import PubMedParser, RisParser, WokParser, load, loads
 from .writer import BaseWriter, RisWriter, dump, dumps
 
 __version__ = "0.9.0"
@@ -11,6 +11,7 @@
     "TAG_KEY_MAPPING",
     "TYPE_OF_REFERENCE_MAPPING",
     "BaseWriter",
+    "PubMedParser",
     "RisParser",
     "RisWriter",
     "WokParser",

diff --git a/rispy/config.py b/rispy/config.py
@@ -80,7 +80,6 @@
     "Y1": "publication_year",
     "Y2": "access_date",
     "ER": "end_of_reference",
-    "UK": "unknown_tag",
 }
 
 TYPE_OF_REFERENCE_MAPPING = {
@@ -227,3 +226,127 @@
     "ER": "end_of_record",
     "EF": "end_of_file",
 }
+PUBMED_LIST_TYPE_TAGS = {
+    "AD",
+    "AID",
+    "AU",
+    "AUID",
+    "CIN",
+    "CON",
+    "CN",
+    "RN",
+    "EDAT",
+    "EIN",
+    "FAU",
+    "FIR",
+    "GR",
+    "IR",
+    "IRAD",
+    "IS",
+    "LA",
+    "LID",
+    "MHDA",
+    "MH",
+    "OT",
+    "PHST",
+    "PST",
+    "PT",
+    "PMC",
+    "SI",
+    "SO",
+    "SB",
+}
+
+# from https://pubmed.ncbi.nlm.nih.gov/help/#pubmed-format
+PUBMED_TAG_KEY_MAPPING = {
+    "AB": "abstract",
+    "AD": "affiliation",
+    "AID": "article_identifier",
+    "AU": "author",
+    "AUID": "author_identifier",
+    "BTI": "book_title",
+    "CI": "copyright_information",
+    "CIN": "comment_in",
+    "CN": "corporate_author",
+    "COIS": "conflict_of_interest",  # mistake in the doc (COIS instead of COI, March 2025)
+    "CON": "comment_on",
+    "CP": "chapter",
+    "CRDT": "create_date",
+    "CRF": "corrected_and_republished_from",
+    "CRI": "corrected_and_republished_in",
+    "CTDT": "contribution_date",
+    "CTI": "collection_title",
+    "DCOM": "completion_date",
+    "DDIN": "dataset_described_in",
+    "DRIN": "dataset_use_reported_in",
+    "DEP": "date_of_electronic_publication",
+    "DP": "publication_date",
+    "DRDT": "date_revised",
+    "ECF": "expression_of_concern_for",
+    "ECI": "expression_of_concern_in",
+    "EDAT": "entry_date",
+    "EFR": "erratum_for",
+    "EIN": "erratum_in",
+    "ED": "editor",
+    "EN": "edition",
+    "FAU": "full_author_name",
+    "FED": "full_editor_name",
+    "FIR": "full_investigator_name",
+    "FPS": "full_personal_name_as_subject",
+    "GN": "general_note",
+    "GR": "grants_and_funding",
+    "GS": "gene_symbol",
+    "IP": "issue",
+    "IR": "investigator",
+    "IRAD": "investigator_affiliation",
+    "IS": "issn",
+    "ISBN": "isbn",
+    "JID": "nlm_unique_id",
+    "JT": "full_journal_title",
+    "LA": "language",
+    "LID": "location_id",
+    "LR": "modification_date",
+    "MH": "mesh_terms",
+    "MHDA": "mesh_date",
+    "MID": "manuscript_identifier",
+    "NM": "substance_name",
+    "OAB": "other_abstract",
+    "OABL": "other_abstract_language",
+    "OCI": "other_copyright_information",
+    "OID": "other_id",
+    "ORI": "original_report_in",
+    "OT": "other_term",
+    "OTO": "other_term_owner",
+    "OWN": "owner",
+    "PB": "publisher",
+    "PG": "pagination",
+    "PHST": "publication_history_status_date",
+    "PL": "place_of_publication",
+    "PMC": "pubmed_central_identifier",
+    "PMCR": "pmc_release",
+    "PMID": "pubmed_unique_identifier",
+    "PS": "personal_name_as_subject",
+    "PST": "publication_status",
+    "PT": "publication_type",
+    "RF": "number_of_references",
+    "RIN": "retraction_in",
+    "RN": "ec_rn_number",
+    "ROF": "retraction_of",
+    "RPF": "republished_from",
+    "RPI": "republished_in",
+    "RRI": "retracted_and_republished_in",
+    "RRF": "retracted_and_republished_from",
+    "SB": "subset",
+    "SFM": "space_flight_mission",
+    "SI": "secondary_source_id",
+    "SO": "source",
+    "SPIN": "summary_for_patients_in",
+    "STAT": "status_tag",
+    "TA": "journal_title_abbreviation",
+    "TI": "title",
+    "TT": "transliterated_title",
+    "UIN": "update_in",
+    "UOF": "update_of",
+    "VI": "volume",
+    "VTI": "volume_title",
+}
diff --git a/rispy/parser.py b/rispy/parser.py
@@ -7,6 +7,8 @@
 from .config import (
     DELIMITED_TAG_MAPPING,
     LIST_TYPE_TAGS,
+    PUBMED_LIST_TYPE_TAGS,
+    PUBMED_TAG_KEY_MAPPING,
     TAG_KEY_MAPPING,
     WOK_LIST_TYPE_TAGS,
     WOK_TAG_KEY_MAPPING,
@@ -45,7 +47,6 @@ class RisParser:
 
     START_TAG: str = "TY"
     END_TAG: str = "ER"
-    UNKNOWN_TAG: str = "UK"
     PATTERN: str
     DEFAULT_IGNORE: ClassVar[list[str]] = []
     DEFAULT_MAPPING: dict = TAG_KEY_MAPPING
@@ -63,6 +64,7 @@ def __init__(
         skip_unknown_tags: bool = False,
         enforce_list_tags: bool = True,
         newline: Optional[str] = None,
+        undo_wrapping: bool = False,
     ):
         """Initialize the parser function.
 
@@ -98,6 +100,7 @@ def __init__(
         self.skip_unknown_tags = skip_unknown_tags
         self.enforce_list_tags = enforce_list_tags
         self.newline = newline if newline is not None else self.DEFAULT_NEWLINE
+        self.undo_wrapping = undo_wrapping
 
     def _iter_till_start(self, lines) -> dict:
         while True:
@@ -122,24 +125,39 @@ def parse_lines(self, lines: Union[TextIO, list[str]]) -> list[dict]:
             while True:
                 tag, content = self.parse_line(next(lines))
 
-                if tag is None:
-                    self._add_tag(record, last_tag, content, extend_multiline=True)
-                    continue
-
                 if tag in self.ignore:
                     continue
 
-                if tag == self.END_TAG:
+                if self.END_TAG and tag == self.END_TAG:
                     result.append(record)
-
+                    last_tag = tag
                     record = self._iter_till_start(lines)
                     continue
 
-                self._add_tag(record, tag, content)
-                last_tag = tag
+                if self.END_TAG is None and tag == self.START_TAG:
+                    result.append(record)
+                    record = {self.mapping[self.START_TAG]: content}
+                    last_tag = tag
+                    continue
+
+                if tag is None and not self.undo_wrapping and last_tag in self.list_tags:
+                    self._add_tag(record, last_tag, content)
+                elif tag is None:
+                    self._extend_tag(record, last_tag, content)
+                else:
+                    self._add_tag(record, tag, content)
+                    last_tag = tag
 
         except StopIteration:
-            return result
+            pass
+
+        if self.END_TAG is not None and last_tag != self.END_TAG:
+            raise ParseError(f"Missing end tag: {self.END_TAG}")
+
+        if self.END_TAG is None:
+            result.append(record)
+
+        return result
 
     def parse_line(self, line: str) -> Union[tuple[str, str], tuple[None, str]]:
         """Parse line of RIS file.
@@ -178,18 +196,11 @@ def _add_single_value(
         The output for a tag can be a list when a delimiter is specified,
         even if it is not a list tag.
         """
-        if not is_multi:
-            if self.enforce_list_tags or name not in record:
-                ignore_this_if_has_one = value
-                record.setdefault(name, ignore_this_if_has_one)
-            else:
-                self._add_list_value(record, name, value)
+        if self.enforce_list_tags or name not in record:
+            ignore_this_if_has_one = value
+            record.setdefault(name, ignore_this_if_has_one)
         else:
-            value_must_exist_or_is_bug = record[name]
-            if isinstance(value, list):
-                record[name].extend(value)
-            else:
-                record[name] = " ".join((value_must_exist_or_is_bug, value))
+            self._add_list_value(record, name, value)
 
     def _add_list_value(self, record: dict, name: str, value: Union[str, list[str]]) -> None:
         """Process tags with multiple values."""
@@ -204,29 +215,34 @@ def _add_list_value(self, record: dict, name: str, value: Union[str, list[str]])
             must_exist = record[name]
             record[name] = [must_exist, *value_list]
 
-    def _add_tag(
-        self, record: dict, tag: str, content: str, extend_multiline: bool = False
-    ) -> None:
+    def _extend_tag(self, record: dict, tag: str, content: Union[str, list[str]]) -> None:
+        """Extend tags with multiline values."""
+
+        sep = " " if self.undo_wrapping else "\n"
+
+        name = self.mapping[tag]
+        if isinstance(record[name], list):
+            record[name][-1] = sep.join((record[name][-1], content))
+        else:
+            record[name] = sep.join((record[name], content))
+
+    def _add_tag(self, record: dict, tag: str, content: str) -> None:
         try:
             name = self.mapping[tag]
         except KeyError:
             if self.skip_unknown_tags:
                 return
 
-            # handle unknown tag
-            name = self.mapping[self.UNKNOWN_TAG]
-            if name not in record:
-                record[name] = defaultdict(list)
-            record[name][tag].append(content)
-
+            record.setdefault("unknown_tag", defaultdict(list))[tag].append(content)
+            return
         else:
             if delimiter := self.delimiter_map.get(tag):
                 content = [i.strip() for i in content.split(delimiter)]
 
             if tag in self.list_tags:
                 self._add_list_value(record, name, content)
             else:
-                self._add_single_value(record, name, content, is_multi=extend_multiline)
+                self._add_single_value(record, name, content)
 
 
 class WokParser(RisParser):
@@ -238,6 +254,9 @@ class WokParser(RisParser):
     DEFAULT_LIST_TAGS = WOK_LIST_TYPE_TAGS
     DEFAULT_DELIMITER_MAPPING: ClassVar[dict] = {}
 
+    def __init__(self, undo_wrapping: bool = True, **kw):
+        super().__init__(undo_wrapping=undo_wrapping, **kw)
+
     def parse_line(self, line: str) -> Union[tuple[str, str], tuple[None, str]]:
         """Parse line of RIS file.
 
@@ -262,6 +281,38 @@ def parse_line(self, line: str) -> Union[tuple[str, str], tuple[None, str]]:
             return (line[0:2], line[3:].strip())
 
 
+class PubMedParser(RisParser):
+    """Subclass of Base for reading PubMed RIS files."""
+
+    START_TAG: str = "PMID"
+    END_TAG: None = None
+    DEFAULT_MAPPING: dict = PUBMED_TAG_KEY_MAPPING
+    DEFAULT_LIST_TAGS: list[str] = PUBMED_LIST_TYPE_TAGS
+    DEFAULT_DELIMITER_MAPPING: ClassVar[dict] = {}
+
+    def __init__(self, undo_wrapping: bool = True, **kw):
+        super().__init__(undo_wrapping=undo_wrapping, **kw)
+
+    def parse_line(self, line: str) -> Union[tuple[str, str], tuple[None, str]]:
+        """Parse line of PubMed file.
+
+        Parameters
+        ----------
+        line : str
+            Line of RIS file between start and end tag.
+
+        Returns
+        -------
+        tuple
+            Tuple containing the tag and the content of the tag.
+        """
+
-
+
+        if len(line) < 5:
+            return (None, line.strip())
-
+
+        if len(line) < 5:
+            return (None, line.strip())
+        if line[4:5] == "-":
+            return (line[0:4].rstrip(), line[6:].rstrip())
+        else:
+            return (None, line[6:].rstrip())
+
+
 def load(
     file: Union[TextIO, Path],
     *,

diff --git a/rispy/writer.py b/rispy/writer.py
@@ -117,8 +117,11 @@ def _format_reference(self, ref, count, n):
             try:
                 tag = self._rev_mapping[label.lower()]
             except KeyError:
-                warnings.warn(UserWarning(f"label `{label}` not exported"), stacklevel=2)
-                continue
+                if label.lower() == "unknown_tag":
-                if label.lower() == "unknown_tag":
+                if label.lower() == self.UNKNOWN_TAG:
-                if label.lower() == "unknown_tag":
+                if label.lower() == self.UNKNOWN_TAG:
+                    tag = self.UNKNOWN_TAG
+                else:
+                    warnings.warn(UserWarning(f"label `{label}` not exported"), stacklevel=2)
+                    continue
 
             # ignore
             if tag in tags_to_skip: