lsbnb
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎netmedex/biocjson_parser.py‎
Lines changed: 89 additions & 62 deletions b/‎netmedex/biocjson_parser.py‎
Lines changed: 89 additions & 62 deletions
diff --git a/‎netmedex/cli.py‎
Lines changed: 50 additions & 41 deletions b/‎netmedex/cli.py‎
Lines changed: 50 additions & 41 deletions
@@ -2,6 +2,7 @@
 dev*/
 */example*
 cache/
+webapp-*/
 pytest.ini
 
 # Byte-compiled / optimized / DLL files
 
@@ -1,50 +1,72 @@
 import logging
 from collections import defaultdict
-from typing import Literal
+from datetime import datetime
+from typing import Any
+
+from netmedex.pubtator_data import PubTatorAnnotation, PubTatorArticle, PubTatorRelation
 
 logger = logging.getLogger(__name__)
 
 
 def biocjson_to_pubtator(
     res_json,
-    retain_ori_text: bool = True,
-    only_abstract: bool = False,
-    role_type: Literal["identifier", "name"] = "identifier",
-):
-    # 2024/05/26: PubTator has changed the format of the response
+    full_text: bool = False,
+) -> list[PubTatorArticle]:
     res_json = res_json["PubTator3"]
 
-    converted_strs = []
+    output = []
     for each_res_json in res_json:
         pmid = each_res_json["pmid"]
 
         title_passage = extract_passage(each_res_json, "TITLE")
         abstract_passage = extract_passage(each_res_json, "ABSTRACT")
+        journal = each_res_json.get("journal")
+        date = None
+        if (date_str := each_res_json.get("date")) is not None:
+            try:
+                date = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d")
+            except Exception:
+                pass
 
         if not title_passage or not abstract_passage:
             continue
 
-        if only_abstract:
-            abstract_idx = title_passage["idx"] + abstract_passage["idx"]
+        if full_text:
+            paragraph_indices = None
         else:
-            abstract_idx = None
+            paragraph_indices = title_passage["idx"] + abstract_passage["idx"]
 
-        annotation_list = get_biocjson_annotations(
-            each_res_json, retain_ori_text, abstract_idx=abstract_idx
+        annotation_list = create_pubtator_annotation(
+            pmid=pmid,
+            annotation_list=get_biocjson_annotations(
+                each_res_json, paragraph_indices=paragraph_indices
+            ),
+        )
+        relation_list = create_pubtator_relation(
+            pmid=pmid, relation_list=get_biocjson_relations(each_res_json)
         )
-        relation_list = get_biocjson_relations(each_res_json, role_type)
-        converted_strs.append(
-            create_pubtator_str(
-                pmid,
-                # Only one title passage
-                title_passage["text"][0],
-                " ".join(abstract_passage["text"]),
-                annotation_list,
-                relation_list,
+
+        # Only one title passage
+        title = title_passage["text"][0]
+        # There may be multiple abstract passages
+        abstract = " ".join(abstract_passage["text"])
+
+        output.append(
+            PubTatorArticle(
+                pmid=pmid,
+                date=date,
+                journal=journal,
+                title=title,
+                abstract=abstract,
+                annotations=annotation_list,
+                relations=relation_list,
+                identifiers={
+                    annotation.mesh: annotation.identifier_name for annotation in annotation_list
+                },
             )
         )
 
-    return "".join(converted_strs)
+    return output
 
 
 def extract_passage(content, name):
@@ -67,26 +89,29 @@ def extract_passage(content, name):
     return passage_info
 
 
-def get_biocjson_annotations(res_json, retain_ori_text, abstract_idx=None):
+def get_biocjson_annotations(res_json, paragraph_indices=None):
     n_passages = len(res_json["passages"])
 
-    annotation_list = []
-    # TODO: extract from specific passages only (if full_text)?
-    if abstract_idx:
-        passages = [res_json["passages"][i]["annotations"] for i in abstract_idx]
+    annotation_list: list[dict[str, Any]] = []
+    if paragraph_indices:
+        passages = [res_json["passages"][i]["annotations"] for i in paragraph_indices]
     else:
         passages = [res_json["passages"][i]["annotations"] for i in range(n_passages)]
+
     for annotation_entries in passages:
         for annotation_entry in annotation_entries:
             annotation = {}
             try:
                 id = annotation_entry["infons"]["identifier"]
             except Exception:
                 id = "-"
-            annotation["id"] = "-" if id == "None" or id is None else id
+            annotation["id"] = "-" if id == "None" or not id else id
             annotation["type"] = annotation_entry["infons"]["type"]
             annotation["locations"] = annotation_entry["locations"][0]
-            annotation["name"] = get_name(retain_ori_text, annotation_entry, annotation)
+            annotation["name"] = annotation_entry["text"]
+            annotation["identifier_name"] = get_identifier_name(
+                annotation_entry, annotation["type"]
+            )
             if annotation["type"] == "Variant":
                 annotation["type"] = annotation_entry["infons"]["subtype"]
 
@@ -97,15 +122,13 @@ def get_biocjson_annotations(res_json, retain_ori_text, abstract_idx=None):
     return annotation_list
 
 
-def get_name(retain_ori_text, annotation_entry, annotation):
+def get_identifier_name(annotation_entry, annotation_type):
     try:
-        if retain_ori_text:
-            name = annotation_entry["text"]
+        if annotation_type == "Species":
             # In type == "species", the entity name is stored in "text"
-        elif annotation["type"] == "Species":
             name = annotation_entry["text"]
             # Variant can be either SNP, DNAMutation, or ProteinMutation
-        elif annotation["type"] == "Variant":
+        elif annotation_type == "Variant":
             # Some variants may not have standardized name
             try:
                 name = annotation_entry["infons"]["name"]
@@ -125,43 +148,47 @@ def get_name(retain_ori_text, annotation_entry, annotation):
     return name
 
 
-def get_biocjson_relations(res_json, role_type):
+def get_biocjson_relations(res_json):
     relation_list = []
     for relation_entry in res_json["relations"]:
         each_relation = {}
-        each_relation["role1"] = relation_entry["infons"]["role1"][role_type]
-        each_relation["role2"] = relation_entry["infons"]["role2"][role_type]
+        each_relation["role1"] = relation_entry["infons"]["role1"]["identifier"]
+        each_relation["name1"] = relation_entry["infons"]["role1"]["name"]
+        each_relation["role2"] = relation_entry["infons"]["role2"]["identifier"]
+        each_relation["name2"] = relation_entry["infons"]["role2"]["name"]
         each_relation["type"] = relation_entry["infons"]["type"]
         relation_list.append(each_relation)
 
     return relation_list
 
 
-def create_pubtator_str(pmid, title, abstract, annotation_list, relation_list):
-    title_str = f"{pmid}|t|{title}\n"
-    abstract_str = f"{pmid}|a|{abstract}\n"
-    annotation_list.sort(key=lambda x: x["locations"]["offset"])
-    annotation_str = [
-        (
-            f"{pmid}\t"
-            f"{annotation['locations']['offset']}\t"
-            f"{annotation['locations']['length'] + annotation['locations']['offset']}\t"
-            f"{annotation['name']}\t"
-            f"{annotation['type']}\t"
-            f"{annotation['id']}"
+def create_pubtator_annotation(pmid: str, annotation_list: list[dict[str, Any]]):
+    return sorted(
+        [
+            PubTatorAnnotation(
+                pmid=pmid,
+                start=annotation["locations"]["offset"],
+                end=annotation["locations"]["length"] + annotation["locations"]["offset"],
+                name=annotation["name"],
+                identifier_name=annotation["identifier_name"],
+                type=annotation["type"],
+                mesh=annotation["id"],
+            )
+            for annotation in annotation_list
+        ],
+        key=lambda x: (x.start, x.end),
+    )
+
+
+def create_pubtator_relation(pmid: str, relation_list: list[dict[str, Any]]):
+    return [
+        PubTatorRelation(
+            pmid=pmid,
+            relation_type=relation["type"],
+            mesh1=relation["role1"],
+            name1=relation["name1"],
+            mesh2=relation["role2"],
+            name2=relation["name2"],
         )
-        for annotation in annotation_list
-    ]
-    relation_str = [
-        (f"{pmid}\t" f"{relation['type']}\t" f"{relation['role1']}\t" f"{relation['role2']}")
         for relation in relation_list
     ]
-
-    return (
-        title_str
-        + abstract_str
-        + "\n".join(annotation_str)
-        + "\n"
-        + "\n".join(relation_str)
-        + "\n\n"
-    )
 
@@ -16,90 +16,99 @@ def main():
 
 
 def pubtator_entry(args):
+    from netmedex.cli_utils import load_pmids
     from netmedex.exceptions import EmptyInput, NoArticles, UnsuccessfulRequest
-    from netmedex.pubtator_core import PubTatorAPI
-    from netmedex.pubtator_utils import create_savepath, load_pmids
+    from netmedex.pubtator import PubTatorAPI
 
+    # Logging
     debug = args.debug
-    logfile_name = "search" if debug else None
+    logfile_name = "pubtator-api" if debug else None
     config_logger(debug, logfile_name)
 
+    # Input
     num_inputs = sum(arg is not None for arg in [args.pmids, args.pmid_file, args.query])
     if num_inputs != 1:
         logger.info("Please specify only one of the following: --query, --pmids, --pmid_file")
         sys.exit()
 
+    # Config
     query = None
     pmid_list = None
     if args.query is not None:
-        search_type = "query"
         query = args.query
-    elif args.pmids is not None:
-        search_type = "pmids"
-        pmid_list = load_pmids(args.pmids, load_from="string")
-        logger.info(f"Find {len(pmid_list)} PMIDs")
-    elif args.pmid_file is not None:
-        search_type = "pmids"
-        logger.info(f"Load PMIDs from: {args.pmid_file}")
-        pmid_list = load_pmids(args.pmid_file, load_from="file")
-        logger.info(f"Find {len(pmid_list)} PMIDs")
-
-    if search_type == "query":
-        suffix = query.replace(" ", "_")
-    if search_type == "pmids":
-        if pmid_list:
-            suffix = f"{pmid_list[0]}_total_{len(pmid_list)}"
-        else:
-            suffix = ""
-    savepath = create_savepath(args.output, type=search_type, suffix=suffix)
-
-    pubtator_api = PubTatorAPI(
+        suffix = query.replace(" ", "_").replace('"', "")
+        savepath = args.output if args.output is not None else f"./query_{suffix}.pubtator"
+    else:
+        if args.pmids is not None:
+            pmid_list = load_pmids(args.pmids, load_from="string")
+        elif args.pmid_file is not None:
+            logger.info(f"Load PMIDs from: {args.pmid_file}")
+            pmid_list = load_pmids(args.pmid_file, load_from="file")
+        logger.info(f"Found {len(pmid_list)} PMIDs")
+        suffix = f"{pmid_list[0]}_total_{len(pmid_list)}" if pmid_list else ""
+        savepath = args.output if args.output is not None else f"./pmids_{suffix}.pubtator"
+
+    # Always use "biocjson" format
+    request_format = "biocjson"
+
+    # Request articles
+    api = PubTatorAPI(
         query=query,
         pmid_list=pmid_list,
-        savepath=str(savepath),
-        search_type=search_type,
         sort=args.sort,
+        request_format=request_format,
         max_articles=args.max_articles,
         full_text=args.full_text,
-        use_mesh=args.use_mesh,
-        debug=args.debug,
         queue=None,
     )
 
     try:
-        pubtator_api.run()
+        collection = api.run()
+        with open(savepath, "w") as f:
+            f.write(collection.to_pubtator_str(annotation_use_identifier_name=args.use_mesh))
+        logger.info(f"Save PubTator file to {savepath}")
     except (NoArticles, EmptyInput, UnsuccessfulRequest) as e:
         logger.error(str(e))
 
 
 def network_entry(args):
-    from netmedex.network_core import NetworkBuilder
+    from netmedex.graph import PubTatorGraphBuilder, save_graph
+    from netmedex.pubtator_parser import PubTatorIO
 
+    # Logging
     debug = args.debug
-    logfile_name = "network" if debug else None
+    logfile_name = "graph" if debug else None
     config_logger(debug, logfile_name)
 
+    # Input
     pubtator_filepath = Path(args.input)
+    if not pubtator_filepath.exists():
+        logger.error(f"PubTator file not found: {pubtator_filepath}")
+        sys.exit()
+
+    # Output
     if args.output is None:
         savepath = pubtator_filepath.with_suffix(f".{args.format}")
     else:
         savepath = Path(args.output)
         savepath.parent.mkdir(parents=True, exist_ok=True)
 
-    network_builder = NetworkBuilder(
-        pubtator_filepath=str(pubtator_filepath),
-        savepath=str(savepath),
-        node_type=args.node_type,
-        output_filetype=args.format,
+    # Parse input PubTator file
+    collection = PubTatorIO.parse(pubtator_filepath)
+
+    # Graph
+    graph_builder = PubTatorGraphBuilder(node_type=args.node_type)
+    graph_builder.add_collection(collection)
+    G = graph_builder.build(
+        pmid_weights=args.pmid_weight,
         weighting_method=args.weighting_method,
         edge_weight_cutoff=args.cut_weight,
-        pmid_weight_filepath=args.pmid_weight,
-        max_edges=args.max_edges,
         community=args.community,
-        debug=args.debug,
+        max_edges=args.max_edges,
     )
 
-    network_builder.run()
+    # Save graph
+    save_graph(G, savepath, output_filetype=args.format)
 
 
 def webapp_entry(args):
@@ -218,7 +227,7 @@ def get_network_parser():
     parser.add_argument(
         "-f",
         "--format",
-        choices=["xgmml", "html", "json"],
+        choices=["xgmml", "html", "json", "pickle"],
         default="html",
         help="Output format (default: html)",
     )