Skip to content

Commit 23cd149

Browse files
committed
Release v0.2.0
1 parent 1fd3401 commit 23cd149

43 files changed

Lines changed: 8963 additions & 2027 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
dev*/
33
*/example*
44
cache/
5+
webapp-*/
56
pytest.ini
67

78
# Byte-compiled / optimized / DLL files

netmedex/biocjson_parser.py

Lines changed: 89 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,72 @@
11
import logging
22
from collections import defaultdict
3-
from typing import Literal
3+
from datetime import datetime
4+
from typing import Any
5+
6+
from netmedex.pubtator_data import PubTatorAnnotation, PubTatorArticle, PubTatorRelation
47

58
logger = logging.getLogger(__name__)
69

710

811
def biocjson_to_pubtator(
912
res_json,
10-
retain_ori_text: bool = True,
11-
only_abstract: bool = False,
12-
role_type: Literal["identifier", "name"] = "identifier",
13-
):
14-
# 2024/05/26: PubTator has changed the format of the response
13+
full_text: bool = False,
14+
) -> list[PubTatorArticle]:
1515
res_json = res_json["PubTator3"]
1616

17-
converted_strs = []
17+
output = []
1818
for each_res_json in res_json:
1919
pmid = each_res_json["pmid"]
2020

2121
title_passage = extract_passage(each_res_json, "TITLE")
2222
abstract_passage = extract_passage(each_res_json, "ABSTRACT")
23+
journal = each_res_json.get("journal")
24+
date = None
25+
if (date_str := each_res_json.get("date")) is not None:
26+
try:
27+
date = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d")
28+
except Exception:
29+
pass
2330

2431
if not title_passage or not abstract_passage:
2532
continue
2633

27-
if only_abstract:
28-
abstract_idx = title_passage["idx"] + abstract_passage["idx"]
34+
if full_text:
35+
paragraph_indices = None
2936
else:
30-
abstract_idx = None
37+
paragraph_indices = title_passage["idx"] + abstract_passage["idx"]
3138

32-
annotation_list = get_biocjson_annotations(
33-
each_res_json, retain_ori_text, abstract_idx=abstract_idx
39+
annotation_list = create_pubtator_annotation(
40+
pmid=pmid,
41+
annotation_list=get_biocjson_annotations(
42+
each_res_json, paragraph_indices=paragraph_indices
43+
),
44+
)
45+
relation_list = create_pubtator_relation(
46+
pmid=pmid, relation_list=get_biocjson_relations(each_res_json)
3447
)
35-
relation_list = get_biocjson_relations(each_res_json, role_type)
36-
converted_strs.append(
37-
create_pubtator_str(
38-
pmid,
39-
# Only one title passage
40-
title_passage["text"][0],
41-
" ".join(abstract_passage["text"]),
42-
annotation_list,
43-
relation_list,
48+
49+
# Only one title passage
50+
title = title_passage["text"][0]
51+
# There may be multiple abstract passages
52+
abstract = " ".join(abstract_passage["text"])
53+
54+
output.append(
55+
PubTatorArticle(
56+
pmid=pmid,
57+
date=date,
58+
journal=journal,
59+
title=title,
60+
abstract=abstract,
61+
annotations=annotation_list,
62+
relations=relation_list,
63+
identifiers={
64+
annotation.mesh: annotation.identifier_name for annotation in annotation_list
65+
},
4466
)
4567
)
4668

47-
return "".join(converted_strs)
69+
return output
4870

4971

5072
def extract_passage(content, name):
@@ -67,26 +89,29 @@ def extract_passage(content, name):
6789
return passage_info
6890

6991

70-
def get_biocjson_annotations(res_json, retain_ori_text, abstract_idx=None):
92+
def get_biocjson_annotations(res_json, paragraph_indices=None):
7193
n_passages = len(res_json["passages"])
7294

73-
annotation_list = []
74-
# TODO: extract from specific passages only (if full_text)?
75-
if abstract_idx:
76-
passages = [res_json["passages"][i]["annotations"] for i in abstract_idx]
95+
annotation_list: list[dict[str, Any]] = []
96+
if paragraph_indices:
97+
passages = [res_json["passages"][i]["annotations"] for i in paragraph_indices]
7798
else:
7899
passages = [res_json["passages"][i]["annotations"] for i in range(n_passages)]
100+
79101
for annotation_entries in passages:
80102
for annotation_entry in annotation_entries:
81103
annotation = {}
82104
try:
83105
id = annotation_entry["infons"]["identifier"]
84106
except Exception:
85107
id = "-"
86-
annotation["id"] = "-" if id == "None" or id is None else id
108+
annotation["id"] = "-" if id == "None" or not id else id
87109
annotation["type"] = annotation_entry["infons"]["type"]
88110
annotation["locations"] = annotation_entry["locations"][0]
89-
annotation["name"] = get_name(retain_ori_text, annotation_entry, annotation)
111+
annotation["name"] = annotation_entry["text"]
112+
annotation["identifier_name"] = get_identifier_name(
113+
annotation_entry, annotation["type"]
114+
)
90115
if annotation["type"] == "Variant":
91116
annotation["type"] = annotation_entry["infons"]["subtype"]
92117

@@ -97,15 +122,13 @@ def get_biocjson_annotations(res_json, retain_ori_text, abstract_idx=None):
97122
return annotation_list
98123

99124

100-
def get_name(retain_ori_text, annotation_entry, annotation):
125+
def get_identifier_name(annotation_entry, annotation_type):
101126
try:
102-
if retain_ori_text:
103-
name = annotation_entry["text"]
127+
if annotation_type == "Species":
104128
# In type == "species", the entity name is stored in "text"
105-
elif annotation["type"] == "Species":
106129
name = annotation_entry["text"]
107130
# Variant can be either SNP, DNAMutation, or ProteinMutation
108-
elif annotation["type"] == "Variant":
131+
elif annotation_type == "Variant":
109132
# Some variants may not have standardized name
110133
try:
111134
name = annotation_entry["infons"]["name"]
@@ -125,43 +148,47 @@ def get_name(retain_ori_text, annotation_entry, annotation):
125148
return name
126149

127150

128-
def get_biocjson_relations(res_json, role_type):
151+
def get_biocjson_relations(res_json):
129152
relation_list = []
130153
for relation_entry in res_json["relations"]:
131154
each_relation = {}
132-
each_relation["role1"] = relation_entry["infons"]["role1"][role_type]
133-
each_relation["role2"] = relation_entry["infons"]["role2"][role_type]
155+
each_relation["role1"] = relation_entry["infons"]["role1"]["identifier"]
156+
each_relation["name1"] = relation_entry["infons"]["role1"]["name"]
157+
each_relation["role2"] = relation_entry["infons"]["role2"]["identifier"]
158+
each_relation["name2"] = relation_entry["infons"]["role2"]["name"]
134159
each_relation["type"] = relation_entry["infons"]["type"]
135160
relation_list.append(each_relation)
136161

137162
return relation_list
138163

139164

140-
def create_pubtator_str(pmid, title, abstract, annotation_list, relation_list):
141-
title_str = f"{pmid}|t|{title}\n"
142-
abstract_str = f"{pmid}|a|{abstract}\n"
143-
annotation_list.sort(key=lambda x: x["locations"]["offset"])
144-
annotation_str = [
145-
(
146-
f"{pmid}\t"
147-
f"{annotation['locations']['offset']}\t"
148-
f"{annotation['locations']['length'] + annotation['locations']['offset']}\t"
149-
f"{annotation['name']}\t"
150-
f"{annotation['type']}\t"
151-
f"{annotation['id']}"
165+
def create_pubtator_annotation(pmid: str, annotation_list: list[dict[str, Any]]):
166+
return sorted(
167+
[
168+
PubTatorAnnotation(
169+
pmid=pmid,
170+
start=annotation["locations"]["offset"],
171+
end=annotation["locations"]["length"] + annotation["locations"]["offset"],
172+
name=annotation["name"],
173+
identifier_name=annotation["identifier_name"],
174+
type=annotation["type"],
175+
mesh=annotation["id"],
176+
)
177+
for annotation in annotation_list
178+
],
179+
key=lambda x: (x.start, x.end),
180+
)
181+
182+
183+
def create_pubtator_relation(pmid: str, relation_list: list[dict[str, Any]]):
184+
return [
185+
PubTatorRelation(
186+
pmid=pmid,
187+
relation_type=relation["type"],
188+
mesh1=relation["role1"],
189+
name1=relation["name1"],
190+
mesh2=relation["role2"],
191+
name2=relation["name2"],
152192
)
153-
for annotation in annotation_list
154-
]
155-
relation_str = [
156-
(f"{pmid}\t" f"{relation['type']}\t" f"{relation['role1']}\t" f"{relation['role2']}")
157193
for relation in relation_list
158194
]
159-
160-
return (
161-
title_str
162-
+ abstract_str
163-
+ "\n".join(annotation_str)
164-
+ "\n"
165-
+ "\n".join(relation_str)
166-
+ "\n\n"
167-
)

netmedex/cli.py

Lines changed: 50 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -16,90 +16,99 @@ def main():
1616

1717

1818
def pubtator_entry(args):
19+
from netmedex.cli_utils import load_pmids
1920
from netmedex.exceptions import EmptyInput, NoArticles, UnsuccessfulRequest
20-
from netmedex.pubtator_core import PubTatorAPI
21-
from netmedex.pubtator_utils import create_savepath, load_pmids
21+
from netmedex.pubtator import PubTatorAPI
2222

23+
# Logging
2324
debug = args.debug
24-
logfile_name = "search" if debug else None
25+
logfile_name = "pubtator-api" if debug else None
2526
config_logger(debug, logfile_name)
2627

28+
# Input
2729
num_inputs = sum(arg is not None for arg in [args.pmids, args.pmid_file, args.query])
2830
if num_inputs != 1:
2931
logger.info("Please specify only one of the following: --query, --pmids, --pmid_file")
3032
sys.exit()
3133

34+
# Config
3235
query = None
3336
pmid_list = None
3437
if args.query is not None:
35-
search_type = "query"
3638
query = args.query
37-
elif args.pmids is not None:
38-
search_type = "pmids"
39-
pmid_list = load_pmids(args.pmids, load_from="string")
40-
logger.info(f"Find {len(pmid_list)} PMIDs")
41-
elif args.pmid_file is not None:
42-
search_type = "pmids"
43-
logger.info(f"Load PMIDs from: {args.pmid_file}")
44-
pmid_list = load_pmids(args.pmid_file, load_from="file")
45-
logger.info(f"Find {len(pmid_list)} PMIDs")
46-
47-
if search_type == "query":
48-
suffix = query.replace(" ", "_")
49-
if search_type == "pmids":
50-
if pmid_list:
51-
suffix = f"{pmid_list[0]}_total_{len(pmid_list)}"
52-
else:
53-
suffix = ""
54-
savepath = create_savepath(args.output, type=search_type, suffix=suffix)
55-
56-
pubtator_api = PubTatorAPI(
39+
suffix = query.replace(" ", "_").replace('"', "")
40+
savepath = args.output if args.output is not None else f"./query_{suffix}.pubtator"
41+
else:
42+
if args.pmids is not None:
43+
pmid_list = load_pmids(args.pmids, load_from="string")
44+
elif args.pmid_file is not None:
45+
logger.info(f"Load PMIDs from: {args.pmid_file}")
46+
pmid_list = load_pmids(args.pmid_file, load_from="file")
47+
logger.info(f"Found {len(pmid_list)} PMIDs")
48+
suffix = f"{pmid_list[0]}_total_{len(pmid_list)}" if pmid_list else ""
49+
savepath = args.output if args.output is not None else f"./pmids_{suffix}.pubtator"
50+
51+
# Always use "biocjson" format
52+
request_format = "biocjson"
53+
54+
# Request articles
55+
api = PubTatorAPI(
5756
query=query,
5857
pmid_list=pmid_list,
59-
savepath=str(savepath),
60-
search_type=search_type,
6158
sort=args.sort,
59+
request_format=request_format,
6260
max_articles=args.max_articles,
6361
full_text=args.full_text,
64-
use_mesh=args.use_mesh,
65-
debug=args.debug,
6662
queue=None,
6763
)
6864

6965
try:
70-
pubtator_api.run()
66+
collection = api.run()
67+
with open(savepath, "w") as f:
68+
f.write(collection.to_pubtator_str(annotation_use_identifier_name=args.use_mesh))
69+
logger.info(f"Save PubTator file to {savepath}")
7170
except (NoArticles, EmptyInput, UnsuccessfulRequest) as e:
7271
logger.error(str(e))
7372

7473

7574
def network_entry(args):
76-
from netmedex.network_core import NetworkBuilder
75+
from netmedex.graph import PubTatorGraphBuilder, save_graph
76+
from netmedex.pubtator_parser import PubTatorIO
7777

78+
# Logging
7879
debug = args.debug
79-
logfile_name = "network" if debug else None
80+
logfile_name = "graph" if debug else None
8081
config_logger(debug, logfile_name)
8182

83+
# Input
8284
pubtator_filepath = Path(args.input)
85+
if not pubtator_filepath.exists():
86+
logger.error(f"PubTator file not found: {pubtator_filepath}")
87+
sys.exit()
88+
89+
# Output
8390
if args.output is None:
8491
savepath = pubtator_filepath.with_suffix(f".{args.format}")
8592
else:
8693
savepath = Path(args.output)
8794
savepath.parent.mkdir(parents=True, exist_ok=True)
8895

89-
network_builder = NetworkBuilder(
90-
pubtator_filepath=str(pubtator_filepath),
91-
savepath=str(savepath),
92-
node_type=args.node_type,
93-
output_filetype=args.format,
96+
# Parse input PubTator file
97+
collection = PubTatorIO.parse(pubtator_filepath)
98+
99+
# Graph
100+
graph_builder = PubTatorGraphBuilder(node_type=args.node_type)
101+
graph_builder.add_collection(collection)
102+
G = graph_builder.build(
103+
pmid_weights=args.pmid_weight,
94104
weighting_method=args.weighting_method,
95105
edge_weight_cutoff=args.cut_weight,
96-
pmid_weight_filepath=args.pmid_weight,
97-
max_edges=args.max_edges,
98106
community=args.community,
99-
debug=args.debug,
107+
max_edges=args.max_edges,
100108
)
101109

102-
network_builder.run()
110+
# Save graph
111+
save_graph(G, savepath, output_filetype=args.format)
103112

104113

105114
def webapp_entry(args):
@@ -218,7 +227,7 @@ def get_network_parser():
218227
parser.add_argument(
219228
"-f",
220229
"--format",
221-
choices=["xgmml", "html", "json"],
230+
choices=["xgmml", "html", "json", "pickle"],
222231
default="html",
223232
help="Output format (default: html)",
224233
)

0 commit comments

Comments
 (0)