11import logging
22from collections import defaultdict
3- from typing import Literal
3+ from datetime import datetime
4+ from typing import Any
5+
6+ from netmedex .pubtator_data import PubTatorAnnotation , PubTatorArticle , PubTatorRelation
47
58logger = logging .getLogger (__name__ )
69
710
811def biocjson_to_pubtator (
912 res_json ,
10- retain_ori_text : bool = True ,
11- only_abstract : bool = False ,
12- role_type : Literal ["identifier" , "name" ] = "identifier" ,
13- ):
14- # 2024/05/26: PubTator has changed the format of the response
13+ full_text : bool = False ,
14+ ) -> list [PubTatorArticle ]:
1515 res_json = res_json ["PubTator3" ]
1616
17- converted_strs = []
17+ output = []
1818 for each_res_json in res_json :
1919 pmid = each_res_json ["pmid" ]
2020
2121 title_passage = extract_passage (each_res_json , "TITLE" )
2222 abstract_passage = extract_passage (each_res_json , "ABSTRACT" )
23+ journal = each_res_json .get ("journal" )
24+ date = None
25+ if (date_str := each_res_json .get ("date" )) is not None :
26+ try :
27+ date = datetime .strptime (date_str , "%Y-%m-%dT%H:%M:%SZ" ).strftime ("%Y-%m-%d" )
28+ except Exception :
29+ pass
2330
2431 if not title_passage or not abstract_passage :
2532 continue
2633
27- if only_abstract :
28- abstract_idx = title_passage [ "idx" ] + abstract_passage [ "idx" ]
34+ if full_text :
35+ paragraph_indices = None
2936 else :
30- abstract_idx = None
37+ paragraph_indices = title_passage [ "idx" ] + abstract_passage [ "idx" ]
3138
32- annotation_list = get_biocjson_annotations (
33- each_res_json , retain_ori_text , abstract_idx = abstract_idx
39+ annotation_list = create_pubtator_annotation (
40+ pmid = pmid ,
41+ annotation_list = get_biocjson_annotations (
42+ each_res_json , paragraph_indices = paragraph_indices
43+ ),
44+ )
45+ relation_list = create_pubtator_relation (
46+ pmid = pmid , relation_list = get_biocjson_relations (each_res_json )
3447 )
35- relation_list = get_biocjson_relations (each_res_json , role_type )
36- converted_strs .append (
37- create_pubtator_str (
38- pmid ,
39- # Only one title passage
40- title_passage ["text" ][0 ],
41- " " .join (abstract_passage ["text" ]),
42- annotation_list ,
43- relation_list ,
48+
49+ # Only one title passage
50+ title = title_passage ["text" ][0 ]
51+ # There may be multiple abstract passages
52+ abstract = " " .join (abstract_passage ["text" ])
53+
54+ output .append (
55+ PubTatorArticle (
56+ pmid = pmid ,
57+ date = date ,
58+ journal = journal ,
59+ title = title ,
60+ abstract = abstract ,
61+ annotations = annotation_list ,
62+ relations = relation_list ,
63+ identifiers = {
64+ annotation .mesh : annotation .identifier_name for annotation in annotation_list
65+ },
4466 )
4567 )
4668
47- return "" . join ( converted_strs )
69+ return output
4870
4971
5072def extract_passage (content , name ):
@@ -67,26 +89,29 @@ def extract_passage(content, name):
6789 return passage_info
6890
6991
70- def get_biocjson_annotations (res_json , retain_ori_text , abstract_idx = None ):
92+ def get_biocjson_annotations (res_json , paragraph_indices = None ):
7193 n_passages = len (res_json ["passages" ])
7294
73- annotation_list = []
74- # TODO: extract from specific passages only (if full_text)?
75- if abstract_idx :
76- passages = [res_json ["passages" ][i ]["annotations" ] for i in abstract_idx ]
95+ annotation_list : list [dict [str , Any ]] = []
96+ if paragraph_indices :
97+ passages = [res_json ["passages" ][i ]["annotations" ] for i in paragraph_indices ]
7798 else :
7899 passages = [res_json ["passages" ][i ]["annotations" ] for i in range (n_passages )]
100+
79101 for annotation_entries in passages :
80102 for annotation_entry in annotation_entries :
81103 annotation = {}
82104 try :
83105 id = annotation_entry ["infons" ]["identifier" ]
84106 except Exception :
85107 id = "-"
86- annotation ["id" ] = "-" if id == "None" or id is None else id
108+ annotation ["id" ] = "-" if id == "None" or not id else id
87109 annotation ["type" ] = annotation_entry ["infons" ]["type" ]
88110 annotation ["locations" ] = annotation_entry ["locations" ][0 ]
89- annotation ["name" ] = get_name (retain_ori_text , annotation_entry , annotation )
111+ annotation ["name" ] = annotation_entry ["text" ]
112+ annotation ["identifier_name" ] = get_identifier_name (
113+ annotation_entry , annotation ["type" ]
114+ )
90115 if annotation ["type" ] == "Variant" :
91116 annotation ["type" ] = annotation_entry ["infons" ]["subtype" ]
92117
@@ -97,15 +122,13 @@ def get_biocjson_annotations(res_json, retain_ori_text, abstract_idx=None):
97122 return annotation_list
98123
99124
100- def get_name ( retain_ori_text , annotation_entry , annotation ):
125+ def get_identifier_name ( annotation_entry , annotation_type ):
101126 try :
102- if retain_ori_text :
103- name = annotation_entry ["text" ]
127+ if annotation_type == "Species" :
104128 # In type == "species", the entity name is stored in "text"
105- elif annotation ["type" ] == "Species" :
106129 name = annotation_entry ["text" ]
107130 # Variant can be either SNP, DNAMutation, or ProteinMutation
108- elif annotation [ "type" ] == "Variant" :
131+ elif annotation_type == "Variant" :
109132 # Some variants may not have standardized name
110133 try :
111134 name = annotation_entry ["infons" ]["name" ]
@@ -125,43 +148,47 @@ def get_name(retain_ori_text, annotation_entry, annotation):
125148 return name
126149
127150
128- def get_biocjson_relations (res_json , role_type ):
151+ def get_biocjson_relations (res_json ):
129152 relation_list = []
130153 for relation_entry in res_json ["relations" ]:
131154 each_relation = {}
132- each_relation ["role1" ] = relation_entry ["infons" ]["role1" ][role_type ]
133- each_relation ["role2" ] = relation_entry ["infons" ]["role2" ][role_type ]
155+ each_relation ["role1" ] = relation_entry ["infons" ]["role1" ]["identifier" ]
156+ each_relation ["name1" ] = relation_entry ["infons" ]["role1" ]["name" ]
157+ each_relation ["role2" ] = relation_entry ["infons" ]["role2" ]["identifier" ]
158+ each_relation ["name2" ] = relation_entry ["infons" ]["role2" ]["name" ]
134159 each_relation ["type" ] = relation_entry ["infons" ]["type" ]
135160 relation_list .append (each_relation )
136161
137162 return relation_list
138163
139164
140- def create_pubtator_str (pmid , title , abstract , annotation_list , relation_list ):
141- title_str = f"{ pmid } |t|{ title } \n "
142- abstract_str = f"{ pmid } |a|{ abstract } \n "
143- annotation_list .sort (key = lambda x : x ["locations" ]["offset" ])
144- annotation_str = [
145- (
146- f"{ pmid } \t "
147- f"{ annotation ['locations' ]['offset' ]} \t "
148- f"{ annotation ['locations' ]['length' ] + annotation ['locations' ]['offset' ]} \t "
149- f"{ annotation ['name' ]} \t "
150- f"{ annotation ['type' ]} \t "
151- f"{ annotation ['id' ]} "
165+ def create_pubtator_annotation (pmid : str , annotation_list : list [dict [str , Any ]]):
166+ return sorted (
167+ [
168+ PubTatorAnnotation (
169+ pmid = pmid ,
170+ start = annotation ["locations" ]["offset" ],
171+ end = annotation ["locations" ]["length" ] + annotation ["locations" ]["offset" ],
172+ name = annotation ["name" ],
173+ identifier_name = annotation ["identifier_name" ],
174+ type = annotation ["type" ],
175+ mesh = annotation ["id" ],
176+ )
177+ for annotation in annotation_list
178+ ],
179+ key = lambda x : (x .start , x .end ),
180+ )
181+
182+
183+ def create_pubtator_relation (pmid : str , relation_list : list [dict [str , Any ]]):
184+ return [
185+ PubTatorRelation (
186+ pmid = pmid ,
187+ relation_type = relation ["type" ],
188+ mesh1 = relation ["role1" ],
189+ name1 = relation ["name1" ],
190+ mesh2 = relation ["role2" ],
191+ name2 = relation ["name2" ],
152192 )
153- for annotation in annotation_list
154- ]
155- relation_str = [
156- (f"{ pmid } \t " f"{ relation ['type' ]} \t " f"{ relation ['role1' ]} \t " f"{ relation ['role2' ]} " )
157193 for relation in relation_list
158194 ]
159-
160- return (
161- title_str
162- + abstract_str
163- + "\n " .join (annotation_str )
164- + "\n "
165- + "\n " .join (relation_str )
166- + "\n \n "
167- )
0 commit comments