-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtest_metadata_preservation.py
More file actions
169 lines (132 loc) · 5.51 KB
/
test_metadata_preservation.py
File metadata and controls
169 lines (132 loc) · 5.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env python3
"""
Test script to verify confidence and evidence are preserved in graph edges
"""
import os
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from netmedex.graph import PubTatorGraphBuilder
from netmedex.pubtator_parser import PubTatorIO
def test_metadata_preservation():
"""Test that semantic metadata is preserved in graph edges"""
print("=" * 80)
print("Testing Semantic Metadata Preservation")
print("=" * 80)
# Load test data
test_file = "tests/test_data/22429397_abstract_240916.pubtator"
collection = PubTatorIO.parse(test_file)
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
print("\n❌ OPENAI_API_KEY not set")
print("This test requires an API key to generate semantic edges")
return
print(f"\n📄 Loading data from: {test_file}")
print(f" Articles: {len(collection.articles)}")
# Build graph with semantic analysis
from openai import OpenAI
class SimpleLLMClient:
def __init__(self):
self.client = OpenAI(api_key=api_key)
self.model = "gpt-3.5-turbo"
llm_client = SimpleLLMClient()
print("\n🤖 Building graph with semantic analysis...")
builder = PubTatorGraphBuilder(
node_type="all",
edge_method="semantic",
llm_client=llm_client,
semantic_threshold=0.5
)
builder.add_collection(collection)
graph = builder.build(
edge_weight_cutoff=0,
community=False,
max_edges=0
)
print(f"\n📊 Graph built:")
print(f" Nodes: {graph.number_of_nodes()}")
print(f" Edges: {graph.number_of_edges()}")
# Check for metadata in edges
print("\n" + "=" * 80)
print("Checking Edge Metadata")
print("=" * 80)
edges_with_confidence = 0
edges_with_evidence = 0
edges_without_metadata = 0
for u, v, data in graph.edges(data=True):
has_conf = data.get("confidences") is not None
has_evid = data.get("evidences") is not None
if has_conf:
edges_with_confidence += 1
if has_evid:
edges_with_evidence += 1
if not has_conf and not has_evid:
edges_without_metadata += 1
print(f"\n📊 Metadata Statistics:")
print(f" Edges with confidence scores: {edges_with_confidence}/{graph.number_of_edges()}")
print(f" Edges with evidence text: {edges_with_evidence}/{graph.number_of_edges()}")
print(f" Edges without metadata: {edges_without_metadata}/{graph.number_of_edges()}")
# Show detailed examples
print("\n" + "=" * 80)
print("Example Edges with Metadata (first 5)")
print("=" * 80)
count = 0
for u, v, data in graph.edges(data=True):
if count >= 5:
break
confidences = data.get("confidences", {})
evidences = data.get("evidences", {})
u_name = graph.nodes[u].get('name', u)
v_name = graph.nodes[v].get('name', v)
print(f"\n{count + 1}. {u_name} ↔ {v_name}")
print(f" Relations: {data.get('relations', {})}")
if confidences:
print(f" ✅ Confidences:")
for pmid, rel_confs in confidences.items():
for rel_type, conf in rel_confs.items():
print(f" PMID {pmid}, {rel_type}: {conf:.3f}")
else:
print(f" ❌ No confidence data")
if evidences:
print(f" ✅ Evidence:")
for pmid, rel_evids in evidences.items():
for rel_type, evid in rel_evids.items():
# Truncate long evidence
evid_short = evid[:100] + "..." if len(evid) > 100 else evid
print(f" PMID {pmid}, {rel_type}:")
print(f" \"{evid_short}\"")
else:
print(f" ❌ No evidence data")
count += 1
# Validation
print("\n" + "=" * 80)
print("Validation Results")
print("=" * 80)
if edges_with_confidence == graph.number_of_edges():
print("\n✅ SUCCESS: All edges have confidence scores!")
elif edges_with_confidence > 0:
print(f"\n⚠️ PARTIAL: {edges_with_confidence}/{graph.number_of_edges()} edges have confidence scores")
else:
print("\n❌ FAILURE: No edges have confidence scores")
if edges_with_evidence == graph.number_of_edges():
print("✅ SUCCESS: All edges have evidence text!")
elif edges_with_evidence > 0:
print(f"⚠️ PARTIAL: {edges_with_evidence}/{graph.number_of_edges()} edges have evidence text")
else:
print("❌ FAILURE: No edges have evidence text")
# Test accessing metadata programmatically
print("\n" + "=" * 80)
print("Programmatic Access Example")
print("=" * 80)
for u, v, data in list(graph.edges(data=True))[:1]: # Just first edge
print(f"\nAccessing metadata for edge: {u} ↔ {v}")
print(f"\nMethod 1: Direct dictionary access")
print(f" confidences = graph.edges['{u}', '{v}']['confidences']")
print(f" evidences = graph.edges['{u}', '{v}']['evidences']")
confidences = data.get('confidences', {})
evidences = data.get('evidences', {})
print(f"\nMethod 2: From edge data")
print(f" confidences = {confidences}")
print(f" evidences = {evidences}")
if __name__ == "__main__":
test_metadata_preservation()