-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkg_clean.py
More file actions
24 lines (19 loc) · 762 Bytes
/
kg_clean.py
File metadata and controls
24 lines (19 loc) · 762 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import pandas as pd
print("Loading original data...")
df = pd.read_csv('kg.csv', low_memory=False)
# 1. DROP DUPLICATES (Just in case)
df = df.drop_duplicates()
# 2. FILTER: Keep only "Drug", "Disease", and "Protein"
# We remove "Anatomy", "Pathway", etc. to make it lightweight for your laptop.
valid_types = ['drug', 'disease', 'gene/protein']
df_clean = df[
(df['x_type'].isin(valid_types)) &
(df['y_type'].isin(valid_types))
]
# 3. REMOVE SELF-LOOPS (Drug A -> Drug A)
df_clean = df_clean[df_clean['x_name'] != df_clean['y_name']]
# 4. SAVE
print(f"Original size: {len(df)}")
print(f"Cleaned size: {len(df_clean)}")
df_clean.to_csv('kg_clean.csv', index=False)
print("Saved to 'kg_clean.csv'. Use this file for the AI!")