Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
__pycache__/

wikipathways_graphs/annotated_diagram
wikipathways_graphs/literature_comparison
wikipathways_graphs/literature_comparison/*
!wikipathways_graphs/literature_comparison/Evaluation_Files/
wikipathways_graphs/literature_comparison/Evaluation_Files/*
!wikipathways_graphs/literature_comparison/Evaluation_Files/concept_idf_annotated.csv
wikipathways_graphs/pkl
wikipathways_graphs/WP*
!wikipathways_graphs/PFOCR_url_list.txt
Expand Down
70 changes: 70 additions & 0 deletions BioBERT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Load model directly
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity, pairwise_distance

# Function to get embeddings
def get_embeddings(model, tokenizer, sentence):
encoded_input = tokenizer(sentence, return_tensors='pt')
with torch.no_grad():
output = model(**encoded_input)
return output.last_hidden_state.mean(dim=1)




tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

# # Define sentences
# sentence1 = "SKI"
# sentence2 = "SMAD1"
# sentence3 = "PYROXD2"

# # Define sentences
# sentence1 = "NCBI:6497"
# sentence2 = "NCBI:4086"
# sentence3 = "NCBI:84795"

# Define sentences
sentence1 = "STAT1"
sentence2 = "IFNG"
sentence3 = "PYROXD2"

# Get embeddings
embeddings1 = get_embeddings(model, tokenizer, sentence1)
embeddings2 = get_embeddings(model, tokenizer, sentence2)
embeddings3 = get_embeddings(model, tokenizer, sentence3)

# Calculate distances
print("Cosine Distance between", sentence1, "and", sentence2, ":", 1 - cosine_similarity(embeddings1, embeddings2).item())
print("Cosine Distance between", sentence1, "and", sentence3, ":", 1 - cosine_similarity(embeddings1, embeddings3).item())


###########################################################3

# power transformation function for range increase


# def modified_power_scale(x, xmin, p):
# """ Scale the distance using a modified power function to amplify differences. """
# return (x - xmin) ** p

# # Constants
# xmin = 0.001 # Slightly less than our minimum expected value
# p = 0.25 # High power to significantly amplify differences

# # Values
# distance1 = 0.007
# distance2 = 0.008

# # Apply scaling
# scaled_distance1 = modified_power_scale(distance1, xmin, p)
# scaled_distance2 = modified_power_scale(distance2, xmin, p)

# print("Scaled Distance 1:", scaled_distance1)
# print("Scaled Distance 2:", scaled_distance2)




70 changes: 70 additions & 0 deletions RoBERTa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from transformers import RobertaTokenizer, RobertaModel
import torch
from torch.nn.functional import cosine_similarity, pairwise_distance

# Function to get embeddings
def get_embeddings(model, tokenizer, sentence):
encoded_input = tokenizer(sentence, return_tensors='pt')
with torch.no_grad():
output = model(**encoded_input)
return output.last_hidden_state.mean(dim=1)

# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')


# # Define sentences
# sentence1 = "SKI"
# sentence2 = "SMAD1"
# sentence3 = "PYROXD2"

# # Define sentences
# sentence1 = "NCBI:6497"
# sentence2 = "NCBI:4086"
# sentence3 = "NCBI:84795"


# Define sentences
sentence1 = "STAT1"
sentence2 = "IFNG"
sentence3 = "PYROXD2"

# Get embeddings
embeddings1 = get_embeddings(model, tokenizer, sentence1)
embeddings2 = get_embeddings(model, tokenizer, sentence2)
embeddings3 = get_embeddings(model, tokenizer, sentence3)

# Calculate distances
print("Cosine Distance between", sentence1, "and", sentence2, ":", 1 - cosine_similarity(embeddings1, embeddings2).item())
print("Cosine Distance between", sentence1, "and", sentence3, ":", 1 - cosine_similarity(embeddings1, embeddings3).item())



# ###########################################################3

# # power transformation function for range increase


# def modified_power_scale(x, xmin, p):
# """ Scale the distance using a modified power function to amplify differences. """
# return (x - xmin) ** p

# # Constants
# xmin = 0.001 # Slightly less than our minimum expected value
# p = 0.25 # High power to significantly amplify differences

# # Values
# distance1 = 0.007
# distance2 = 0.008

# # Apply scaling
# scaled_distance1 = modified_power_scale(distance1, xmin, p)
# scaled_distance2 = modified_power_scale(distance2, xmin, p)

# print("Scaled Distance 1:", scaled_distance1)
# print("Scaled Distance 2:", scaled_distance2)




47 changes: 47 additions & 0 deletions biomed_RoBERTa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# from transformers import RobertaTokenizer, RobertaModel
import torch
from torch.nn.functional import cosine_similarity, pairwise_distance

# Function to get embeddings
def get_embeddings(model, tokenizer, sentence):
encoded_input = tokenizer(sentence, return_tensors='pt')
with torch.no_grad():
output = model(**encoded_input)
return output.last_hidden_state.mean(dim=1)

# function to find cosine similarity
def get_cosine_similarity(embedding1,embedding2):
return(cosine_similarity(embedding1,embedding2).item())

# # Initialize tokenizer and model
# tokenizer = RobertaTokenizer.from_pretrained('allenai/biomed_roberta_base')
# model = RobertaModel.from_pretrained('allenai/biomed_roberta_base')




# # Define sentences
# sentence1 = "SKI"
# sentence2 = "SMAD1"
# sentence3 = "PYROXD2"

# # Define sentences
# sentence1 = "NCBI:6497"
# sentence2 = "NCBI:4086"
# sentence3 = "NCBI:84795"


# # Define sentences
# sentence1 = "STAT1"
# sentence2 = "IFNG"
# sentence3 = "PYROXD2"

# # Get embeddings
# embeddings1 = get_embeddings(model, tokenizer, sentence1)
# embeddings2 = get_embeddings(model, tokenizer, sentence2)
# embeddings3 = get_embeddings(model, tokenizer, sentence3)

# # Calculate distances
# print("Cosine Distance between", sentence1, "and", sentence2, ":", 1 - cosine_similarity(embeddings1, embeddings2).item())
# print("Cosine Distance between", sentence1, "and", sentence3, ":", 1 - cosine_similarity(embeddings1, embeddings3).item())

2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ dependencies:
- networkx==3.2.1
- requests==2.31.0
- oaklib==0.5.33
- torch==2.4.0
- transformers==4.44.0
6 changes: 3 additions & 3 deletions evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def output_num_paths_pairs(output_dir,num_paths_df,subgraph_algorithm):
num_paths_df.to_csv(output_folder+'/num_paths_'+subgraph_algorithm+'.csv',sep=',',index=False)
logging.info('Create number of paths file: %s',output_folder+'/num_paths_'+subgraph_algorithm+'.csv')

def output_literature_comparison_df(output_dir,all_subgraphs_cosine_sim):
def output_literature_comparison_df(output_dir,all_subgraphs_cosine_sim,search_type):

output_folder = output_dir+'/Evaluation_Files'
#Check for existence of output directory
Expand All @@ -309,8 +309,8 @@ def output_literature_comparison_df(output_dir,all_subgraphs_cosine_sim):

all_subgraphs_cosine_sim_df = pd.DataFrame.from_dict(all_subgraphs_cosine_sim, orient='columns')

all_subgraphs_cosine_sim_df.to_csv(output_folder+'/literature_comparison_evaluation.csv',sep=',',index=False)
logging.info('Create literature comparison evaluation file: %s',output_folder+'/literature_comparison_evaluation.csv')
all_subgraphs_cosine_sim_df.to_csv(output_folder+'/literature_comparison_evaluation_' + search_type + '.csv',sep=',',index=False)
logging.info('Create literature comparison evaluation file: %s',output_folder+'/literature_comparison_evaluation_' + search_type + '.csv')

return all_subgraphs_cosine_sim_df

Expand Down
15 changes: 7 additions & 8 deletions evaluation_plots_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,15 @@ def visualize_literature_comparison_heatmap(term_averages_cosine_sim_df,all_wiki


#Generates boxplot of each
def visualize_literature_comparison_boxplot_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir):
def visualize_literature_comparison_boxplot_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir,search_type):

output_folder = all_wikipathways_dir+'/literature_comparison/Evaluation_Files'

all_subgraphs_other_pathways = all_subgraphs_zscore_df.loc[all_subgraphs_zscore_df.Compared_Pathway != "Same_Pathway"]

all_subgraphs_same_pathway = all_subgraphs_zscore_df.loc[all_subgraphs_zscore_df.Compared_Pathway == "Same_Pathway"]

plt_file = output_folder + '/Literature_Comparison_all_pathways_boxplot.png'
plt_file = output_folder + '/Literature_Comparison_all_pathways_boxplot_' + search_type + '.png'
sns.swarmplot(data=all_subgraphs_same_pathway, x="Pathway_ID", y="avg_zscore_per_pathway",hue='Algorithm',palette="flare",dodge=True, legend=False, marker="x", linewidth=1,size=10)
sns.swarmplot(data=all_subgraphs_other_pathways, x="Pathway_ID", y="avg_zscore_per_pathway",hue="Algorithm", dodge=True, legend=False)
sns.boxplot(data=all_subgraphs_other_pathways, x='Pathway_ID', y = 'avg_zscore_per_pathway',hue='Algorithm').set_title("Z-Score of Cosine Similarity to All Pathway Abstracts")
Expand All @@ -165,14 +165,14 @@ def visualize_literature_comparison_boxplot_all_pathways(all_subgraphs_zscore_df
plt.close()
logging.info('Created png: %s',plt_file)

def visualize_literature_comparison_scatterplot_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir):
def visualize_literature_comparison_scatterplot_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir,search_type):

pathways = all_subgraphs_zscore_df.Pathway_ID.unique()

for pathway in pathways:

df = all_subgraphs_zscore_df.loc[all_subgraphs_zscore_df['Pathway_ID'] == pathway]
plt_file = all_wikipathways_dir + '/' + pathway + '_output/Evaluation_Files/Literature_Comparison_all_pathways_scatterplot.png'
plt_file = all_wikipathways_dir + '/' + pathway + '_output/Evaluation_Files/Literature_Comparison_all_pathways_scatterplot_' + search_type + '.png'
sns_plot = sns.swarmplot(data=df, x='Algorithm', y = 'avg_zscore_per_pathway',hue='Compared_Pathway')
sns.lineplot(x="Algorithm", dashes=False, y="avg_zscore_per_pathway", hue="Compared_Pathway", style="Compared_Pathway", data=df,legend=False).set_title("Z-Score of Cosine Similarity to All Pathway Abstracts for" + pathway)
sns.move_legend(sns_plot,"upper left", bbox_to_anchor=(1, 1))
Expand All @@ -181,14 +181,13 @@ def visualize_literature_comparison_scatterplot_all_pathways(all_subgraphs_zscor
plt.close()
logging.info('Created png: %s',plt_file)

def visualize_literature_comparison_heatmap_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir):
def visualize_literature_comparison_heatmap_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir,search_type):

output_folder = all_wikipathways_dir+'/literature_comparison/Evaluation_Files'

plt_file = output_folder + '/Literature_Comparison_all_pathways_heatmap.png'
plt_file = output_folder + '/Literature_Comparison_all_pathways_heatmap_' + search_type + '.png'
df_matrix = all_subgraphs_zscore_df.pivot_table(index='Pathway_ID',columns='Algorithm',values='avg_zscore_per_pathway')
sns.heatmap(df_matrix, fmt="g", cmap='viridis').set_title("Z-Score of Subgraphs to All Other Pathways")
plt.savefig(plt_file,bbox_inches="tight")
plt.close()
logging.info('Created png: %s',plt_file)

logging.info('Created png: %s',plt_file)
Loading