diff --git a/DESCRIPTION b/DESCRIPTION index 1959e65..76c4edd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,7 +25,11 @@ Imports: tidyr, htmlwidgets, grDevices, - stats + stats, + text2vec, + stopwords, + xml2, + rentrez Suggests: data.table, BiocStyle, diff --git a/NAMESPACE b/NAMESPACE index c433f2b..28f35aa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(annotateProteinInfoFromIndra) export(cytoscapeNetwork) export(cytoscapeNetworkOutput) export(exportNetworkToHTML) +export(filterSubnetworkByContext) export(getSubnetworkFromIndra) export(previewNetworkInBrowser) export(renderCytoscapeNetwork) @@ -16,13 +17,27 @@ importFrom(httr,GET) importFrom(httr,POST) importFrom(httr,add_headers) importFrom(httr,content) +importFrom(httr,content_type_json) importFrom(httr,status_code) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) importFrom(r2r,hashmap) importFrom(r2r,keys) importFrom(r2r,query) +importFrom(rentrez,entrez_fetch) importFrom(stats,cor) importFrom(stats,setNames) +importFrom(stopwords,stopwords) +importFrom(text2vec,TfIdf) +importFrom(text2vec,create_dtm) +importFrom(text2vec,create_vocabulary) +importFrom(text2vec,fit_transform) +importFrom(text2vec,itoken) +importFrom(text2vec,prune_vocabulary) +importFrom(text2vec,vocab_vectorizer) +importFrom(text2vec,word_tokenizer) importFrom(tidyr,pivot_wider) importFrom(utils,browseURL) +importFrom(xml2,read_xml) +importFrom(xml2,xml_find_all) +importFrom(xml2,xml_text) diff --git a/R/filterSubnetworkByContext.R b/R/filterSubnetworkByContext.R new file mode 100644 index 0000000..2bb07b4 --- /dev/null +++ b/R/filterSubnetworkByContext.R @@ -0,0 +1,380 @@ +#' Filter a subnetwork by contextual relevance +#' +#' Fetches PubMed abstracts for evidence PMIDs, scores each abstract against a +#' user-supplied query, and returns only the nodes, edges, and evidence rows +#' whose abstracts meet the scoring cutoff. +#' +#' Two scoring methods are available, controlled by the \code{method} argument: +#' +#' \describe{ +#' \item{\code{"tag_count"} (default)}{ +#' Counts how many tags from \code{query} appear as substrings in the +#' abstract (case-insensitive). The score for each abstract is an integer +#' in \code{[0, length(query)]}. Set \code{cutoff} to the minimum number of +#' tags that must appear - e.g. \code{cutoff = 2} keeps abstracts that +#' mention at least 2 of your tags. \code{query} must be a character +#' \emph{vector} of tags when using this method. +#' } +#' \item{\code{"cosine"}}{ +#' Scores abstracts using TF-IDF cosine similarity against \code{query}. +#' Scores are in \code{[-1, 1]} (in practice \code{[0, 1]} for text). +#' Set \code{cutoff} to a decimal threshold - e.g. \code{cutoff = 0.10}. +#' \code{query} should be a single character string; expand it with +#' synonyms and related terms for better recall under exact token matching. +#' } +#' } +#' +#' @param nodes A dataframe of network nodes. +#' @param edges A dataframe of network edges with columns: source, target, +#' interaction, site, evidenceLink, stmt_hash. +#' @param query For \code{method = "tag_count"}: a character vector of tags, +#' e.g. \code{c("CHEK1", "DNA damage", "DNA damage repair")}. +#' For \code{method = "cosine"}: a single character string. +#' @param cutoff Numeric threshold applied to the chosen scoring method. +#' \itemize{ +#' \item \code{"tag_count"}: integer >= 0; abstracts must +#' contain at least this many tags. Max possible value is +#' \code{length(query)}. Default \code{1}. +#' \item \code{"cosine"}: numeric in \code{[-1, 1]}; abstracts +#' must score >= this value. Default \code{0.10}. +#' } +#' @param method One of \code{"tag_count"} (default) or \code{"cosine"}. +#' +#' @return A named list with three elements: +#' \item{nodes}{Filtered nodes dataframe (only nodes present in kept edges)} +#' \item{edges}{Filtered edges dataframe} +#' \item{evidence}{Dataframe with columns: source, target, interaction, site, +#' evidenceLink, stmt_hash, text, pmid, score. The \code{score} column +#' contains tag counts (integer) or cosine similarities (numeric) depending +#' on the method used.} +#' +#' @importFrom text2vec itoken word_tokenizer create_vocabulary prune_vocabulary +#' vocab_vectorizer create_dtm TfIdf fit_transform +#' @importFrom stopwords stopwords +#' @export +filterSubnetworkByContext <- function(nodes, + edges, + query, + cutoff = NULL, + method = c("tag_count", "cosine")) { + + method <- match.arg(method) + + if (method == "tag_count") { + if (!is.character(query) || length(query) < 1 || + any(is.na(query)) || any(!nzchar(query))) { + stop("`query` must be a character vector of tags when method = 'tag_count'.") + } + if (is.null(cutoff)) cutoff <- 1L + if (!is.numeric(cutoff) || length(cutoff) != 1L || is.na(cutoff) || + cutoff < 0 || cutoff > length(query) || cutoff != as.integer(cutoff)) { + stop("`cutoff` must be a single integer in [0, length(query)] when method = 'tag_count'.") + } + cat(sprintf( + "Method: tag_count | Tags: %d | Cutoff: >= %d tag(s)\n", + length(query), cutoff + )) + } else { + if (!is.character(query) || length(query) != 1L || + is.na(query) || !nzchar(query)) { + stop("`query` must be a single character string when method = 'cosine'.") + } + if (is.null(cutoff)) cutoff <- 0.10 + if (!is.numeric(cutoff) || length(cutoff) != 1L || is.na(cutoff) || + !is.finite(cutoff) || cutoff < 0 || cutoff > 1) { + stop("`cutoff` must be a single numeric value in [0, 1] when method = 'cosine'.") + } + cat(sprintf( + "Method: cosine | Cutoff: >= %.2f\n", cutoff + )) + } + + evidence <- .extract_evidence_text(edges) + + if (nrow(evidence) == 0) { + evidence$score <- if (method == "tag_count") integer(0) else numeric(0) + warning("No evidence text found - returning unfiltered inputs.") + return(list(nodes = nodes, edges = edges, evidence = evidence)) + } + pmids <- unique(evidence$pmid[!is.na(evidence$pmid) & nchar(evidence$pmid) > 0]) + + if (length(pmids) == 0) { + evidence$score <- if (method == "tag_count") { + rep(NA_integer_, nrow(evidence)) + } else { + rep(NA_real_, nrow(evidence)) + } + warning("No PMIDs found in evidence - returning unfiltered inputs.") + return(list(nodes = nodes, edges = edges, evidence = evidence)) + } + + abstract_list <- .fetch_clean_abstracts_xml(pmids) + abstracts_df <- data.frame( + pmid = names(abstract_list), + abstract = unlist(abstract_list, use.names = FALSE), + stringsAsFactors = FALSE + ) + + if (method == "tag_count") { + abstracts_df$score <- .score_by_tag_count(abstracts_df$abstract, query) + } else { + abstracts_df$score <- .score_by_cosine(query, abstracts_df$abstract) + } + + passing_pmids <- abstracts_df$pmid[abstracts_df$score >= cutoff] + + cat(sprintf( + "\n%d / %d abstracts passed cutoff (score >= %s)\n", + length(passing_pmids), nrow(abstracts_df), cutoff + )) + + evidence_scored <- merge( + evidence, + abstracts_df[, c("pmid", "score")], + by = "pmid", + all.x = TRUE + ) + evidence_scored$score[is.na(evidence_scored$score)] <- 0 + + evidence_filtered <- evidence_scored[ + evidence_scored$pmid %in% passing_pmids, + c("source", "target", "interaction", "site", + "evidenceLink", "stmt_hash", "text", "pmid", "score") + ] + + surviving_hashes <- unique(evidence_filtered$stmt_hash) + edges_filtered <- edges[edges$stmt_hash %in% surviving_hashes, ] + + surviving_nodes <- union(edges_filtered$source, edges_filtered$target) + if (!"id" %in% names(nodes)) { + stop("`nodes` must contain an `id` column.") + } + nodes_filtered <- nodes[nodes$id %in% surviving_nodes, ] + + cat(sprintf( + "Retained: %d edges (of %d), %d nodes (of %d), %d evidence rows (of %d)\n", + nrow(edges_filtered), nrow(edges), + nrow(nodes_filtered), nrow(nodes), + nrow(evidence_filtered), nrow(evidence_scored) + )) + + return(list( + nodes = nodes_filtered, + edges = edges_filtered, + evidence = evidence_filtered + )) +} + + +#' Score abstracts by tag count +#' +#' For each abstract, counts how many tags appear as case-insensitive substrings. +#' +#' @param abstracts Character vector of abstract texts. +#' @param tags Character vector of tags to search for. +#' @return Integer vector of tag hit counts, same length as \code{abstracts}. +#' @keywords internal +#' @noRd +.score_by_tag_count <- function(abstracts, tags) { + abstracts_lower <- tolower(abstracts) + tags_lower <- tolower(tags) + + sapply(abstracts_lower, function(abstract) { + sum(sapply(tags_lower, function(tag) grepl(tag, abstract, fixed = TRUE))) + }, USE.NAMES = FALSE) +} + + +#' Score abstracts by TF-IDF cosine similarity +#' +#' Vectorises the query and all abstracts together with TF-IDF, then returns +#' the cosine similarity of each abstract against the query vector. +#' +#' @param query Single character string query. +#' @param abstracts Character vector of abstract texts. +#' @return Numeric vector of cosine similarities in \code{[0, 1]}, same length +#' as \code{abstracts}. +#' @keywords internal +#' @noRd +#' @importFrom text2vec itoken word_tokenizer create_vocabulary prune_vocabulary +#' vocab_vectorizer create_dtm TfIdf fit_transform +#' @importFrom stopwords stopwords +.score_by_cosine <- function(query, abstracts) { + all_texts <- c(query, abstracts) + + tokens <- itoken(all_texts, + preprocessor = tolower, + tokenizer = word_tokenizer) + vocab <- create_vocabulary(tokens, stopwords = stopwords("en")) + vocab <- prune_vocabulary(vocab, term_count_min = 1) + vectorizer <- vocab_vectorizer(vocab) + dtm <- create_dtm(tokens, vectorizer) + tfidf <- TfIdf$new() + dtm_tfidf <- fit_transform(dtm, tfidf) + + .cos_sim <- function(a, b) { + a <- as.numeric(a) + b <- as.numeric(b) + denom <- sqrt(sum(a^2)) * sqrt(sum(b^2)) + if (denom == 0) return(0) + sum(a * b) / denom + } + + query_vec <- dtm_tfidf[1, , drop = FALSE] + abstract_vecs <- dtm_tfidf[-1, , drop = FALSE] + + scores <- sapply(seq_len(nrow(abstract_vecs)), function(i) { + .cos_sim(query_vec, abstract_vecs[i, , drop = FALSE]) + }) + + round(scores, 4) +} + + + +#' Extract evidence text from edges dataframe via INDRA API +#' @param df Edges dataframe with columns: source, target, interaction, site, +#' evidenceLink, stmt_hash +#' @return Dataframe with additional columns: text, pmid +#' @keywords internal +#' @noRd +.extract_evidence_text <- function(df) { + + required_cols <- c("source", "target", "interaction", "site", "evidenceLink", "stmt_hash") + missing_cols <- setdiff(required_cols, names(df)) + if (length(missing_cols) > 0) { + stop(sprintf("Missing required columns: %s", paste(missing_cols, collapse = ", "))) + } + + results_list <- list() + result_count <- 0 + unique_hashes <- unique(df$stmt_hash) + n_hashes <- length(unique_hashes) + + cat(sprintf("Processing %d unique statement hashes...\n", n_hashes)) + + for (i in seq_along(unique_hashes)) { + stmt_hash <- unique_hashes[i] + + if (i %% 10 == 0) cat(sprintf("Progress: %d/%d\n", i, n_hashes)) + + evidence_list <- .query_indra_evidence(stmt_hash) + if (is.null(evidence_list) || length(evidence_list) == 0) next + + matching_indices <- which(df$stmt_hash == stmt_hash) + + for (evidence in evidence_list) { + if (!is.null(evidence[["text"]]) && nchar(evidence[["text"]]) > 0) { + for (idx in matching_indices) { + result_count <- result_count + 1 + results_list[[result_count]] <- data.frame( + source = df$source[idx], + target = df$target[idx], + interaction = df$interaction[idx], + site = df$site[idx], + evidenceLink = df$evidenceLink[idx], + stmt_hash = df$stmt_hash[idx], + text = evidence[["text"]], + pmid = if (is.null(evidence[["pmid"]])) "" else evidence[["pmid"]], + stringsAsFactors = FALSE + ) + } + } + } + } + + if (result_count == 0) { + warning("No evidence text found for any statement hash") + return(data.frame( + source = character(), target = character(), interaction = character(), + site = character(), evidenceLink = character(), stmt_hash = character(), + text = character(), pmid = character(), stringsAsFactors = FALSE + )) + } + + results_df <- do.call(rbind, results_list) + cat(sprintf("\nComplete! Found %d evidence text entries.\n", nrow(results_df))) + return(results_df) +} + + +#' Fetch and clean PubMed abstracts via rentrez +#' @param pmids Character vector of PubMed IDs +#' @return Named list: pmid -> abstract text +#' @keywords internal +#' @noRd +#' @importFrom rentrez entrez_fetch +#' @importFrom xml2 read_xml xml_find_all xml_text +.fetch_clean_abstracts_xml <- function(pmids) { + results <- list() + total <- length(pmids) + + cat(sprintf("Fetching %d abstracts...\n", total)) + + for (i in seq_along(pmids)) { + pmid <- pmids[i] + + record <- tryCatch( + entrez_fetch(db = "pubmed", id = pmid, rettype = "xml"), + error = function(e) { + cat(sprintf("Error fetching PMID %s at %d/%d: %s\n", pmid, i, total, e$message)) + NULL + } + ) + + if (is.null(record)) { + results[[pmid]] <- "" + next + } + + doc <- read_xml(record) + abstract_nodes <- xml_find_all(doc, ".//AbstractText") + + if (length(abstract_nodes) > 0) { + results[[pmid]] <- paste(trimws(xml_text(abstract_nodes)), collapse = " ") + } else { + results[[pmid]] <- "" + } + + if (i %% 10 == 0 || i == total) { + cat(sprintf("Progress: %d/%d (%.1f%%)\n", i, total, (i / total) * 100)) + } + + Sys.sleep(0.34) + } + + cat("Done fetching abstracts!\n") + return(results) +} + + +#' Query INDRA API for evidence text +#' @param stmt_hash A statement hash string +#' @return A list of evidence objects from the API, or NULL if error +#' @keywords internal +#' @noRd +#' @importFrom httr POST status_code content content_type_json +#' @importFrom jsonlite fromJSON +.query_indra_evidence <- function(stmt_hash) { + url <- "https://discovery.indra.bio/api/get_evidences_for_stmt_hash" + + tryCatch({ + response <- POST( + url, + body = list(stmt_hash = stmt_hash), + encode = "json", + content_type_json() + ) + + if (status_code(response) != 200) { + warning(sprintf("API returned status %d for stmt_hash: %s", + status_code(response), stmt_hash)) + return(NULL) + } + + content(response, as = "parsed") + }, error = function(e) { + warning(sprintf("Error querying stmt_hash %s: %s", stmt_hash, e$message)) + return(NULL) + }) +} \ No newline at end of file diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index 228ff02..ef14ef2 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -426,3 +426,5 @@ correlations <- cor(wide_data, use = "pairwise.complete.obs") return(correlations) } + + diff --git a/man/filterSubnetworkByContext.Rd b/man/filterSubnetworkByContext.Rd new file mode 100644 index 0000000..91d176c --- /dev/null +++ b/man/filterSubnetworkByContext.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filterSubnetworkByContext.R +\name{filterSubnetworkByContext} +\alias{filterSubnetworkByContext} +\title{Filter a subnetwork by contextual relevance} +\usage{ +filterSubnetworkByContext( + nodes, + edges, + query, + cutoff = NULL, + method = c("tag_count", "cosine") +) +} +\arguments{ +\item{nodes}{A dataframe of network nodes.} + +\item{edges}{A dataframe of network edges with columns: source, target, +interaction, site, evidenceLink, stmt_hash.} + +\item{query}{For \code{method = "tag_count"}: a character vector of tags, +e.g. \code{c("CHEK1", "DNA damage", "DNA damage repair")}. +For \code{method = "cosine"}: a single character string.} + +\item{cutoff}{Numeric threshold applied to the chosen scoring method. +\itemize{ + \item \code{"tag_count"}: integer >= 0; abstracts must + contain at least this many tags. Max possible value is + \code{length(query)}. Default \code{1}. + \item \code{"cosine"}: numeric in \code{[-1, 1]}; abstracts + must score >= this value. Default \code{0.10}. +}} + +\item{method}{One of \code{"tag_count"} (default) or \code{"cosine"}.} +} +\value{ +A named list with three elements: + \item{nodes}{Filtered nodes dataframe (only nodes present in kept edges)} + \item{edges}{Filtered edges dataframe} + \item{evidence}{Dataframe with columns: source, target, interaction, site, + evidenceLink, stmt_hash, text, pmid, score. The \code{score} column + contains tag counts (integer) or cosine similarities (numeric) depending + on the method used.} +} +\description{ +Fetches PubMed abstracts for evidence PMIDs, scores each abstract against a +user-supplied query, and returns only the nodes, edges, and evidence rows +whose abstracts meet the scoring cutoff. +} +\details{ +Two scoring methods are available, controlled by the \code{method} argument: + +\describe{ + \item{\code{"tag_count"} (default)}{ + Counts how many tags from \code{query} appear as substrings in the + abstract (case-insensitive). The score for each abstract is an integer + in \code{[0, length(query)]}. Set \code{cutoff} to the minimum number of + tags that must appear - e.g. \code{cutoff = 2} keeps abstracts that + mention at least 2 of your tags. \code{query} must be a character + \emph{vector} of tags when using this method. + } + \item{\code{"cosine"}}{ + Scores abstracts using TF-IDF cosine similarity against \code{query}. + Scores are in \code{[-1, 1]} (in practice \code{[0, 1]} for text). + Set \code{cutoff} to a decimal threshold - e.g. \code{cutoff = 0.10}. + \code{query} should be a single character string; expand it with + synonyms and related terms for better recall under exact token matching. + } +} +} diff --git a/tests/testthat/test-filterSubnetworkByContext.R b/tests/testthat/test-filterSubnetworkByContext.R new file mode 100644 index 0000000..4f77b03 --- /dev/null +++ b/tests/testthat/test-filterSubnetworkByContext.R @@ -0,0 +1,154 @@ +make_edges <- function() { + data.frame( + source = c("A", "B"), + target = c("B", "C"), + interaction = c("activates", "inhibits"), + site = c("T308", "S473"), + evidenceLink = c("https://example.com/1", "https://example.com/2"), + stmt_hash = c("hash1", "hash2"), + stringsAsFactors = FALSE + ) +} + +make_nodes <- function() { + data.frame( + id = c("A", "B", "C"), + label = c("GeneA", "GeneB", "GeneC"), + stringsAsFactors = FALSE + ) +} + +describe(".score_by_tag_count", { + + test_that("returns 0 for an abstract that contains none of the tags", { + scores <- .score_by_tag_count("nothing relevant here", c("CHEK1", "DNA damage")) + expect_equal(scores, 0L) + }) + + test_that("counts multiple matching tags correctly", { + abstract <- "CHEK1 is involved in DNA damage repair pathways." + scores <- .score_by_tag_count(abstract, c("chek1", "DNA damage", "apoptosis")) + expect_equal(scores, 2L) + }) + +}) + +describe(".score_by_cosine", { + + test_that("returns a numeric vector of the same length as abstracts", { + abstracts <- c("DNA damage repair involves CHEK1.", "Unrelated text about metabolism.") + scores <- .score_by_cosine("CHEK1 DNA damage", abstracts) + expect_true(is.numeric(scores)) + expect_length(scores, 2) + }) + + test_that("scores a highly relevant abstract higher than an irrelevant one", { + relevant <- "CHEK1 mediates the DNA damage checkpoint response." + irrelevant <- "Photosynthesis occurs in the chloroplast of plant cells." + scores <- .score_by_cosine("CHEK1 DNA damage checkpoint", c(relevant, irrelevant)) + expect_gt(scores[1], scores[2]) + }) + +}) + +describe(".extract_evidence_text", { + + test_that("stops when required columns are missing from the edges dataframe", { + bad_df <- data.frame(source = "A", target = "B", stringsAsFactors = FALSE) + expect_error( + .extract_evidence_text(bad_df), + regexp = "Missing required columns" + ) + }) + + test_that("returns an empty dataframe with correct columns when the INDRA API returns nothing", { + edges <- make_edges() + + # Mock .query_indra_evidence to always return NULL + mockery::stub(.extract_evidence_text, ".query_indra_evidence", NULL) + + result <- suppressWarnings(.extract_evidence_text(edges)) + expect_s3_class(result, "data.frame") + expect_true(all(c("source", "target", "interaction", "site", + "evidenceLink", "stmt_hash", "text", "pmid") %in% names(result))) + expect_equal(nrow(result), 0) + }) + +}) + +describe(".fetch_clean_abstracts_xml", { + + test_that("returns an empty list when given an empty pmids vector", { + result <- .fetch_clean_abstracts_xml(character(0)) + expect_true(is.list(result)) + expect_length(result, 0) + }) + + test_that("stores an empty string for a PMID that triggers an API error", { + mockery::stub( + .fetch_clean_abstracts_xml, + "entrez_fetch", + function(...) stop("network error") + ) + result <- suppressMessages(.fetch_clean_abstracts_xml(c("99999999"))) + expect_true("99999999" %in% names(result)) + expect_equal(result[["99999999"]], "") + }) + +}) + +describe("filterSubnetworkByContext", { + + test_that("returns nodes, edges, and evidence that match the query tags (happy path)", { + nodes <- make_nodes() + edges <- make_edges() + + # --- mock .extract_evidence_text --- + mock_evidence <- data.frame( + source = c("A", "B"), + target = c("B", "C"), + interaction = c("activates", "inhibits"), + site = c("T308", "S473"), + evidenceLink = c("https://example.com/1", "https://example.com/2"), + stmt_hash = c("hash1", "hash2"), + text = c( + "CHEK1 phosphorylates CDC25A in response to DNA damage.", + "Unrelated text about lipid metabolism and glucose uptake." + ), + pmid = c("11111111", "22222222"), + stringsAsFactors = FALSE + ) + mockery::stub(filterSubnetworkByContext, ".extract_evidence_text", mock_evidence) + + # --- mock .fetch_clean_abstracts_xml --- + mock_abstracts <- list( + "11111111" = "CHEK1 phosphorylates CDC25A in response to DNA damage.", + "22222222" = "Unrelated text about lipid metabolism and glucose uptake." + ) + mockery::stub(filterSubnetworkByContext, ".fetch_clean_abstracts_xml", mock_abstracts) + + result <- filterSubnetworkByContext( + nodes = nodes, + edges = edges, + query = c("CHEK1", "DNA damage"), + cutoff = 1, + method = "tag_count" + ) + + # Structure check + expect_named(result, c("nodes", "edges", "evidence")) + + # Only the CHEK1/DNA-damage abstract passed the cutoff + expect_equal(nrow(result$edges), 1) + expect_equal(result$edges$stmt_hash, "hash1") + + # Nodes should only contain those referenced by surviving edges + expect_true(all(result$nodes$id %in% c("A", "B", "C"))) + expect_false("C" %in% result$nodes$id) # C only appears in the filtered-out edge + + # Evidence rows carry a score column + expect_true("score" %in% names(result$evidence)) + expect_true(all(result$evidence$score >= 1)) + }) + +}) \ No newline at end of file diff --git a/vignettes/Filter-By-Context.Rmd b/vignettes/Filter-By-Context.Rmd new file mode 100644 index 0000000..fbc167a --- /dev/null +++ b/vignettes/Filter-By-Context.Rmd @@ -0,0 +1,410 @@ +--- +title: "Filtering Subnetworks by Biological Context" +author: "" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: true + toc_depth: 3 +vignette: > + %\VignetteIndexEntry{Filtering Subnetworks by Biological Context} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = FALSE +) +``` + +## Overview + +This vignette demonstrates how to use `filterSubnetworkByContext()` to filter a +protein interaction subnetwork by the contextual relevance of its supporting +literature. The function: + +1. Retrieves evidence sentences from the INDRA database for each edge in the network +2. Fetches the corresponding PubMed abstracts +3. Scores each abstract against a user-supplied text query using TF-IDF cosine score +4. Returns only the nodes, edges, and evidence whose abstracts exceed a score cutoff + +This is useful when a subnetwork contains many edges supported by literature from +unrelated biological contexts, and you want to focus on edges relevant to a +specific research question — in this case, **DNA damage repair in cancer**. + +## Input Data + +`filterSubnetworkByContext()` expects a nodes and edges dataframe, typically +produced by `getSubnetworkFromIndra()`. For this example we construct a small +representative input table directly, mimicking the structure of a proteomics +experiment centred on the DNA damage response kinase **CHK1**. + +The input table contains one row per protein with columns for the UniProt +mnemonic identifier, the log2 fold-change, and the adjusted p-value from a +differential expression analysis. + +```{r input-data} +input <- data.frame( + Protein = c("CHK1_HUMAN", "RFA1_HUMAN", "CLH1_HUMAN", "CRTC3_HUMAN"), + log2FC = c(2.31, 1.87, 1.45, 1.12), + adj.pvalue = c(0.0021, 0.0089, 0.0310, 0.0490), + stringsAsFactors = FALSE +) + +input +``` + +``` +## Protein log2FC adj.pvalue +## 1 CHK1_HUMAN 2.31 0.0021 +## 2 RFA1_HUMAN 1.87 0.0089 +## 3 CLH1_HUMAN 1.45 0.0310 +## 4 CRTC3_HUMAN 1.12 0.0490 +``` + +All four proteins are up-regulated (positive log2FC) and statistically +significant (adj.pvalue < 0.05). + +--- + +## Building the Subnetwork + +### Step 1 — Annotate proteins with INDRA metadata + +`annotateProteinInfoFromIndra()` maps UniProt mnemonics to HGNC gene identifiers +and other metadata used downstream by the INDRA query engine. + +```{r annotate} +library(MSstatsBioNet) +annotated_df <- annotateProteinInfoFromIndra(input, "Uniprot_Mnemonic") +``` + +### Step 2 — Retrieve the interaction subnetwork + +`getSubnetworkFromIndra()` queries the INDRA database for curated causal +interactions among the annotated proteins and returns a list containing `$nodes` +and `$edges` dataframes. + +Key parameters used here: + +- **`pvalueCutoff = 0.2`** — relaxed threshold to retain more candidate edges for + downstream context filtering +- **`evidence_count_cutoff = 1`** — keep edges supported by at least one + literature statement +- **`force_include_other = "HGNC:1925"`** — always include CHK1 (HGNC:1925) + regardless of significance, as it is the focal protein of interest +- **`filter_by_curation = FALSE`** — include both curated and automatically + extracted interactions + +```{r subnetwork} +subnetwork <- getSubnetworkFromIndra( + annotated_df, + pvalueCutoff = 0.2, + logfc_cutoff = NULL, + evidence_count_cutoff = 1, + sources_filter = NULL, + force_include_other = "HGNC:1925", + filter_by_curation = FALSE +) + +# Inspect the unfiltered network +nrow(subnetwork$nodes) +nrow(subnetwork$edges) +``` + +--- + +## Filtering by Context: Tag Count + +### Defining the Query + +The query string is compared against each PubMed abstract supporting the network +edges. A richer query — one that includes synonyms, abbreviations, and related +terms — improves recall under TF-IDF, which relies on exact token matching rather +than semantic understanding. + +The expanded query below was produced with the help of a chatbot and +covers the major vocabulary used in the DNA damage repair and cancer literature. + +```{r tags} +tags <- c( + "dna damage repair", + "cancer", + "oncology", + "dna repair", + "genome integrity", + "genomic instability", + "double strand_break", + "dsb", + "single strand_break", + "ssb", + "base excision repair", + "ber", + "nucleotide excision repair", + "ner", + "mismatch repair", + "mmr", + "homologous recombination", + "hr", + "non homologous end joining", + "nhej", + "brca1", + "brca2", + "atm", + "atr", + "p53", + "tp53", + "parp", + "tumor suppressor", + "oncogene", + "carcinogenesis", + "tumorigenesis", + "chemotherapy resistance", + "radiation resistance", + "genotoxic stress", + "replication stress", + "oxidative dna_damage", + "somatic mutation", + "tumor mutational burden", + "tmb" +) +``` + +> **Tip:** You can iteratively refine `tags` by inspecting the +> scores in `filtered_network$evidence` and adding terms that appear frequently +> in high-scoring abstracts but are absent from your query. + +`filterSubnetworkByContext()` ties everything together. The +`cutoff` parameter controls stringency — only edges whose supporting +abstracts score at or above this value are retained. + +```{r filter} +filtered_network <- filterSubnetworkByContext( + nodes = subnetwork$nodes, + edges = subnetwork$edges, + method = "tag_count", + cutoff = 3, + query = tags +) +``` + +The function prints a progress summary to the console: + +``` +Processing N unique statement hashes... +Fetching M abstracts... +Progress: M/M (100.0%) +Done fetching abstracts! + +X / M abstracts passed score cutoff (>= 0.10) +Retained: A edges (of B), C nodes (of D), E evidence rows (of F) +``` + + +### Filtered nodes + +```{r nodes} +filtered_network$nodes +``` + +Only proteins connected by at least one contextually relevant edge are retained. + +### Filtered edges + +```{r edges} +filtered_network$edges +``` + +Each row represents a causal interaction (e.g. phosphorylation, activation) +supported by literature that passed the score threshold. + +### Evidence with scores + +```{r evidence} +filtered_network$evidence +``` + +The evidence dataframe contains the following columns: + +| Column | Description | +|---|---| +| `source` | Source protein / gene | +| `target` | Target protein / gene | +| `interaction` | Interaction type (e.g. Phosphorylation) | +| `site` | Modification site if applicable | +| `evidenceLink` | URL to the INDRA evidence viewer | +| `stmt_hash` | Unique INDRA statement identifier | +| `text` | Sentence extracted from the supporting paper | +| `pmid` | PubMed ID of the source article | +| `score` | Cosine score of the abstract vs. query | + +You can sort by score to identify the most on-topic supporting evidence: + +```{r sort-evidence} +filtered_network$evidence[ + order(filtered_network$evidence$score, decreasing = TRUE), +] +``` + +### Defining a cutoff + +```{r explore-cutoff} +# Run with permissive cutoff to see full score distribution +exploratory <- filterSubnetworkByContext( + nodes = subnetwork$nodes, + edges = subnetwork$edges, + cutoff = 0.0, + query = tags +) + +summary(exploratory$evidence$score) +hist(exploratory$evidence$score, + breaks = 30, + main = "Distribution of abstract scores", + xlab = "Number of tags matched", + col = "steelblue") +``` + +--- + +## Filtering by Context: Cosine score + +### Defining the Query + +The query string is compared against each PubMed abstract supporting the network +edges. A richer query — one that includes synonyms, abbreviations, and related +terms — improves recall under TF-IDF, which relies on exact token matching rather +than semantic understanding. + +The expanded query below was produced with the help of a chatbot and +covers the major vocabulary used in the DNA damage repair and cancer literature. + +```{r query} +my_query <- "DNA damage repair cancer oncology DNA repair genome integrity + genomic instability double strand break DSB single strand break SSB + base excision repair BER nucleotide excision repair NER mismatch repair MMR + homologous recombination HR non-homologous end joining NHEJ BRCA1 BRCA2 + ATM ATR p53 TP53 PARP tumor suppressor oncogene carcinogenesis tumorigenesis + chemotherapy resistance radiation resistance genotoxic stress replication stress + oxidative DNA damage somatic mutation tumor mutational burden TMB" +``` + +> **Tip:** You can iteratively refine `my_query` by inspecting the +> scores in `filtered_network$evidence` and adding terms that appear frequently +> in high-scoring abstracts but are absent from your query. + +`filterSubnetworkByContext()` ties everything together. The +`cutoff` parameter controls stringency — only edges whose supporting +abstracts score at or above this value are retained. + +```{r} +filtered_network <- filterSubnetworkByContext( + nodes = subnetwork$nodes, + edges = subnetwork$edges, + method = "cosine", + cutoff = 0.10, + query = my_query +) +``` + +The function prints a progress summary to the console: + +``` +Processing N unique statement hashes... +Fetching M abstracts... +Progress: M/M (100.0%) +Done fetching abstracts! + +X / M abstracts passed score cutoff (>= 0.10) +Retained: A edges (of B), C nodes (of D), E evidence rows (of F) +``` + + +### Filtered nodes + +```{r} +filtered_network$nodes +``` + +Only proteins connected by at least one contextually relevant edge are retained. + +### Filtered edges + +```{r} +filtered_network$edges +``` + +Each row represents a causal interaction (e.g. phosphorylation, activation) +supported by literature that passed the score threshold. + +### Evidence with scores + +```{r} +filtered_network$evidence +``` + +The evidence dataframe contains the following columns: + +| Column | Description | +|---|---| +| `source` | Source protein / gene | +| `target` | Target protein / gene | +| `interaction` | Interaction type (e.g. Phosphorylation) | +| `site` | Modification site if applicable | +| `evidenceLink` | URL to the INDRA evidence viewer | +| `stmt_hash` | Unique INDRA statement identifier | +| `text` | Sentence extracted from the supporting paper | +| `pmid` | PubMed ID of the source article | +| `score` | Relevance score (tag count or cosine similarity) | + +You can sort by score to identify the most on-topic supporting evidence: + +```{r} +filtered_network$evidence[ + order(filtered_network$evidence$score, decreasing = TRUE), +] +``` + +### Choosing a score Cutoff + +The right cutoff depends on how broadly the query overlaps with the literature +in your network. As a rough guide: + +| Cutoff | Effect | +|---|---| +| `0.05` | Permissive — removes only completely off-topic abstracts | +| `0.10` | Recommended default for domain-specific queries | +| `0.20` | Stringent — retains only highly on-topic edges | +| `> 0.30` | Very stringent — use only with highly specific queries | + +To explore the score distribution before committing to a cutoff, run the +function at a low threshold and inspect the scores: + +```{r} +# Run with permissive cutoff to see full score distribution +exploratory <- filterSubnetworkByContext( + nodes = subnetwork$nodes, + edges = subnetwork$edges, + cutoff = 0.0, + method = "cosine", + query = my_query +) + +summary(exploratory$evidence$score) +hist(exploratory$evidence$score, + breaks = 30, + main = "Distribution of abstract scores", + xlab = "Cosine score to query", + col = "steelblue") +``` + +--- + +## Session Info + +```{r session-info, eval=TRUE} +sessionInfo() +```