From cb324d449f255df8abd37e1330638761be23aad4 Mon Sep 17 00:00:00 2001 From: Young Sherlock Date: Sun, 1 Mar 2026 01:20:56 +0800 Subject: [PATCH 1/7] chore: update ignore files for LLM config --- .Rbuildignore | 1 + .gitignore | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.Rbuildignore b/.Rbuildignore index 66d27f9..7f92f90 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,3 +20,4 @@ outdated inst/sticker local_test .trae +.env diff --git a/.gitignore b/.gitignore index 7847d1b..7420ac0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,10 @@ __pycache__ __init__.py __init__.pyc .Rproj.user +.Rhistory +.RData +*.Rproj .vscode/ .trae - +.env \ No newline at end of file From ce59da68a826e0fb74c37edf2175a2589734f351 Mon Sep 17 00:00:00 2001 From: Young Sherlock Date: Sun, 1 Mar 2026 01:21:04 +0800 Subject: [PATCH 2/7] refactor: replace fanyi with aisdk and add provider support in interpret --- R/interpret.R | 1703 ++++++++++++++++++++++++++----------------------- 1 file changed, 895 insertions(+), 808 deletions(-) diff --git a/R/interpret.R b/R/interpret.R index caa717b..8c28475 100644 --- a/R/interpret.R +++ b/R/interpret.R @@ -11,256 +11,264 @@ #' @param context A string describing the experimental background. #' @param n_pathways Number of top pathways to consider initially. Default is 50 (Agent 1 will filter them). #' @param model The LLM model to use. +#' @param provider The LLM provider. Default is NULL (inferred from model or handled by aisdk). #' @param api_key The API key for the LLM. #' @param add_ppi Boolean, whether to use PPI network integration. #' @param gene_fold_change Named vector of logFC for expression context. #' @return A detailed interpretation list. #' @author Guangchuang Yu #' @export -interpret_agent <- function(x, context = NULL, n_pathways = 50, model = "deepseek-chat", api_key = NULL, add_ppi = FALSE, gene_fold_change = NULL) { - if (missing(x)) { - stop("enrichment result 'x' is required.") +interpret_agent <- function(x, context = NULL, n_pathways = 50, model = "deepseek-chat", provider = NULL, api_key = NULL, add_ppi = FALSE, gene_fold_change = NULL) { + if (missing(x)) { + stop("enrichment result 'x' is required.") + } + + # Process input into a list of data frames (one per cluster/group) + res_list <- process_enrichment_input(x, n_pathways) + + if (length(res_list) == 0) { + return("No significant pathways found to interpret.") + } + + # Process each cluster with the multi-agent pipeline + results <- lapply(names(res_list), function(name) { + item <- res_list[[name]] + df <- item$df + original_genes <- item$genes + + # Check for fallback mode + fallback_mode <- FALSE + pathway_text <- "" + + if (nrow(df) == 0) { + if (!is.null(original_genes) && length(original_genes) > 0) { + fallback_mode <- TRUE + warning(sprintf("Cluster '%s': No enriched pathways. Falling back to gene-based interpretation. Confidence may be lower.", name)) + pathway_text <- paste( + "No significant pathways enriched.", + "Top Genes:", paste(head(original_genes, 50), collapse = ", ") + ) + } else { + return(NULL) + } + } else { + message(sprintf("Processing cluster '%s' with Agent 1: The Cleaner...", name)) + + # Format initial pathways for Agent 1 + cols_to_keep <- intersect(c("ID", "Description", "GeneRatio", "NES", "p.adjust", "pvalue", "geneID"), names(df)) + pathway_text <- paste( + apply(df[, cols_to_keep, drop = FALSE], 1, function(row) { + paste(names(row), row, sep = ": ", collapse = ", ") + }), + collapse = "\n" + ) + } + + # --- Step 1: Agent Cleaner --- + # Skip cleaner in fallback mode or adapt it? + # For now, if fallback, we skip cleaner as there are no pathways to clean. + cleaned_pathways <- pathway_text + if (!fallback_mode) { + clean_res <- run_agent_cleaner(pathway_text, context, model, provider, api_key) + if (is.null(clean_res) || is.null(clean_res$kept_pathways)) { + warning("Agent Cleaner failed or returned empty results. Falling back to using top pathways.") + cleaned_pathways <- pathway_text # Fallback + } else { + cleaned_pathways <- paste("Selected Relevant Pathways (filtered by Agent Cleaner):", + paste(clean_res$kept_pathways, collapse = ", "), + "\nReasoning:", clean_res$reasoning, + sep = "\n" + ) + } } - # Process input into a list of data frames (one per cluster/group) - res_list <- process_enrichment_input(x, n_pathways) + # --- Step 2: Agent Detective --- + message(sprintf("Processing cluster '%s' with Agent 2: The Detective...", name)) - if (length(res_list) == 0) { - return("No significant pathways found to interpret.") + # Prepare Network Data (PPI) + ppi_network_text <- NULL + if (add_ppi) { + # Extract genes + all_genes <- NULL + if (fallback_mode) { + all_genes <- original_genes + } else { + all_genes <- unique(unlist(strsplit(df$geneID, "/"))) + } + + if (length(all_genes) > 0) { + ppi_network_text <- .get_ppi_context_text(all_genes, x) + } } - # Process each cluster with the multi-agent pipeline - results <- lapply(names(res_list), function(name) { - item <- res_list[[name]] - df <- item$df - original_genes <- item$genes - - # Check for fallback mode - fallback_mode <- FALSE - pathway_text <- "" - - if (nrow(df) == 0) { - if (!is.null(original_genes) && length(original_genes) > 0) { - fallback_mode <- TRUE - warning(sprintf("Cluster '%s': No enriched pathways. Falling back to gene-based interpretation. Confidence may be lower.", name)) - pathway_text <- paste("No significant pathways enriched.", - "Top Genes:", paste(head(original_genes, 50), collapse=", ")) - } else { - return(NULL) - } - } else { - message(sprintf("Processing cluster '%s' with Agent 1: The Cleaner...", name)) - - # Format initial pathways for Agent 1 - cols_to_keep <- intersect(c("ID", "Description", "GeneRatio", "NES", "p.adjust", "pvalue", "geneID"), names(df)) - pathway_text <- paste( - apply(df[, cols_to_keep, drop=FALSE], 1, function(row) { - paste(names(row), row, sep=": ", collapse=", ") - }), - collapse="\n" - ) - } - - # --- Step 1: Agent Cleaner --- - # Skip cleaner in fallback mode or adapt it? - # For now, if fallback, we skip cleaner as there are no pathways to clean. - cleaned_pathways <- pathway_text - if (!fallback_mode) { - clean_res <- run_agent_cleaner(pathway_text, context, model, api_key) - if (is.null(clean_res) || is.null(clean_res$kept_pathways)) { - warning("Agent Cleaner failed or returned empty results. Falling back to using top pathways.") - cleaned_pathways <- pathway_text # Fallback - } else { - cleaned_pathways <- paste("Selected Relevant Pathways (filtered by Agent Cleaner):", - paste(clean_res$kept_pathways, collapse=", "), - "\nReasoning:", clean_res$reasoning, sep="\n") - } - } - - # --- Step 2: Agent Detective --- - message(sprintf("Processing cluster '%s' with Agent 2: The Detective...", name)) - - # Prepare Network Data (PPI) - ppi_network_text <- NULL - if (add_ppi) { - # Extract genes - all_genes <- NULL - if (fallback_mode) { - all_genes <- original_genes + # Prepare Fold Change Data + fc_text <- NULL + if (!is.null(gene_fold_change)) { + all_genes <- NULL + if (fallback_mode) { + all_genes <- original_genes + } else { + all_genes <- unique(unlist(strsplit(df$geneID, "/"))) + } + + common_genes <- intersect(all_genes, names(gene_fold_change)) + if (length(common_genes) > 0) { + fc_subset <- gene_fold_change[common_genes] + fc_subset <- fc_subset[order(abs(fc_subset), decreasing = TRUE)] + top_fc <- head(fc_subset, 20) + fc_text <- paste(names(top_fc), round(top_fc, 2), sep = ":", collapse = ", ") + } + } + + detective_res <- run_agent_detective(cleaned_pathways, ppi_network_text, fc_text, context, model, provider, api_key, fallback_mode) + + # --- Step 3: Agent Synthesizer --- + message(sprintf("Processing cluster '%s' with Agent 3: The Storyteller...", name)) + + final_res <- run_agent_synthesizer(cleaned_pathways, detective_res, context, model, provider, api_key, fallback_mode) + + # Post-processing: Add cluster name and parse refined network if available + if (is.list(final_res)) { + final_res$cluster <- name + if (fallback_mode) { + final_res$data_source <- "gene_list_only" + } + + # Merge Detective findings into final result for transparency + if (!is.null(detective_res)) { + final_res$regulatory_drivers <- detective_res$key_drivers + final_res$refined_network <- detective_res$refined_network + final_res$network_evidence <- detective_res$network_evidence + } + + # Helper to parse refined network to igraph + if (!is.null(final_res$refined_network)) { + rn_df <- tryCatch( + { + if (is.data.frame(final_res$refined_network)) { + final_res$refined_network } else { - all_genes <- unique(unlist(strsplit(df$geneID, "/"))) - } - - if (length(all_genes) > 0) { - ppi_network_text <- .get_ppi_context_text(all_genes, x) + do.call(rbind, lapply(final_res$refined_network, as.data.frame)) } - } - - # Prepare Fold Change Data - fc_text <- NULL - if (!is.null(gene_fold_change)) { - all_genes <- NULL - if (fallback_mode) { - all_genes <- original_genes - } else { - all_genes <- unique(unlist(strsplit(df$geneID, "/"))) - } - - common_genes <- intersect(all_genes, names(gene_fold_change)) - if (length(common_genes) > 0) { - fc_subset <- gene_fold_change[common_genes] - fc_subset <- fc_subset[order(abs(fc_subset), decreasing = TRUE)] - top_fc <- head(fc_subset, 20) - fc_text <- paste(names(top_fc), round(top_fc, 2), sep = ":", collapse = ", ") - } - } - - detective_res <- run_agent_detective(cleaned_pathways, ppi_network_text, fc_text, context, model, api_key, fallback_mode) - - # --- Step 3: Agent Synthesizer --- - message(sprintf("Processing cluster '%s' with Agent 3: The Storyteller...", name)) - - final_res <- run_agent_synthesizer(cleaned_pathways, detective_res, context, model, api_key, fallback_mode) + }, + error = function(e) NULL + ) - # Post-processing: Add cluster name and parse refined network if available - if (is.list(final_res)) { - final_res$cluster <- name - if (fallback_mode) { - final_res$data_source <- "gene_list_only" - } - - # Merge Detective findings into final result for transparency - if (!is.null(detective_res)) { - final_res$regulatory_drivers <- detective_res$key_drivers - final_res$refined_network <- detective_res$refined_network - final_res$network_evidence <- detective_res$network_evidence - } - - # Helper to parse refined network to igraph - if (!is.null(final_res$refined_network)) { - rn_df <- tryCatch({ - if (is.data.frame(final_res$refined_network)) { - final_res$refined_network - } else { - do.call(rbind, lapply(final_res$refined_network, as.data.frame)) - } - }, error = function(e) NULL) - - if (!is.null(rn_df) && nrow(rn_df) > 0) { - colnames(rn_df)[colnames(rn_df) == "source"] <- "from" - colnames(rn_df)[colnames(rn_df) == "target"] <- "to" - if ("from" %in% names(rn_df) && "to" %in% names(rn_df)) { - final_res$network <- igraph::graph_from_data_frame(rn_df, directed = FALSE) - } - } - } + if (!is.null(rn_df) && nrow(rn_df) > 0) { + colnames(rn_df)[colnames(rn_df) == "source"] <- "from" + colnames(rn_df)[colnames(rn_df) == "target"] <- "to" + if ("from" %in% names(rn_df) && "to" %in% names(rn_df)) { + final_res$network <- igraph::graph_from_data_frame(rn_df, directed = FALSE) + } } - - return(final_res) - }) - - names(results) <- names(res_list) - - if (length(results) == 1 && names(results)[1] == "Default") { - return(results[[1]]) - } else { - class(results) <- c("interpretation_list", "list") - return(results) + } } + + return(final_res) + }) + + names(results) <- names(res_list) + + if (length(results) == 1 && names(results)[1] == "Default") { + return(results[[1]]) + } else { + class(results) <- c("interpretation_list", "list") + return(results) + } } -run_agent_cleaner <- function(pathways, context, model, api_key) { - prompt <- paste0( - "You are 'Agent Cleaner', an expert bioinformatics curator.\n", - "Your task is to filter a list of enriched pathways to retain only those relevant to the specific experimental context.\n\n", - if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", - "Raw Enriched Pathways:\n", pathways, "\n\n", - "Instructions:\n", - "1. Identify and REMOVE 'housekeeping' pathways (e.g., Ribosome, Spliceosome, RNA transport) unless they are specifically relevant to the context (e.g., cancer proliferation).\n", - "2. Identify and REMOVE redundant or overly broad terms.\n", - "3. KEEP disease-specific, tissue-specific, or phenotype-driving pathways.\n\n", - "Output JSON format:\n", - "{\n", - " \"kept_pathways\": [\"List of pathway names to keep\"],\n", - " \"discarded_pathways\": [\"List of discarded pathways\"],\n", - " \"reasoning\": \"Brief explanation of the filtering strategy used.\"\n", - "}" - ) - - call_llm_fanyi(prompt, model, api_key) +run_agent_cleaner <- function(pathways, context, model, provider, api_key) { + prompt <- paste0( + "You are 'Agent Cleaner', an expert bioinformatics curator.\n", + "Your task is to filter a list of enriched pathways to retain only those relevant to the specific experimental context.\n\n", + if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", + "Raw Enriched Pathways:\n", pathways, "\n\n", + "Instructions:\n", + "1. Identify and REMOVE 'housekeeping' pathways (e.g., Ribosome, Spliceosome, RNA transport) unless they are specifically relevant to the context (e.g., cancer proliferation).\n", + "2. Identify and REMOVE redundant or overly broad terms.\n", + "3. KEEP disease-specific, tissue-specific, or phenotype-driving pathways.\n\n", + "Output JSON format:\n", + "{\n", + " \"kept_pathways\": [\"List of pathway names to keep\"],\n", + " \"discarded_pathways\": [\"List of discarded pathways\"],\n", + " \"reasoning\": \"Brief explanation of the filtering strategy used.\"\n", + "}" + ) + + call_llm_aisdk(prompt, model, api_key) } -run_agent_detective <- function(pathways, ppi_network, fold_change, context, model, api_key, fallback_mode = FALSE) { - prompt <- paste0( - "You are 'Agent Detective', an expert systems biologist.\n", - "Your task is to identify Key Drivers (Regulators) and Functional Modules based on the filtered pathways and available network data.\n\n", - if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", - if (fallback_mode) "WARNING: No significant enriched pathways found. You are analyzing RAW GENE LISTS. Proceed with caution.\n" else "", - "Filtered Pathways:\n", pathways, "\n\n", - if (!is.null(ppi_network)) paste0("PPI Network Evidence:\n", ppi_network, "\n\n") else "", - if (!is.null(fold_change)) paste0("Gene Fold Changes:\n", fold_change, "\n\n") else "", - "Instructions:\n", - "1. Identify potential Master Regulators (TFs, Kinases) that explain the pathways.\n", - "2. Define Functional Modules (groups of interacting proteins) using the PPI network.\n", - "3. Refine the PPI network to a core regulatory sub-network.\n\n", - "Output JSON format:\n", - "{\n", - " \"key_drivers\": [\"List of top 3-5 driver genes\"],\n", - " \"functional_modules\": [\"List of identified modules (e.g. 'TCR Complex', 'Cell Cycle G1/S')\"],\n", - " \"refined_network\": [{\"source\": \"GeneA\", \"target\": \"GeneB\", \"interaction\": \"activation\", \"reason\": \"evidence\"}],\n", - " \"network_evidence\": \"Narrative describing how the network supports the drivers.\"\n", - "}" - ) - - call_llm_fanyi(prompt, model, api_key) +run_agent_detective <- function(pathways, ppi_network, fold_change, context, model, provider, api_key, fallback_mode = FALSE) { + prompt <- paste0( + "You are 'Agent Detective', an expert systems biologist.\n", + "Your task is to identify Key Drivers (Regulators) and Functional Modules based on the filtered pathways and available network data.\n\n", + if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", + if (fallback_mode) "WARNING: No significant enriched pathways found. You are analyzing RAW GENE LISTS. Proceed with caution.\n" else "", + "Filtered Pathways:\n", pathways, "\n\n", + if (!is.null(ppi_network)) paste0("PPI Network Evidence:\n", ppi_network, "\n\n") else "", + if (!is.null(fold_change)) paste0("Gene Fold Changes:\n", fold_change, "\n\n") else "", + "Instructions:\n", + "1. Identify potential Master Regulators (TFs, Kinases) that explain the pathways.\n", + "2. Define Functional Modules (groups of interacting proteins) using the PPI network.\n", + "3. Refine the PPI network to a core regulatory sub-network.\n\n", + "Output JSON format:\n", + "{\n", + " \"key_drivers\": [\"List of top 3-5 driver genes\"],\n", + " \"functional_modules\": [\"List of identified modules (e.g. 'TCR Complex', 'Cell Cycle G1/S')\"],\n", + " \"refined_network\": [{\"source\": \"GeneA\", \"target\": \"GeneB\", \"interaction\": \"activation\", \"reason\": \"evidence\"}],\n", + " \"network_evidence\": \"Narrative describing how the network supports the drivers.\"\n", + "}" + ) + + call_llm_aisdk(prompt, model, provider, api_key) } -run_agent_synthesizer <- function(pathways, detective_report, context, model, api_key, fallback_mode = FALSE) { - # Convert detective report to string if it's a list - detective_text <- "" - if (!is.null(detective_report) && is.list(detective_report)) { - # Check if fields exist before accessing to avoid errors - key_drivers <- if (!is.null(detective_report$key_drivers)) paste(detective_report$key_drivers, collapse=", ") else "None identified" - functional_modules <- if (!is.null(detective_report$functional_modules)) paste(detective_report$functional_modules, collapse=", ") else "None identified" - network_evidence <- if (!is.null(detective_report$network_evidence)) detective_report$network_evidence else "None provided" - - detective_text <- paste( - "Key Drivers: ", key_drivers, "\n", - "Functional Modules: ", functional_modules, "\n", - "Network Evidence: ", network_evidence, - sep="" - ) - } +run_agent_synthesizer <- function(pathways, detective_report, context, model, provider, api_key, fallback_mode = FALSE) { + # Convert detective report to string if it's a list + detective_text <- "" + if (!is.null(detective_report) && is.list(detective_report)) { + # Check if fields exist before accessing to avoid errors + key_drivers <- if (!is.null(detective_report$key_drivers)) paste(detective_report$key_drivers, collapse = ", ") else "None identified" + functional_modules <- if (!is.null(detective_report$functional_modules)) paste(detective_report$functional_modules, collapse = ", ") else "None identified" + network_evidence <- if (!is.null(detective_report$network_evidence)) detective_report$network_evidence else "None provided" - prompt <- paste0( - "You are 'Agent Storyteller', a senior scientific writer.\n", - "Your task is to synthesize the findings from previous agents into a coherent biological narrative.\n\n", - if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", - if (fallback_mode) "WARNING: No significant enriched pathways found. You are analyzing RAW GENE LISTS. Interpretation confidence should be evaluated cautiously.\n" else "", - "Data Sources:\n", - "1. Relevant Pathways:\n", pathways, "\n\n", - "2. Detective's Report (Drivers & Modules):\n", detective_text, "\n\n", - "Instructions:\n", - "1. Write a comprehensive Overview.\n", - "2. Explain Key Mechanisms, explicitly linking Regulators -> Modules -> Pathways.\n", - "3. Formulate a Hypothesis.\n", - "4. Draft a Narrative paragraph for a paper.\n\n", - "Output JSON format:\n", - "{\n", - " \"overview\": \"...\",\n", - " \"key_mechanisms\": \"...\",\n", - " \"hypothesis\": \"...\",\n", - " \"narrative\": \"...\"\n", - "}" + detective_text <- paste( + "Key Drivers: ", key_drivers, "\n", + "Functional Modules: ", functional_modules, "\n", + "Network Evidence: ", network_evidence, + sep = "" ) - - call_llm_fanyi(prompt, model, api_key) + } + + prompt <- paste0( + "You are 'Agent Storyteller', a senior scientific writer.\n", + "Your task is to synthesize the findings from previous agents into a coherent biological narrative.\n\n", + if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", + if (fallback_mode) "WARNING: No significant enriched pathways found. You are analyzing RAW GENE LISTS. Interpretation confidence should be evaluated cautiously.\n" else "", + "Data Sources:\n", + "1. Relevant Pathways:\n", pathways, "\n\n", + "2. Detective's Report (Drivers & Modules):\n", detective_text, "\n\n", + "Instructions:\n", + "1. Write a comprehensive Overview.\n", + "2. Explain Key Mechanisms, explicitly linking Regulators -> Modules -> Pathways.\n", + "3. Formulate a Hypothesis.\n", + "4. Draft a Narrative paragraph for a paper.\n\n", + "Output JSON format:\n", + "{\n", + " \"overview\": \"...\",\n", + " \"key_mechanisms\": \"...\",\n", + " \"hypothesis\": \"...\",\n", + " \"narrative\": \"...\"\n", + "}" + ) + + call_llm_aisdk(prompt, model, provider, api_key) } #' Interpret enrichment results using Large Language Models (LLM) #' -#' This function sends the enrichment results (top significant pathways) along with -#' an optional experimental context to an LLM (e.g., DeepSeek) to generate +#' This function sends the enrichment results (top significant pathways) along with +#' an optional experimental context to an LLM (e.g., DeepSeek) to generate #' a biological interpretation, hypothesis, and narrative suitable for a paper. #' #' @title interpret @@ -268,6 +276,7 @@ run_agent_synthesizer <- function(pathways, detective_report, context, model, ap #' @param context A string describing the experimental background (e.g., "scRNA-seq of mouse myocardial infarction at day 3"). #' @param n_pathways Number of top significant pathways to include in the analysis. Default is 20. #' @param model The LLM model to use. Default is "deepseek-chat". Supported models include "deepseek-chat", "glm-4", "qwen-turbo" etc. +#' @param provider The LLM provider. Default is NULL (inferred from model or handled by aisdk). #' @param api_key The API key for the LLM. If NULL, it tries to fetch from `getOption('yulab_translate')` based on the model. #' @param task Task type, default is "interpretation". Other options include "cell_type"/"annotation" and "phenotype"/"phenotyping". #' @param prior Optional prior knowledge (e.g., a biological hypothesis) to guide the task. @@ -276,201 +285,204 @@ run_agent_synthesizer <- function(pathways, detective_report, context, model, ap #' @return A character string containing the LLM-generated interpretation. #' @author Guangchuang Yu #' @export -interpret <- function(x, context = NULL, n_pathways = 20, model = "deepseek-chat", api_key = NULL, task = "interpretation", prior = NULL, add_ppi = FALSE, gene_fold_change = NULL) { - if (missing(x)) { - stop("enrichment result 'x' is required.") - } +interpret <- function(x, context = NULL, n_pathways = 20, model = "deepseek-chat", provider = NULL, api_key = NULL, task = "interpretation", prior = NULL, add_ppi = FALSE, gene_fold_change = NULL) { + if (missing(x)) { + stop("enrichment result 'x' is required.") + } + + # Process input into a list of data frames (one per cluster/group) + res_list <- process_enrichment_input(x, n_pathways) + + if (length(res_list) == 0) { + return("No significant pathways found to interpret.") + } + + # Process each item + results <- lapply(names(res_list), function(name) { + message(sprintf("Interpreting cluster: %s", name)) + item <- res_list[[name]] + df <- item$df + genes <- item$genes - # Process input into a list of data frames (one per cluster/group) - res_list <- process_enrichment_input(x, n_pathways) - - if (length(res_list) == 0) { - return("No significant pathways found to interpret.") + # Get raw genes for this cluster to identify specific markers + # even if no pathways are enriched or if pathways obscure them + if (is.null(genes)) { + # Try to get from x if possible (for single result) + genes <- tryCatch(process_enrichment_input(x, n_pathways)[[name]]$genes, error = function(e) NULL) } - # Process each item - results <- lapply(names(res_list), function(name) { - message(sprintf("Interpreting cluster: %s", name)) - item <- res_list[[name]] - df <- item$df - genes <- item$genes - - # Get raw genes for this cluster to identify specific markers - # even if no pathways are enriched or if pathways obscure them - if (is.null(genes)) { - # Try to get from x if possible (for single result) - genes <- tryCatch(process_enrichment_input(x, n_pathways)[[name]]$genes, error=function(e) NULL) - } - - # Top specific genes text - top_genes_text <- NULL - if (!is.null(genes) && length(genes) > 0) { - # If we have fold change, use it to rank - if (!is.null(gene_fold_change)) { - common <- intersect(genes, names(gene_fold_change)) - if (length(common) > 0) { - fc <- gene_fold_change[common] - # Get top upregulated - top_up <- head(names(fc[order(fc, decreasing = TRUE)]), 20) - top_genes_text <- paste(top_up, collapse = ", ") - } else { - top_genes_text <- paste(head(genes, 20), collapse = ", ") - } - } else { - top_genes_text <- paste(head(genes, 20), collapse = ", ") - } - } - - if (nrow(df) == 0) { - # Fallback logic handled inside prompts or earlier? - # For now, let's allow empty df if we have genes - if (is.null(top_genes_text)) { - res <- list( - cluster = name, - overview = "No significant pathways enriched and no marker genes available for interpretation.", - confidence = "None" - ) - class(res) <- c("interpretation", "list") - return(res) - } - pathway_text <- "No significant enriched pathways found." - } else { - # Format pathways for prompt - # We typically need ID, Description, GeneRatio/NES, p.adjust, geneID - cols_to_keep <- intersect(c("ID", "Description", "GeneRatio", "NES", "p.adjust", "pvalue", "geneID"), names(df)) - pathway_text <- paste( - apply(df[, cols_to_keep, drop=FALSE], 1, function(row) { - paste(names(row), row, sep=": ", collapse=", ") - }), - collapse="\n" - ) - } - - # Determine prior for this cluster - current_prior <- NULL - if (!is.null(prior)) { - if (length(prior) == 1 && is.null(names(prior))) { - current_prior <- prior - } else if (name %in% names(prior)) { - current_prior <- prior[[name]] - } - } - - # Determine PPI/Hub Genes info if requested - ppi_network_text <- NULL - if (add_ppi) { - # Extract all unique genes from the top pathways OR from the top genes list - all_genes <- unique(unlist(strsplit(as.character(df$geneID), "/"))) - if (length(all_genes) == 0 && !is.null(genes)) all_genes <- head(genes, 50) - - if (length(all_genes) > 0) { - ppi_network_text <- .get_ppi_context_text(all_genes, x) - } - } - - # Determine Fold Change info if provided - fc_text <- NULL - if (!is.null(gene_fold_change)) { - # gene_fold_change should be a named vector of logFC - # We filter for genes present in the pathways OR top genes - all_genes <- unique(unlist(strsplit(as.character(df$geneID), "/"))) - if (length(all_genes) == 0 && !is.null(genes)) all_genes <- genes - - common_genes <- intersect(all_genes, names(gene_fold_change)) - - if (length(common_genes) > 0) { - # Sort by absolute FC to show most regulated genes - fc_subset <- gene_fold_change[common_genes] - fc_subset <- fc_subset[order(abs(fc_subset), decreasing = TRUE)] - - # Take top 20 - top_fc <- head(fc_subset, 20) - fc_text <- paste(names(top_fc), round(top_fc, 2), sep = ":", collapse = ", ") - } - } - - # Construct Prompt based on task - if (task == "annotation" || task == "cell_type") { - prompt <- construct_annotation_prompt(pathway_text, context, name, current_prior, ppi_network_text, fc_text, top_genes_text) - } else if (task == "phenotype" || task == "phenotyping") { - prompt <- construct_phenotype_prompt(pathway_text, context, name, ppi_network_text, fc_text) + # Top specific genes text + top_genes_text <- NULL + if (!is.null(genes) && length(genes) > 0) { + # If we have fold change, use it to rank + if (!is.null(gene_fold_change)) { + common <- intersect(genes, names(gene_fold_change)) + if (length(common) > 0) { + fc <- gene_fold_change[common] + # Get top upregulated + top_up <- head(names(fc[order(fc, decreasing = TRUE)]), 20) + top_genes_text <- paste(top_up, collapse = ", ") } else { - prompt <- construct_interpretation_prompt(pathway_text, context, ppi_network_text, fc_text) + top_genes_text <- paste(head(genes, 20), collapse = ", ") } + } else { + top_genes_text <- paste(head(genes, 20), collapse = ", ") + } + } + + if (nrow(df) == 0) { + # Fallback logic handled inside prompts or earlier? + # For now, let's allow empty df if we have genes + if (is.null(top_genes_text)) { + res <- list( + cluster = name, + overview = "No significant pathways enriched and no marker genes available for interpretation.", + confidence = "None" + ) + class(res) <- c("interpretation", "list") + return(res) + } + pathway_text <- "No significant enriched pathways found." + } else { + # Format pathways for prompt + # We typically need ID, Description, GeneRatio/NES, p.adjust, geneID + cols_to_keep <- intersect(c("ID", "Description", "GeneRatio", "NES", "p.adjust", "pvalue", "geneID"), names(df)) + pathway_text <- paste( + apply(df[, cols_to_keep, drop = FALSE], 1, function(row) { + paste(names(row), row, sep = ": ", collapse = ", ") + }), + collapse = "\n" + ) + } + + # Determine prior for this cluster + current_prior <- NULL + if (!is.null(prior)) { + if (length(prior) == 1 && is.null(names(prior))) { + current_prior <- prior + } else if (name %in% names(prior)) { + current_prior <- prior[[name]] + } + } + + # Determine PPI/Hub Genes info if requested + ppi_network_text <- NULL + if (add_ppi) { + # Extract all unique genes from the top pathways OR from the top genes list + all_genes <- unique(unlist(strsplit(as.character(df$geneID), "/"))) + if (length(all_genes) == 0 && !is.null(genes)) all_genes <- head(genes, 50) + + if (length(all_genes) > 0) { + ppi_network_text <- .get_ppi_context_text(all_genes, x) + } + } + + # Determine Fold Change info if provided + fc_text <- NULL + if (!is.null(gene_fold_change)) { + # gene_fold_change should be a named vector of logFC + # We filter for genes present in the pathways OR top genes + all_genes <- unique(unlist(strsplit(as.character(df$geneID), "/"))) + if (length(all_genes) == 0 && !is.null(genes)) all_genes <- genes + + common_genes <- intersect(all_genes, names(gene_fold_change)) + + if (length(common_genes) > 0) { + # Sort by absolute FC to show most regulated genes + fc_subset <- gene_fold_change[common_genes] + fc_subset <- fc_subset[order(abs(fc_subset), decreasing = TRUE)] - # Call LLM via fanyi - res <- call_llm_fanyi(prompt, model, api_key) - - # If result is a list (JSON parsed), add cluster name - if (is.list(res)) { - res$cluster <- name - - # Post-process refined_network if present to be an igraph object - if (!is.null(res$refined_network)) { - # refined_network is a list of lists/dataframes from JSON - # Convert to dataframe - rn_df <- tryCatch({ - # It might be a list of lists or a dataframe already depending on jsonlite parsing - if (is.data.frame(res$refined_network)) { - res$refined_network - } else { - # Check if all elements are lists with same structure - if (all(sapply(res$refined_network, is.list))) { - # Convert each list element to dataframe row - do.call(rbind, lapply(res$refined_network, function(x) as.data.frame(x, stringsAsFactors=FALSE))) - } else { - NULL - } - } - }, error = function(e) NULL) - - if (!is.null(rn_df) && nrow(rn_df) > 0) { - # Create igraph object - # Columns expected: source, target, interaction, reason - # Map source/target to from/to for igraph - colnames(rn_df)[colnames(rn_df) == "source"] <- "from" - colnames(rn_df)[colnames(rn_df) == "target"] <- "to" - - # Ensure we have from and to - if ("from" %in% names(rn_df) && "to" %in% names(rn_df)) { - res$network <- igraph::graph_from_data_frame(rn_df, directed = FALSE) # Assuming undirected for simplicity or directed if interaction implies - } - } + # Take top 20 + top_fc <- head(fc_subset, 20) + fc_text <- paste(names(top_fc), round(top_fc, 2), sep = ":", collapse = ", ") + } + } + + # Construct Prompt based on task + if (task == "annotation" || task == "cell_type") { + prompt <- construct_annotation_prompt(pathway_text, context, name, current_prior, ppi_network_text, fc_text, top_genes_text) + } else if (task == "phenotype" || task == "phenotyping") { + prompt <- construct_phenotype_prompt(pathway_text, context, name, ppi_network_text, fc_text) + } else { + prompt <- construct_interpretation_prompt(pathway_text, context, ppi_network_text, fc_text) + } + + # Call LLM via fanyi/aisdk + res <- call_llm_aisdk(prompt, model, provider, api_key) + + # If result is a list (JSON parsed), add cluster name + if (is.list(res)) { + res$cluster <- name + + # Post-process refined_network if present to be an igraph object + if (!is.null(res$refined_network)) { + # refined_network is a list of lists/dataframes from JSON + # Convert to dataframe + rn_df <- tryCatch( + { + # It might be a list of lists or a dataframe already depending on jsonlite parsing + if (is.data.frame(res$refined_network)) { + res$refined_network + } else { + # Check if all elements are lists with same structure + if (all(sapply(res$refined_network, is.list))) { + # Convert each list element to dataframe row + do.call(rbind, lapply(res$refined_network, function(x) as.data.frame(x, stringsAsFactors = FALSE))) + } else { + NULL + } } - } else if (is.character(res)) { - # Handle raw text response fallback - res <- list( - cluster = name, - overview = res, # Put raw text in overview - confidence = "Low", - reasoning = "Failed to parse structured JSON response from LLM. Raw text provided." - ) - class(res) <- c("interpretation", "list") - } + }, + error = function(e) NULL + ) - # If res is NULL (which shouldn't happen with call_llm_fanyi wrapper unless error caught inside and returned NULL, but wrapper returns raw text on error), handle it - if (is.null(res)) { - res <- list( - cluster = name, - overview = "Failed to retrieve interpretation from LLM.", - confidence = "None", - reasoning = "API call failed or returned empty response." - ) - class(res) <- c("interpretation", "list") + if (!is.null(rn_df) && nrow(rn_df) > 0) { + # Create igraph object + # Columns expected: source, target, interaction, reason + # Map source/target to from/to for igraph + colnames(rn_df)[colnames(rn_df) == "source"] <- "from" + colnames(rn_df)[colnames(rn_df) == "target"] <- "to" + + # Ensure we have from and to + if ("from" %in% names(rn_df) && "to" %in% names(rn_df)) { + res$network <- igraph::graph_from_data_frame(rn_df, directed = FALSE) # Assuming undirected for simplicity or directed if interaction implies + } } - - return(res) - }) - - names(results) <- names(res_list) + } + } else if (is.character(res)) { + # Handle raw text response fallback + res <- list( + cluster = name, + overview = res, # Put raw text in overview + confidence = "Low", + reasoning = "Failed to parse structured JSON response from LLM. Raw text provided." + ) + class(res) <- c("interpretation", "list") + } - # Return structure - if (length(results) == 1 && names(results)[1] == "Default") { - return(results[[1]]) - } else { - class(results) <- c("interpretation_list", "list") - return(results) + # If res is NULL (which shouldn't happen with call_llm_aisdk wrapper unless error caught inside and returned NULL, but wrapper returns raw text on error), handle it + if (is.null(res)) { + res <- list( + cluster = name, + overview = "Failed to retrieve interpretation from LLM.", + confidence = "None", + reasoning = "API call failed or returned empty response." + ) + class(res) <- c("interpretation", "list") } + + return(res) + }) + + names(results) <- names(res_list) + + # Return structure + if (length(results) == 1 && names(results)[1] == "Default") { + return(results[[1]]) + } else { + class(results) <- c("interpretation_list", "list") + return(results) + } } #' Interpret enrichment results using a hierarchical strategy (Major -> Minor clusters) @@ -485,208 +497,213 @@ interpret <- function(x, context = NULL, n_pathways = 20, model = "deepseek-chat #' @return A list of interpretation results. #' @author Guangchuang Yu #' @export -interpret_hierarchical <- function(x_minor, x_major, mapping, model = "deepseek-chat", api_key = NULL, task = "cell_type") { +interpret_hierarchical <- function(x_minor, x_major, mapping, model = "deepseek-chat", provider = NULL, api_key = NULL, task = "cell_type") { + # 1. Interpret Major Clusters + message("Step 1: Interpreting Major Clusters to establish lineage context...") + res_major <- interpret(x_major, context = NULL, model = model, provider = provider, api_key = api_key, task = "cell_type") + + # 2. Interpret Sub-clusters with Context + message("Step 2: Interpreting Sub-clusters using hierarchical constraints...") + + # Use internal helper to process x_minor into list of dataframes + res_list_minor <- process_enrichment_input(x_minor, n_pathways = 20) + + results <- lapply(names(res_list_minor), function(name) { + # name is the sub-cluster ID - # 1. Interpret Major Clusters - message("Step 1: Interpreting Major Clusters to establish lineage context...") - res_major <- interpret(x_major, context = NULL, model = model, api_key = api_key, task = "cell_type") + # Determine Major Context + specific_context <- NULL + if (name %in% names(mapping)) { + major_id <- mapping[[name]] + + # Extract major result + major_info <- NULL + if (inherits(res_major, "interpretation_list") && major_id %in% names(res_major)) { + major_info <- res_major[[major_id]] + } else if (inherits(res_major, "interpretation")) { + # Handle case where res_major might be a single result (if only 1 major cluster) + # Check if it matches major_id or is just default + if (!is.null(res_major$cluster) && res_major$cluster == major_id) { + major_info <- res_major + } else if (is.null(res_major$cluster)) { + # Assume it's the only one + major_info <- res_major + } + } + + if (!is.null(major_info) && !is.null(major_info$cell_type)) { + major_label <- major_info$cell_type + specific_context <- paste0("Hierarchical Constraint: This cluster is a confirmed subcluster of the '", major_label, "' lineage (identified in major cluster analysis). Please focus on distinguishing the specific subtype or state within this lineage.") + } + } - # 2. Interpret Sub-clusters with Context - message("Step 2: Interpreting Sub-clusters using hierarchical constraints...") + if (is.null(specific_context)) { + warning(paste("No major lineage context found for sub-cluster:", name)) + } - # Use internal helper to process x_minor into list of dataframes - res_list_minor <- process_enrichment_input(x_minor, n_pathways = 20) + # Call interpret for this single cluster + # We pass the dataframe directly + res <- interpret(res_list_minor[[name]], context = specific_context, model = model, provider = provider, api_key = api_key, task = task) - results <- lapply(names(res_list_minor), function(name) { - # name is the sub-cluster ID - - # Determine Major Context - specific_context <- NULL - if (name %in% names(mapping)) { - major_id <- mapping[[name]] - - # Extract major result - major_info <- NULL - if (inherits(res_major, "interpretation_list") && major_id %in% names(res_major)) { - major_info <- res_major[[major_id]] - } else if (inherits(res_major, "interpretation")) { - # Handle case where res_major might be a single result (if only 1 major cluster) - # Check if it matches major_id or is just default - if (!is.null(res_major$cluster) && res_major$cluster == major_id) { - major_info <- res_major - } else if (is.null(res_major$cluster)) { - # Assume it's the only one - major_info <- res_major - } - } - - if (!is.null(major_info) && !is.null(major_info$cell_type)) { - major_label <- major_info$cell_type - specific_context <- paste0("Hierarchical Constraint: This cluster is a confirmed subcluster of the '", major_label, "' lineage (identified in major cluster analysis). Please focus on distinguishing the specific subtype or state within this lineage.") - } - } - - if (is.null(specific_context)) { - warning(paste("No major lineage context found for sub-cluster:", name)) - } - - # Call interpret for this single cluster - # We pass the dataframe directly - res <- interpret(res_list_minor[[name]], context = specific_context, model = model, api_key = api_key, task = task) - - # Ensure cluster name is preserved - if (is.list(res)) res$cluster <- name - - return(res) - }) + # Ensure cluster name is preserved + if (is.list(res)) res$cluster <- name - names(results) <- names(res_list_minor) - class(results) <- c("interpretation_list", "list") - return(results) + return(res) + }) + + names(results) <- names(res_list_minor) + class(results) <- c("interpretation_list", "list") + return(results) } process_enrichment_input <- function(x, n_pathways) { - # Helper to convert object to data frame - get_df <- function(obj) { - if (inherits(obj, "compareClusterResult") || inherits(obj, "enrichResult") || inherits(obj, "gseaResult")) { - return(as.data.frame(obj)) - } else if (is.data.frame(obj)) { - return(obj) - } - stop("Unsupported input type. Expected enrichResult, compareClusterResult, gseaResult, or data.frame.") + # Helper to convert object to data frame + get_df <- function(obj) { + if (inherits(obj, "compareClusterResult") || inherits(obj, "enrichResult") || inherits(obj, "gseaResult")) { + return(as.data.frame(obj)) + } else if (is.data.frame(obj)) { + return(obj) } - - # Helper to get top N - get_top_n <- function(df, n) { - if (nrow(df) == 0) return(df) - if ("p.adjust" %in% names(df)) { - df <- df[order(df$p.adjust), ] - } else if ("pvalue" %in% names(df)) { - df <- df[order(df$pvalue), ] - } - head(df, n) + stop("Unsupported input type. Expected enrichResult, compareClusterResult, gseaResult, or data.frame.") + } + + # Helper to get top N + get_top_n <- function(df, n) { + if (nrow(df) == 0) { + return(df) } - - # Helper to get gene list from object - get_genes <- function(obj, cluster = NULL) { - if (inherits(obj, "enrichResult")) { - return(obj@gene) - } else if (inherits(obj, "compareClusterResult")) { - if (!is.null(cluster) && !is.null(obj@geneClusters)) { - if (cluster %in% names(obj@geneClusters)) { - return(obj@geneClusters[[cluster]]) - } - } + if ("p.adjust" %in% names(df)) { + df <- df[order(df$p.adjust), ] + } else if ("pvalue" %in% names(df)) { + df <- df[order(df$pvalue), ] + } + head(df, n) + } + + # Helper to get gene list from object + get_genes <- function(obj, cluster = NULL) { + if (inherits(obj, "enrichResult")) { + return(obj@gene) + } else if (inherits(obj, "compareClusterResult")) { + if (!is.null(cluster) && !is.null(obj@geneClusters)) { + if (cluster %in% names(obj@geneClusters)) { + return(obj@geneClusters[[cluster]]) } - return(NULL) + } + } + return(NULL) + } + + # Check if input is a list of enrichment objects (Mixed Database Strategy) + if (is.list(x) && !inherits(x, "enrichResult") && !inherits(x, "gseaResult") && !inherits(x, "compareClusterResult") && !is.data.frame(x)) { + # Check if it is already a processed item (has 'df' and 'genes') + if ("df" %in% names(x) && is.data.frame(x$df)) { + # It seems to be a single processed item (e.g. passed from interpret_hierarchical) + # Return it as a single-item list + return(list(Default = x)) } - # Check if input is a list of enrichment objects (Mixed Database Strategy) - if (is.list(x) && !inherits(x, "enrichResult") && !inherits(x, "gseaResult") && !inherits(x, "compareClusterResult") && !is.data.frame(x)) { - - # Check if it is already a processed item (has 'df' and 'genes') - if ("df" %in% names(x) && is.data.frame(x$df)) { - # It seems to be a single processed item (e.g. passed from interpret_hierarchical) - # Return it as a single-item list - return(list(Default = x)) - } - - # Convert all elements to data frames - dfs <- lapply(x, get_df) - - # Check if they look like compareCluster results (have 'Cluster' column) - has_cluster <- all(sapply(dfs, function(d) "Cluster" %in% names(d))) - - combined_df <- do.call(rbind, dfs) - - if (has_cluster) { - # Split by Cluster and get top N for each cluster - df_list <- split(combined_df, combined_df$Cluster) - res_list <- lapply(names(df_list), function(cl_name) { - list(df = get_top_n(df_list[[cl_name]], n_pathways), genes = NULL) # List input usually doesn't store raw genes in a structured way easily accessible here - }) - names(res_list) <- names(df_list) - return(res_list) - } else { - # Assume single group (e.g. list of enrichResult for same sample) - return(list(Default = list(df = get_top_n(combined_df, n_pathways), genes = NULL))) - } + # Convert all elements to data frames + dfs <- lapply(x, get_df) + + # Check if they look like compareCluster results (have 'Cluster' column) + has_cluster <- all(sapply(dfs, function(d) "Cluster" %in% names(d))) + + combined_df <- do.call(rbind, dfs) + + if (has_cluster) { + # Split by Cluster and get top N for each cluster + df_list <- split(combined_df, combined_df$Cluster) + res_list <- lapply(names(df_list), function(cl_name) { + list(df = get_top_n(df_list[[cl_name]], n_pathways), genes = NULL) # List input usually doesn't store raw genes in a structured way easily accessible here + }) + names(res_list) <- names(df_list) + return(res_list) } else { - # Single object - df <- get_df(x) - if ("Cluster" %in% names(df)) { - # compareClusterResult - df_list <- split(df, df$Cluster) - - # Map back to genes if possible - res_list <- lapply(names(df_list), function(cl_name) { - genes <- get_genes(x, cl_name) - list(df = get_top_n(df_list[[cl_name]], n_pathways), genes = genes) - }) - names(res_list) <- names(df_list) - return(res_list) - } else { - # enrichResult / gseaResult - genes <- get_genes(x) - return(list(Default = list(df = get_top_n(df, n_pathways), genes = genes))) - } + # Assume single group (e.g. list of enrichResult for same sample) + return(list(Default = list(df = get_top_n(combined_df, n_pathways), genes = NULL))) + } + } else { + # Single object + df <- get_df(x) + if ("Cluster" %in% names(df)) { + # compareClusterResult + df_list <- split(df, df$Cluster) + + # Map back to genes if possible + res_list <- lapply(names(df_list), function(cl_name) { + genes <- get_genes(x, cl_name) + list(df = get_top_n(df_list[[cl_name]], n_pathways), genes = genes) + }) + names(res_list) <- names(df_list) + return(res_list) + } else { + # enrichResult / gseaResult + genes <- get_genes(x) + return(list(Default = list(df = get_top_n(df, n_pathways), genes = genes))) } + } } .get_ppi_context_text <- function(genes, x = NULL, limit = 50) { - if (length(genes) == 0) return(NULL) - - input_for_ppi <- head(genes, limit) - - # Try to determine taxID - current_taxID <- "auto" - if (!is.null(x) && inherits(x, "enrichResult") && !is.list(x)) { - current_taxID <- tryCatch(getTaxID(x@organism), error=function(e) "auto") - } - - ppi_res <- tryCatch({ - g <- getPPI(input_for_ppi, taxID = current_taxID, output = "igraph", network_type = "functional") - - if (!is.null(g)) { - el <- igraph::as_data_frame(g, what="edges") - if (nrow(el) > 0) { - if ("score" %in% names(el)) el <- el[order(el$score, decreasing = TRUE), ] - el_subset <- head(el, limit) - edges_text <- apply(el_subset, 1, function(row) { - score_info <- "" - if ("score" %in% names(row)) score_info <- paste0(" (Score: ", row["score"], ")") - paste0(row["from"], " -- ", row["to"], score_info) - }) - paste(edges_text, collapse = "\n") - } else { - NULL - } + if (length(genes) == 0) { + return(NULL) + } + + input_for_ppi <- head(genes, limit) + + # Try to determine taxID + current_taxID <- "auto" + if (!is.null(x) && inherits(x, "enrichResult") && !is.list(x)) { + current_taxID <- tryCatch(getTaxID(x@organism), error = function(e) "auto") + } + + ppi_res <- tryCatch( + { + g <- getPPI(input_for_ppi, taxID = current_taxID, output = "igraph", network_type = "functional") + + if (!is.null(g)) { + el <- igraph::as_data_frame(g, what = "edges") + if (nrow(el) > 0) { + if ("score" %in% names(el)) el <- el[order(el$score, decreasing = TRUE), ] + el_subset <- head(el, limit) + edges_text <- apply(el_subset, 1, function(row) { + score_info <- "" + if ("score" %in% names(row)) score_info <- paste0(" (Score: ", row["score"], ")") + paste0(row["from"], " -- ", row["to"], score_info) + }) + paste(edges_text, collapse = "\n") } else { - NULL + NULL } - }, error = function(e) NULL) - - return(ppi_res) + } else { + NULL + } + }, + error = function(e) NULL + ) + + return(ppi_res) } construct_interpretation_prompt <- function(pathways, context, ppi_network = NULL, fold_change = NULL) { - base_prompt <- "You are an expert biologist and bioinformatics researcher. I have performed functional enrichment analyses using multiple databases (e.g., KEGG, Reactome, GO, ChEA/Transcription Factors, Disease Ontologies)." - - if (!is.null(context) && context != "") { - base_prompt <- paste0(base_prompt, "\n\nExperimental Context:\n", context) - } - - base_prompt <- paste0(base_prompt, "\n\nTop Enriched Terms (Mixed Sources):\n", pathways) - - if (!is.null(ppi_network)) { - base_prompt <- paste0(base_prompt, "\n\nPPI Network (Edge List from STRING):\n", ppi_network) - } - - if (!is.null(fold_change)) { - base_prompt <- paste0(base_prompt, "\n\nTop Regulated Genes (Log2 Fold Change):\n", fold_change) - } - - base_prompt <- paste0(base_prompt, "\n\nPlease use a **Chain-of-Thought** approach to analyze these results before generating the final report. Follow these reasoning steps: + base_prompt <- "You are an expert biologist and bioinformatics researcher. I have performed functional enrichment analyses using multiple databases (e.g., KEGG, Reactome, GO, ChEA/Transcription Factors, Disease Ontologies)." + + if (!is.null(context) && context != "") { + base_prompt <- paste0(base_prompt, "\n\nExperimental Context:\n", context) + } + + base_prompt <- paste0(base_prompt, "\n\nTop Enriched Terms (Mixed Sources):\n", pathways) + + if (!is.null(ppi_network)) { + base_prompt <- paste0(base_prompt, "\n\nPPI Network (Edge List from STRING):\n", ppi_network) + } + + if (!is.null(fold_change)) { + base_prompt <- paste0(base_prompt, "\n\nTop Regulated Genes (Log2 Fold Change):\n", fold_change) + } + + base_prompt <- paste0(base_prompt, "\n\nPlease use a **Chain-of-Thought** approach to analyze these results before generating the final report. Follow these reasoning steps: 1. **Source Deconvolution**: Identify the nature of the enriched terms. Distinguish between: - **Biological Processes/Pathways** (e.g., 'Cell Cycle', 'TCR Signaling') -> WHAT is happening. - **Upstream Regulators/TFs** (e.g., 'E2F1', 'NFKB1 target genes') -> WHO is driving it. @@ -716,37 +733,37 @@ Every biological claim or interpretation MUST be supported by specific evidence Please be scientifically rigorous, citing standard biological knowledge where appropriate, and avoid hallucinations. Ensure the response is a valid JSON object. Do not include any markdown formatting (like ```json).") - - return(base_prompt) + + return(base_prompt) } construct_annotation_prompt <- function(pathways, context, cluster_id, prior = NULL, ppi_network = NULL, fold_change = NULL, top_genes = NULL) { - base_prompt <- paste0("You are an expert cell biologist. I have a cell cluster (", cluster_id, ") from a single-cell RNA-seq experiment.") - - if (!is.null(context) && context != "") { - base_prompt <- paste0(base_prompt, "\n\nExperimental Context:\n", context) - } - - if (!is.null(prior) && prior != "") { - base_prompt <- paste0(base_prompt, "\n\nPreliminary Annotation (from automated tool):\n", prior) - } - - base_prompt <- paste0(base_prompt, "\n\nEnriched Terms (Mixed Sources: Pathways/TFs):\n", pathways) - - if (!is.null(top_genes) && top_genes != "") { - base_prompt <- paste0(base_prompt, "\n\nTop Specific/Marker Genes (Highest Fold Change):\n", top_genes) - } - - if (!is.null(ppi_network)) { - base_prompt <- paste0(base_prompt, "\n\nPPI Network (Edge List from STRING):\n", ppi_network) - } - - if (!is.null(fold_change)) { - base_prompt <- paste0(base_prompt, "\n\nTop Regulated Genes (Log2 Fold Change):\n", fold_change) - } - - # Common Logic Section - logic_section <- " + base_prompt <- paste0("You are an expert cell biologist. I have a cell cluster (", cluster_id, ") from a single-cell RNA-seq experiment.") + + if (!is.null(context) && context != "") { + base_prompt <- paste0(base_prompt, "\n\nExperimental Context:\n", context) + } + + if (!is.null(prior) && prior != "") { + base_prompt <- paste0(base_prompt, "\n\nPreliminary Annotation (from automated tool):\n", prior) + } + + base_prompt <- paste0(base_prompt, "\n\nEnriched Terms (Mixed Sources: Pathways/TFs):\n", pathways) + + if (!is.null(top_genes) && top_genes != "") { + base_prompt <- paste0(base_prompt, "\n\nTop Specific/Marker Genes (Highest Fold Change):\n", top_genes) + } + + if (!is.null(ppi_network)) { + base_prompt <- paste0(base_prompt, "\n\nPPI Network (Edge List from STRING):\n", ppi_network) + } + + if (!is.null(fold_change)) { + base_prompt <- paste0(base_prompt, "\n\nTop Regulated Genes (Log2 Fold Change):\n", fold_change) + } + + # Common Logic Section + logic_section <- " Use the following logic: 1. **Source Deconvolution**: Distinguish between Cell Type Markers, Biological Pathways, and Upstream TFs. 2. **Comparative Analysis (CRITICAL)**: Do NOT just look at the top 1 enriched term. @@ -764,9 +781,9 @@ Use the following logic: - **Medium**: Strong shared markers/pathways, but discriminatory markers are weak or absent. - **Low**: Conflicting evidence. " - - if (!is.null(prior) && prior != "") { - base_prompt <- paste0(base_prompt, "\n\nTask: + + if (!is.null(prior) && prior != "") { + base_prompt <- paste0(base_prompt, "\n\nTask: Please validate and refine the preliminary annotation based on the enrichment and marker evidence.", logic_section, " **GROUNDING INSTRUCTION (STRICT):** @@ -785,8 +802,8 @@ Provide the result as a JSON object with the following keys: - network_evidence: Describe specific protein complexes or signaling modules found in the network that support your conclusion. Ensure the response is a valid JSON object. Do not include any markdown formatting (like ```json).") - } else { - base_prompt <- paste0(base_prompt, "\n\nBased on these enrichment results and marker genes, identify the cell type of this cluster.", logic_section, " + } else { + base_prompt <- paste0(base_prompt, "\n\nBased on these enrichment results and marker genes, identify the cell type of this cluster.", logic_section, " **GROUNDING INSTRUCTION (STRICT):** - **NO HALLUCINATION**: Do not invent markers or pathways not present in the input. @@ -803,29 +820,29 @@ Provide the result as a JSON object with the following keys: - network_evidence: Describe specific protein complexes or signaling modules found in the network that support your conclusion. Ensure the response is a valid JSON object. Do not include any markdown formatting (like ```json).") - } - - return(base_prompt) + } + + return(base_prompt) } construct_phenotype_prompt <- function(pathways, context, group_id, ppi_network = NULL, fold_change = NULL) { - base_prompt <- paste0("You are an expert biologist. I have a list of enriched pathways/terms for a biological group (", group_id, "). The enrichment may include results from multiple databases (e.g., Pathways, TFs, Ontologies).") - - if (!is.null(context) && context != "") { - base_prompt <- paste0(base_prompt, "\n\nExperimental Context:\n", context) - } - - base_prompt <- paste0(base_prompt, "\n\nEnriched Terms:\n", pathways) - - if (!is.null(ppi_network)) { - base_prompt <- paste0(base_prompt, "\n\nPPI Network (Edge List from STRING):\n", ppi_network) - } - - if (!is.null(fold_change)) { - base_prompt <- paste0(base_prompt, "\n\nTop Regulated Genes (Log2 Fold Change):\n", fold_change) - } - - base_prompt <- paste0(base_prompt, "\n\nBased on these enrichment results, characterize the specific biological phenotype or functional state of this group. + base_prompt <- paste0("You are an expert biologist. I have a list of enriched pathways/terms for a biological group (", group_id, "). The enrichment may include results from multiple databases (e.g., Pathways, TFs, Ontologies).") + + if (!is.null(context) && context != "") { + base_prompt <- paste0(base_prompt, "\n\nExperimental Context:\n", context) + } + + base_prompt <- paste0(base_prompt, "\n\nEnriched Terms:\n", pathways) + + if (!is.null(ppi_network)) { + base_prompt <- paste0(base_prompt, "\n\nPPI Network (Edge List from STRING):\n", ppi_network) + } + + if (!is.null(fold_change)) { + base_prompt <- paste0(base_prompt, "\n\nTop Regulated Genes (Log2 Fold Change):\n", fold_change) + } + + base_prompt <- paste0(base_prompt, "\n\nBased on these enrichment results, characterize the specific biological phenotype or functional state of this group. Use the following logic: 1. **Source Deconvolution**: Separate observed processes (Pathways) from drivers (TFs). 2. Synthesize the enriched terms to identify the dominant biological theme (e.g., Inflammation, Cell Cycle, Metabolism, Stress Response). @@ -847,202 +864,272 @@ Provide the result as a JSON object with the following keys: - network_evidence: Describe specific protein complexes or signaling modules found in the network that support your conclusion. Ensure the response is a valid JSON object. Do not include any markdown formatting (like ```json).") - - return(base_prompt) + + return(base_prompt) } -call_llm_fanyi <- function(prompt, model, api_key) { - if (!requireNamespace("fanyi", quietly = TRUE)) { - stop("Package 'fanyi' is required for interpret(). Please install it.") +call_llm_aisdk <- function(prompt, model, provider = NULL, api_key = NULL) { + if (!requireNamespace("aisdk", quietly = TRUE)) { + stop("Package 'aisdk' is required for interpret(). Please install it.") + } + + # Fallback to yulab_translate if api_key is NULL (matching original fanyi behaviour) + if (is.null(api_key)) { + api_key <- getOption("yulab_translate") + } + + # Optional: ensure local .env is loaded if available (often used in aisdk tests) + if (file.exists(".env")) { + readRenviron(".env") + } + + # Standardize model string and extract provider + model_str <- model + provider_name <- provider + + if (is.null(provider_name)) { + if (grepl(":", model)) { + parts <- strsplit(model, ":")[[1]] + provider_name <- parts[1] + model_name <- parts[2] + } else { + # Heuristic inference if no provider/colon given + if (grepl("^deepseek-", model)) { + provider_name <- "deepseek" + } else if (grepl("^gpt-", model) || grepl("^o1-|^o3-", model)) { + provider_name <- "openai" + } else if (grepl("^qwen-|^glm-", model)) { + provider_name <- "bailian" + } else if (grepl("claude", model)) { + provider_name <- "anthropic" + } else if (grepl("^doubao-", model)) { + provider_name <- "volcengine" + } else if (grepl("^step-", model)) { + provider_name <- "stepfun" + } + model_name <- model } - - # Call fanyi::chat_request - res_content <- tryCatch({ - fanyi::chat_request(x = prompt, model = model, api_key = api_key, max_tokens = 4096) - }, error = function(e) { - stop("Failed to call fanyi::chat_request. Error: ", e$message) - }) - - # Try to parse JSON response if the prompt asked for JSON - tryCatch({ - # Clean up potential markdown code blocks like ```json ... ``` - json_str <- res_content - if (grepl("```json", json_str)) { - json_str <- sub(".*?```json\\s*", "", json_str) - json_str <- sub("\\s*```.*", "", json_str) - } else if (grepl("```", json_str)) { - json_str <- sub(".*?```\\s*", "", json_str) - json_str <- sub("\\s*```.*", "", json_str) - } - - if (!requireNamespace("jsonlite", quietly = TRUE)) { - stop("Package 'jsonlite' is required.") + } else { + model_name <- model + } + + # Setup model object via explicit provider instantiation to avoid stale global registry + model_obj <- if (is.null(provider_name)) model_str else paste0(provider_name, ":", model_name) + + if (!is.null(provider_name)) { + provider_factory_name <- paste0("create_", provider_name) + # Check if factory exists in aisdk namespace + if (exists(provider_factory_name, envir = asNamespace("aisdk"), mode = "function")) { + provider_factory <- get(provider_factory_name, envir = asNamespace("aisdk")) + tryCatch( + { + # Use provided api_key or fallback to NULL (letting factory read fresh Sys.getenv) + provider_instance <- provider_factory(api_key = api_key) + model_obj <- provider_instance$language_model(model_name) + }, + error = function(e) { + warning(sprintf("Failed to instantiate provider '%s' explicitly: %s. Falling back to default resolution.", provider_name, e$message)) } - - parsed_res <- jsonlite::fromJSON(json_str) - class(parsed_res) <- c("interpretation", class(parsed_res)) - return(parsed_res) - }, error = function(e) { - warning("Failed to parse JSON response from LLM. Returning raw text. Error: ", e$message) - return(res_content) - }) + ) + } + } + + # Call aisdk::generate_text (temperature = NULL prevents sending unsupported params for reasoning models) + res_content <- tryCatch( + { + res <- aisdk::generate_text(model = model_obj, prompt = prompt, max_tokens = 4096, temperature = NULL) + res$text + }, + error = function(e) { + stop("Failed to call aisdk::generate_text. Error: ", e$message) + } + ) + + # Try to parse JSON response if the prompt asked for JSON + tryCatch( + { + # Clean up potential markdown code blocks like ```json ... ``` + json_str <- res_content + if (grepl("```json", json_str)) { + json_str <- sub(".*?```json\\s*", "", json_str) + json_str <- sub("\\s*```.*", "", json_str) + } else if (grepl("```", json_str)) { + json_str <- sub(".*?```\\s*", "", json_str) + json_str <- sub("\\s*```.*", "", json_str) + } + + # Use aisdk's safe_parse_json for better resilience (handles truncated JSON) + parsed_res <- aisdk::safe_parse_json(json_str) + + if (is.null(parsed_res)) { + stop("Failed to parse JSON even after repair.") + } + + class(parsed_res) <- c("interpretation", class(parsed_res)) + return(parsed_res) + }, + error = function(e) { + warning("Failed to parse JSON response from LLM. Returning raw text. Error: ", e$message) + return(res_content) + } + ) } #' @method print interpretation #' @export print.interpretation <- function(x, ...) { - # Check if it is an annotation result - if (!is.null(x$cell_type)) { - cat("## Cell Type Annotation\n\n") - if (!is.null(x$cluster)) cat(sprintf("### Cluster: %s\n\n", x$cluster)) - - cat(sprintf("**Cell Type:** %s\n", x$cell_type)) - cat(sprintf("**Confidence:** %s\n", x$confidence)) - cat("\n**Reasoning:**\n", x$reasoning, "\n") - - if (!is.null(x$markers)) { - cat("\n**Supporting Markers/Pathways:**\n") - if (is.list(x$markers) || length(x$markers) > 1) { - cat(paste("-", unlist(x$markers), collapse="\n"), "\n") - } else { - cat(x$markers, "\n") - } - } - cat("\n") - return(invisible(x)) - } - - # Check if it is a phenotyping result - if (!is.null(x$phenotype)) { - cat("## Phenotype Characterization\n\n") - if (!is.null(x$cluster)) cat(sprintf("### Group/Cluster: %s\n\n", x$cluster)) - - cat(sprintf("**Phenotype:** %s\n", x$phenotype)) - cat(sprintf("**Confidence:** %s\n", x$confidence)) - cat("\n**Reasoning:**\n", x$reasoning, "\n") - - if (!is.null(x$key_processes)) { - cat("\n**Key Processes:**\n") - if (is.list(x$key_processes) || length(x$key_processes) > 1) { - cat(paste("-", unlist(x$key_processes), collapse="\n"), "\n") - } else { - cat(x$key_processes, "\n") - } - } - cat("\n") - return(invisible(x)) - } - - cat("## Interpretation Result\n\n") + # Check if it is an annotation result + if (!is.null(x$cell_type)) { + cat("## Cell Type Annotation\n\n") if (!is.null(x$cluster)) cat(sprintf("### Cluster: %s\n\n", x$cluster)) - if (!is.null(x$overview)) { - cat("### 1. Overview\n") - cat(x$overview, "\n\n") - } + cat(sprintf("**Cell Type:** %s\n", x$cell_type)) + cat(sprintf("**Confidence:** %s\n", x$confidence)) + cat("\n**Reasoning:**\n", x$reasoning, "\n") - if (!is.null(x$regulatory_drivers)) { - cat("### 2. Regulatory Drivers (TFs/Hubs)\n") - if (is.list(x$regulatory_drivers) || length(x$regulatory_drivers) > 1) { - # If list or vector - drivers <- unlist(x$regulatory_drivers) - cat(paste("-", drivers, collapse="\n"), "\n\n") - } else { - cat(x$regulatory_drivers, "\n\n") - } + if (!is.null(x$markers)) { + cat("\n**Supporting Markers/Pathways:**\n") + if (is.list(x$markers) || length(x$markers) > 1) { + cat(paste("-", unlist(x$markers), collapse = "\n"), "\n") + } else { + cat(x$markers, "\n") + } } + cat("\n") + return(invisible(x)) + } + + # Check if it is a phenotyping result + if (!is.null(x$phenotype)) { + cat("## Phenotype Characterization\n\n") + if (!is.null(x$cluster)) cat(sprintf("### Group/Cluster: %s\n\n", x$cluster)) - if (!is.null(x$key_mechanisms)) { - cat("### 3. Key Mechanisms\n") - if (is.list(x$key_mechanisms)) { - for (mechanism_name in names(x$key_mechanisms)) { - cat(sprintf("#### %s\n", mechanism_name)) - mechanism <- x$key_mechanisms[[mechanism_name]] - - # Check if mechanism is a list (structured) or just a character string - if (is.list(mechanism)) { - if (!is.null(mechanism$explanation)) { - cat(mechanism$explanation, "\n") - } - if (!is.null(mechanism$pathways)) { - cat("**Pathways:** ", paste(mechanism$pathways, collapse = ", "), "\n") - } - if (!is.null(mechanism$genes)) { - cat("**Key Genes:** ", paste(head(mechanism$genes, 10), collapse = ", "), ifelse(length(mechanism$genes) > 10, "...", ""), "\n") - } - } else { - # mechanism is likely a simple character string description - cat(mechanism, "\n") - } - cat("\n") - } - } else { - cat(x$key_mechanisms, "\n\n") - } - } + cat(sprintf("**Phenotype:** %s\n", x$phenotype)) + cat(sprintf("**Confidence:** %s\n", x$confidence)) + cat("\n**Reasoning:**\n", x$reasoning, "\n") - if (!is.null(x$crosstalk)) { - cat("### 4. Crosstalk & Interactions\n") - cat(x$crosstalk, "\n\n") + if (!is.null(x$key_processes)) { + cat("\n**Key Processes:**\n") + if (is.list(x$key_processes) || length(x$key_processes) > 1) { + cat(paste("-", unlist(x$key_processes), collapse = "\n"), "\n") + } else { + cat(x$key_processes, "\n") + } } - - if (!is.null(x$hypothesis)) { - cat("### 5. Hypothesis\n") - if (is.list(x$hypothesis)) { - if (!is.null(x$hypothesis$what)) { - cat("**Observation (What):** ", x$hypothesis$what, "\n\n") - } - if (!is.null(x$hypothesis$so_what)) { - cat("**Implication (So What):** ", x$hypothesis$so_what, "\n\n") - } + cat("\n") + return(invisible(x)) + } + + cat("## Interpretation Result\n\n") + if (!is.null(x$cluster)) cat(sprintf("### Cluster: %s\n\n", x$cluster)) + + if (!is.null(x$overview)) { + cat("### 1. Overview\n") + cat(x$overview, "\n\n") + } + + if (!is.null(x$regulatory_drivers)) { + cat("### 2. Regulatory Drivers (TFs/Hubs)\n") + if (is.list(x$regulatory_drivers) || length(x$regulatory_drivers) > 1) { + # If list or vector + drivers <- unlist(x$regulatory_drivers) + cat(paste("-", drivers, collapse = "\n"), "\n\n") + } else { + cat(x$regulatory_drivers, "\n\n") + } + } + + if (!is.null(x$key_mechanisms)) { + cat("### 3. Key Mechanisms\n") + if (is.list(x$key_mechanisms)) { + for (mechanism_name in names(x$key_mechanisms)) { + cat(sprintf("#### %s\n", mechanism_name)) + mechanism <- x$key_mechanisms[[mechanism_name]] + + # Check if mechanism is a list (structured) or just a character string + if (is.list(mechanism)) { + if (!is.null(mechanism$explanation)) { + cat(mechanism$explanation, "\n") + } + if (!is.null(mechanism$pathways)) { + cat("**Pathways:** ", paste(mechanism$pathways, collapse = ", "), "\n") + } + if (!is.null(mechanism$genes)) { + cat("**Key Genes:** ", paste(head(mechanism$genes, 10), collapse = ", "), ifelse(length(mechanism$genes) > 10, "...", ""), "\n") + } } else { - cat(x$hypothesis, "\n\n") + # mechanism is likely a simple character string description + cat(mechanism, "\n") } + cat("\n") + } + } else { + cat(x$key_mechanisms, "\n\n") } - - if (!is.null(x$narrative)) { - cat("### 6. Narrative Draft\n") - cat(x$narrative, "\n\n") + } + + if (!is.null(x$crosstalk)) { + cat("### 4. Crosstalk & Interactions\n") + cat(x$crosstalk, "\n\n") + } + + if (!is.null(x$hypothesis)) { + cat("### 5. Hypothesis\n") + if (is.list(x$hypothesis)) { + if (!is.null(x$hypothesis$what)) { + cat("**Observation (What):** ", x$hypothesis$what, "\n\n") + } + if (!is.null(x$hypothesis$so_what)) { + cat("**Implication (So What):** ", x$hypothesis$so_what, "\n\n") + } + } else { + cat(x$hypothesis, "\n\n") } - - if (!is.null(x$network)) { - cat("### 7. Refined Regulatory Network\n") - # Simple ASCII visualization of the network - # Edge list with interaction type - el <- igraph::as_data_frame(x$network, what = "edges") - if (nrow(el) > 0) { - cat("Key Interactions:\n") - for (i in 1:nrow(el)) { - interaction_type <- ifelse("interaction" %in% names(el), paste0(" (", el[i, "interaction"], ")"), "") - reason <- ifelse("reason" %in% names(el), paste0(" - ", el[i, "reason"]), "") - cat(sprintf(" %s -- %s%s%s\n", el[i, "from"], el[i, "to"], interaction_type, reason)) - } - cat("\n") - } - - if (!is.null(x$network_evidence)) { - cat("**Network Evidence:**\n") - cat(x$network_evidence, "\n\n") - } + } + + if (!is.null(x$narrative)) { + cat("### 6. Narrative Draft\n") + cat(x$narrative, "\n\n") + } + + if (inherits(x$network, "igraph")) { + cat("### 7. Refined Regulatory Network\n") + # Simple ASCII visualization of the network + # Edge list with interaction type + el <- igraph::as_data_frame(x$network, what = "edges") + if (nrow(el) > 0) { + cat("Key Interactions:\n") + for (i in 1:nrow(el)) { + interaction_type <- ifelse("interaction" %in% names(el), paste0(" (", el[i, "interaction"], ")"), "") + reason <- ifelse("reason" %in% names(el), paste0(" - ", el[i, "reason"]), "") + cat(sprintf(" %s -- %s%s%s\n", el[i, "from"], el[i, "to"], interaction_type, reason)) + } + cat("\n") } - # Fallback if no content printed - if (is.null(x$cell_type) && is.null(x$phenotype) && is.null(x$overview) && is.null(x$key_mechanisms)) { - cat("No structured interpretation content found.\n") - cat("Raw result structure:\n") - utils::str(x) + if (!is.null(x$network_evidence)) { + cat("**Network Evidence:**\n") + cat(x$network_evidence, "\n\n") } - - invisible(x) + } + + # Fallback if no content printed + if (is.null(x$cell_type) && is.null(x$phenotype) && is.null(x$overview) && is.null(x$key_mechanisms)) { + cat("No structured interpretation content found.\n") + cat("Raw result structure:\n") + utils::str(x) + } + + invisible(x) } #' @method print interpretation_list #' @export print.interpretation_list <- function(x, ...) { - cat("# Enrichment Interpretation / Annotation Report\n\n") - for (i in seq_along(x)) { - print(x[[i]]) - cat("---\n\n") - } - invisible(x) + cat("# Enrichment Interpretation / Annotation Report\n\n") + for (i in seq_along(x)) { + print(x[[i]]) + cat("---\n\n") + } + invisible(x) } From 818182e59ac34ea2a680c82fe675c1aa1c81b1ec Mon Sep 17 00:00:00 2001 From: Young Sherlock Date: Sun, 1 Mar 2026 01:21:12 +0800 Subject: [PATCH 3/7] docs: update documentation for interpret functions --- man/interpret.Rd | 7 +++++-- man/interpret_agent.Rd | 3 +++ man/interpret_hierarchical.Rd | 1 + man/reexports.Rd | 4 ++-- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/man/interpret.Rd b/man/interpret.Rd index ee1779c..549edb4 100644 --- a/man/interpret.Rd +++ b/man/interpret.Rd @@ -9,6 +9,7 @@ interpret( context = NULL, n_pathways = 20, model = "deepseek-chat", + provider = NULL, api_key = NULL, task = "interpretation", prior = NULL, @@ -25,6 +26,8 @@ interpret( \item{model}{The LLM model to use. Default is "deepseek-chat". Supported models include "deepseek-chat", "glm-4", "qwen-turbo" etc.} +\item{provider}{The LLM provider. Default is NULL (inferred from model or handled by aisdk).} + \item{api_key}{The API key for the LLM. If NULL, it tries to fetch from `getOption('yulab_translate')` based on the model.} \item{task}{Task type, default is "interpretation". Other options include "cell_type"/"annotation" and "phenotype"/"phenotyping".} @@ -42,8 +45,8 @@ A character string containing the LLM-generated interpretation. Interpret enrichment results using Large Language Models (LLM) } \details{ -This function sends the enrichment results (top significant pathways) along with -an optional experimental context to an LLM (e.g., DeepSeek) to generate +This function sends the enrichment results (top significant pathways) along with +an optional experimental context to an LLM (e.g., DeepSeek) to generate a biological interpretation, hypothesis, and narrative suitable for a paper. } \author{ diff --git a/man/interpret_agent.Rd b/man/interpret_agent.Rd index 1941463..287f1eb 100644 --- a/man/interpret_agent.Rd +++ b/man/interpret_agent.Rd @@ -9,6 +9,7 @@ interpret_agent( context = NULL, n_pathways = 50, model = "deepseek-chat", + provider = NULL, api_key = NULL, add_ppi = FALSE, gene_fold_change = NULL @@ -23,6 +24,8 @@ interpret_agent( \item{model}{The LLM model to use.} +\item{provider}{The LLM provider. Default is NULL (inferred from model or handled by aisdk).} + \item{api_key}{The API key for the LLM.} \item{add_ppi}{Boolean, whether to use PPI network integration.} diff --git a/man/interpret_hierarchical.Rd b/man/interpret_hierarchical.Rd index 4d889d0..ccc3431 100644 --- a/man/interpret_hierarchical.Rd +++ b/man/interpret_hierarchical.Rd @@ -9,6 +9,7 @@ interpret_hierarchical( x_major, mapping, model = "deepseek-chat", + provider = NULL, api_key = NULL, task = "cell_type" ) diff --git a/man/reexports.Rd b/man/reexports.Rd index f079145..d4e448a 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -39,14 +39,14 @@ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ + \item{GOSemSim}{\code{\link[GOSemSim]{buildGOmap}}, \code{\link[GOSemSim]{get_organism}}, \code{\link[GOSemSim]{read.blast2go}}, \code{\link[GOSemSim:read-gaf]{read.gaf}}} + \item{dplyr}{\code{\link[dplyr]{arrange}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr:context]{n}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{select}}, \code{\link[dplyr]{slice}}, \code{\link[dplyr]{summarise}}} \item{enrichit}{\code{\link[enrichit]{geneID}}, \code{\link[enrichit]{geneInCategory}}, \code{\link[enrichit]{gsfilter}}, \code{\link[enrichit]{setReadable}}} \item{enrichplot}{\code{\link[enrichplot:reexports]{cnetplot}}, \code{\link[enrichplot]{dotplot}}, \code{\link[enrichplot]{emapplot}}, \code{\link[enrichplot]{goplot}}, \code{\link[enrichplot]{gseaplot}}, \code{\link[enrichplot]{heatplot}}, \code{\link[enrichplot]{ridgeplot}}} - \item{GOSemSim}{\code{\link[GOSemSim]{buildGOmap}}, \code{\link[GOSemSim]{get_organism}}, \code{\link[GOSemSim]{read.blast2go}}, \code{\link[GOSemSim:read-gaf]{read.gaf}}} - \item{gson}{\code{\link[gson:read-gmt]{read.gmt}}, \code{\link[gson:read-gmt]{read.gmt.wp}}} \item{magrittr}{\code{\link[magrittr:compound]{\%<>\%}}, \code{\link[magrittr:pipe]{\%>\%}}} From a52882ff38a4399fc775e8f91a6da4fbaec41827 Mon Sep 17 00:00:00 2001 From: Young Sherlock Date: Sun, 1 Mar 2026 02:23:46 +0800 Subject: [PATCH 4/7] refactor: clean up and simplify print.interpretation method --- R/interpret.R | 1867 ++++++++++++++++++++++++++----------------------- 1 file changed, 977 insertions(+), 890 deletions(-) diff --git a/R/interpret.R b/R/interpret.R index 8c28475..a09b7d1 100644 --- a/R/interpret.R +++ b/R/interpret.R @@ -1,586 +1,238 @@ -#' Interpret enrichment results using a multi-agent system (Deep Mode) +#' @title Interpret Enrichment Results Using LLMs +#' @description +#' Functions for interpreting functional enrichment analysis results using +#' Large Language Models. Supports single-call interpretation, multi-agent +#' deep analysis, and hierarchical cluster strategies. #' -#' This function employs a multi-agent strategy to provide a more rigorous and comprehensive -#' biological interpretation. It breaks down the task into three specialized agents: -#' 1. Agent Cleaner: Filters noise and selects relevant pathways. -#' 2. Agent Detective: Identifies key regulators and functional modules using PPI/TF data. -#' 3. Agent Synthesizer: Synthesizes findings into a coherent narrative. -#' -#' @title interpret_agent -#' @param x An enrichment result object (e.g., `enrichResult` or `gseaResult`). -#' @param context A string describing the experimental background. -#' @param n_pathways Number of top pathways to consider initially. Default is 50 (Agent 1 will filter them). -#' @param model The LLM model to use. -#' @param provider The LLM provider. Default is NULL (inferred from model or handled by aisdk). -#' @param api_key The API key for the LLM. -#' @param add_ppi Boolean, whether to use PPI network integration. -#' @param gene_fold_change Named vector of logFC for expression context. -#' @return A detailed interpretation list. -#' @author Guangchuang Yu -#' @export -interpret_agent <- function(x, context = NULL, n_pathways = 50, model = "deepseek-chat", provider = NULL, api_key = NULL, add_ppi = FALSE, gene_fold_change = NULL) { - if (missing(x)) { - stop("enrichment result 'x' is required.") - } - - # Process input into a list of data frames (one per cluster/group) - res_list <- process_enrichment_input(x, n_pathways) - - if (length(res_list) == 0) { - return("No significant pathways found to interpret.") - } - - # Process each cluster with the multi-agent pipeline - results <- lapply(names(res_list), function(name) { - item <- res_list[[name]] - df <- item$df - original_genes <- item$genes - - # Check for fallback mode - fallback_mode <- FALSE - pathway_text <- "" - - if (nrow(df) == 0) { - if (!is.null(original_genes) && length(original_genes) > 0) { - fallback_mode <- TRUE - warning(sprintf("Cluster '%s': No enriched pathways. Falling back to gene-based interpretation. Confidence may be lower.", name)) - pathway_text <- paste( - "No significant pathways enriched.", - "Top Genes:", paste(head(original_genes, 50), collapse = ", ") - ) - } else { - return(NULL) - } - } else { - message(sprintf("Processing cluster '%s' with Agent 1: The Cleaner...", name)) - - # Format initial pathways for Agent 1 - cols_to_keep <- intersect(c("ID", "Description", "GeneRatio", "NES", "p.adjust", "pvalue", "geneID"), names(df)) - pathway_text <- paste( - apply(df[, cols_to_keep, drop = FALSE], 1, function(row) { - paste(names(row), row, sep = ": ", collapse = ", ") - }), - collapse = "\n" - ) - } - - # --- Step 1: Agent Cleaner --- - # Skip cleaner in fallback mode or adapt it? - # For now, if fallback, we skip cleaner as there are no pathways to clean. - cleaned_pathways <- pathway_text - if (!fallback_mode) { - clean_res <- run_agent_cleaner(pathway_text, context, model, provider, api_key) - if (is.null(clean_res) || is.null(clean_res$kept_pathways)) { - warning("Agent Cleaner failed or returned empty results. Falling back to using top pathways.") - cleaned_pathways <- pathway_text # Fallback - } else { - cleaned_pathways <- paste("Selected Relevant Pathways (filtered by Agent Cleaner):", - paste(clean_res$kept_pathways, collapse = ", "), - "\nReasoning:", clean_res$reasoning, - sep = "\n" - ) - } - } - - # --- Step 2: Agent Detective --- - message(sprintf("Processing cluster '%s' with Agent 2: The Detective...", name)) - - # Prepare Network Data (PPI) - ppi_network_text <- NULL - if (add_ppi) { - # Extract genes - all_genes <- NULL - if (fallback_mode) { - all_genes <- original_genes - } else { - all_genes <- unique(unlist(strsplit(df$geneID, "/"))) - } - - if (length(all_genes) > 0) { - ppi_network_text <- .get_ppi_context_text(all_genes, x) - } - } - - # Prepare Fold Change Data - fc_text <- NULL - if (!is.null(gene_fold_change)) { - all_genes <- NULL - if (fallback_mode) { - all_genes <- original_genes - } else { - all_genes <- unique(unlist(strsplit(df$geneID, "/"))) - } - - common_genes <- intersect(all_genes, names(gene_fold_change)) - if (length(common_genes) > 0) { - fc_subset <- gene_fold_change[common_genes] - fc_subset <- fc_subset[order(abs(fc_subset), decreasing = TRUE)] - top_fc <- head(fc_subset, 20) - fc_text <- paste(names(top_fc), round(top_fc, 2), sep = ":", collapse = ", ") - } - } - - detective_res <- run_agent_detective(cleaned_pathways, ppi_network_text, fc_text, context, model, provider, api_key, fallback_mode) - - # --- Step 3: Agent Synthesizer --- - message(sprintf("Processing cluster '%s' with Agent 3: The Storyteller...", name)) - - final_res <- run_agent_synthesizer(cleaned_pathways, detective_res, context, model, provider, api_key, fallback_mode) - - # Post-processing: Add cluster name and parse refined network if available - if (is.list(final_res)) { - final_res$cluster <- name - if (fallback_mode) { - final_res$data_source <- "gene_list_only" - } - - # Merge Detective findings into final result for transparency - if (!is.null(detective_res)) { - final_res$regulatory_drivers <- detective_res$key_drivers - final_res$refined_network <- detective_res$refined_network - final_res$network_evidence <- detective_res$network_evidence - } - - # Helper to parse refined network to igraph - if (!is.null(final_res$refined_network)) { - rn_df <- tryCatch( - { - if (is.data.frame(final_res$refined_network)) { - final_res$refined_network - } else { - do.call(rbind, lapply(final_res$refined_network, as.data.frame)) - } - }, - error = function(e) NULL - ) - - if (!is.null(rn_df) && nrow(rn_df) > 0) { - colnames(rn_df)[colnames(rn_df) == "source"] <- "from" - colnames(rn_df)[colnames(rn_df) == "target"] <- "to" - if ("from" %in% names(rn_df) && "to" %in% names(rn_df)) { - final_res$network <- igraph::graph_from_data_frame(rn_df, directed = FALSE) - } - } - } - } - - return(final_res) - }) - - names(results) <- names(res_list) - - if (length(results) == 1 && names(results)[1] == "Default") { - return(results[[1]]) - } else { - class(results) <- c("interpretation_list", "list") - return(results) - } +#' Built on top of aisdk's `generate_object()` for reliable structured output, +#' and the Agent/Session system for multi-agent pipelines. +#' @name interpret +NULL + +# ============================================================================ +# Schema Definitions +# ============================================================================ +#' @importFrom aisdk generate_object z_object z_string z_array z_enum +.interpretation_schema <- function() { + z_object( + overview = z_string("High-level summary of the key biological processes identified"), + regulatory_drivers = z_string( + "Key transcription factors or master regulators and their potential role in driving the observed pathways" + ), + key_mechanisms = z_string( + "Explanation of underlying biological mechanisms, grouping pathways into major themes" + ), + crosstalk = z_string( + "Discussion of potential interactions and regulatory networks between pathways" + ), + hypothesis = z_string( + "A coherent biological hypothesis connecting the pathways (What) to biological meaning (So What)" + ), + narrative = z_string( + "A cohesive paragraph suitable for the Results or Discussion section of a scientific paper" + ), + refined_network = z_array( + z_object( + source = z_string("Source gene or protein"), + target = z_string("Target gene or protein"), + interaction = z_string("Type of interaction: binding, activation, inhibition, etc."), + reason = z_string("Why this edge is biologically relevant") + ), + description = "Core regulatory network edges" + ), + network_evidence = z_string( + "Description of specific protein complexes or signaling modules that support the conclusion" + ) + ) } -run_agent_cleaner <- function(pathways, context, model, provider, api_key) { - prompt <- paste0( - "You are 'Agent Cleaner', an expert bioinformatics curator.\n", - "Your task is to filter a list of enriched pathways to retain only those relevant to the specific experimental context.\n\n", - if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", - "Raw Enriched Pathways:\n", pathways, "\n\n", - "Instructions:\n", - "1. Identify and REMOVE 'housekeeping' pathways (e.g., Ribosome, Spliceosome, RNA transport) unless they are specifically relevant to the context (e.g., cancer proliferation).\n", - "2. Identify and REMOVE redundant or overly broad terms.\n", - "3. KEEP disease-specific, tissue-specific, or phenotype-driving pathways.\n\n", - "Output JSON format:\n", - "{\n", - " \"kept_pathways\": [\"List of pathway names to keep\"],\n", - " \"discarded_pathways\": [\"List of discarded pathways\"],\n", - " \"reasoning\": \"Brief explanation of the filtering strategy used.\"\n", - "}" +.annotation_schema <- function() { + z_object( + cell_type = z_string("The identified cell type label"), + confidence = z_enum(c("High", "Medium", "Low"), description = "Confidence level of the annotation"), + reasoning = z_string( + "Explanation of why this cell type was assigned, citing specific markers or pathways from the input" + ), + regulatory_drivers = z_string("Key TFs or master regulators that define this cell type/state"), + markers = z_array( + z_string(), + description = "Key markers or pathways from the input that support this decision" + ), + refined_network = z_array( + z_object( + source = z_string("Source gene or protein"), + target = z_string("Target gene or protein"), + interaction = z_string("Type of interaction"), + reason = z_string("Why this edge is relevant") + ), + description = "Core regulatory network edges" + ), + network_evidence = z_string("Protein complexes or signaling modules that support the conclusion") ) - - call_llm_aisdk(prompt, model, api_key) } -run_agent_detective <- function(pathways, ppi_network, fold_change, context, model, provider, api_key, fallback_mode = FALSE) { - prompt <- paste0( - "You are 'Agent Detective', an expert systems biologist.\n", - "Your task is to identify Key Drivers (Regulators) and Functional Modules based on the filtered pathways and available network data.\n\n", - if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", - if (fallback_mode) "WARNING: No significant enriched pathways found. You are analyzing RAW GENE LISTS. Proceed with caution.\n" else "", - "Filtered Pathways:\n", pathways, "\n\n", - if (!is.null(ppi_network)) paste0("PPI Network Evidence:\n", ppi_network, "\n\n") else "", - if (!is.null(fold_change)) paste0("Gene Fold Changes:\n", fold_change, "\n\n") else "", - "Instructions:\n", - "1. Identify potential Master Regulators (TFs, Kinases) that explain the pathways.\n", - "2. Define Functional Modules (groups of interacting proteins) using the PPI network.\n", - "3. Refine the PPI network to a core regulatory sub-network.\n\n", - "Output JSON format:\n", - "{\n", - " \"key_drivers\": [\"List of top 3-5 driver genes\"],\n", - " \"functional_modules\": [\"List of identified modules (e.g. 'TCR Complex', 'Cell Cycle G1/S')\"],\n", - " \"refined_network\": [{\"source\": \"GeneA\", \"target\": \"GeneB\", \"interaction\": \"activation\", \"reason\": \"evidence\"}],\n", - " \"network_evidence\": \"Narrative describing how the network supports the drivers.\"\n", - "}" +.annotation_refinement_schema <- function() { + z_object( + cell_type = z_string("The final identified cell type label (refined or corrected)"), + refinement_status = z_enum( + c("Confirmed", "Refined", "Corrected"), + description = "Whether the preliminary annotation was confirmed, refined, or corrected" + ), + confidence = z_enum(c("High", "Medium", "Low"), description = "Confidence level"), + reasoning = z_string( + "Why you confirmed, refined, or corrected the label, citing specific evidence" + ), + regulatory_drivers = z_string("Key TFs or master regulators"), + markers = z_array(z_string(), description = "Supporting markers or pathways"), + refined_network = z_array( + z_object( + source = z_string("Source gene or protein"), + target = z_string("Target gene or protein"), + interaction = z_string("Type of interaction"), + reason = z_string("Why this edge is relevant") + ), + description = "Core regulatory network edges" + ), + network_evidence = z_string("Protein complexes or signaling modules supporting the conclusion") ) - - call_llm_aisdk(prompt, model, provider, api_key) } -run_agent_synthesizer <- function(pathways, detective_report, context, model, provider, api_key, fallback_mode = FALSE) { - # Convert detective report to string if it's a list - detective_text <- "" - if (!is.null(detective_report) && is.list(detective_report)) { - # Check if fields exist before accessing to avoid errors - key_drivers <- if (!is.null(detective_report$key_drivers)) paste(detective_report$key_drivers, collapse = ", ") else "None identified" - functional_modules <- if (!is.null(detective_report$functional_modules)) paste(detective_report$functional_modules, collapse = ", ") else "None identified" - network_evidence <- if (!is.null(detective_report$network_evidence)) detective_report$network_evidence else "None provided" - - detective_text <- paste( - "Key Drivers: ", key_drivers, "\n", - "Functional Modules: ", functional_modules, "\n", - "Network Evidence: ", network_evidence, - sep = "" - ) - } - - prompt <- paste0( - "You are 'Agent Storyteller', a senior scientific writer.\n", - "Your task is to synthesize the findings from previous agents into a coherent biological narrative.\n\n", - if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", - if (fallback_mode) "WARNING: No significant enriched pathways found. You are analyzing RAW GENE LISTS. Interpretation confidence should be evaluated cautiously.\n" else "", - "Data Sources:\n", - "1. Relevant Pathways:\n", pathways, "\n\n", - "2. Detective's Report (Drivers & Modules):\n", detective_text, "\n\n", - "Instructions:\n", - "1. Write a comprehensive Overview.\n", - "2. Explain Key Mechanisms, explicitly linking Regulators -> Modules -> Pathways.\n", - "3. Formulate a Hypothesis.\n", - "4. Draft a Narrative paragraph for a paper.\n\n", - "Output JSON format:\n", - "{\n", - " \"overview\": \"...\",\n", - " \"key_mechanisms\": \"...\",\n", - " \"hypothesis\": \"...\",\n", - " \"narrative\": \"...\"\n", - "}" +.phenotype_schema <- function() { + z_object( + phenotype = z_string("A concise label for the biological phenotype or state"), + confidence = z_enum(c("High", "Medium", "Low"), description = "Confidence level"), + reasoning = z_string("How the enriched terms support this phenotype"), + regulatory_drivers = z_string("Key TFs or master regulators that drive this phenotype"), + key_processes = z_array( + z_string(), + description = "Key pathways or terms that define this phenotype" + ), + refined_network = z_array( + z_object( + source = z_string("Source gene or protein"), + target = z_string("Target gene or protein"), + interaction = z_string("Type of interaction"), + reason = z_string("Why this edge is relevant") + ), + description = "Core regulatory network edges" + ), + network_evidence = z_string("Protein complexes or signaling modules supporting the conclusion") ) - - call_llm_aisdk(prompt, model, provider, api_key) } -#' Interpret enrichment results using Large Language Models (LLM) -#' -#' This function sends the enrichment results (top significant pathways) along with -#' an optional experimental context to an LLM (e.g., DeepSeek) to generate -#' a biological interpretation, hypothesis, and narrative suitable for a paper. +.cleaner_schema <- function() { + z_object( + kept_pathways = z_array(z_string(), description = "Pathway names to keep"), + discarded_pathways = z_array(z_string(), description = "Discarded pathway names"), + reasoning = z_string("Explanation of the filtering strategy used") + ) +} + +.detective_schema <- function() { + z_object( + key_drivers = z_array( + z_string(), + description = "Top 3-5 driver genes (TFs, Kinases)" + ), + functional_modules = z_array( + z_string(), + description = "Identified functional modules (e.g., TCR Complex, Cell Cycle G1/S)" + ), + refined_network = z_array( + z_object( + source = z_string("Source gene"), + target = z_string("Target gene"), + interaction = z_string("Type: activation, inhibition, binding, etc."), + reason = z_string("Evidence for this interaction") + ), + description = "Core regulatory sub-network" + ), + network_evidence = z_string("How the network supports the identified drivers") + ) +} + +# ============================================================================ +# Model ID Helper +# ============================================================================ + +#' @title Infer Model ID +#' @description +#' Maps bare model names to the aisdk `provider:model` format for backward +#' compatibility. Emits a warning when guessing and suggests the explicit form. +#' If the model already contains a colon, it is returned as-is. #' -#' @title interpret -#' @param x An enrichment result object (e.g., `enrichResult` or `gseaResult`). -#' @param context A string describing the experimental background (e.g., "scRNA-seq of mouse myocardial infarction at day 3"). -#' @param n_pathways Number of top significant pathways to include in the analysis. Default is 20. -#' @param model The LLM model to use. Default is "deepseek-chat". Supported models include "deepseek-chat", "glm-4", "qwen-turbo" etc. -#' @param provider The LLM provider. Default is NULL (inferred from model or handled by aisdk). -#' @param api_key The API key for the LLM. If NULL, it tries to fetch from `getOption('yulab_translate')` based on the model. -#' @param task Task type, default is "interpretation". Other options include "cell_type"/"annotation" and "phenotype"/"phenotyping". -#' @param prior Optional prior knowledge (e.g., a biological hypothesis) to guide the task. -#' @param add_ppi Boolean, whether to use PPI network integration. -#' @param gene_fold_change Named vector of logFC for expression context. -#' @return A character string containing the LLM-generated interpretation. -#' @author Guangchuang Yu -#' @export -interpret <- function(x, context = NULL, n_pathways = 20, model = "deepseek-chat", provider = NULL, api_key = NULL, task = "interpretation", prior = NULL, add_ppi = FALSE, gene_fold_change = NULL) { - if (missing(x)) { - stop("enrichment result 'x' is required.") +#' @param model A model string, either bare (e.g., "deepseek-chat") or fully +#' qualified (e.g., "deepseek:deepseek-chat"). +#' @return A string in `provider:model` format. +#' @keywords internal +infer_model_id <- function(model) { + if (grepl(":", model, fixed = TRUE)) { + return(model) } - # Process input into a list of data frames (one per cluster/group) - res_list <- process_enrichment_input(x, n_pathways) - - if (length(res_list) == 0) { - return("No significant pathways found to interpret.") - } + provider <- NULL + + patterns <- list( + deepseek = "^deepseek-", + openai = "^(gpt-|o[0-9]|chatgpt-)", + anthropic = "claude", + gemini = "^gemini-", + bailian = "^(qwen-|qwq-|glm-)", + volcengine = "^(doubao-|deepseek-r1)", + stepfun = "^step-", + xai = "^grok-", + nvidia = "^(nvidia/|meta/llama)", + openrouter = "/" + ) - # Process each item - results <- lapply(names(res_list), function(name) { - message(sprintf("Interpreting cluster: %s", name)) - item <- res_list[[name]] - df <- item$df - genes <- item$genes - - # Get raw genes for this cluster to identify specific markers - # even if no pathways are enriched or if pathways obscure them - if (is.null(genes)) { - # Try to get from x if possible (for single result) - genes <- tryCatch(process_enrichment_input(x, n_pathways)[[name]]$genes, error = function(e) NULL) - } - - # Top specific genes text - top_genes_text <- NULL - if (!is.null(genes) && length(genes) > 0) { - # If we have fold change, use it to rank - if (!is.null(gene_fold_change)) { - common <- intersect(genes, names(gene_fold_change)) - if (length(common) > 0) { - fc <- gene_fold_change[common] - # Get top upregulated - top_up <- head(names(fc[order(fc, decreasing = TRUE)]), 20) - top_genes_text <- paste(top_up, collapse = ", ") - } else { - top_genes_text <- paste(head(genes, 20), collapse = ", ") - } - } else { - top_genes_text <- paste(head(genes, 20), collapse = ", ") - } - } - - if (nrow(df) == 0) { - # Fallback logic handled inside prompts or earlier? - # For now, let's allow empty df if we have genes - if (is.null(top_genes_text)) { - res <- list( - cluster = name, - overview = "No significant pathways enriched and no marker genes available for interpretation.", - confidence = "None" - ) - class(res) <- c("interpretation", "list") - return(res) - } - pathway_text <- "No significant enriched pathways found." - } else { - # Format pathways for prompt - # We typically need ID, Description, GeneRatio/NES, p.adjust, geneID - cols_to_keep <- intersect(c("ID", "Description", "GeneRatio", "NES", "p.adjust", "pvalue", "geneID"), names(df)) - pathway_text <- paste( - apply(df[, cols_to_keep, drop = FALSE], 1, function(row) { - paste(names(row), row, sep = ": ", collapse = ", ") - }), - collapse = "\n" - ) - } - - # Determine prior for this cluster - current_prior <- NULL - if (!is.null(prior)) { - if (length(prior) == 1 && is.null(names(prior))) { - current_prior <- prior - } else if (name %in% names(prior)) { - current_prior <- prior[[name]] - } - } - - # Determine PPI/Hub Genes info if requested - ppi_network_text <- NULL - if (add_ppi) { - # Extract all unique genes from the top pathways OR from the top genes list - all_genes <- unique(unlist(strsplit(as.character(df$geneID), "/"))) - if (length(all_genes) == 0 && !is.null(genes)) all_genes <- head(genes, 50) - - if (length(all_genes) > 0) { - ppi_network_text <- .get_ppi_context_text(all_genes, x) - } - } - - # Determine Fold Change info if provided - fc_text <- NULL - if (!is.null(gene_fold_change)) { - # gene_fold_change should be a named vector of logFC - # We filter for genes present in the pathways OR top genes - all_genes <- unique(unlist(strsplit(as.character(df$geneID), "/"))) - if (length(all_genes) == 0 && !is.null(genes)) all_genes <- genes - - common_genes <- intersect(all_genes, names(gene_fold_change)) - - if (length(common_genes) > 0) { - # Sort by absolute FC to show most regulated genes - fc_subset <- gene_fold_change[common_genes] - fc_subset <- fc_subset[order(abs(fc_subset), decreasing = TRUE)] - - # Take top 20 - top_fc <- head(fc_subset, 20) - fc_text <- paste(names(top_fc), round(top_fc, 2), sep = ":", collapse = ", ") - } - } - - # Construct Prompt based on task - if (task == "annotation" || task == "cell_type") { - prompt <- construct_annotation_prompt(pathway_text, context, name, current_prior, ppi_network_text, fc_text, top_genes_text) - } else if (task == "phenotype" || task == "phenotyping") { - prompt <- construct_phenotype_prompt(pathway_text, context, name, ppi_network_text, fc_text) - } else { - prompt <- construct_interpretation_prompt(pathway_text, context, ppi_network_text, fc_text) - } - - # Call LLM via fanyi/aisdk - res <- call_llm_aisdk(prompt, model, provider, api_key) - - # If result is a list (JSON parsed), add cluster name - if (is.list(res)) { - res$cluster <- name - - # Post-process refined_network if present to be an igraph object - if (!is.null(res$refined_network)) { - # refined_network is a list of lists/dataframes from JSON - # Convert to dataframe - rn_df <- tryCatch( - { - # It might be a list of lists or a dataframe already depending on jsonlite parsing - if (is.data.frame(res$refined_network)) { - res$refined_network - } else { - # Check if all elements are lists with same structure - if (all(sapply(res$refined_network, is.list))) { - # Convert each list element to dataframe row - do.call(rbind, lapply(res$refined_network, function(x) as.data.frame(x, stringsAsFactors = FALSE))) - } else { - NULL - } - } - }, - error = function(e) NULL - ) - - if (!is.null(rn_df) && nrow(rn_df) > 0) { - # Create igraph object - # Columns expected: source, target, interaction, reason - # Map source/target to from/to for igraph - colnames(rn_df)[colnames(rn_df) == "source"] <- "from" - colnames(rn_df)[colnames(rn_df) == "target"] <- "to" - - # Ensure we have from and to - if ("from" %in% names(rn_df) && "to" %in% names(rn_df)) { - res$network <- igraph::graph_from_data_frame(rn_df, directed = FALSE) # Assuming undirected for simplicity or directed if interaction implies - } - } - } - } else if (is.character(res)) { - # Handle raw text response fallback - res <- list( - cluster = name, - overview = res, # Put raw text in overview - confidence = "Low", - reasoning = "Failed to parse structured JSON response from LLM. Raw text provided." - ) - class(res) <- c("interpretation", "list") + for (name in names(patterns)) { + if (grepl(patterns[[name]], model, ignore.case = TRUE)) { + provider <- name + break } - - # If res is NULL (which shouldn't happen with call_llm_aisdk wrapper unless error caught inside and returned NULL, but wrapper returns raw text on error), handle it - if (is.null(res)) { - res <- list( - cluster = name, - overview = "Failed to retrieve interpretation from LLM.", - confidence = "None", - reasoning = "API call failed or returned empty response." - ) - class(res) <- c("interpretation", "list") - } - - return(res) - }) - - names(results) <- names(res_list) - - # Return structure - if (length(results) == 1 && names(results)[1] == "Default") { - return(results[[1]]) - } else { - class(results) <- c("interpretation_list", "list") - return(results) } -} - -#' Interpret enrichment results using a hierarchical strategy (Major -> Minor clusters) -#' -#' @title interpret_hierarchical -#' @param x_minor Enrichment result for sub-clusters (e.g., compareClusterResult or list of enrichResult). -#' @param x_major Enrichment result for major clusters. -#' @param mapping A named vector mapping sub-cluster IDs (names in x_minor) to major cluster IDs (names in x_major). -#' @param model LLM model. -#' @param api_key API key. -#' @param task Task type, default is "cell_type". -#' @return A list of interpretation results. -#' @author Guangchuang Yu -#' @export -interpret_hierarchical <- function(x_minor, x_major, mapping, model = "deepseek-chat", provider = NULL, api_key = NULL, task = "cell_type") { - # 1. Interpret Major Clusters - message("Step 1: Interpreting Major Clusters to establish lineage context...") - res_major <- interpret(x_major, context = NULL, model = model, provider = provider, api_key = api_key, task = "cell_type") - - # 2. Interpret Sub-clusters with Context - message("Step 2: Interpreting Sub-clusters using hierarchical constraints...") - - # Use internal helper to process x_minor into list of dataframes - res_list_minor <- process_enrichment_input(x_minor, n_pathways = 20) - results <- lapply(names(res_list_minor), function(name) { - # name is the sub-cluster ID - - # Determine Major Context - specific_context <- NULL - if (name %in% names(mapping)) { - major_id <- mapping[[name]] - - # Extract major result - major_info <- NULL - if (inherits(res_major, "interpretation_list") && major_id %in% names(res_major)) { - major_info <- res_major[[major_id]] - } else if (inherits(res_major, "interpretation")) { - # Handle case where res_major might be a single result (if only 1 major cluster) - # Check if it matches major_id or is just default - if (!is.null(res_major$cluster) && res_major$cluster == major_id) { - major_info <- res_major - } else if (is.null(res_major$cluster)) { - # Assume it's the only one - major_info <- res_major - } - } - - if (!is.null(major_info) && !is.null(major_info$cell_type)) { - major_label <- major_info$cell_type - specific_context <- paste0("Hierarchical Constraint: This cluster is a confirmed subcluster of the '", major_label, "' lineage (identified in major cluster analysis). Please focus on distinguishing the specific subtype or state within this lineage.") - } - } - - if (is.null(specific_context)) { - warning(paste("No major lineage context found for sub-cluster:", name)) - } - - # Call interpret for this single cluster - # We pass the dataframe directly - res <- interpret(res_list_minor[[name]], context = specific_context, model = model, provider = provider, api_key = api_key, task = task) - - # Ensure cluster name is preserved - if (is.list(res)) res$cluster <- name - - return(res) - }) + if (is.null(provider)) { + rlang::abort(c( + paste0("Cannot infer provider for model: ", model), + "i" = "Use the explicit 'provider:model' format.", + "i" = "Example: 'gemini:gemini-3-flash-preview', 'openai:gpt-4o', 'deepseek:deepseek-chat'" + )) + } - names(results) <- names(res_list_minor) - class(results) <- c("interpretation_list", "list") - return(results) + full_id <- paste0(provider, ":", model) + rlang::warn(c( + paste0("Inferred model ID as '", full_id, "'. Consider using the explicit format."), + "i" = paste0("Use: model = '", full_id, "'") + )) + full_id } +# ============================================================================ +# Input Processing +# ============================================================================ + +#' @keywords internal process_enrichment_input <- function(x, n_pathways) { - # Helper to convert object to data frame get_df <- function(obj) { if (inherits(obj, "compareClusterResult") || inherits(obj, "enrichResult") || inherits(obj, "gseaResult")) { return(as.data.frame(obj)) } else if (is.data.frame(obj)) { return(obj) } - stop("Unsupported input type. Expected enrichResult, compareClusterResult, gseaResult, or data.frame.") + rlang::abort("Unsupported input type. Expected enrichResult, compareClusterResult, gseaResult, or data.frame.") } - # Helper to get top N get_top_n <- function(df, n) { - if (nrow(df) == 0) { - return(df) - } + if (nrow(df) == 0) return(df) if ("p.adjust" %in% names(df)) { df <- df[order(df$p.adjust), ] } else if ("pvalue" %in% names(df)) { df <- df[order(df$pvalue), ] } - head(df, n) + utils::head(df, n) } - # Helper to get gene list from object get_genes <- function(obj, cluster = NULL) { if (inherits(obj, "enrichResult")) { return(obj@gene) @@ -591,430 +243,884 @@ process_enrichment_input <- function(x, n_pathways) { } } } - return(NULL) + NULL } - # Check if input is a list of enrichment objects (Mixed Database Strategy) - if (is.list(x) && !inherits(x, "enrichResult") && !inherits(x, "gseaResult") && !inherits(x, "compareClusterResult") && !is.data.frame(x)) { - # Check if it is already a processed item (has 'df' and 'genes') + if (is.list(x) && !inherits(x, "enrichResult") && !inherits(x, "gseaResult") && + !inherits(x, "compareClusterResult") && !is.data.frame(x)) { if ("df" %in% names(x) && is.data.frame(x$df)) { - # It seems to be a single processed item (e.g. passed from interpret_hierarchical) - # Return it as a single-item list return(list(Default = x)) } - - # Convert all elements to data frames dfs <- lapply(x, get_df) - - # Check if they look like compareCluster results (have 'Cluster' column) - has_cluster <- all(sapply(dfs, function(d) "Cluster" %in% names(d))) - + has_cluster <- all(vapply(dfs, function(d) "Cluster" %in% names(d), logical(1))) combined_df <- do.call(rbind, dfs) - if (has_cluster) { - # Split by Cluster and get top N for each cluster df_list <- split(combined_df, combined_df$Cluster) - res_list <- lapply(names(df_list), function(cl_name) { - list(df = get_top_n(df_list[[cl_name]], n_pathways), genes = NULL) # List input usually doesn't store raw genes in a structured way easily accessible here + res_list <- lapply(names(df_list), function(cl) { + list(df = get_top_n(df_list[[cl]], n_pathways), genes = NULL) }) names(res_list) <- names(df_list) return(res_list) } else { - # Assume single group (e.g. list of enrichResult for same sample) return(list(Default = list(df = get_top_n(combined_df, n_pathways), genes = NULL))) } } else { - # Single object df <- get_df(x) if ("Cluster" %in% names(df)) { - # compareClusterResult df_list <- split(df, df$Cluster) - - # Map back to genes if possible - res_list <- lapply(names(df_list), function(cl_name) { - genes <- get_genes(x, cl_name) - list(df = get_top_n(df_list[[cl_name]], n_pathways), genes = genes) + res_list <- lapply(names(df_list), function(cl) { + list(df = get_top_n(df_list[[cl]], n_pathways), genes = get_genes(x, cl)) }) names(res_list) <- names(df_list) return(res_list) } else { - # enrichResult / gseaResult - genes <- get_genes(x) - return(list(Default = list(df = get_top_n(df, n_pathways), genes = genes))) + return(list(Default = list(df = get_top_n(df, n_pathways), genes = get_genes(x)))) } } } +# ============================================================================ +# PPI Helper +# ============================================================================ + +#' @keywords internal .get_ppi_context_text <- function(genes, x = NULL, limit = 50) { - if (length(genes) == 0) { - return(NULL) - } + if (length(genes) == 0) return(NULL) - input_for_ppi <- head(genes, limit) - - # Try to determine taxID + input_for_ppi <- utils::head(genes, limit) current_taxID <- "auto" if (!is.null(x) && inherits(x, "enrichResult") && !is.list(x)) { current_taxID <- tryCatch(getTaxID(x@organism), error = function(e) "auto") } - ppi_res <- tryCatch( - { - g <- getPPI(input_for_ppi, taxID = current_taxID, output = "igraph", network_type = "functional") - - if (!is.null(g)) { - el <- igraph::as_data_frame(g, what = "edges") - if (nrow(el) > 0) { - if ("score" %in% names(el)) el <- el[order(el$score, decreasing = TRUE), ] - el_subset <- head(el, limit) - edges_text <- apply(el_subset, 1, function(row) { - score_info <- "" - if ("score" %in% names(row)) score_info <- paste0(" (Score: ", row["score"], ")") - paste0(row["from"], " -- ", row["to"], score_info) - }) - paste(edges_text, collapse = "\n") - } else { - NULL - } - } else { - NULL - } - }, - error = function(e) NULL + tryCatch({ + g <- getPPI(input_for_ppi, taxID = current_taxID, output = "igraph", network_type = "functional") + if (is.null(g)) return(NULL) + el <- igraph::as_data_frame(g, what = "edges") + if (nrow(el) == 0) return(NULL) + if ("score" %in% names(el)) el <- el[order(el$score, decreasing = TRUE), ] + el_subset <- utils::head(el, limit) + edges_text <- apply(el_subset, 1, function(row) { + score_info <- if ("score" %in% names(row)) paste0(" (Score: ", row["score"], ")") else "" + paste0(row["from"], " -- ", row["to"], score_info) + }) + paste(edges_text, collapse = "\n") + }, error = function(e) NULL) +} + +# ============================================================================ +# Prompt Helpers (return system + user prompt pair) +# ============================================================================ + +.format_pathway_text <- function(df) { + cols <- intersect(c("ID", "Description", "GeneRatio", "NES", "p.adjust", "pvalue", "geneID"), names(df)) + paste( + apply(df[, cols, drop = FALSE], 1, function(row) { + paste(names(row), row, sep = ": ", collapse = ", ") + }), + collapse = "\n" ) - - return(ppi_res) } -construct_interpretation_prompt <- function(pathways, context, ppi_network = NULL, fold_change = NULL) { - base_prompt <- "You are an expert biologist and bioinformatics researcher. I have performed functional enrichment analyses using multiple databases (e.g., KEGG, Reactome, GO, ChEA/Transcription Factors, Disease Ontologies)." - - if (!is.null(context) && context != "") { - base_prompt <- paste0(base_prompt, "\n\nExperimental Context:\n", context) +.build_data_sections <- function(pathway_text, context = NULL, ppi_network = NULL, + fold_change = NULL, top_genes = NULL) { + parts <- character(0) + if (!is.null(context) && nzchar(context)) { + parts <- c(parts, paste0("Experimental Context:\n", context)) + } + parts <- c(parts, paste0("Top Enriched Terms:\n", pathway_text)) + if (!is.null(top_genes) && nzchar(top_genes)) { + parts <- c(parts, paste0("Top Marker Genes (Highest Fold Change):\n", top_genes)) } - - base_prompt <- paste0(base_prompt, "\n\nTop Enriched Terms (Mixed Sources):\n", pathways) - if (!is.null(ppi_network)) { - base_prompt <- paste0(base_prompt, "\n\nPPI Network (Edge List from STRING):\n", ppi_network) + parts <- c(parts, paste0("PPI Network (Edge List from STRING):\n", ppi_network)) } - if (!is.null(fold_change)) { - base_prompt <- paste0(base_prompt, "\n\nTop Regulated Genes (Log2 Fold Change):\n", fold_change) + parts <- c(parts, paste0("Top Regulated Genes (Log2 Fold Change):\n", fold_change)) } - - base_prompt <- paste0(base_prompt, "\n\nPlease use a **Chain-of-Thought** approach to analyze these results before generating the final report. Follow these reasoning steps: -1. **Source Deconvolution**: Identify the nature of the enriched terms. Distinguish between: - - **Biological Processes/Pathways** (e.g., 'Cell Cycle', 'TCR Signaling') -> WHAT is happening. - - **Upstream Regulators/TFs** (e.g., 'E2F1', 'NFKB1 target genes') -> WHO is driving it. - - **Phenotypes/Diseases** (e.g., 'Inflammation') -> WHAT is the outcome. -2. **Gene-Level Analysis**: Identify shared key genes and unique functional modules across the pathways. -3. **Causal Integration**: Construct a regulatory narrative connecting the Regulators (TFs) to the Processes (Pathways). For example: 'Enrichment of E2F1 targets explains the observed upregulation of Cell Cycle pathways'. -4. **Contextual Mapping**: Map these themes to the provided experimental context (e.g., tissue, treatment, timepoint) to distinguish expected vs. unexpected findings. -5. **Network Analysis**: Use the provided PPI connections to identify **functional modules** (e.g., a receptor-ligand pair or a protein complex). Identify key hubs that drive the biological process. -6. **Network Refinement**: Prune the provided PPI network to retain only the most biologically relevant edges that explain the context. - -Based on this deep analysis, please provide a comprehensive biological interpretation formatted as a JSON object with the following keys: -- overview: A high-level summary of the key biological processes identified. -- regulatory_drivers: Identify key transcription factors or master regulators found in the enrichment list (or inferred from hubs) and explain their potential role in driving the observed pathways. -- key_mechanisms: Explain the underlying biological mechanisms, grouping pathways into major themes. -- crosstalk: Discuss potential interactions and regulatory networks between these pathways. -- hypothesis: Formulate a coherent biological hypothesis connecting the pathways ('What') to the biological meaning ('So What'). -- narrative: Write a cohesive paragraph suitable for the 'Results' or 'Discussion' section of a scientific paper. -- refined_network: A list of objects representing the refined core regulatory network. Each object should have 'source', 'target', 'interaction' (e.g., binding, activation, inhibition), and 'reason' (why this edge is relevant). -- network_evidence: Describe specific protein complexes or signaling modules found in the network that support your conclusion. + paste(parts, collapse = "\n\n") +} -**GROUNDING INSTRUCTION:** -Every biological claim or interpretation MUST be supported by specific evidence from the provided enrichment results. -- When mentioning a biological process, cite the supporting pathway name(s) in parentheses. -- Example: 'The cluster shows signs of proliferation (supported by: Cell cycle, DNA replication)'. -- Do not make claims that cannot be directly inferred from the provided list. +.interpretation_system_prompt <- function() { + paste0( + "You are an expert biologist and bioinformatics researcher.\n\n", + "Analyze enrichment results using Chain-of-Thought reasoning:\n", + "1. Source Deconvolution: Distinguish Biological Processes/Pathways (WHAT), ", + "Upstream Regulators/TFs (WHO), and Phenotypes/Diseases (OUTCOME).\n", + "2. Gene-Level Analysis: Identify shared key genes and unique functional modules.\n", + "3. Causal Integration: Connect Regulators -> Processes -> Outcomes.\n", + "4. Contextual Mapping: Map themes to the experimental context.\n", + "5. Network Analysis: Identify functional modules and key hubs from PPI data.\n", + "6. Network Refinement: Prune PPI to the most biologically relevant edges.\n\n", + "GROUNDING INSTRUCTION:\n", + "- Every claim MUST be supported by specific evidence from the provided enrichment results.\n", + "- Cite supporting pathway names in parentheses.\n", + "- Do not make claims that cannot be directly inferred from the provided data." + ) +} -Please be scientifically rigorous, citing standard biological knowledge where appropriate, and avoid hallucinations. +.annotation_system_prompt <- function(cluster_id, has_prior = FALSE) { + base <- paste0( + "You are an expert cell biologist analyzing cluster ", cluster_id, + " from a single-cell RNA-seq experiment.\n\n", + "Use the following logic:\n", + "1. Source Deconvolution: Distinguish Cell Type Markers, Biological Pathways, and Upstream TFs.\n", + "2. Comparative Analysis (CRITICAL): Do NOT just look at the top 1 enriched term.\n", + " - Compare the top 3-5 enriched terms as candidates.\n", + " - Use specific Marker Genes to vote among candidates.\n", + " - Rule of Exclusion: If top term is 'Cell Type A' but gene list has markers for 'Cell Type B' ", + "and lacks key markers for 'Cell Type A', assign 'Cell Type B'.\n", + "3. Pathway Context: Use functional pathways to infer cell state or function.\n", + "4. Integration: Combine marker specificity with pathway function.\n", + "5. Network Analysis: Use PPI data to identify functional modules.\n\n", + "Confidence Calibration:\n", + "- High: Specific markers definitively distinguish the label.\n", + "- Medium: Strong shared markers but weak discriminatory markers.\n", + "- Low: Conflicting evidence.\n\n", + "GROUNDING INSTRUCTION (STRICT):\n", + "- NO HALLUCINATION: Do not invent markers or pathways not present in the input.\n", + "- CITATION REQUIRED: Every conclusion must cite specific pathways or markers from the data." + ) + if (has_prior) { + paste0(base, "\n\nTask: Validate and refine the preliminary annotation based on the enrichment and marker evidence.") + } else { + paste0(base, "\n\nTask: Identify the cell type of this cluster based on the enrichment results and marker genes.") + } +} -Ensure the response is a valid JSON object. Do not include any markdown formatting (like ```json).") - - return(base_prompt) +.phenotype_system_prompt <- function(group_id) { + paste0( + "You are an expert biologist characterizing the biological phenotype of group ", group_id, ".\n\n", + "Use the following logic:\n", + "1. Source Deconvolution: Separate observed processes (Pathways) from drivers (TFs).\n", + "2. Synthesize enriched terms to identify the dominant biological theme.\n", + "3. Be specific about direction or nature of the state ", + "(e.g., 'M1 Macrophage Polarization' over 'Immune response').\n", + "4. Assign a concise Phenotype Label.\n", + "5. Network Analysis: Use PPI data to identify functional modules and key drivers.\n\n", + "GROUNDING INSTRUCTION:\n", + "- Every phenotype claim must be supported by evidence.\n", + "- List the specific pathways or TFs that define this phenotype." + ) } -construct_annotation_prompt <- function(pathways, context, cluster_id, prior = NULL, ppi_network = NULL, fold_change = NULL, top_genes = NULL) { - base_prompt <- paste0("You are an expert cell biologist. I have a cell cluster (", cluster_id, ") from a single-cell RNA-seq experiment.") - - if (!is.null(context) && context != "") { - base_prompt <- paste0(base_prompt, "\n\nExperimental Context:\n", context) - } - - if (!is.null(prior) && prior != "") { - base_prompt <- paste0(base_prompt, "\n\nPreliminary Annotation (from automated tool):\n", prior) +# ============================================================================ +# Core: interpret() +# ============================================================================ + +#' Interpret enrichment results using LLMs +#' +#' Sends enrichment results along with optional experimental context to an LLM +#' to generate a structured biological interpretation, hypothesis, and narrative +#' suitable for a publication. +#' +#' Uses `generate_object()` internally for reliable structured output with +#' automatic JSON repair, eliminating manual parsing failures. +#' +#' @param x An enrichment result object (`enrichResult`, `gseaResult`, +#' `compareClusterResult`, or a `data.frame` with pathway columns). +#' @param context A string describing the experimental background +#' (e.g., "scRNA-seq of mouse myocardial infarction at day 3"). +#' @param n_pathways Number of top significant pathways to include. Default 20. +#' @param model The LLM model in `provider:model` format +#' (e.g., "deepseek:deepseek-chat", "gemini:gemini-2.5-flash"). +#' Bare model names are supported with a warning (e.g., "deepseek-chat"). +#' @param task Task type: "interpretation" (default), "cell_type"/"annotation", +#' or "phenotype"/"phenotyping". +#' @param prior Optional prior knowledge or preliminary annotation to guide the task. +#' @param add_ppi Logical, whether to query STRING PPI network data. Default FALSE. +#' @param gene_fold_change Named numeric vector of log fold changes for expression context. +#' @param max_tokens Maximum tokens for the LLM response. Default 8192. +#' Some models (especially reasoning models) may need much higher values +#' (e.g., 16384 or more) to produce complete structured output. +#' @param temperature Sampling temperature. Default 0.3. +#' @param verbose Logical, whether to print debug messages showing raw API +#' responses, token usage, and JSON parsing details. Default FALSE. +#' Equivalent to setting `options(aisdk.debug = TRUE)` for the call. +#' @return An `interpretation` object (list) with task-specific fields. +#' For "interpretation": overview, key_mechanisms, hypothesis, narrative, etc. +#' For "annotation": cell_type, confidence, reasoning, markers, etc. +#' For "phenotype": phenotype, confidence, reasoning, key_processes, etc. +#' @export +#' @examples +#' \dontrun{ +#' # Basic usage with a data frame +#' df <- data.frame( +#' ID = c("GO:0006915", "GO:0008284"), +#' Description = c("apoptotic process", "positive regulation of proliferation"), +#' GeneRatio = c("10/100", "20/100"), +#' p.adjust = c(0.01, 0.02), +#' geneID = c("TP53/BAX", "MYC/CCND1/CDK4") +#' ) +#' res <- interpret(df, model = "deepseek:deepseek-chat", +#' context = "Cancer proliferation study") +#' print(res) +#' } +interpret <- function(x, + context = NULL, + n_pathways = 20, + model = "deepseek:deepseek-chat", + task = "interpretation", + prior = NULL, + add_ppi = FALSE, + gene_fold_change = NULL, + max_tokens = 8192, + temperature = 0.3, + verbose = FALSE) { + if (missing(x)) rlang::abort("Enrichment result 'x' is required.") + + old_debug <- getOption("aisdk.debug", FALSE) + if (isTRUE(verbose)) { + options(aisdk.debug = TRUE) + on.exit(options(aisdk.debug = old_debug), add = TRUE) } - base_prompt <- paste0(base_prompt, "\n\nEnriched Terms (Mixed Sources: Pathways/TFs):\n", pathways) - - if (!is.null(top_genes) && top_genes != "") { - base_prompt <- paste0(base_prompt, "\n\nTop Specific/Marker Genes (Highest Fold Change):\n", top_genes) - } + model <- infer_model_id(model) + res_list <- process_enrichment_input(x, n_pathways) - if (!is.null(ppi_network)) { - base_prompt <- paste0(base_prompt, "\n\nPPI Network (Edge List from STRING):\n", ppi_network) + if (length(res_list) == 0) { + return(structure( + list(overview = "No significant pathways found to interpret.", confidence = "None"), + class = c("interpretation", "list") + )) } - if (!is.null(fold_change)) { - base_prompt <- paste0(base_prompt, "\n\nTop Regulated Genes (Log2 Fold Change):\n", fold_change) - } + results <- lapply(names(res_list), function(name) { + message(sprintf("Interpreting cluster: %s", name)) + item <- res_list[[name]] + df <- item$df + genes <- item$genes + + top_genes_text <- .get_top_genes_text(genes, gene_fold_change) + + if (nrow(df) == 0 && is.null(top_genes_text)) { + res <- list( + cluster = name, + overview = "No significant pathways enriched and no marker genes available for interpretation.", + confidence = "None" + ) + class(res) <- c("interpretation", "list") + return(res) + } + + pathway_text <- if (nrow(df) > 0) .format_pathway_text(df) else "No significant enriched pathways found." + + current_prior <- .resolve_prior(prior, name) + ppi_text <- .get_ppi_if_requested(add_ppi, df, genes, x) + fc_text <- .get_fc_text(gene_fold_change, df, genes) + + user_prompt <- .build_data_sections( + pathway_text, context, ppi_text, fc_text, top_genes_text + ) + + if (!is.null(current_prior) && nzchar(current_prior)) { + user_prompt <- paste0(user_prompt, "\n\nPreliminary Annotation:\n", current_prior) + } + + res <- .call_generate_object( + model = model, + task = task, + cluster_id = name, + user_prompt = user_prompt, + has_prior = !is.null(current_prior), + max_tokens = max_tokens, + temperature = temperature + ) + + res$cluster <- name + .postprocess_network(res) + }) - # Common Logic Section - logic_section <- " -Use the following logic: -1. **Source Deconvolution**: Distinguish between Cell Type Markers, Biological Pathways, and Upstream TFs. -2. **Comparative Analysis (CRITICAL)**: Do NOT just look at the top 1 enriched term. - - **Compare Candidates**: Look at the top 3-5 enriched terms. They often represent related cell types (e.g., 'Cytotoxic T cell' vs 'Natural Killer cell'). - - **Discriminatory Markers**: Use the specific 'Marker Genes' to vote among these candidates. - - **Rule of Exclusion**: If the top enriched term is 'Cell Type A', but the gene list contains specific markers for 'Cell Type B' (which is also in the enrichment list, perhaps lower ranked) AND lacks key markers for 'Cell Type A', you must assign 'Cell Type B'. - - **Example**: NK cells and Cytotoxic T cells both share cytotoxic genes (GZMB, PRF1). If the top term is 'T cell' but you see NK markers (FGFBP2, FCGR3A) and NO T-cell markers (CD3D, CD3E, CD4, CD8A), the identity is **NK cell**, not T cell. -3. **Pathway Context**: Use the functional pathways (e.g., KEGG, Reactome) to infer cell state or function (e.g., cytotoxicity, exhaustion). -4. **Integration**: Combine marker specificity with pathway function to assign a specific Cell Type Label. -5. **Network Analysis**: Use the provided PPI connections to identify **functional modules**. -6. **Refinement/Validation**: If a preliminary annotation is provided, validate it against the marker and pathway evidence. - -**Confidence Calibration**: -- **High**: Specific Marker Genes definitively distinguish the label from other top enriched candidates. -- **Medium**: Strong shared markers/pathways, but discriminatory markers are weak or absent. -- **Low**: Conflicting evidence. -" + names(results) <- names(res_list) - if (!is.null(prior) && prior != "") { - base_prompt <- paste0(base_prompt, "\n\nTask: -Please validate and refine the preliminary annotation based on the enrichment and marker evidence.", logic_section, " - -**GROUNDING INSTRUCTION (STRICT):** -- **NO HALLUCINATION**: Do not invent markers or pathways not present in the input. -- **CITATION REQUIRED**: Every conclusion must be explicitly backed by specific pathways or markers from the provided lists. -- In the 'reasoning' field, you MUST cite the specific terms that support your decision (e.g., 'Label assigned due to marker X and pathway Y'). - -Provide the result as a JSON object with the following keys: -- cell_type: The final identified cell type label (refined or corrected). -- refinement_status: 'Confirmed', 'Refined', or 'Corrected'. -- confidence: 'High', 'Medium', or 'Low'. -- reasoning: Explain why you confirmed, refined, or corrected the label, citing specific evidence (Pathways, TFs, Markers). -- regulatory_drivers: Identify key TFs or master regulators that define this cell state. -- markers: A list of key markers or pathways from the input that support this decision. -- refined_network: A list of objects representing the refined core regulatory network. Each object should have 'source', 'target', 'interaction', and 'reason'. -- network_evidence: Describe specific protein complexes or signaling modules found in the network that support your conclusion. - -Ensure the response is a valid JSON object. Do not include any markdown formatting (like ```json).") - } else { - base_prompt <- paste0(base_prompt, "\n\nBased on these enrichment results and marker genes, identify the cell type of this cluster.", logic_section, " - -**GROUNDING INSTRUCTION (STRICT):** -- **NO HALLUCINATION**: Do not invent markers or pathways not present in the input. -- **CITATION REQUIRED**: Every conclusion must be explicitly backed by specific pathways or markers from the provided lists. -- In the 'reasoning' field, you MUST cite the specific terms that support your decision (e.g., 'Label assigned due to marker X and pathway Y'). + if (length(results) == 1 && names(results)[1] == "Default") { + return(results[[1]]) + } + class(results) <- c("interpretation_list", "list") + results +} -Provide the result as a JSON object with the following keys: -- cell_type: The identified cell type label. -- confidence: 'High', 'Medium', or 'Low'. -- reasoning: A brief explanation of why this cell type was assigned, citing specific markers or pathways. -- regulatory_drivers: Identify key TFs or master regulators that define this cell type/state. -- markers: A list of key markers or pathways from the input that support this decision. -- refined_network: A list of objects representing the refined core regulatory network. Each object should have 'source', 'target', 'interaction', and 'reason'. -- network_evidence: Describe specific protein complexes or signaling modules found in the network that support your conclusion. +# ============================================================================ +# Core: interpret_agent() +# ============================================================================ -Ensure the response is a valid JSON object. Do not include any markdown formatting (like ```json).") +#' Interpret enrichment results using a multi-agent pipeline (Deep Mode) +#' +#' Employs three specialized AI agents in sequence for rigorous interpretation: +#' \enumerate{ +#' \item Agent Cleaner: Filters noise and selects relevant pathways. +#' \item Agent Detective: Identifies key regulators and functional modules. +#' \item Agent Synthesizer: Produces a coherent biological narrative. +#' } +#' +#' Uses aisdk's Agent and Session system for shared context across agents. +#' +#' @param x An enrichment result object. +#' @param context A string describing the experimental background. +#' @param n_pathways Number of top pathways to consider initially. Default 50. +#' @param model The LLM model in `provider:model` format. +#' @param add_ppi Logical, whether to query PPI data. Default FALSE. +#' @param gene_fold_change Named numeric vector of log fold changes. +#' @param max_tokens Maximum tokens per agent call. Default 4096. +#' @param temperature Sampling temperature. Default 0.3. +#' @return An `interpretation` object with deep analysis fields plus +#' regulatory_drivers, refined_network, and network_evidence from the +#' detective agent. +#' @export +#' @examples +#' \dontrun{ +#' res <- interpret_agent(df, model = "openai:gpt-4o", +#' context = "scRNA-seq of mouse MI day 3") +#' print(res) +#' } +interpret_agent <- function(x, + context = NULL, + n_pathways = 50, + model = "deepseek:deepseek-chat", + add_ppi = FALSE, + gene_fold_change = NULL, + max_tokens = 4096, + temperature = 0.3, + verbose = FALSE) { + if (missing(x)) rlang::abort("Enrichment result 'x' is required.") + + old_debug <- getOption("aisdk.debug", FALSE) + if (isTRUE(verbose)) { + options(aisdk.debug = TRUE) + on.exit(options(aisdk.debug = old_debug), add = TRUE) } - return(base_prompt) -} - -construct_phenotype_prompt <- function(pathways, context, group_id, ppi_network = NULL, fold_change = NULL) { - base_prompt <- paste0("You are an expert biologist. I have a list of enriched pathways/terms for a biological group (", group_id, "). The enrichment may include results from multiple databases (e.g., Pathways, TFs, Ontologies).") + model <- infer_model_id(model) + res_list <- process_enrichment_input(x, n_pathways) - if (!is.null(context) && context != "") { - base_prompt <- paste0(base_prompt, "\n\nExperimental Context:\n", context) + if (length(res_list) == 0) { + return(structure( + list(overview = "No significant pathways found to interpret."), + class = c("interpretation", "list") + )) } - base_prompt <- paste0(base_prompt, "\n\nEnriched Terms:\n", pathways) + results <- lapply(names(res_list), function(name) { + item <- res_list[[name]] + df <- item$df + original_genes <- item$genes + fallback_mode <- FALSE + + if (nrow(df) == 0) { + if (!is.null(original_genes) && length(original_genes) > 0) { + fallback_mode <- TRUE + rlang::warn(sprintf( + "Cluster '%s': No enriched pathways. Falling back to gene-based interpretation.", name + )) + pathway_text <- paste( + "No significant pathways enriched.", + "Top Genes:", paste(utils::head(original_genes, 50), collapse = ", ") + ) + } else { + return(NULL) + } + } else { + pathway_text <- .format_pathway_text(df) + } + + ppi_text <- .get_ppi_if_requested(add_ppi, df, original_genes, x, fallback_mode) + fc_text <- .get_fc_text(gene_fold_change, df, original_genes, fallback_mode) + + session <- ChatSession$new(model = model) + + # --- Agent 1: Cleaner --- + cleaned_pathways <- pathway_text + if (!fallback_mode) { + message(sprintf("Processing cluster '%s' with Agent 1: The Cleaner...", name)) + cleaner <- Agent$new( + name = "cleaner", + description = "Filters noise and selects relevant pathways from enrichment results", + system_prompt = paste0( + "You are 'Agent Cleaner', an expert bioinformatics curator.\n", + "Your task is to filter enriched pathways to retain only those relevant to the experimental context.\n\n", + "Instructions:\n", + "1. REMOVE 'housekeeping' pathways (Ribosome, Spliceosome, RNA transport) unless specifically relevant.\n", + "2. REMOVE redundant or overly broad terms.\n", + "3. KEEP disease-specific, tissue-specific, or phenotype-driving pathways." + ), + model = model + ) + + cleaner_prompt <- paste0( + if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", + "Raw Enriched Pathways:\n", pathway_text + ) + + cleaner_res <- tryCatch({ + gen <- generate_object( + model = model, prompt = cleaner_prompt, + schema = .cleaner_schema(), schema_name = "cleaner_result", + system = cleaner$system_prompt, + temperature = temperature, max_tokens = max_tokens + ) + gen$object + }, error = function(e) { + rlang::warn(paste0("Agent Cleaner failed: ", e$message, ". Using unfiltered pathways.")) + NULL + }) + + if (!is.null(cleaner_res) && !is.null(cleaner_res$kept_pathways)) { + cleaned_pathways <- paste( + "Selected Relevant Pathways (filtered by Agent Cleaner):", + paste(cleaner_res$kept_pathways, collapse = ", "), + "\nReasoning:", cleaner_res$reasoning, + sep = "\n" + ) + session$set_memory("cleaner_result", cleaner_res) + } + } + + # --- Agent 2: Detective --- + message(sprintf("Processing cluster '%s' with Agent 2: The Detective...", name)) + detective <- Agent$new( + name = "detective", + description = "Identifies key regulators and functional modules from enrichment and network data", + system_prompt = paste0( + "You are 'Agent Detective', an expert systems biologist.\n", + "Identify Key Drivers (Regulators) and Functional Modules from the filtered pathways and network data.\n\n", + "Instructions:\n", + "1. Identify potential Master Regulators (TFs, Kinases) that explain the pathways.\n", + "2. Define Functional Modules using PPI network connections.\n", + "3. Refine the PPI network to a core regulatory sub-network." + ), + model = model + ) + + detective_prompt <- paste0( + if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", + if (fallback_mode) "WARNING: No significant enriched pathways found. Analyzing RAW GENE LISTS.\n\n" else "", + "Filtered Pathways:\n", cleaned_pathways, + if (!is.null(ppi_text)) paste0("\n\nPPI Network Evidence:\n", ppi_text) else "", + if (!is.null(fc_text)) paste0("\n\nGene Fold Changes:\n", fc_text) else "" + ) + + detective_res <- tryCatch({ + gen <- generate_object( + model = model, prompt = detective_prompt, + schema = .detective_schema(), schema_name = "detective_result", + system = detective$system_prompt, + temperature = temperature, max_tokens = max_tokens + ) + gen$object + }, error = function(e) { + rlang::warn(paste0("Agent Detective failed: ", e$message)) + NULL + }) + + if (!is.null(detective_res)) { + session$set_memory("detective_result", detective_res) + } + + # --- Agent 3: Synthesizer --- + message(sprintf("Processing cluster '%s' with Agent 3: The Storyteller...", name)) + detective_text <- .format_detective_report(detective_res) + + synthesizer_prompt <- paste0( + if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", + if (fallback_mode) "WARNING: No significant enriched pathways found. Interpret with caution.\n\n" else "", + "Data Sources:\n", + "1. Relevant Pathways:\n", cleaned_pathways, "\n\n", + "2. Detective's Report (Drivers & Modules):\n", detective_text + ) + + synth_system <- paste0( + "You are 'Agent Storyteller', a senior scientific writer.\n", + "Synthesize the findings from previous agents into a coherent biological narrative.\n\n", + "Instructions:\n", + "1. Write a comprehensive Overview.\n", + "2. Explain Key Mechanisms, linking Regulators -> Modules -> Pathways.\n", + "3. Formulate a Hypothesis.\n", + "4. Draft a Narrative paragraph for a paper." + ) + + final_res <- tryCatch({ + gen <- generate_object( + model = model, prompt = synthesizer_prompt, + schema = .interpretation_schema(), schema_name = "synthesis_result", + system = synth_system, + temperature = temperature, max_tokens = max_tokens + ) + gen$object + }, error = function(e) { + rlang::warn(paste0("Agent Synthesizer failed: ", e$message)) + list(overview = "Agent Synthesizer failed to produce structured output.", confidence = "None") + }) + + if (is.list(final_res)) { + final_res$cluster <- name + if (fallback_mode) final_res$data_source <- "gene_list_only" + if (!is.null(detective_res)) { + final_res$regulatory_drivers <- detective_res$key_drivers + final_res$refined_network <- detective_res$refined_network + final_res$network_evidence <- detective_res$network_evidence + } + } + + .postprocess_network(final_res) + }) - if (!is.null(ppi_network)) { - base_prompt <- paste0(base_prompt, "\n\nPPI Network (Edge List from STRING):\n", ppi_network) - } + results <- Filter(Negate(is.null), results) + names(results) <- vapply(results, function(r) r$cluster %||% "Unknown", character(1)) - if (!is.null(fold_change)) { - base_prompt <- paste0(base_prompt, "\n\nTop Regulated Genes (Log2 Fold Change):\n", fold_change) + if (length(results) == 1 && names(results)[1] == "Default") { + return(results[[1]]) } - - base_prompt <- paste0(base_prompt, "\n\nBased on these enrichment results, characterize the specific biological phenotype or functional state of this group. -Use the following logic: -1. **Source Deconvolution**: Separate observed processes (Pathways) from drivers (TFs). -2. Synthesize the enriched terms to identify the dominant biological theme (e.g., Inflammation, Cell Cycle, Metabolism, Stress Response). -3. Be specific about the direction or nature of the state (e.g., 'M1 Macrophage Polarization' is better than just 'Immune response'; 'G2/M Arrest' is better than 'Cell Cycle'). -4. Assign a concise Phenotype Label. -5. **Network Analysis**: Use the provided PPI connections to identify **functional modules** and key drivers. - -**GROUNDING INSTRUCTION:** -- Every phenotype claim must be supported by evidence. -- In the 'reasoning' section, list the specific pathways or TFs that define this phenotype. + class(results) <- c("interpretation_list", "list") + results +} -Provide the result as a JSON object with the following keys: -- phenotype: A concise label for the biological phenotype/state. -- confidence: 'High', 'Medium', or 'Low'. -- reasoning: Explanation of how the terms support this phenotype. -- regulatory_drivers: Identify key TFs or master regulators that drive this phenotype. -- key_processes: A list of key pathways/terms that define this phenotype. -- refined_network: A list of objects representing the refined core regulatory network. Each object should have 'source', 'target', 'interaction', and 'reason'. -- network_evidence: Describe specific protein complexes or signaling modules found in the network that support your conclusion. +# ============================================================================ +# Core: interpret_hierarchical() +# ============================================================================ -Ensure the response is a valid JSON object. Do not include any markdown formatting (like ```json).") +#' Interpret enrichment results using a hierarchical strategy +#' +#' First interprets major clusters to establish lineage context, then interprets +#' sub-clusters with hierarchical constraints from the major cluster annotations. +#' +#' @param x_minor Enrichment result for sub-clusters. +#' @param x_major Enrichment result for major clusters. +#' @param mapping A named vector mapping sub-cluster IDs to major cluster IDs. +#' @param model The LLM model in `provider:model` format. +#' @param task Task type, default "cell_type". +#' @param max_tokens Maximum tokens. Default 4096. +#' @param temperature Sampling temperature. Default 0.3. +#' @return An `interpretation_list` object. +#' @export +interpret_hierarchical <- function(x_minor, + x_major, + mapping, + model = "deepseek:deepseek-chat", + task = "cell_type", + max_tokens = 4096, + temperature = 0.3) { + message("Step 1: Interpreting Major Clusters to establish lineage context...") + res_major <- interpret( + x_major, context = NULL, model = model, task = "cell_type", + max_tokens = max_tokens, temperature = temperature + ) + + message("Step 2: Interpreting Sub-clusters using hierarchical constraints...") + res_list_minor <- process_enrichment_input(x_minor, n_pathways = 20) + + results <- lapply(names(res_list_minor), function(name) { + specific_context <- NULL + if (name %in% names(mapping)) { + major_id <- mapping[[name]] + major_info <- .extract_major_info(res_major, major_id) + if (!is.null(major_info) && !is.null(major_info$cell_type)) { + specific_context <- paste0( + "Hierarchical Constraint: This cluster is a confirmed subcluster of the '", + major_info$cell_type, + "' lineage. Focus on distinguishing the specific subtype or state within this lineage." + ) + } + } + + if (is.null(specific_context)) { + rlang::warn(paste("No major lineage context found for sub-cluster:", name)) + } + + res <- interpret( + res_list_minor[[name]], context = specific_context, model = model, + task = task, max_tokens = max_tokens, temperature = temperature + ) + if (is.list(res)) res$cluster <- name + res + }) - return(base_prompt) + names(results) <- names(res_list_minor) + class(results) <- c("interpretation_list", "list") + results } -call_llm_aisdk <- function(prompt, model, provider = NULL, api_key = NULL) { - if (!requireNamespace("aisdk", quietly = TRUE)) { - stop("Package 'aisdk' is required for interpret(). Please install it.") +# ============================================================================ +# Internal Helpers +# ============================================================================ + +.diagnose_failure <- function(finish_reason, raw_text, max_tokens) { + raw_len <- nchar(raw_text %||% "") + + if (finish_reason == "length" && raw_len == 0) { + return(list( + message = paste0( + "Token limit exhausted (max_tokens=", max_tokens, "). ", + "The model used all tokens on internal reasoning and produced no output." + ), + suggestion = paste0( + "Increase max_tokens. Try: max_tokens = ", max(max_tokens * 4, 16384) + ), + fallback_overview = paste0( + "Model ran out of tokens (max_tokens=", max_tokens, + "). Re-run with a higher max_tokens value." + ) + )) } - # Fallback to yulab_translate if api_key is NULL (matching original fanyi behaviour) - if (is.null(api_key)) { - api_key <- getOption("yulab_translate") + if (finish_reason == "length" && raw_len > 0) { + return(list( + message = paste0( + "Token limit exhausted (max_tokens=", max_tokens, "). ", + "The JSON output was truncated, causing parse failure." + ), + suggestion = paste0( + "Increase max_tokens. Try: max_tokens = ", max(max_tokens * 2, 8192) + ), + fallback_overview = raw_text + )) } - # Optional: ensure local .env is loaded if available (often used in aisdk tests) - if (file.exists(".env")) { - readRenviron(".env") + if (raw_len > 0) { + return(list( + message = "Model returned text but it could not be parsed as valid JSON.", + suggestion = "The model may not follow JSON instructions well. Try a different model or lower temperature.", + fallback_overview = raw_text + )) } - # Standardize model string and extract provider - model_str <- model - provider_name <- provider - - if (is.null(provider_name)) { - if (grepl(":", model)) { - parts <- strsplit(model, ":")[[1]] - provider_name <- parts[1] - model_name <- parts[2] - } else { - # Heuristic inference if no provider/colon given - if (grepl("^deepseek-", model)) { - provider_name <- "deepseek" - } else if (grepl("^gpt-", model) || grepl("^o1-|^o3-", model)) { - provider_name <- "openai" - } else if (grepl("^qwen-|^glm-", model)) { - provider_name <- "bailian" - } else if (grepl("claude", model)) { - provider_name <- "anthropic" - } else if (grepl("^doubao-", model)) { - provider_name <- "volcengine" - } else if (grepl("^step-", model)) { - provider_name <- "stepfun" - } - model_name <- model + list( + message = "Model returned an empty response.", + suggestion = "Try a different model, or check that your API key has access to this model.", + fallback_overview = "Model returned empty response. Try a different model or increase max_tokens." + ) +} + +.get_top_genes_text <- function(genes, gene_fold_change) { + if (is.null(genes) || length(genes) == 0) return(NULL) + if (!is.null(gene_fold_change)) { + common <- intersect(genes, names(gene_fold_change)) + if (length(common) > 0) { + fc <- gene_fold_change[common] + top_up <- utils::head(names(fc[order(fc, decreasing = TRUE)]), 20) + return(paste(top_up, collapse = ", ")) } + } + paste(utils::head(genes, 20), collapse = ", ") +} + +.resolve_prior <- function(prior, name) { + if (is.null(prior)) return(NULL) + if (length(prior) == 1 && is.null(names(prior))) return(prior) + if (name %in% names(prior)) return(prior[[name]]) + NULL +} + +.get_ppi_if_requested <- function(add_ppi, df, genes, x, fallback_mode = FALSE) { + if (!add_ppi) return(NULL) + if (fallback_mode) { + all_genes <- genes + } else { + all_genes <- unique(unlist(strsplit(as.character(df$geneID), "/"))) + if (length(all_genes) == 0 && !is.null(genes)) all_genes <- utils::head(genes, 50) + } + if (length(all_genes) > 0) .get_ppi_context_text(all_genes, x) else NULL +} + +.get_fc_text <- function(gene_fold_change, df, genes, fallback_mode = FALSE) { + if (is.null(gene_fold_change)) return(NULL) + if (fallback_mode) { + all_genes <- genes } else { - model_name <- model + all_genes <- unique(unlist(strsplit(as.character(df$geneID), "/"))) + if (length(all_genes) == 0 && !is.null(genes)) all_genes <- genes + } + common_genes <- intersect(all_genes, names(gene_fold_change)) + if (length(common_genes) == 0) return(NULL) + fc_subset <- gene_fold_change[common_genes] + fc_subset <- fc_subset[order(abs(fc_subset), decreasing = TRUE)] + top_fc <- utils::head(fc_subset, 20) + paste(names(top_fc), round(top_fc, 2), sep = ":", collapse = ", ") +} + +.call_generate_object <- function(model, task, cluster_id, user_prompt, + has_prior = FALSE, max_tokens = 4096, + temperature = 0.3) { + if (task %in% c("annotation", "cell_type")) { + sys <- .annotation_system_prompt(cluster_id, has_prior) + schema <- if (has_prior) .annotation_refinement_schema() else .annotation_schema() + schema_name <- "annotation_result" + } else if (task %in% c("phenotype", "phenotyping")) { + sys <- .phenotype_system_prompt(cluster_id) + schema <- .phenotype_schema() + schema_name <- "phenotype_result" + } else { + sys <- .interpretation_system_prompt() + schema <- .interpretation_schema() + schema_name <- "interpretation_result" } - # Setup model object via explicit provider instantiation to avoid stale global registry - model_obj <- if (is.null(provider_name)) model_str else paste0(provider_name, ":", model_name) + debug <- isTRUE(getOption("aisdk.debug", FALSE)) - if (!is.null(provider_name)) { - provider_factory_name <- paste0("create_", provider_name) - # Check if factory exists in aisdk namespace - if (exists(provider_factory_name, envir = asNamespace("aisdk"), mode = "function")) { - provider_factory <- get(provider_factory_name, envir = asNamespace("aisdk")) - tryCatch( - { - # Use provided api_key or fallback to NULL (letting factory read fresh Sys.getenv) - provider_instance <- provider_factory(api_key = api_key) - model_obj <- provider_instance$language_model(model_name) - }, - error = function(e) { - warning(sprintf("Failed to instantiate provider '%s' explicitly: %s. Falling back to default resolution.", provider_name, e$message)) - } - ) - } + if (debug) { + message("[DEBUG] .call_generate_object: model=", model, + " task=", task, " cluster=", cluster_id, " max_tokens=", max_tokens) } - # Call aisdk::generate_text (temperature = NULL prevents sending unsupported params for reasoning models) - res_content <- tryCatch( - { - res <- aisdk::generate_text(model = model_obj, prompt = prompt, max_tokens = 4096, temperature = NULL) - res$text - }, - error = function(e) { - stop("Failed to call aisdk::generate_text. Error: ", e$message) - } - ) - - # Try to parse JSON response if the prompt asked for JSON - tryCatch( - { - # Clean up potential markdown code blocks like ```json ... ``` - json_str <- res_content - if (grepl("```json", json_str)) { - json_str <- sub(".*?```json\\s*", "", json_str) - json_str <- sub("\\s*```.*", "", json_str) - } else if (grepl("```", json_str)) { - json_str <- sub(".*?```\\s*", "", json_str) - json_str <- sub("\\s*```.*", "", json_str) + result <- tryCatch({ + gen <- generate_object( + model = model, prompt = user_prompt, schema = schema, + schema_name = schema_name, system = sys, + temperature = temperature, max_tokens = max_tokens + ) + + if (debug) { + message("[DEBUG] generate_object result:") + message("[DEBUG] finish_reason: ", gen$finish_reason %||% "NULL") + message("[DEBUG] raw_text length: ", nchar(gen$raw_text %||% ""), " chars") + message("[DEBUG] object is NULL: ", is.null(gen$object)) + if (!is.null(gen$usage)) { + message("[DEBUG] usage: prompt=", gen$usage$prompt_tokens, + " completion=", gen$usage$completion_tokens, + " total=", gen$usage$total_tokens) } + if (!is.null(gen$raw_text) && nchar(gen$raw_text) > 0) { + preview <- substr(gen$raw_text, 1, min(500, nchar(gen$raw_text))) + message("[DEBUG] raw_text preview:\n", preview, + if (nchar(gen$raw_text) > 500) "\n... [truncated]" else "") + } + } + + if (!is.null(gen$object)) { + obj <- gen$object + class(obj) <- c("interpretation", class(obj)) + obj + } else { + raw <- gen$raw_text %||% "" + finish <- gen$finish_reason %||% "unknown" + raw_preview <- if (nchar(raw) > 200) paste0(substr(raw, 1, 200), "...") else raw - # Use aisdk's safe_parse_json for better resilience (handles truncated JSON) - parsed_res <- aisdk::safe_parse_json(json_str) + diagnosis <- .diagnose_failure(finish, raw, max_tokens) - if (is.null(parsed_res)) { - stop("Failed to parse JSON even after repair.") + warn_parts <- c( + paste0("generate_object() returned NULL for cluster '", cluster_id, "'."), + "i" = paste0("Model: ", model), + "i" = paste0("finish_reason: ", finish), + "i" = paste0("raw_text (", nchar(raw), " chars): ", + if (nzchar(raw_preview)) raw_preview else "") + ) + warn_parts <- c(warn_parts, "!" = diagnosis$message) + if (!is.null(diagnosis$suggestion)) { + warn_parts <- c(warn_parts, ">" = diagnosis$suggestion) } + rlang::warn(warn_parts) - class(parsed_res) <- c("interpretation", class(parsed_res)) - return(parsed_res) - }, - error = function(e) { - warning("Failed to parse JSON response from LLM. Returning raw text. Error: ", e$message) - return(res_content) + fallback_text <- if (nzchar(raw)) raw else diagnosis$fallback_overview + list( + overview = fallback_text, + confidence = "Low", + cluster = cluster_id + ) } + }, error = function(e) { + rlang::warn(c( + paste0("LLM call failed for cluster '", cluster_id, "': ", e$message), + "i" = paste0("Model: ", model), + "i" = "Tip: Re-run with verbose=TRUE for full debug output" + )) + res <- list( + overview = paste0("LLM call failed: ", e$message), + confidence = "None", + cluster = cluster_id + ) + class(res) <- c("interpretation", "list") + res + }) + + result +} + +.format_detective_report <- function(detective_res) { + if (is.null(detective_res) || !is.list(detective_res)) return("No detective report available.") + key_drivers <- if (!is.null(detective_res$key_drivers)) { + paste(detective_res$key_drivers, collapse = ", ") + } else "None identified" + modules <- if (!is.null(detective_res$functional_modules)) { + paste(detective_res$functional_modules, collapse = ", ") + } else "None identified" + evidence <- detective_res$network_evidence %||% "None provided" + paste0( + "Key Drivers: ", key_drivers, "\n", + "Functional Modules: ", modules, "\n", + "Network Evidence: ", evidence ) } +.postprocess_network <- function(res) { + if (!is.list(res) || is.null(res$refined_network)) { + class(res) <- union("interpretation", class(res)) + return(res) + } + rn <- res$refined_network + rn_df <- tryCatch({ + if (is.data.frame(rn)) { + rn + } else if (is.list(rn) && length(rn) > 0 && all(vapply(rn, is.list, logical(1)))) { + do.call(rbind, lapply(rn, function(r) as.data.frame(r, stringsAsFactors = FALSE))) + } else { + NULL + } + }, error = function(e) NULL) + + if (!is.null(rn_df) && nrow(rn_df) > 0) { + colnames(rn_df)[colnames(rn_df) == "source"] <- "from" + colnames(rn_df)[colnames(rn_df) == "target"] <- "to" + if ("from" %in% names(rn_df) && "to" %in% names(rn_df)) { + if (requireNamespace("igraph", quietly = TRUE)) { + res$network <- igraph::graph_from_data_frame(rn_df, directed = FALSE) + } + } + } + class(res) <- union("interpretation", class(res)) + res +} + +.extract_major_info <- function(res_major, major_id) { + if (inherits(res_major, "interpretation_list") && major_id %in% names(res_major)) { + return(res_major[[major_id]]) + } + if (inherits(res_major, "interpretation")) { + if (!is.null(res_major$cluster) && res_major$cluster == major_id) return(res_major) + if (is.null(res_major$cluster)) return(res_major) + } + NULL +} + +# ============================================================================ +# Print Methods +# ============================================================================ + #' @method print interpretation #' @export print.interpretation <- function(x, ...) { - # Check if it is an annotation result if (!is.null(x$cell_type)) { cat("## Cell Type Annotation\n\n") if (!is.null(x$cluster)) cat(sprintf("### Cluster: %s\n\n", x$cluster)) - cat(sprintf("**Cell Type:** %s\n", x$cell_type)) + if (!is.null(x$refinement_status)) cat(sprintf("**Status:** %s\n", x$refinement_status)) cat(sprintf("**Confidence:** %s\n", x$confidence)) cat("\n**Reasoning:**\n", x$reasoning, "\n") - if (!is.null(x$markers)) { cat("\n**Supporting Markers/Pathways:**\n") - if (is.list(x$markers) || length(x$markers) > 1) { - cat(paste("-", unlist(x$markers), collapse = "\n"), "\n") - } else { - cat(x$markers, "\n") - } + cat(paste("-", unlist(x$markers), collapse = "\n"), "\n") } cat("\n") return(invisible(x)) } - # Check if it is a phenotyping result if (!is.null(x$phenotype)) { cat("## Phenotype Characterization\n\n") if (!is.null(x$cluster)) cat(sprintf("### Group/Cluster: %s\n\n", x$cluster)) - cat(sprintf("**Phenotype:** %s\n", x$phenotype)) cat(sprintf("**Confidence:** %s\n", x$confidence)) cat("\n**Reasoning:**\n", x$reasoning, "\n") - if (!is.null(x$key_processes)) { cat("\n**Key Processes:**\n") - if (is.list(x$key_processes) || length(x$key_processes) > 1) { - cat(paste("-", unlist(x$key_processes), collapse = "\n"), "\n") - } else { - cat(x$key_processes, "\n") - } + cat(paste("-", unlist(x$key_processes), collapse = "\n"), "\n") } cat("\n") return(invisible(x)) @@ -1029,42 +1135,37 @@ print.interpretation <- function(x, ...) { } if (!is.null(x$regulatory_drivers)) { - cat("### 2. Regulatory Drivers (TFs/Hubs)\n") - if (is.list(x$regulatory_drivers) || length(x$regulatory_drivers) > 1) { - # If list or vector - drivers <- unlist(x$regulatory_drivers) + cat("### 2. Regulatory Drivers\n") + drivers <- unlist(x$regulatory_drivers) + if (length(drivers) > 1) { cat(paste("-", drivers, collapse = "\n"), "\n\n") } else { - cat(x$regulatory_drivers, "\n\n") + cat(drivers, "\n\n") } } if (!is.null(x$key_mechanisms)) { cat("### 3. Key Mechanisms\n") - if (is.list(x$key_mechanisms)) { - for (mechanism_name in names(x$key_mechanisms)) { - cat(sprintf("#### %s\n", mechanism_name)) - mechanism <- x$key_mechanisms[[mechanism_name]] - - # Check if mechanism is a list (structured) or just a character string + if (is.list(x$key_mechanisms) && !is.null(names(x$key_mechanisms))) { + for (mname in names(x$key_mechanisms)) { + cat(sprintf("#### %s\n", mname)) + mechanism <- x$key_mechanisms[[mname]] if (is.list(mechanism)) { - if (!is.null(mechanism$explanation)) { - cat(mechanism$explanation, "\n") - } + if (!is.null(mechanism$explanation)) cat(mechanism$explanation, "\n") if (!is.null(mechanism$pathways)) { cat("**Pathways:** ", paste(mechanism$pathways, collapse = ", "), "\n") } if (!is.null(mechanism$genes)) { - cat("**Key Genes:** ", paste(head(mechanism$genes, 10), collapse = ", "), ifelse(length(mechanism$genes) > 10, "...", ""), "\n") + cat("**Key Genes:** ", paste(utils::head(mechanism$genes, 10), collapse = ", "), + if (length(mechanism$genes) > 10) "..." else "", "\n") } } else { - # mechanism is likely a simple character string description cat(mechanism, "\n") } cat("\n") } } else { - cat(x$key_mechanisms, "\n\n") + cat(as.character(x$key_mechanisms), "\n\n") } } @@ -1076,12 +1177,8 @@ print.interpretation <- function(x, ...) { if (!is.null(x$hypothesis)) { cat("### 5. Hypothesis\n") if (is.list(x$hypothesis)) { - if (!is.null(x$hypothesis$what)) { - cat("**Observation (What):** ", x$hypothesis$what, "\n\n") - } - if (!is.null(x$hypothesis$so_what)) { - cat("**Implication (So What):** ", x$hypothesis$so_what, "\n\n") - } + if (!is.null(x$hypothesis$what)) cat("**Observation (What):** ", x$hypothesis$what, "\n\n") + if (!is.null(x$hypothesis$so_what)) cat("**Implication (So What):** ", x$hypothesis$so_what, "\n\n") } else { cat(x$hypothesis, "\n\n") } @@ -1092,34 +1189,24 @@ print.interpretation <- function(x, ...) { cat(x$narrative, "\n\n") } - if (inherits(x$network, "igraph")) { + if (!is.null(x$network) && requireNamespace("igraph", quietly = TRUE) && inherits(x$network, "igraph")) { cat("### 7. Refined Regulatory Network\n") - # Simple ASCII visualization of the network - # Edge list with interaction type el <- igraph::as_data_frame(x$network, what = "edges") if (nrow(el) > 0) { cat("Key Interactions:\n") - for (i in 1:nrow(el)) { - interaction_type <- ifelse("interaction" %in% names(el), paste0(" (", el[i, "interaction"], ")"), "") - reason <- ifelse("reason" %in% names(el), paste0(" - ", el[i, "reason"]), "") + for (i in seq_len(nrow(el))) { + interaction_type <- if ("interaction" %in% names(el)) paste0(" (", el[i, "interaction"], ")") else "" + reason <- if ("reason" %in% names(el)) paste0(" - ", el[i, "reason"]) else "" cat(sprintf(" %s -- %s%s%s\n", el[i, "from"], el[i, "to"], interaction_type, reason)) } cat("\n") } - if (!is.null(x$network_evidence)) { cat("**Network Evidence:**\n") cat(x$network_evidence, "\n\n") } } - # Fallback if no content printed - if (is.null(x$cell_type) && is.null(x$phenotype) && is.null(x$overview) && is.null(x$key_mechanisms)) { - cat("No structured interpretation content found.\n") - cat("Raw result structure:\n") - utils::str(x) - } - invisible(x) } From 056609c77754500a7a61c0618d4a62dc3db93e70 Mon Sep 17 00:00:00 2001 From: Young Sherlock Date: Sun, 1 Mar 2026 02:28:10 +0800 Subject: [PATCH 5/7] refactor: migrate to aisdk structured output and update documentation --- NAMESPACE | 5 + R/interpret.R | 524 +++++++++++++++++++--------------- man/infer_model_id.Rd | 21 ++ man/interpret.Rd | 79 +++-- man/interpret_agent.Rd | 52 ++-- man/interpret_hierarchical.Rd | 30 +- 6 files changed, 423 insertions(+), 288 deletions(-) create mode 100644 man/infer_model_id.Rd diff --git a/NAMESPACE b/NAMESPACE index 9f81f09..29e974d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -129,6 +129,11 @@ importFrom(GOSemSim,godata) importFrom(GOSemSim,mgoSim) importFrom(GOSemSim,read.blast2go) importFrom(GOSemSim,read.gaf) +importFrom(aisdk,generate_object) +importFrom(aisdk,z_array) +importFrom(aisdk,z_enum) +importFrom(aisdk,z_object) +importFrom(aisdk,z_string) importFrom(dplyr,arrange) importFrom(dplyr,filter) importFrom(dplyr,group_by) diff --git a/R/interpret.R b/R/interpret.R index a09b7d1..96d6fd0 100644 --- a/R/interpret.R +++ b/R/interpret.R @@ -169,29 +169,29 @@ infer_model_id <- function(model) { if (grepl(":", model, fixed = TRUE)) { return(model) } - + provider <- NULL - + patterns <- list( - deepseek = "^deepseek-", - openai = "^(gpt-|o[0-9]|chatgpt-)", + deepseek = "^deepseek-", + openai = "^(gpt-|o[0-9]|chatgpt-)", anthropic = "claude", - gemini = "^gemini-", - bailian = "^(qwen-|qwq-|glm-)", + gemini = "^gemini-", + bailian = "^(qwen-|qwq-|glm-)", volcengine = "^(doubao-|deepseek-r1)", - stepfun = "^step-", - xai = "^grok-", - nvidia = "^(nvidia/|meta/llama)", + stepfun = "^step-", + xai = "^grok-", + nvidia = "^(nvidia/|meta/llama)", openrouter = "/" ) - + for (name in names(patterns)) { if (grepl(patterns[[name]], model, ignore.case = TRUE)) { provider <- name break } } - + if (is.null(provider)) { rlang::abort(c( paste0("Cannot infer provider for model: ", model), @@ -199,7 +199,7 @@ infer_model_id <- function(model) { "i" = "Example: 'gemini:gemini-3-flash-preview', 'openai:gpt-4o', 'deepseek:deepseek-chat'" )) } - + full_id <- paste0(provider, ":", model) rlang::warn(c( paste0("Inferred model ID as '", full_id, "'. Consider using the explicit format."), @@ -222,9 +222,11 @@ process_enrichment_input <- function(x, n_pathways) { } rlang::abort("Unsupported input type. Expected enrichResult, compareClusterResult, gseaResult, or data.frame.") } - + get_top_n <- function(df, n) { - if (nrow(df) == 0) return(df) + if (nrow(df) == 0) { + return(df) + } if ("p.adjust" %in% names(df)) { df <- df[order(df$p.adjust), ] } else if ("pvalue" %in% names(df)) { @@ -232,7 +234,7 @@ process_enrichment_input <- function(x, n_pathways) { } utils::head(df, n) } - + get_genes <- function(obj, cluster = NULL) { if (inherits(obj, "enrichResult")) { return(obj@gene) @@ -245,9 +247,9 @@ process_enrichment_input <- function(x, n_pathways) { } NULL } - + if (is.list(x) && !inherits(x, "enrichResult") && !inherits(x, "gseaResult") && - !inherits(x, "compareClusterResult") && !is.data.frame(x)) { + !inherits(x, "compareClusterResult") && !is.data.frame(x)) { if ("df" %in% names(x) && is.data.frame(x$df)) { return(list(Default = x)) } @@ -285,27 +287,36 @@ process_enrichment_input <- function(x, n_pathways) { #' @keywords internal .get_ppi_context_text <- function(genes, x = NULL, limit = 50) { - if (length(genes) == 0) return(NULL) - + if (length(genes) == 0) { + return(NULL) + } + input_for_ppi <- utils::head(genes, limit) current_taxID <- "auto" if (!is.null(x) && inherits(x, "enrichResult") && !is.list(x)) { current_taxID <- tryCatch(getTaxID(x@organism), error = function(e) "auto") } - - tryCatch({ - g <- getPPI(input_for_ppi, taxID = current_taxID, output = "igraph", network_type = "functional") - if (is.null(g)) return(NULL) - el <- igraph::as_data_frame(g, what = "edges") - if (nrow(el) == 0) return(NULL) - if ("score" %in% names(el)) el <- el[order(el$score, decreasing = TRUE), ] - el_subset <- utils::head(el, limit) - edges_text <- apply(el_subset, 1, function(row) { - score_info <- if ("score" %in% names(row)) paste0(" (Score: ", row["score"], ")") else "" - paste0(row["from"], " -- ", row["to"], score_info) - }) - paste(edges_text, collapse = "\n") - }, error = function(e) NULL) + + tryCatch( + { + g <- getPPI(input_for_ppi, taxID = current_taxID, output = "igraph", network_type = "functional") + if (is.null(g)) { + return(NULL) + } + el <- igraph::as_data_frame(g, what = "edges") + if (nrow(el) == 0) { + return(NULL) + } + if ("score" %in% names(el)) el <- el[order(el$score, decreasing = TRUE), ] + el_subset <- utils::head(el, limit) + edges_text <- apply(el_subset, 1, function(row) { + score_info <- if ("score" %in% names(row)) paste0(" (Score: ", row["score"], ")") else "" + paste0(row["from"], " -- ", row["to"], score_info) + }) + paste(edges_text, collapse = "\n") + }, + error = function(e) NULL + ) } # ============================================================================ @@ -452,8 +463,10 @@ process_enrichment_input <- function(x, n_pathways) { #' p.adjust = c(0.01, 0.02), #' geneID = c("TP53/BAX", "MYC/CCND1/CDK4") #' ) -#' res <- interpret(df, model = "deepseek:deepseek-chat", -#' context = "Cancer proliferation study") +#' res <- interpret(df, +#' model = "deepseek:deepseek-chat", +#' context = "Cancer proliferation study" +#' ) #' print(res) #' } interpret <- function(x, @@ -468,31 +481,31 @@ interpret <- function(x, temperature = 0.3, verbose = FALSE) { if (missing(x)) rlang::abort("Enrichment result 'x' is required.") - + old_debug <- getOption("aisdk.debug", FALSE) if (isTRUE(verbose)) { options(aisdk.debug = TRUE) on.exit(options(aisdk.debug = old_debug), add = TRUE) } - + model <- infer_model_id(model) res_list <- process_enrichment_input(x, n_pathways) - + if (length(res_list) == 0) { return(structure( list(overview = "No significant pathways found to interpret.", confidence = "None"), class = c("interpretation", "list") )) } - + results <- lapply(names(res_list), function(name) { message(sprintf("Interpreting cluster: %s", name)) item <- res_list[[name]] df <- item$df genes <- item$genes - + top_genes_text <- .get_top_genes_text(genes, gene_fold_change) - + if (nrow(df) == 0 && is.null(top_genes_text)) { res <- list( cluster = name, @@ -502,21 +515,21 @@ interpret <- function(x, class(res) <- c("interpretation", "list") return(res) } - + pathway_text <- if (nrow(df) > 0) .format_pathway_text(df) else "No significant enriched pathways found." - + current_prior <- .resolve_prior(prior, name) ppi_text <- .get_ppi_if_requested(add_ppi, df, genes, x) fc_text <- .get_fc_text(gene_fold_change, df, genes) - + user_prompt <- .build_data_sections( pathway_text, context, ppi_text, fc_text, top_genes_text ) - + if (!is.null(current_prior) && nzchar(current_prior)) { user_prompt <- paste0(user_prompt, "\n\nPreliminary Annotation:\n", current_prior) } - + res <- .call_generate_object( model = model, task = task, @@ -526,13 +539,13 @@ interpret <- function(x, max_tokens = max_tokens, temperature = temperature ) - + res$cluster <- name .postprocess_network(res) }) - + names(results) <- names(res_list) - + if (length(results) == 1 && names(results)[1] == "Default") { return(results[[1]]) } @@ -561,7 +574,7 @@ interpret <- function(x, #' @param model The LLM model in `provider:model` format. #' @param add_ppi Logical, whether to query PPI data. Default FALSE. #' @param gene_fold_change Named numeric vector of log fold changes. -#' @param max_tokens Maximum tokens per agent call. Default 4096. +#' @param max_tokens Maximum tokens per agent call. Default 8192. #' @param temperature Sampling temperature. Default 0.3. #' @return An `interpretation` object with deep analysis fields plus #' regulatory_drivers, refined_network, and network_evidence from the @@ -569,8 +582,10 @@ interpret <- function(x, #' @export #' @examples #' \dontrun{ -#' res <- interpret_agent(df, model = "openai:gpt-4o", -#' context = "scRNA-seq of mouse MI day 3") +#' res <- interpret_agent(df, +#' model = "openai:gpt-4o", +#' context = "scRNA-seq of mouse MI day 3" +#' ) #' print(res) #' } interpret_agent <- function(x, @@ -579,33 +594,33 @@ interpret_agent <- function(x, model = "deepseek:deepseek-chat", add_ppi = FALSE, gene_fold_change = NULL, - max_tokens = 4096, + max_tokens = 8192, temperature = 0.3, verbose = FALSE) { if (missing(x)) rlang::abort("Enrichment result 'x' is required.") - + old_debug <- getOption("aisdk.debug", FALSE) if (isTRUE(verbose)) { options(aisdk.debug = TRUE) on.exit(options(aisdk.debug = old_debug), add = TRUE) } - + model <- infer_model_id(model) res_list <- process_enrichment_input(x, n_pathways) - + if (length(res_list) == 0) { return(structure( list(overview = "No significant pathways found to interpret."), class = c("interpretation", "list") )) } - + results <- lapply(names(res_list), function(name) { item <- res_list[[name]] df <- item$df original_genes <- item$genes fallback_mode <- FALSE - + if (nrow(df) == 0) { if (!is.null(original_genes) && length(original_genes) > 0) { fallback_mode <- TRUE @@ -622,12 +637,12 @@ interpret_agent <- function(x, } else { pathway_text <- .format_pathway_text(df) } - + ppi_text <- .get_ppi_if_requested(add_ppi, df, original_genes, x, fallback_mode) fc_text <- .get_fc_text(gene_fold_change, df, original_genes, fallback_mode) - + session <- ChatSession$new(model = model) - + # --- Agent 1: Cleaner --- cleaned_pathways <- pathway_text if (!fallback_mode) { @@ -645,25 +660,28 @@ interpret_agent <- function(x, ), model = model ) - + cleaner_prompt <- paste0( if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", "Raw Enriched Pathways:\n", pathway_text ) - - cleaner_res <- tryCatch({ - gen <- generate_object( - model = model, prompt = cleaner_prompt, - schema = .cleaner_schema(), schema_name = "cleaner_result", - system = cleaner$system_prompt, - temperature = temperature, max_tokens = max_tokens - ) - gen$object - }, error = function(e) { - rlang::warn(paste0("Agent Cleaner failed: ", e$message, ". Using unfiltered pathways.")) - NULL - }) - + + cleaner_res <- tryCatch( + { + gen <- generate_object( + model = model, prompt = cleaner_prompt, + schema = .cleaner_schema(), schema_name = "cleaner_result", + system = cleaner$system_prompt, + temperature = temperature, max_tokens = max_tokens + ) + gen$object + }, + error = function(e) { + rlang::warn(paste0("Agent Cleaner failed: ", e$message, ". Using unfiltered pathways.")) + NULL + } + ) + if (!is.null(cleaner_res) && !is.null(cleaner_res$kept_pathways)) { cleaned_pathways <- paste( "Selected Relevant Pathways (filtered by Agent Cleaner):", @@ -674,7 +692,7 @@ interpret_agent <- function(x, session$set_memory("cleaner_result", cleaner_res) } } - + # --- Agent 2: Detective --- message(sprintf("Processing cluster '%s' with Agent 2: The Detective...", name)) detective <- Agent$new( @@ -690,7 +708,7 @@ interpret_agent <- function(x, ), model = model ) - + detective_prompt <- paste0( if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", if (fallback_mode) "WARNING: No significant enriched pathways found. Analyzing RAW GENE LISTS.\n\n" else "", @@ -698,28 +716,31 @@ interpret_agent <- function(x, if (!is.null(ppi_text)) paste0("\n\nPPI Network Evidence:\n", ppi_text) else "", if (!is.null(fc_text)) paste0("\n\nGene Fold Changes:\n", fc_text) else "" ) - - detective_res <- tryCatch({ - gen <- generate_object( - model = model, prompt = detective_prompt, - schema = .detective_schema(), schema_name = "detective_result", - system = detective$system_prompt, - temperature = temperature, max_tokens = max_tokens - ) - gen$object - }, error = function(e) { - rlang::warn(paste0("Agent Detective failed: ", e$message)) - NULL - }) - + + detective_res <- tryCatch( + { + gen <- generate_object( + model = model, prompt = detective_prompt, + schema = .detective_schema(), schema_name = "detective_result", + system = detective$system_prompt, + temperature = temperature, max_tokens = max_tokens + ) + gen$object + }, + error = function(e) { + rlang::warn(paste0("Agent Detective failed: ", e$message)) + NULL + } + ) + if (!is.null(detective_res)) { session$set_memory("detective_result", detective_res) } - + # --- Agent 3: Synthesizer --- message(sprintf("Processing cluster '%s' with Agent 3: The Storyteller...", name)) detective_text <- .format_detective_report(detective_res) - + synthesizer_prompt <- paste0( if (!is.null(context)) paste0("Context: ", context, "\n\n") else "", if (fallback_mode) "WARNING: No significant enriched pathways found. Interpret with caution.\n\n" else "", @@ -727,7 +748,7 @@ interpret_agent <- function(x, "1. Relevant Pathways:\n", cleaned_pathways, "\n\n", "2. Detective's Report (Drivers & Modules):\n", detective_text ) - + synth_system <- paste0( "You are 'Agent Storyteller', a senior scientific writer.\n", "Synthesize the findings from previous agents into a coherent biological narrative.\n\n", @@ -737,20 +758,23 @@ interpret_agent <- function(x, "3. Formulate a Hypothesis.\n", "4. Draft a Narrative paragraph for a paper." ) - - final_res <- tryCatch({ - gen <- generate_object( - model = model, prompt = synthesizer_prompt, - schema = .interpretation_schema(), schema_name = "synthesis_result", - system = synth_system, - temperature = temperature, max_tokens = max_tokens - ) - gen$object - }, error = function(e) { - rlang::warn(paste0("Agent Synthesizer failed: ", e$message)) - list(overview = "Agent Synthesizer failed to produce structured output.", confidence = "None") - }) - + + final_res <- tryCatch( + { + gen <- generate_object( + model = model, prompt = synthesizer_prompt, + schema = .interpretation_schema(), schema_name = "synthesis_result", + system = synth_system, + temperature = temperature, max_tokens = max_tokens + ) + gen$object + }, + error = function(e) { + rlang::warn(paste0("Agent Synthesizer failed: ", e$message)) + list(overview = "Agent Synthesizer failed to produce structured output.", confidence = "None") + } + ) + if (is.list(final_res)) { final_res$cluster <- name if (fallback_mode) final_res$data_source <- "gene_list_only" @@ -760,13 +784,13 @@ interpret_agent <- function(x, final_res$network_evidence <- detective_res$network_evidence } } - + .postprocess_network(final_res) }) - + results <- Filter(Negate(is.null), results) names(results) <- vapply(results, function(r) r$cluster %||% "Unknown", character(1)) - + if (length(results) == 1 && names(results)[1] == "Default") { return(results[[1]]) } @@ -788,7 +812,7 @@ interpret_agent <- function(x, #' @param mapping A named vector mapping sub-cluster IDs to major cluster IDs. #' @param model The LLM model in `provider:model` format. #' @param task Task type, default "cell_type". -#' @param max_tokens Maximum tokens. Default 4096. +#' @param max_tokens Maximum tokens. Default 8192. #' @param temperature Sampling temperature. Default 0.3. #' @return An `interpretation_list` object. #' @export @@ -797,17 +821,18 @@ interpret_hierarchical <- function(x_minor, mapping, model = "deepseek:deepseek-chat", task = "cell_type", - max_tokens = 4096, + max_tokens = 8192, temperature = 0.3) { message("Step 1: Interpreting Major Clusters to establish lineage context...") res_major <- interpret( - x_major, context = NULL, model = model, task = "cell_type", + x_major, + context = NULL, model = model, task = "cell_type", max_tokens = max_tokens, temperature = temperature ) - + message("Step 2: Interpreting Sub-clusters using hierarchical constraints...") res_list_minor <- process_enrichment_input(x_minor, n_pathways = 20) - + results <- lapply(names(res_list_minor), function(name) { specific_context <- NULL if (name %in% names(mapping)) { @@ -821,19 +846,20 @@ interpret_hierarchical <- function(x_minor, ) } } - + if (is.null(specific_context)) { rlang::warn(paste("No major lineage context found for sub-cluster:", name)) } - + res <- interpret( - res_list_minor[[name]], context = specific_context, model = model, + res_list_minor[[name]], + context = specific_context, model = model, task = task, max_tokens = max_tokens, temperature = temperature ) if (is.list(res)) res$cluster <- name res }) - + names(results) <- names(res_list_minor) class(results) <- c("interpretation_list", "list") results @@ -845,7 +871,7 @@ interpret_hierarchical <- function(x_minor, .diagnose_failure <- function(finish_reason, raw_text, max_tokens) { raw_len <- nchar(raw_text %||% "") - + if (finish_reason == "length" && raw_len == 0) { return(list( message = paste0( @@ -861,7 +887,7 @@ interpret_hierarchical <- function(x_minor, ) )) } - + if (finish_reason == "length" && raw_len > 0) { return(list( message = paste0( @@ -874,7 +900,7 @@ interpret_hierarchical <- function(x_minor, fallback_overview = raw_text )) } - + if (raw_len > 0) { return(list( message = "Model returned text but it could not be parsed as valid JSON.", @@ -882,7 +908,7 @@ interpret_hierarchical <- function(x_minor, fallback_overview = raw_text )) } - + list( message = "Model returned an empty response.", suggestion = "Try a different model, or check that your API key has access to this model.", @@ -891,7 +917,9 @@ interpret_hierarchical <- function(x_minor, } .get_top_genes_text <- function(genes, gene_fold_change) { - if (is.null(genes) || length(genes) == 0) return(NULL) + if (is.null(genes) || length(genes) == 0) { + return(NULL) + } if (!is.null(gene_fold_change)) { common <- intersect(genes, names(gene_fold_change)) if (length(common) > 0) { @@ -904,14 +932,22 @@ interpret_hierarchical <- function(x_minor, } .resolve_prior <- function(prior, name) { - if (is.null(prior)) return(NULL) - if (length(prior) == 1 && is.null(names(prior))) return(prior) - if (name %in% names(prior)) return(prior[[name]]) + if (is.null(prior)) { + return(NULL) + } + if (length(prior) == 1 && is.null(names(prior))) { + return(prior) + } + if (name %in% names(prior)) { + return(prior[[name]]) + } NULL } .get_ppi_if_requested <- function(add_ppi, df, genes, x, fallback_mode = FALSE) { - if (!add_ppi) return(NULL) + if (!add_ppi) { + return(NULL) + } if (fallback_mode) { all_genes <- genes } else { @@ -922,7 +958,9 @@ interpret_hierarchical <- function(x_minor, } .get_fc_text <- function(gene_fold_change, df, genes, fallback_mode = FALSE) { - if (is.null(gene_fold_change)) return(NULL) + if (is.null(gene_fold_change)) { + return(NULL) + } if (fallback_mode) { all_genes <- genes } else { @@ -930,7 +968,9 @@ interpret_hierarchical <- function(x_minor, if (length(all_genes) == 0 && !is.null(genes)) all_genes <- genes } common_genes <- intersect(all_genes, names(gene_fold_change)) - if (length(common_genes) == 0) return(NULL) + if (length(common_genes) == 0) { + return(NULL) + } fc_subset <- gene_fold_change[common_genes] fc_subset <- fc_subset[order(abs(fc_subset), decreasing = TRUE)] top_fc <- utils::head(fc_subset, 20) @@ -938,7 +978,7 @@ interpret_hierarchical <- function(x_minor, } .call_generate_object <- function(model, task, cluster_id, user_prompt, - has_prior = FALSE, max_tokens = 4096, + has_prior = FALSE, max_tokens = 8192, temperature = 0.3) { if (task %in% c("annotation", "cell_type")) { sys <- .annotation_system_prompt(cluster_id, has_prior) @@ -953,95 +993,112 @@ interpret_hierarchical <- function(x_minor, schema <- .interpretation_schema() schema_name <- "interpretation_result" } - + debug <- isTRUE(getOption("aisdk.debug", FALSE)) - + if (debug) { - message("[DEBUG] .call_generate_object: model=", model, - " task=", task, " cluster=", cluster_id, " max_tokens=", max_tokens) - } - - result <- tryCatch({ - gen <- generate_object( - model = model, prompt = user_prompt, schema = schema, - schema_name = schema_name, system = sys, - temperature = temperature, max_tokens = max_tokens + message( + "[DEBUG] .call_generate_object: model=", model, + " task=", task, " cluster=", cluster_id, " max_tokens=", max_tokens ) - - if (debug) { - message("[DEBUG] generate_object result:") - message("[DEBUG] finish_reason: ", gen$finish_reason %||% "NULL") - message("[DEBUG] raw_text length: ", nchar(gen$raw_text %||% ""), " chars") - message("[DEBUG] object is NULL: ", is.null(gen$object)) - if (!is.null(gen$usage)) { - message("[DEBUG] usage: prompt=", gen$usage$prompt_tokens, - " completion=", gen$usage$completion_tokens, - " total=", gen$usage$total_tokens) + } + + result <- tryCatch( + { + gen <- generate_object( + model = model, prompt = user_prompt, schema = schema, + schema_name = schema_name, system = sys, + temperature = temperature, max_tokens = max_tokens + ) + + if (debug) { + message("[DEBUG] generate_object result:") + message("[DEBUG] finish_reason: ", gen$finish_reason %||% "NULL") + message("[DEBUG] raw_text length: ", nchar(gen$raw_text %||% ""), " chars") + message("[DEBUG] object is NULL: ", is.null(gen$object)) + if (!is.null(gen$usage)) { + message( + "[DEBUG] usage: prompt=", gen$usage$prompt_tokens, + " completion=", gen$usage$completion_tokens, + " total=", gen$usage$total_tokens + ) + } + if (!is.null(gen$raw_text) && nchar(gen$raw_text) > 0) { + preview <- substr(gen$raw_text, 1, min(500, nchar(gen$raw_text))) + message( + "[DEBUG] raw_text preview:\n", preview, + if (nchar(gen$raw_text) > 500) "\n... [truncated]" else "" + ) + } } - if (!is.null(gen$raw_text) && nchar(gen$raw_text) > 0) { - preview <- substr(gen$raw_text, 1, min(500, nchar(gen$raw_text))) - message("[DEBUG] raw_text preview:\n", preview, - if (nchar(gen$raw_text) > 500) "\n... [truncated]" else "") + + if (!is.null(gen$object)) { + obj <- gen$object + class(obj) <- c("interpretation", class(obj)) + obj + } else { + raw <- gen$raw_text %||% "" + finish <- gen$finish_reason %||% "unknown" + raw_preview <- if (nchar(raw) > 200) paste0(substr(raw, 1, 200), "...") else raw + + diagnosis <- .diagnose_failure(finish, raw, max_tokens) + + warn_parts <- c( + paste0("generate_object() returned NULL for cluster '", cluster_id, "'."), + "i" = paste0("Model: ", model), + "i" = paste0("finish_reason: ", finish), + "i" = paste0( + "raw_text (", nchar(raw), " chars): ", + if (nzchar(raw_preview)) raw_preview else "" + ) + ) + warn_parts <- c(warn_parts, "!" = diagnosis$message) + if (!is.null(diagnosis$suggestion)) { + warn_parts <- c(warn_parts, ">" = diagnosis$suggestion) + } + rlang::warn(warn_parts) + + fallback_text <- if (nzchar(raw)) raw else diagnosis$fallback_overview + list( + overview = fallback_text, + confidence = "Low", + cluster = cluster_id + ) } - } - - if (!is.null(gen$object)) { - obj <- gen$object - class(obj) <- c("interpretation", class(obj)) - obj - } else { - raw <- gen$raw_text %||% "" - finish <- gen$finish_reason %||% "unknown" - raw_preview <- if (nchar(raw) > 200) paste0(substr(raw, 1, 200), "...") else raw - - diagnosis <- .diagnose_failure(finish, raw, max_tokens) - - warn_parts <- c( - paste0("generate_object() returned NULL for cluster '", cluster_id, "'."), + }, + error = function(e) { + rlang::warn(c( + paste0("LLM call failed for cluster '", cluster_id, "': ", e$message), "i" = paste0("Model: ", model), - "i" = paste0("finish_reason: ", finish), - "i" = paste0("raw_text (", nchar(raw), " chars): ", - if (nzchar(raw_preview)) raw_preview else "") - ) - warn_parts <- c(warn_parts, "!" = diagnosis$message) - if (!is.null(diagnosis$suggestion)) { - warn_parts <- c(warn_parts, ">" = diagnosis$suggestion) - } - rlang::warn(warn_parts) - - fallback_text <- if (nzchar(raw)) raw else diagnosis$fallback_overview - list( - overview = fallback_text, - confidence = "Low", + "i" = "Tip: Re-run with verbose=TRUE for full debug output" + )) + res <- list( + overview = paste0("LLM call failed: ", e$message), + confidence = "None", cluster = cluster_id ) + class(res) <- c("interpretation", "list") + res } - }, error = function(e) { - rlang::warn(c( - paste0("LLM call failed for cluster '", cluster_id, "': ", e$message), - "i" = paste0("Model: ", model), - "i" = "Tip: Re-run with verbose=TRUE for full debug output" - )) - res <- list( - overview = paste0("LLM call failed: ", e$message), - confidence = "None", - cluster = cluster_id - ) - class(res) <- c("interpretation", "list") - res - }) - + ) + result } .format_detective_report <- function(detective_res) { - if (is.null(detective_res) || !is.list(detective_res)) return("No detective report available.") + if (is.null(detective_res) || !is.list(detective_res)) { + return("No detective report available.") + } key_drivers <- if (!is.null(detective_res$key_drivers)) { paste(detective_res$key_drivers, collapse = ", ") - } else "None identified" + } else { + "None identified" + } modules <- if (!is.null(detective_res$functional_modules)) { paste(detective_res$functional_modules, collapse = ", ") - } else "None identified" + } else { + "None identified" + } evidence <- detective_res$network_evidence %||% "None provided" paste0( "Key Drivers: ", key_drivers, "\n", @@ -1056,16 +1113,19 @@ interpret_hierarchical <- function(x_minor, return(res) } rn <- res$refined_network - rn_df <- tryCatch({ - if (is.data.frame(rn)) { - rn - } else if (is.list(rn) && length(rn) > 0 && all(vapply(rn, is.list, logical(1)))) { - do.call(rbind, lapply(rn, function(r) as.data.frame(r, stringsAsFactors = FALSE))) - } else { - NULL - } - }, error = function(e) NULL) - + rn_df <- tryCatch( + { + if (is.data.frame(rn)) { + rn + } else if (is.list(rn) && length(rn) > 0 && all(vapply(rn, is.list, logical(1)))) { + do.call(rbind, lapply(rn, function(r) as.data.frame(r, stringsAsFactors = FALSE))) + } else { + NULL + } + }, + error = function(e) NULL + ) + if (!is.null(rn_df) && nrow(rn_df) > 0) { colnames(rn_df)[colnames(rn_df) == "source"] <- "from" colnames(rn_df)[colnames(rn_df) == "target"] <- "to" @@ -1084,8 +1144,12 @@ interpret_hierarchical <- function(x_minor, return(res_major[[major_id]]) } if (inherits(res_major, "interpretation")) { - if (!is.null(res_major$cluster) && res_major$cluster == major_id) return(res_major) - if (is.null(res_major$cluster)) return(res_major) + if (!is.null(res_major$cluster) && res_major$cluster == major_id) { + return(res_major) + } + if (is.null(res_major$cluster)) { + return(res_major) + } } NULL } @@ -1111,7 +1175,7 @@ print.interpretation <- function(x, ...) { cat("\n") return(invisible(x)) } - + if (!is.null(x$phenotype)) { cat("## Phenotype Characterization\n\n") if (!is.null(x$cluster)) cat(sprintf("### Group/Cluster: %s\n\n", x$cluster)) @@ -1125,15 +1189,15 @@ print.interpretation <- function(x, ...) { cat("\n") return(invisible(x)) } - + cat("## Interpretation Result\n\n") if (!is.null(x$cluster)) cat(sprintf("### Cluster: %s\n\n", x$cluster)) - + if (!is.null(x$overview)) { cat("### 1. Overview\n") cat(x$overview, "\n\n") } - + if (!is.null(x$regulatory_drivers)) { cat("### 2. Regulatory Drivers\n") drivers <- unlist(x$regulatory_drivers) @@ -1143,7 +1207,7 @@ print.interpretation <- function(x, ...) { cat(drivers, "\n\n") } } - + if (!is.null(x$key_mechanisms)) { cat("### 3. Key Mechanisms\n") if (is.list(x$key_mechanisms) && !is.null(names(x$key_mechanisms))) { @@ -1156,8 +1220,10 @@ print.interpretation <- function(x, ...) { cat("**Pathways:** ", paste(mechanism$pathways, collapse = ", "), "\n") } if (!is.null(mechanism$genes)) { - cat("**Key Genes:** ", paste(utils::head(mechanism$genes, 10), collapse = ", "), - if (length(mechanism$genes) > 10) "..." else "", "\n") + cat( + "**Key Genes:** ", paste(utils::head(mechanism$genes, 10), collapse = ", "), + if (length(mechanism$genes) > 10) "..." else "", "\n" + ) } } else { cat(mechanism, "\n") @@ -1168,12 +1234,12 @@ print.interpretation <- function(x, ...) { cat(as.character(x$key_mechanisms), "\n\n") } } - + if (!is.null(x$crosstalk)) { cat("### 4. Crosstalk & Interactions\n") cat(x$crosstalk, "\n\n") } - + if (!is.null(x$hypothesis)) { cat("### 5. Hypothesis\n") if (is.list(x$hypothesis)) { @@ -1183,12 +1249,12 @@ print.interpretation <- function(x, ...) { cat(x$hypothesis, "\n\n") } } - + if (!is.null(x$narrative)) { cat("### 6. Narrative Draft\n") cat(x$narrative, "\n\n") } - + if (!is.null(x$network) && requireNamespace("igraph", quietly = TRUE) && inherits(x$network, "igraph")) { cat("### 7. Refined Regulatory Network\n") el <- igraph::as_data_frame(x$network, what = "edges") @@ -1206,7 +1272,7 @@ print.interpretation <- function(x, ...) { cat(x$network_evidence, "\n\n") } } - + invisible(x) } diff --git a/man/infer_model_id.Rd b/man/infer_model_id.Rd new file mode 100644 index 0000000..a3078c8 --- /dev/null +++ b/man/infer_model_id.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interpret.R +\name{infer_model_id} +\alias{infer_model_id} +\title{Infer Model ID} +\usage{ +infer_model_id(model) +} +\arguments{ +\item{model}{A model string, either bare (e.g., "deepseek-chat") or fully +qualified (e.g., "deepseek:deepseek-chat").} +} +\value{ +A string in `provider:model` format. +} +\description{ +Maps bare model names to the aisdk `provider:model` format for backward +compatibility. Emits a warning when guessing and suggests the explicit form. +If the model already contains a colon, it is returned as-is. +} +\keyword{internal} diff --git a/man/interpret.Rd b/man/interpret.Rd index 549edb4..3546f2e 100644 --- a/man/interpret.Rd +++ b/man/interpret.Rd @@ -2,53 +2,88 @@ % Please edit documentation in R/interpret.R \name{interpret} \alias{interpret} -\title{interpret} +\title{Interpret Enrichment Results Using LLMs} \usage{ interpret( x, context = NULL, n_pathways = 20, - model = "deepseek-chat", - provider = NULL, - api_key = NULL, + model = "deepseek:deepseek-chat", task = "interpretation", prior = NULL, add_ppi = FALSE, - gene_fold_change = NULL + gene_fold_change = NULL, + max_tokens = 8192, + temperature = 0.3, + verbose = FALSE ) } \arguments{ -\item{x}{An enrichment result object (e.g., `enrichResult` or `gseaResult`).} +\item{x}{An enrichment result object (`enrichResult`, `gseaResult`, +`compareClusterResult`, or a `data.frame` with pathway columns).} -\item{context}{A string describing the experimental background (e.g., "scRNA-seq of mouse myocardial infarction at day 3").} +\item{context}{A string describing the experimental background +(e.g., "scRNA-seq of mouse myocardial infarction at day 3").} -\item{n_pathways}{Number of top significant pathways to include in the analysis. Default is 20.} +\item{n_pathways}{Number of top significant pathways to include. Default 20.} -\item{model}{The LLM model to use. Default is "deepseek-chat". Supported models include "deepseek-chat", "glm-4", "qwen-turbo" etc.} +\item{model}{The LLM model in `provider:model` format +(e.g., "deepseek:deepseek-chat", "gemini:gemini-2.5-flash"). +Bare model names are supported with a warning (e.g., "deepseek-chat").} -\item{provider}{The LLM provider. Default is NULL (inferred from model or handled by aisdk).} +\item{task}{Task type: "interpretation" (default), "cell_type"/"annotation", +or "phenotype"/"phenotyping".} -\item{api_key}{The API key for the LLM. If NULL, it tries to fetch from `getOption('yulab_translate')` based on the model.} +\item{prior}{Optional prior knowledge or preliminary annotation to guide the task.} -\item{task}{Task type, default is "interpretation". Other options include "cell_type"/"annotation" and "phenotype"/"phenotyping".} +\item{add_ppi}{Logical, whether to query STRING PPI network data. Default FALSE.} -\item{prior}{Optional prior knowledge (e.g., a biological hypothesis) to guide the task.} +\item{gene_fold_change}{Named numeric vector of log fold changes for expression context.} -\item{add_ppi}{Boolean, whether to use PPI network integration.} +\item{max_tokens}{Maximum tokens for the LLM response. Default 8192. +Some models (especially reasoning models) may need much higher values +(e.g., 16384 or more) to produce complete structured output.} -\item{gene_fold_change}{Named vector of logFC for expression context.} +\item{temperature}{Sampling temperature. Default 0.3.} + +\item{verbose}{Logical, whether to print debug messages showing raw API +responses, token usage, and JSON parsing details. Default FALSE. +Equivalent to setting `options(aisdk.debug = TRUE)` for the call.} } \value{ -A character string containing the LLM-generated interpretation. +An `interpretation` object (list) with task-specific fields. + For "interpretation": overview, key_mechanisms, hypothesis, narrative, etc. + For "annotation": cell_type, confidence, reasoning, markers, etc. + For "phenotype": phenotype, confidence, reasoning, key_processes, etc. } \description{ -Interpret enrichment results using Large Language Models (LLM) +Functions for interpreting functional enrichment analysis results using +Large Language Models. Supports single-call interpretation, multi-agent +deep analysis, and hierarchical cluster strategies. + +Built on top of aisdk's `generate_object()` for reliable structured output, +and the Agent/Session system for multi-agent pipelines. + +Sends enrichment results along with optional experimental context to an LLM +to generate a structured biological interpretation, hypothesis, and narrative +suitable for a publication. } \details{ -This function sends the enrichment results (top significant pathways) along with -an optional experimental context to an LLM (e.g., DeepSeek) to generate -a biological interpretation, hypothesis, and narrative suitable for a paper. +Uses `generate_object()` internally for reliable structured output with +automatic JSON repair, eliminating manual parsing failures. +} +\examples{ +\dontrun{ +# Basic usage with a data frame +df <- data.frame( + ID = c("GO:0006915", "GO:0008284"), + Description = c("apoptotic process", "positive regulation of proliferation"), + GeneRatio = c("10/100", "20/100"), + p.adjust = c(0.01, 0.02), + geneID = c("TP53/BAX", "MYC/CCND1/CDK4") +) +res <- interpret(df, model = "deepseek:deepseek-chat", + context = "Cancer proliferation study") +print(res) } -\author{ -Guangchuang Yu } diff --git a/man/interpret_agent.Rd b/man/interpret_agent.Rd index 287f1eb..0e5b778 100644 --- a/man/interpret_agent.Rd +++ b/man/interpret_agent.Rd @@ -2,49 +2,57 @@ % Please edit documentation in R/interpret.R \name{interpret_agent} \alias{interpret_agent} -\title{interpret_agent} +\title{Interpret enrichment results using a multi-agent pipeline (Deep Mode)} \usage{ interpret_agent( x, context = NULL, n_pathways = 50, - model = "deepseek-chat", - provider = NULL, - api_key = NULL, + model = "deepseek:deepseek-chat", add_ppi = FALSE, - gene_fold_change = NULL + gene_fold_change = NULL, + max_tokens = 4096, + temperature = 0.3, + verbose = FALSE ) } \arguments{ -\item{x}{An enrichment result object (e.g., `enrichResult` or `gseaResult`).} +\item{x}{An enrichment result object.} \item{context}{A string describing the experimental background.} -\item{n_pathways}{Number of top pathways to consider initially. Default is 50 (Agent 1 will filter them).} +\item{n_pathways}{Number of top pathways to consider initially. Default 50.} -\item{model}{The LLM model to use.} +\item{model}{The LLM model in `provider:model` format.} -\item{provider}{The LLM provider. Default is NULL (inferred from model or handled by aisdk).} +\item{add_ppi}{Logical, whether to query PPI data. Default FALSE.} -\item{api_key}{The API key for the LLM.} +\item{gene_fold_change}{Named numeric vector of log fold changes.} -\item{add_ppi}{Boolean, whether to use PPI network integration.} +\item{max_tokens}{Maximum tokens per agent call. Default 4096.} -\item{gene_fold_change}{Named vector of logFC for expression context.} +\item{temperature}{Sampling temperature. Default 0.3.} } \value{ -A detailed interpretation list. +An `interpretation` object with deep analysis fields plus + regulatory_drivers, refined_network, and network_evidence from the + detective agent. } \description{ -Interpret enrichment results using a multi-agent system (Deep Mode) +Employs three specialized AI agents in sequence for rigorous interpretation: +\enumerate{ + \item Agent Cleaner: Filters noise and selects relevant pathways. + \item Agent Detective: Identifies key regulators and functional modules. + \item Agent Synthesizer: Produces a coherent biological narrative. +} } \details{ -This function employs a multi-agent strategy to provide a more rigorous and comprehensive -biological interpretation. It breaks down the task into three specialized agents: -1. Agent Cleaner: Filters noise and selects relevant pathways. -2. Agent Detective: Identifies key regulators and functional modules using PPI/TF data. -3. Agent Synthesizer: Synthesizes findings into a coherent narrative. -} -\author{ -Guangchuang Yu +Uses aisdk's Agent and Session system for shared context across agents. +} +\examples{ +\dontrun{ +res <- interpret_agent(df, model = "openai:gpt-4o", + context = "scRNA-seq of mouse MI day 3") +print(res) +} } diff --git a/man/interpret_hierarchical.Rd b/man/interpret_hierarchical.Rd index ccc3431..fff2298 100644 --- a/man/interpret_hierarchical.Rd +++ b/man/interpret_hierarchical.Rd @@ -2,37 +2,37 @@ % Please edit documentation in R/interpret.R \name{interpret_hierarchical} \alias{interpret_hierarchical} -\title{interpret_hierarchical} +\title{Interpret enrichment results using a hierarchical strategy} \usage{ interpret_hierarchical( x_minor, x_major, mapping, - model = "deepseek-chat", - provider = NULL, - api_key = NULL, - task = "cell_type" + model = "deepseek:deepseek-chat", + task = "cell_type", + max_tokens = 4096, + temperature = 0.3 ) } \arguments{ -\item{x_minor}{Enrichment result for sub-clusters (e.g., compareClusterResult or list of enrichResult).} +\item{x_minor}{Enrichment result for sub-clusters.} \item{x_major}{Enrichment result for major clusters.} -\item{mapping}{A named vector mapping sub-cluster IDs (names in x_minor) to major cluster IDs (names in x_major).} +\item{mapping}{A named vector mapping sub-cluster IDs to major cluster IDs.} -\item{model}{LLM model.} +\item{model}{The LLM model in `provider:model` format.} -\item{api_key}{API key.} +\item{task}{Task type, default "cell_type".} -\item{task}{Task type, default is "cell_type".} +\item{max_tokens}{Maximum tokens. Default 4096.} + +\item{temperature}{Sampling temperature. Default 0.3.} } \value{ -A list of interpretation results. +An `interpretation_list` object. } \description{ -Interpret enrichment results using a hierarchical strategy (Major -> Minor clusters) -} -\author{ -Guangchuang Yu +First interprets major clusters to establish lineage context, then interprets +sub-clusters with hierarchical constraints from the major cluster annotations. } From 641a889290e19a88d27c253b12a7e173457de68d Mon Sep 17 00:00:00 2001 From: Young Sherlock Date: Sun, 1 Mar 2026 02:32:50 +0800 Subject: [PATCH 6/7] feat: Replace fanyi with aisdk for LLM-based interpretation --- DESCRIPTION | 7 +++++-- man/interpret.Rd | 6 ++++-- man/interpret_agent.Rd | 10 ++++++---- man/interpret_hierarchical.Rd | 4 ++-- man/reexports.Rd | 4 ++-- 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5f1c8e7..2179954 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,12 +39,12 @@ Imports: tidyr, utils, yulab.utils (>= 0.2.3), - ggplot2 + ggplot2, + aisdk Suggests: AnnotationHub, BiocManager, DOSE, - fanyi (>= 0.1.0), ggtangle, readr, org.Hs.eg.db, @@ -60,3 +60,6 @@ biocViews: Annotation, Clustering, GeneSetEnrichment, GO, KEGG, MultipleComparison, Pathways, Reactome, Visualization Encoding: UTF-8 RoxygenNote: 7.3.3 +Remotes: + YuLab-SMU/aisdk + diff --git a/man/interpret.Rd b/man/interpret.Rd index 3546f2e..c20b39b 100644 --- a/man/interpret.Rd +++ b/man/interpret.Rd @@ -82,8 +82,10 @@ df <- data.frame( p.adjust = c(0.01, 0.02), geneID = c("TP53/BAX", "MYC/CCND1/CDK4") ) -res <- interpret(df, model = "deepseek:deepseek-chat", - context = "Cancer proliferation study") +res <- interpret(df, + model = "deepseek:deepseek-chat", + context = "Cancer proliferation study" +) print(res) } } diff --git a/man/interpret_agent.Rd b/man/interpret_agent.Rd index 0e5b778..fa2dd88 100644 --- a/man/interpret_agent.Rd +++ b/man/interpret_agent.Rd @@ -11,7 +11,7 @@ interpret_agent( model = "deepseek:deepseek-chat", add_ppi = FALSE, gene_fold_change = NULL, - max_tokens = 4096, + max_tokens = 8192, temperature = 0.3, verbose = FALSE ) @@ -29,7 +29,7 @@ interpret_agent( \item{gene_fold_change}{Named numeric vector of log fold changes.} -\item{max_tokens}{Maximum tokens per agent call. Default 4096.} +\item{max_tokens}{Maximum tokens per agent call. Default 8192.} \item{temperature}{Sampling temperature. Default 0.3.} } @@ -51,8 +51,10 @@ Uses aisdk's Agent and Session system for shared context across agents. } \examples{ \dontrun{ -res <- interpret_agent(df, model = "openai:gpt-4o", - context = "scRNA-seq of mouse MI day 3") +res <- interpret_agent(df, + model = "openai:gpt-4o", + context = "scRNA-seq of mouse MI day 3" +) print(res) } } diff --git a/man/interpret_hierarchical.Rd b/man/interpret_hierarchical.Rd index fff2298..d5452c3 100644 --- a/man/interpret_hierarchical.Rd +++ b/man/interpret_hierarchical.Rd @@ -10,7 +10,7 @@ interpret_hierarchical( mapping, model = "deepseek:deepseek-chat", task = "cell_type", - max_tokens = 4096, + max_tokens = 8192, temperature = 0.3 ) } @@ -25,7 +25,7 @@ interpret_hierarchical( \item{task}{Task type, default "cell_type".} -\item{max_tokens}{Maximum tokens. Default 4096.} +\item{max_tokens}{Maximum tokens. Default 8192.} \item{temperature}{Sampling temperature. Default 0.3.} } diff --git a/man/reexports.Rd b/man/reexports.Rd index d4e448a..f079145 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -39,14 +39,14 @@ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ - \item{GOSemSim}{\code{\link[GOSemSim]{buildGOmap}}, \code{\link[GOSemSim]{get_organism}}, \code{\link[GOSemSim]{read.blast2go}}, \code{\link[GOSemSim:read-gaf]{read.gaf}}} - \item{dplyr}{\code{\link[dplyr]{arrange}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr:context]{n}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{select}}, \code{\link[dplyr]{slice}}, \code{\link[dplyr]{summarise}}} \item{enrichit}{\code{\link[enrichit]{geneID}}, \code{\link[enrichit]{geneInCategory}}, \code{\link[enrichit]{gsfilter}}, \code{\link[enrichit]{setReadable}}} \item{enrichplot}{\code{\link[enrichplot:reexports]{cnetplot}}, \code{\link[enrichplot]{dotplot}}, \code{\link[enrichplot]{emapplot}}, \code{\link[enrichplot]{goplot}}, \code{\link[enrichplot]{gseaplot}}, \code{\link[enrichplot]{heatplot}}, \code{\link[enrichplot]{ridgeplot}}} + \item{GOSemSim}{\code{\link[GOSemSim]{buildGOmap}}, \code{\link[GOSemSim]{get_organism}}, \code{\link[GOSemSim]{read.blast2go}}, \code{\link[GOSemSim:read-gaf]{read.gaf}}} + \item{gson}{\code{\link[gson:read-gmt]{read.gmt}}, \code{\link[gson:read-gmt]{read.gmt.wp}}} \item{magrittr}{\code{\link[magrittr:compound]{\%<>\%}}, \code{\link[magrittr:pipe]{\%>\%}}} From 47160ec2947333951d5e7391a143e87cbe060c2f Mon Sep 17 00:00:00 2001 From: Young Sherlock Date: Sun, 1 Mar 2026 02:51:17 +0800 Subject: [PATCH 7/7] fix: address R CMD check warnings and notes (undocumented args, missing imports) --- NAMESPACE | 3 ++ R/interpret.R | 3 +- R/kegg-utilities.R | 63 +++++++++++++++++++++++------------------- man/interpret_agent.Rd | 2 ++ 4 files changed, 41 insertions(+), 30 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 29e974d..0a7f3db 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -129,6 +129,8 @@ importFrom(GOSemSim,godata) importFrom(GOSemSim,mgoSim) importFrom(GOSemSim,read.blast2go) importFrom(GOSemSim,read.gaf) +importFrom(aisdk,Agent) +importFrom(aisdk,ChatSession) importFrom(aisdk,generate_object) importFrom(aisdk,z_array) importFrom(aisdk,z_enum) @@ -178,6 +180,7 @@ importFrom(rlang,check_installed) importFrom(rlang,quos) importFrom(rlang,sym) importFrom(stats,formula) +importFrom(stats,na.omit) importFrom(stats,setNames) importFrom(tidyr,gather) importFrom(tidyr,separate) diff --git a/R/interpret.R b/R/interpret.R index 96d6fd0..e70c7ca 100644 --- a/R/interpret.R +++ b/R/interpret.R @@ -12,7 +12,7 @@ NULL # ============================================================================ # Schema Definitions # ============================================================================ -#' @importFrom aisdk generate_object z_object z_string z_array z_enum +#' @importFrom aisdk generate_object z_object z_string z_array z_enum ChatSession Agent .interpretation_schema <- function() { z_object( overview = z_string("High-level summary of the key biological processes identified"), @@ -576,6 +576,7 @@ interpret <- function(x, #' @param gene_fold_change Named numeric vector of log fold changes. #' @param max_tokens Maximum tokens per agent call. Default 8192. #' @param temperature Sampling temperature. Default 0.3. +#' @param verbose Logical, whether to print debug messages. Default FALSE. #' @return An `interpretation` object with deep analysis fields plus #' regulatory_drivers, refined_network, and network_evidence from the #' detective agent. diff --git a/R/kegg-utilities.R b/R/kegg-utilities.R index 14f030c..ae886e0 100644 --- a/R/kegg-utilities.R +++ b/R/kegg-utilities.R @@ -1,6 +1,6 @@ #' add KEGG pathway category information #' -#' This function appends the KEGG pathway category information to KEGG enrichment result +#' This function appends the KEGG pathway category information to KEGG enrichment result #' (either output of 'enrichKEGG' or 'gseKEGG' #' @title append_kegg_category #' @param x KEGG enrichment result @@ -41,7 +41,7 @@ append_kegg_category <- function(x) { #' @export #' @author Guangchuang Yu browseKEGG <- function(x, pathID) { - url <- paste0("https://www.kegg.jp/kegg-bin/show_pathway?", pathID, '/', x[pathID, "geneID"]) + url <- paste0("https://www.kegg.jp/kegg-bin/show_pathway?", pathID, "/", x[pathID, "geneID"]) browseURL(url) invisible(url) } @@ -57,11 +57,11 @@ browseKEGG <- function(x, pathID) { #' @return data.frame #' @export #' @author Guangchuang Yu -search_kegg_organism <- function(str, by="scientific_name", ignore.case=FALSE, +search_kegg_organism <- function(str, by = "scientific_name", ignore.case = FALSE, use_internal_data = TRUE) { if (use_internal_data) { by <- match.arg(by, c("kegg_code", "scientific_name", "common_name")) - kegg_species <- kegg_species_data() + kegg_species <- kegg_species_data() # Message <- paste("You are using the internal data. ", # "If you want to use the latest data", # "and your internet speed is fast enough, ", @@ -71,7 +71,7 @@ search_kegg_organism <- function(str, by="scientific_name", ignore.case=FALSE, kegg_species <- get_kegg_species() } idx <- grep(str, kegg_species[, by], ignore.case = ignore.case) - kegg_species[idx,] + kegg_species[idx, ] } @@ -90,10 +90,12 @@ get_cached_kegg_data <- function(type = "category") { type <- match.arg(type, c("category", "species")) basefile <- sprintf("kegg_%s", type) file <- sprintf("%s.rda", basefile) - urls <- c("https://yulab-smu.top/clusterProfiler", - "https://raw.githubusercontent.com/YuLab-SMU/clusterProfiler/gh-pages") - - d <- download_yulab_file(file, urls, gzfile=FALSE, appname="clusterProfiler") + urls <- c( + "https://yulab-smu.top/clusterProfiler", + "https://raw.githubusercontent.com/YuLab-SMU/clusterProfiler/gh-pages" + ) + + d <- download_yulab_file(file, urls, gzfile = FALSE, appname = "clusterProfiler") load(d, envir = get_cache()) get_cache_item(basefile) } @@ -102,21 +104,23 @@ get_kegg_species <- function(save = FALSE) { url <- "https://rest.kegg.jp/list/organism" species <- read.table(url, fill = TRUE, sep = "\t", header = F, quote = "") species <- species[, -1] - scientific_name <- gsub(" \\(.*", "", species[,2]) - common_name <- gsub(".*\\(", "", species[,2]) + scientific_name <- gsub(" \\(.*", "", species[, 2]) + common_name <- gsub(".*\\(", "", species[, 2]) common_name <- gsub("\\)", "", common_name) - kegg_species <- data.frame(kegg_code = species[, 1], - scientific_name = scientific_name, - common_name = common_name) - - file <- 'kegg_species.rda' - if (dir.exists('data')) file <- paste0('data/', file) + kegg_species <- data.frame( + kegg_code = species[, 1], + scientific_name = scientific_name, + common_name = common_name + ) + + file <- "kegg_species.rda" + if (dir.exists("data")) file <- paste0("data/", file) if (save) { message(sprintf("--> Number of species %s", nrow(kegg_species))) message(sprintf("--> Save to %s\n", file)) - save(kegg_species, file=file) + save(kegg_species, file = file) } - invisible(kegg_species) + invisible(kegg_species) } @@ -172,7 +176,7 @@ kegg_rest <- function(rest_url) { # f <- tempfile() # dl <- mydownload(rest_url, destfile = f) - # + # # if (is.null(dl)) { # message("fail to download KEGG data...") # return(NULL) @@ -181,9 +185,11 @@ kegg_rest <- function(rest_url) { # content <- readLines(f) content <- yread(rest_url) - content %<>% strsplit(., "\t") %>% do.call('rbind', .) - res <- data.frame(from=content[,1], - to=content[,2]) + content %<>% strsplit(., "\t") %>% do.call("rbind", .) + res <- data.frame( + from = content[, 1], + to = content[, 2] + ) return(res) } @@ -191,19 +197,19 @@ kegg_rest <- function(rest_url) { ## https://www.genome.jp/kegg/rest/keggapi.html ## kegg_link('hsa', 'pathway') kegg_link <- function(target_db, source_db) { - url <- paste0("https://rest.kegg.jp/link/", target_db, "/", source_db, collapse="") + url <- paste0("https://rest.kegg.jp/link/", target_db, "/", source_db, collapse = "") kegg_rest(url) } kegg_list <- function(db, species = NULL) { if (db == "pathway") { - url <- paste("https://rest.kegg.jp/list", db, species, sep="/") + url <- paste("https://rest.kegg.jp/list", db, species, sep = "/") } else { ## module do not need species - url <- paste("https://rest.kegg.jp/list", db, sep="/") + url <- paste("https://rest.kegg.jp/list", db, sep = "/") } - + kegg_rest(url) } @@ -213,6 +219,7 @@ kegg_list <- function(db, species = NULL) { #' @title ko2name #' @param ko ko ID #' @return data.frame +#' @importFrom stats na.omit #' @export #' @author guangchuang yu ko2name <- function(ko) { @@ -236,5 +243,3 @@ ko2name <- function(ko) { }) do.call(rbind, res) } - - diff --git a/man/interpret_agent.Rd b/man/interpret_agent.Rd index fa2dd88..f68c094 100644 --- a/man/interpret_agent.Rd +++ b/man/interpret_agent.Rd @@ -32,6 +32,8 @@ interpret_agent( \item{max_tokens}{Maximum tokens per agent call. Default 8192.} \item{temperature}{Sampling temperature. Default 0.3.} + +\item{verbose}{Logical, whether to print debug messages. Default FALSE.} } \value{ An `interpretation` object with deep analysis fields plus