diff --git a/DESCRIPTION b/DESCRIPTION index ea5d92de..80583ae0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: serodynamics Title: What the Package Does (One Line, Title Case) -Version: 0.0.0.9047 +Version: 0.0.0.9048 Authors@R: c( person("Peter", "Teunis", , "p.teunis@emory.edu", role = c("aut", "cph"), comment = "Author of the method and original code."), @@ -22,6 +22,7 @@ Imports: dplyr, ggmcmc (>= 1.5.1.2), ggplot2, + MASS, purrr, rlang, runjags, diff --git a/NAMESPACE b/NAMESPACE index 2db4d027..43c7eb27 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand S3method(autoplot,case_data) +S3method(print,prior_predictive_summary) export(as_case_data) export(autoplot) export(expect_snapshot_data) @@ -13,6 +14,7 @@ export(plot_jags_dens) export(plot_jags_effect) export(plot_jags_trace) export(plot_predicted_curve) +export(plot_prior_predictive) export(post_summ) export(postprocess_jags_output) export(prep_data) @@ -21,6 +23,8 @@ export(run_mod) export(serodynamics_example) export(sim_case_data) export(sim_n_obs) +export(simulate_prior_predictive) +export(summarize_prior_predictive) importFrom(dplyr,.data) importFrom(dplyr,all_of) importFrom(dplyr,filter) @@ -45,6 +49,10 @@ importFrom(serocalculator,get_values) importFrom(serocalculator,ids) importFrom(serocalculator,ids_varname) importFrom(stats,complete.cases) +importFrom(stats,median) importFrom(stats,quantile) +importFrom(stats,rWishart) +importFrom(stats,rgamma) +importFrom(stats,rnorm) importFrom(tidyr,pivot_wider) importFrom(utils,read.csv) diff --git a/NEWS.md b/NEWS.md index 89199522..571ba9f2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # serodynamics (development version) +* Added prior predictive check utilities for Bayesian workflow validation: + - `simulate_prior_predictive()` generates antibody trajectories from priors before model fitting + - `summarize_prior_predictive()` provides diagnostic summaries identifying invalid values and scale mismatches + - `plot_prior_predictive()` visualizes prior predictive trajectories with optional observed data overlay + - New vignette article demonstrating prior predictive checks as recommended workflow step + * Added dev container configuration for persistent, cached development environment that includes R, JAGS, and all dependencies preinstalled, making Copilot Workspace sessions much faster. diff --git a/R/plot_prior_predictive.R b/R/plot_prior_predictive.R new file mode 100644 index 00000000..de05b189 --- /dev/null +++ b/R/plot_prior_predictive.R @@ -0,0 +1,210 @@ +#' Plot prior predictive check trajectories +#' +#' @description +#' Visualizes antibody trajectories simulated from priors to assess whether +#' prior distributions generate realistic curves for the study context. +#' +#' @details +#' Creates plots showing: +#' - Simulated antibody trajectories over time +#' - Separate panels for each biomarker (faceted) +#' - Optional overlay of observed data for comparison +#' - Multiple trajectories (if multiple simulations provided) +#' +#' The plot uses log-scale antibody values by default (matching the model), +#' but can optionally show natural scale. +#' +#' @param sim_data A simulated `prepped_jags_data` object from +#' [simulate_prior_predictive()], or a [list] of such objects +#' @param original_data Optional original `prepped_jags_data` object from +#' [prep_data()] to overlay observed data +#' @param log_scale [logical] Whether to plot on log scale (default = TRUE) +#' @param max_traj [integer] Maximum number of trajectories to plot per subject +#' (default = 100). Useful when `sim_data` contains many simulations. +#' @param show_points [logical] Whether to show individual observation points +#' (default = TRUE) +#' @param alpha [numeric] Transparency for trajectory lines (default = 0.3) +#' +#' @returns A [ggplot2::ggplot()] object +#' +#' @export +#' @examples +#' # Prepare data and priors +#' set.seed(1) +#' raw_data <- serocalculator::typhoid_curves_nostrat_100 |> +#' sim_case_data(n = 5) +#' prepped_data <- prep_data(raw_data) +#' prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) +#' +#' # Simulate and plot +#' sim_data <- simulate_prior_predictive( +#' prepped_data, prepped_priors, n_sims = 20 +#' ) +#' plot_prior_predictive(sim_data, original_data = prepped_data) +plot_prior_predictive <- function(sim_data, + original_data = NULL, + log_scale = TRUE, + max_traj = 100, + show_points = TRUE, + alpha = 0.3) { + # Convert to list if single simulation + if (inherits(sim_data, "prepped_jags_data")) { + sim_list <- list(sim_data) + } else { + sim_list <- sim_data + } + + # Limit number of trajectories to plot + if (length(sim_list) > max_traj) { + sim_list <- sim_list[seq_len(max_traj)] + cli::cli_inform( + "Plotting {max_traj} of {length(sim_data)} simulations for clarity" + ) + } + + # Extract antigens from first simulation + antigens <- attr(sim_list[[1]], "antigens") + subject_ids <- attr(sim_list[[1]], "ids") + + # Convert simulated data to long format for plotting + sim_plot_data <- do.call( + rbind, + lapply(seq_along(sim_list), function(sim_idx) { + sim <- sim_list[[sim_idx]] + + do.call(rbind, lapply(seq_len(sim$nsubj), function(subj) { + do.call(rbind, lapply(seq_len(sim$nsmpl[subj]), function(obs) { + do.call(rbind, lapply(seq_along(antigens), function(k) { + data.frame( + subject = subject_ids[subj], + time = sim$smpl.t[subj, obs], + logy = sim$logy[subj, obs, k], + biomarker = antigens[k], + sim_id = sim_idx, + stringsAsFactors = FALSE + ) + })) + })) + })) + }) + ) + + # Remove NA values + sim_plot_data <- + sim_plot_data[ + !is.na(sim_plot_data$time) & + !is.na(sim_plot_data$logy), + ] + + # Transform to natural scale if requested + if (!log_scale) { + sim_plot_data$value <- exp(sim_plot_data$logy) + } else { + sim_plot_data$value <- sim_plot_data$logy + } + + # Create base plot + p <- ggplot2::ggplot( + sim_plot_data, + ggplot2::aes(x = .data$time, y = .data$value) + ) + + # Add observed data if provided + if (!is.null(original_data)) { + if (inherits(original_data, "prepped_jags_data")) { + obs_plot_data <- do.call( + rbind, + lapply(seq_len(original_data$nsubj), function(subj) { + do.call( + rbind, + lapply(seq_len(original_data$nsmpl[subj]), function(obs) { + do.call(rbind, lapply(seq_along(antigens), function(k) { + data.frame( + subject = subject_ids[subj], + time = original_data$smpl.t[subj, obs], + logy = original_data$logy[subj, obs, k], + biomarker = antigens[k], + stringsAsFactors = FALSE + ) + })) + }) + ) + }) + ) + + obs_plot_data <- + obs_plot_data[ + !is.na(obs_plot_data$time) & + !is.na(obs_plot_data$logy), + ] + + if (!log_scale) { + obs_plot_data$value <- exp(obs_plot_data$logy) + } else { + obs_plot_data$value <- obs_plot_data$logy + } + + # Add observed points + if (show_points) { + p <- p + + ggplot2::geom_point( + data = obs_plot_data, + ggplot2::aes(x = .data$time, y = .data$value), + color = "black", + size = 2, + alpha = 0.6 + ) + } + + # Add observed trajectories + p <- p + + ggplot2::geom_line( + data = obs_plot_data, + ggplot2::aes( + x = .data$time, + y = .data$value, + group = .data$subject + ), + color = "black", + linewidth = 0.5, + alpha = 0.4 + ) + } + } + + # Add simulated trajectories + p <- p + + ggplot2::geom_line( + ggplot2::aes( + group = interaction(.data$subject, .data$sim_id) + ), + color = "steelblue", + alpha = alpha + ) + + # Facet by biomarker + p <- p + + ggplot2::facet_wrap(~biomarker, scales = "free_y", ncol = 2) + + # Add labels and theme + y_label <- if (log_scale) "Log(Antibody Level)" else "Antibody Level" + + p <- p + + ggplot2::labs( + title = "Prior Predictive Check", + subtitle = if (!is.null(original_data)) { + "Blue = simulated from priors, Black = observed data" + } else { + "Simulated antibody trajectories from priors" + }, + x = "Time (days)", + y = y_label + ) + + ggplot2::theme_bw() + + ggplot2::theme( + strip.background = ggplot2::element_rect(fill = "grey90"), + strip.text = ggplot2::element_text(face = "bold") + ) + + return(p) +} diff --git a/R/simulate_prior_predictive.R b/R/simulate_prior_predictive.R new file mode 100644 index 00000000..6586a196 --- /dev/null +++ b/R/simulate_prior_predictive.R @@ -0,0 +1,252 @@ +#' Simulate antibody trajectories from priors only +#' +#' @description +#' Performs a prior predictive check by simulating antibody trajectories and +#' measurements using only the prior distributions, before fitting the model +#' to data. This is useful for assessing whether priors generate realistic +#' antibody values for a given pathogen and assay. +#' +#' @importFrom stats rWishart rgamma rnorm +#' +#' @details +#' This function: +#' 1. Draws kinetic parameters from the prior distributions specified by +#' `prep_priors()` +#' 2. Generates latent antibody trajectories using the same within-host antibody +#' model used in the JAGS model +#' 3. Applies measurement noise to simulate observed antibody values +#' 4. Preserves the original dataset structure (IDs, biomarkers, timepoints) +#' +#' The simulation follows the hierarchical model structure: +#' - Population-level parameters are drawn from hyperpriors +#' - Individual-level parameters are drawn from population distributions +#' - Observations are generated with log-normal measurement error +#' +#' @param prepped_data A `prepped_jags_data` object from [prep_data()] +#' @param prepped_priors A `curve_params_priors` object from [prep_priors()] +#' @param n_sims [integer] Number of prior predictive simulations to generate +#' (default = 1). If > 1, returns a list of simulated datasets. +#' @param seed [integer] Optional random seed for reproducibility +#' +#' @returns If `n_sims = 1`, a `prepped_jags_data` object with simulated +#' antibody values replacing the observed values. If `n_sims > 1`, a [list] +#' of such objects. +#' +#' @export +#' @examples +#' # Prepare data and priors +#' set.seed(1) +#' raw_data <- serocalculator::typhoid_curves_nostrat_100 |> +#' sim_case_data(n = 5) +#' prepped_data <- prep_data(raw_data) +#' prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) +#' +#' # Simulate from priors +#' sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) +#' +#' # Generate multiple simulations +#' sim_list <- simulate_prior_predictive( +#' prepped_data, prepped_priors, n_sims = 10 +#' ) +simulate_prior_predictive <- function(prepped_data, + prepped_priors, + n_sims = 1, + seed = NULL) { + # Input validation + if (!inherits(prepped_data, "prepped_jags_data")) { + cli::cli_abort( + c( + "{.arg prepped_data} must be a {.cls prepped_jags_data}", + "object from {.fn prep_data}" + ) + ) + } + + if (!inherits(prepped_priors, "curve_params_priors")) { + cli::cli_abort( + c( + "{.arg prepped_priors} must be a {.cls curve_params_priors}", + "object from {.fn prep_priors}" + ) + ) + } + + if (prepped_data$n_antigen_isos != nrow(prepped_priors$mu.hyp)) { + cli::cli_abort( + c( + "Mismatch between data and priors:", + "i" = paste( + "{.arg prepped_data} has", + "{prepped_data$n_antigen_isos} biomarkers" + ), + "i" = paste( + "{.arg prepped_priors} is configured for", + "{nrow(prepped_priors$mu.hyp)} biomarkers" + ) + ) + ) + } + + if (!is.null(seed)) { + set.seed(seed) + } + + # Generate simulations + if (n_sims == 1) { + return(simulate_one_prior_predictive(prepped_data, prepped_priors)) + } else { + return( + lapply(seq_len(n_sims), function(i) { + simulate_one_prior_predictive(prepped_data, prepped_priors) + }) + ) + } +} + +#' Simulate a single prior predictive dataset +#' +#' @param prepped_data A `prepped_jags_data` object from [prep_data()] +#' @param prepped_priors A `curve_params_priors` object from [prep_priors()] +#' +#' @returns A `prepped_jags_data` object with simulated antibody values +#' @noRd +simulate_one_prior_predictive <- function(prepped_data, prepped_priors) { + # Extract dimensions + nsubj <- prepped_data$nsubj + n_antigen_isos <- prepped_data$n_antigen_isos + n_params <- prepped_priors$n_params + + # Initialize arrays for parameters + mu_par <- array(NA, dim = c(n_antigen_isos, n_params)) + prec_par <- array(NA, dim = c(n_antigen_isos, n_params, n_params)) + prec_logy <- numeric(n_antigen_isos) + + # Sample hyperparameters for each biomarker + for (k in seq_len(n_antigen_isos)) { + # Sample population-level mean parameters from hyperpriors + mu_par[k, ] <- MASS::mvrnorm( + n = 1, + mu = prepped_priors$mu.hyp[k, ], + Sigma = solve(prepped_priors$prec.hyp[k, , ]) + ) + + # Sample precision matrix from Wishart prior + prec_par[k, , ] <- rWishart( + n = 1, + df = prepped_priors$wishdf[k], + Sigma = solve(prepped_priors$omega[k, , ]) + )[, , 1] + + # Sample measurement error precision + prec_logy[k] <- rgamma( + n = 1, + shape = prepped_priors$prec.logy.hyp[k, 1], + rate = prepped_priors$prec.logy.hyp[k, 2] + ) + } + + # Initialize arrays for subject-level parameters + params <- array(NA, dim = c(nsubj, n_antigen_isos, n_params)) + y0 <- array(NA, dim = c(nsubj, n_antigen_isos)) + y1 <- array(NA, dim = c(nsubj, n_antigen_isos)) + t1 <- array(NA, dim = c(nsubj, n_antigen_isos)) + alpha <- array(NA, dim = c(nsubj, n_antigen_isos)) + shape <- array(NA, dim = c(nsubj, n_antigen_isos)) + + # Sample subject-level parameters for each subject and biomarker + for (subj in seq_len(nsubj)) { + for (k in seq_len(n_antigen_isos)) { + # Sample individual parameters from population distribution + params[subj, k, ] <- MASS::mvrnorm( + n = 1, + mu = mu_par[k, ], + Sigma = solve(prec_par[k, , ]) + ) + + # Transform to natural scale (matching JAGS model) + y0[subj, k] <- exp(params[subj, k, 1]) + # params[,,2] is log(y1-y0) + y1[subj, k] <- y0[subj, k] + exp(params[subj, k, 2]) + t1[subj, k] <- exp(params[subj, k, 3]) + alpha[subj, k] <- exp(params[subj, k, 4]) + # params[,,5] is log(shape-1) + shape[subj, k] <- exp(params[subj, k, 5]) + 1 + } + } + + # Generate simulated antibody observations + logy_sim <- prepped_data$logy + smpl_t <- prepped_data$smpl.t + nsmpl <- prepped_data$nsmpl + + for (subj in seq_len(nsubj)) { + for (obs in seq_len(nsmpl[subj])) { + for (k in seq_len(n_antigen_isos)) { + # Calculate mean antibody level at this timepoint + t_obs <- smpl_t[subj, obs] + + # Skip if no timepoint + if (is.na(t_obs)) { + next + } + + # Calculate mu.logy using the antibody dynamics model + # This matches the JAGS model logic + beta <- log(y1[subj, k] / y0[subj, k]) / t1[subj, k] + + if (t_obs <= t1[subj, k]) { + # Active infection period (before peak) + mu_logy <- log(y0[subj, k]) + beta * t_obs + } else { + # Recovery period (after peak) + # Calculate the argument inside the log + log_arg <- y1[subj, k]^(1 - shape[subj, k]) - + (1 - shape[subj, k]) * alpha[subj, k] * (t_obs - t1[subj, k]) + + # Check if log_arg is positive and finite + if (is.finite(log_arg) && log_arg > 0) { + mu_logy <- (1 / (1 - shape[subj, k])) * log(log_arg) + } else { + # Set to NA if invalid + mu_logy <- NA + } + } + + # Add measurement noise (only if mu_logy is finite) + if (is.finite(mu_logy)) { + sd_logy <- 1 / sqrt(prec_logy[k]) + logy_sim[subj, obs, k] <- rnorm( + n = 1, + mean = mu_logy, + sd = sd_logy + ) + } else { + # Leave as NA if mu_logy is not finite + logy_sim[subj, obs, k] <- NA + } + } + } + } + + # Create output with same structure as input + to_return <- prepped_data + to_return$logy <- logy_sim + + # Add attributes to track that this is simulated data + to_return <- structure( + to_return, + simulated_from_priors = TRUE, + sim_params = list( + mu_par = mu_par, + prec_par = prec_par, + prec_logy = prec_logy, + y0 = y0, + y1 = y1, + t1 = t1, + alpha = alpha, + shape = shape + ) + ) + + return(to_return) +} diff --git a/R/summarize_prior_predictive.R b/R/summarize_prior_predictive.R new file mode 100644 index 00000000..2f484ae6 --- /dev/null +++ b/R/summarize_prior_predictive.R @@ -0,0 +1,275 @@ +#' Summarize prior predictive simulations +#' +#' @description +#' Provides diagnostic summaries of prior predictive simulations to identify +#' potential issues with prior specifications before fitting the model. +#' +#' @importFrom stats median quantile +#' +#' @details +#' This function checks for: +#' - Non-finite values (NaN, Inf, -Inf) +#' - Negative antibody values (which would be invalid on natural scale) +#' - Summary statistics by biomarker (min, max, median, IQR) +#' - Optional comparison to observed data ranges +#' +#' @param sim_data A simulated `prepped_jags_data` object from +#' [simulate_prior_predictive()], or a [list] of such objects +#' @param original_data Optional original `prepped_jags_data` object from +#' [prep_data()] to compare simulated vs observed ranges +#' +#' @returns A [list] containing: +#' - `n_sims`: Number of simulations summarized +#' - `validity_check`: [data.frame] with counts of finite, non-finite, +#' and negative values by biomarker +#' - `range_summary`: [data.frame] with min, max, median, and IQR of +#' simulated values by biomarker +#' - `observed_range`: (if `original_data` provided) [data.frame] with +#' observed data ranges for comparison +#' - `issues`: [character] [vector] describing any detected problems +#' +#' @export +#' @examples +#' # Prepare data and priors +#' set.seed(1) +#' raw_data <- serocalculator::typhoid_curves_nostrat_100 |> +#' sim_case_data(n = 5) +#' prepped_data <- prep_data(raw_data) +#' prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) +#' +#' # Simulate and summarize +#' sim_data <- simulate_prior_predictive( +#' prepped_data, prepped_priors, n_sims = 10 +#' ) +#' summary <- summarize_prior_predictive( +#' sim_data, original_data = prepped_data +#' ) +#' print(summary) +summarize_prior_predictive <- function(sim_data, original_data = NULL) { + # Handle list of simulations or single simulation + if (is.list(sim_data) && !inherits(sim_data, "prepped_jags_data")) { + # It's a list of simulations + n_sims <- length(sim_data) + antigens <- attr(sim_data[[1]], "antigens") + n_antigens <- sim_data[[1]]$n_antigen_isos + + # Extract values by biomarker across all simulations + logy_by_biomarker <- lapply(seq_len(n_antigens), function(k) { + vals <- do.call( + c, + lapply(sim_data, function(x) as.vector(x$logy[, , k])) + ) + vals[!is.na(vals)] + }) + } else { + # Single simulation + n_sims <- 1 + antigens <- attr(sim_data, "antigens") + n_antigens <- sim_data$n_antigen_isos + + logy_by_biomarker <- lapply(seq_len(n_antigens), function(k) { + vals <- as.vector(sim_data$logy[, , k]) + vals[!is.na(vals)] + }) + } + + # Validity check for each biomarker + validity_check <- data.frame( + biomarker = antigens, + n_finite = vapply( + logy_by_biomarker, + function(x) sum(is.finite(x)), + FUN.VALUE = integer(1) + ), + n_nonfinite = vapply( + logy_by_biomarker, + function(x) sum(!is.finite(x)), + FUN.VALUE = integer(1) + ), + n_negative = vapply( + logy_by_biomarker, + function(x) sum(is.finite(x) & x < log(0.01)), + FUN.VALUE = integer(1) + ), + stringsAsFactors = FALSE + ) + + # Range summary for each biomarker (on log scale) + range_summary <- data.frame( + biomarker = antigens, + min = vapply(logy_by_biomarker, function(x) { + if (length(x) > 0 && any(is.finite(x))) { + min(x[is.finite(x)]) + } else { + NA_real_ + } + }, FUN.VALUE = numeric(1)), + q25 = vapply(logy_by_biomarker, function(x) { + if (length(x) > 0 && any(is.finite(x))) { + quantile(x[is.finite(x)], 0.25) + } else { + NA_real_ + } + }, FUN.VALUE = numeric(1)), + median = vapply(logy_by_biomarker, function(x) { + if (length(x) > 0 && any(is.finite(x))) { + median(x[is.finite(x)]) + } else { + NA_real_ + } + }, FUN.VALUE = numeric(1)), + q75 = vapply(logy_by_biomarker, function(x) { + if (length(x) > 0 && any(is.finite(x))) { + quantile(x[is.finite(x)], 0.75) + } else { + NA_real_ + } + }, FUN.VALUE = numeric(1)), + max = vapply(logy_by_biomarker, function(x) { + if (length(x) > 0 && any(is.finite(x))) { + max(x[is.finite(x)]) + } else { + NA_real_ + } + }, FUN.VALUE = numeric(1)), + stringsAsFactors = FALSE + ) + + # Optional comparison with observed data + observed_range <- NULL + if (!is.null(original_data)) { + if (!inherits(original_data, "prepped_jags_data")) { + cli::cli_warn( + c( + "{.arg original_data} is not a {.cls prepped_jags_data}", + "object; skipping comparison" + ) + ) + } else { + obs_logy_by_biomarker <- lapply(seq_len(n_antigens), function(k) { + vals <- as.vector(original_data$logy[, , k]) + vals[!is.na(vals)] + }) + + observed_range <- data.frame( + biomarker = antigens, + obs_min = vapply(obs_logy_by_biomarker, function(x) { + if (length(x) > 0) min(x) else NA_real_ + }, FUN.VALUE = numeric(1)), + obs_median = vapply(obs_logy_by_biomarker, function(x) { + if (length(x) > 0) median(x) else NA_real_ + }, FUN.VALUE = numeric(1)), + obs_max = vapply(obs_logy_by_biomarker, function(x) { + if (length(x) > 0) max(x) else NA_real_ + }, FUN.VALUE = numeric(1)), + stringsAsFactors = FALSE + ) + } + } + + # Identify issues + issues <- character(0) + + if (any(validity_check$n_nonfinite > 0)) { + affected <- validity_check$biomarker[validity_check$n_nonfinite > 0] + issues <- c( + issues, + paste0( + "Non-finite values detected for biomarker(s): ", + paste(affected, collapse = ", ") + ) + ) + } + + if (any(validity_check$n_negative > 0)) { + affected <- validity_check$biomarker[validity_check$n_negative > 0] + issues <- c( + issues, + paste0( + "Very low/negative log-scale values detected for biomarker(s): ", + paste(affected, collapse = ", "), + " (may indicate prior-data scale mismatch)" + ) + ) + } + + # Check for scale mismatch with observed data + if (!is.null(observed_range)) { + for (i in seq_len(nrow(range_summary))) { + sim_range <- range_summary$max[i] - range_summary$min[i] + obs_range <- observed_range$obs_max[i] - observed_range$obs_min[i] + + # If simulated range is much larger than observed (factor of 10+) + if (is.finite(sim_range) && + is.finite(obs_range) && + sim_range > obs_range * 10) { + issues <- c( + issues, + paste0( + "Simulated range for ", + antigens[i], + " is much wider than observed data ", + "(may indicate over-dispersed priors)" + ) + ) + } + } + } + + if (length(issues) == 0) { + issues <- "No obvious issues detected" + } + + # Return summary + to_return <- list( + n_sims = n_sims, + validity_check = validity_check, + range_summary = range_summary, + observed_range = observed_range, + issues = issues + ) + + class(to_return) <- c("prior_predictive_summary", "list") + + return(to_return) +} + +#' Print method for prior_predictive_summary +#' +#' @param x A `prior_predictive_summary` object +#' @param ... Additional arguments (not used) +#' +#' @returns Invisibly returns `x` +#' @keywords internal +#' @export +print.prior_predictive_summary <- function(x, ...) { + cli::cli_h1("Prior Predictive Check Summary") + + cli::cli_text("Based on {.strong {x$n_sims}} simulation{?s}") + cli::cli_text("") + + cli::cli_h2("Validity Check") + print(x$validity_check) + cli::cli_text("") + + cli::cli_h2("Simulated Range Summary (log scale)") + print(x$range_summary) + cli::cli_text("") + + if (!is.null(x$observed_range)) { + cli::cli_h2("Observed Data Range (log scale)") + print(x$observed_range) + cli::cli_text("") + } + + cli::cli_h2("Issues Detected") + for (issue in x$issues) { + if (issue == "No obvious issues detected") { + cli::cli_inform(c("v" = issue)) + } else { + cli::cli_inform(c("!" = issue)) + } + } + + invisible(x) +} diff --git a/inst/WORDLIST b/inst/WORDLIST index 8ea3768d..3d3d609c 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -9,9 +9,12 @@ Iso JAGS's JAGs Lifecycle +MFI MSVC +PPCs Postprocess Rhat +Shigella TSI Vectorization Wasserstein @@ -52,5 +55,6 @@ strat stratifications tbl testthat +titers unstratified wishdf diff --git a/man/plot_prior_predictive.Rd b/man/plot_prior_predictive.Rd new file mode 100644 index 00000000..35013660 --- /dev/null +++ b/man/plot_prior_predictive.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_prior_predictive.R +\name{plot_prior_predictive} +\alias{plot_prior_predictive} +\title{Plot prior predictive check trajectories} +\usage{ +plot_prior_predictive( + sim_data, + original_data = NULL, + log_scale = TRUE, + max_traj = 100, + show_points = TRUE, + alpha = 0.3 +) +} +\arguments{ +\item{sim_data}{A simulated \code{prepped_jags_data} object from +\code{\link[=simulate_prior_predictive]{simulate_prior_predictive()}}, or a \link{list} of such objects} + +\item{original_data}{Optional original \code{prepped_jags_data} object from +\code{\link[=prep_data]{prep_data()}} to overlay observed data} + +\item{log_scale}{\link{logical} Whether to plot on log scale (default = TRUE)} + +\item{max_traj}{\link{integer} Maximum number of trajectories to plot per subject +(default = 100). Useful when \code{sim_data} contains many simulations.} + +\item{show_points}{\link{logical} Whether to show individual observation points +(default = TRUE)} + +\item{alpha}{\link{numeric} Transparency for trajectory lines (default = 0.3)} +} +\value{ +A \code{\link[ggplot2:ggplot]{ggplot2::ggplot()}} object +} +\description{ +Visualizes antibody trajectories simulated from priors to assess whether +prior distributions generate realistic curves for the study context. +} +\details{ +Creates plots showing: +\itemize{ +\item Simulated antibody trajectories over time +\item Separate panels for each biomarker (faceted) +\item Optional overlay of observed data for comparison +\item Multiple trajectories (if multiple simulations provided) +} + +The plot uses log-scale antibody values by default (matching the model), +but can optionally show natural scale. +} +\examples{ +# Prepare data and priors +set.seed(1) +raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 5) +prepped_data <- prep_data(raw_data) +prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + +# Simulate and plot +sim_data <- simulate_prior_predictive( + prepped_data, prepped_priors, n_sims = 20 +) +plot_prior_predictive(sim_data, original_data = prepped_data) +} diff --git a/man/print.prior_predictive_summary.Rd b/man/print.prior_predictive_summary.Rd new file mode 100644 index 00000000..11a67fe6 --- /dev/null +++ b/man/print.prior_predictive_summary.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarize_prior_predictive.R +\name{print.prior_predictive_summary} +\alias{print.prior_predictive_summary} +\title{Print method for prior_predictive_summary} +\usage{ +\method{print}{prior_predictive_summary}(x, ...) +} +\arguments{ +\item{x}{A \code{prior_predictive_summary} object} + +\item{...}{Additional arguments (not used)} +} +\value{ +Invisibly returns \code{x} +} +\description{ +Print method for prior_predictive_summary +} +\keyword{internal} diff --git a/man/simulate_prior_predictive.Rd b/man/simulate_prior_predictive.Rd new file mode 100644 index 00000000..f7f81524 --- /dev/null +++ b/man/simulate_prior_predictive.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simulate_prior_predictive.R +\name{simulate_prior_predictive} +\alias{simulate_prior_predictive} +\title{Simulate antibody trajectories from priors only} +\usage{ +simulate_prior_predictive( + prepped_data, + prepped_priors, + n_sims = 1, + seed = NULL +) +} +\arguments{ +\item{prepped_data}{A \code{prepped_jags_data} object from \code{\link[=prep_data]{prep_data()}}} + +\item{prepped_priors}{A \code{curve_params_priors} object from \code{\link[=prep_priors]{prep_priors()}}} + +\item{n_sims}{\link{integer} Number of prior predictive simulations to generate +(default = 1). If > 1, returns a list of simulated datasets.} + +\item{seed}{\link{integer} Optional random seed for reproducibility} +} +\value{ +If \code{n_sims = 1}, a \code{prepped_jags_data} object with simulated +antibody values replacing the observed values. If \code{n_sims > 1}, a \link{list} +of such objects. +} +\description{ +Performs a prior predictive check by simulating antibody trajectories and +measurements using only the prior distributions, before fitting the model +to data. This is useful for assessing whether priors generate realistic +antibody values for a given pathogen and assay. +} +\details{ +This function: +\enumerate{ +\item Draws kinetic parameters from the prior distributions specified by +\code{prep_priors()} +\item Generates latent antibody trajectories using the same within-host antibody +model used in the JAGS model +\item Applies measurement noise to simulate observed antibody values +\item Preserves the original dataset structure (IDs, biomarkers, timepoints) +} + +The simulation follows the hierarchical model structure: +\itemize{ +\item Population-level parameters are drawn from hyperpriors +\item Individual-level parameters are drawn from population distributions +\item Observations are generated with log-normal measurement error +} +} +\examples{ +# Prepare data and priors +set.seed(1) +raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 5) +prepped_data <- prep_data(raw_data) +prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + +# Simulate from priors +sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + +# Generate multiple simulations +sim_list <- simulate_prior_predictive( + prepped_data, prepped_priors, n_sims = 10 +) +} diff --git a/man/summarize_prior_predictive.Rd b/man/summarize_prior_predictive.Rd new file mode 100644 index 00000000..724858d5 --- /dev/null +++ b/man/summarize_prior_predictive.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarize_prior_predictive.R +\name{summarize_prior_predictive} +\alias{summarize_prior_predictive} +\title{Summarize prior predictive simulations} +\usage{ +summarize_prior_predictive(sim_data, original_data = NULL) +} +\arguments{ +\item{sim_data}{A simulated \code{prepped_jags_data} object from +\code{\link[=simulate_prior_predictive]{simulate_prior_predictive()}}, or a \link{list} of such objects} + +\item{original_data}{Optional original \code{prepped_jags_data} object from +\code{\link[=prep_data]{prep_data()}} to compare simulated vs observed ranges} +} +\value{ +A \link{list} containing: +\itemize{ +\item \code{n_sims}: Number of simulations summarized +\item \code{validity_check}: \link{data.frame} with counts of finite, non-finite, +and negative values by biomarker +\item \code{range_summary}: \link{data.frame} with min, max, median, and IQR of +simulated values by biomarker +\item \code{observed_range}: (if \code{original_data} provided) \link{data.frame} with +observed data ranges for comparison +\item \code{issues}: \link{character} \link{vector} describing any detected problems +} +} +\description{ +Provides diagnostic summaries of prior predictive simulations to identify +potential issues with prior specifications before fitting the model. +} +\details{ +This function checks for: +\itemize{ +\item Non-finite values (NaN, Inf, -Inf) +\item Negative antibody values (which would be invalid on natural scale) +\item Summary statistics by biomarker (min, max, median, IQR) +\item Optional comparison to observed data ranges +} +} +\examples{ +# Prepare data and priors +set.seed(1) +raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 5) +prepped_data <- prep_data(raw_data) +prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + +# Simulate and summarize +sim_data <- simulate_prior_predictive( + prepped_data, prepped_priors, n_sims = 10 +) +summary <- summarize_prior_predictive( + sim_data, original_data = prepped_data +) +print(summary) +} diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index 61c98202..b9e2a10c 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -26,6 +26,11 @@ reference: contents: - prep_priors - initsfunction +- title: Prior predictive checks + contents: + - simulate_prior_predictive + - summarize_prior_predictive + - plot_prior_predictive - title: Model seroreponse contents: - run_mod @@ -53,6 +58,7 @@ articles: navbar: ~ contents: - articles/getting-started + - articles/prior-predictive-checks - title: Developer Notes contents: diff --git a/tests/testthat/_snaps/simulate_prior_predictive.md b/tests/testthat/_snaps/simulate_prior_predictive.md new file mode 100644 index 00000000..8fe23a84 --- /dev/null +++ b/tests/testthat/_snaps/simulate_prior_predictive.md @@ -0,0 +1,6 @@ +# simulate_prior_predictive result consistency + + list(class = c("prepped_jags_data", "list"), dims = list(logy = c(3L, + 9L, 5L), smpl_t = c(subjects = 3L, visit_number = 9L)), has_sim_params = TRUE, + simulated_from_priors = TRUE) + diff --git a/tests/testthat/_snaps/summarize_prior_predictive.md b/tests/testthat/_snaps/summarize_prior_predictive.md new file mode 100644 index 00000000..b92336b7 --- /dev/null +++ b/tests/testthat/_snaps/summarize_prior_predictive.md @@ -0,0 +1,7 @@ +# summarize_prior_predictive result consistency + + list(n_sims = 3L, validity_check_cols = c("biomarker", "n_finite", + "n_nonfinite", "n_negative"), range_summary_cols = c("biomarker", + "min", "q25", "median", "q75", "max"), has_observed_range = TRUE, + n_issues = 6L) + diff --git a/tests/testthat/test-plot_prior_predictive.R b/tests/testthat/test-plot_prior_predictive.R new file mode 100644 index 00000000..95a54172 --- /dev/null +++ b/tests/testthat/test-plot_prior_predictive.R @@ -0,0 +1,151 @@ +test_that("plot_prior_predictive basic functionality", { + withr::local_seed(123) + + # Create test data + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + p <- plot_prior_predictive(sim_data) + + expect_s3_class(p, "ggplot") +}) + +test_that("plot_prior_predictive with multiple simulations", { + withr::local_seed(456) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_list <- simulate_prior_predictive( + prepped_data, + prepped_priors, + n_sims = 5 + ) + p <- plot_prior_predictive(sim_list) + + expect_s3_class(p, "ggplot") +}) + +test_that("plot_prior_predictive with observed data overlay", { + withr::local_seed(789) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + p <- plot_prior_predictive(sim_data, original_data = prepped_data) + + expect_s3_class(p, "ggplot") + + # Check that the plot includes observed data in subtitle + expect_true(grepl("observed", p$labels$subtitle, ignore.case = TRUE)) +}) + +test_that("plot_prior_predictive on natural scale", { + withr::local_seed(101) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + p <- plot_prior_predictive(sim_data, log_scale = FALSE) + + expect_s3_class(p, "ggplot") + expect_true(grepl("Antibody Level", p$labels$y)) + expect_false(grepl("Log", p$labels$y)) +}) + +test_that("plot_prior_predictive on log scale", { + withr::local_seed(202) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + p <- plot_prior_predictive(sim_data, log_scale = TRUE) + + expect_s3_class(p, "ggplot") + expect_true(grepl("Log", p$labels$y)) +}) + +test_that("plot_prior_predictive respects max_traj parameter", { + withr::local_seed(303) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + # Generate many simulations + sim_list <- simulate_prior_predictive( + prepped_data, + prepped_priors, + n_sims = 200 + ) + + # Should inform about limiting trajectories + expect_message( + plot_prior_predictive(sim_list, max_traj = 50), + "Plotting 50 of 200 simulations" + ) +}) + +test_that("plot_prior_predictive without points", { + withr::local_seed(404) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + p <- plot_prior_predictive( + sim_data, + original_data = prepped_data, + show_points = FALSE + ) + + expect_s3_class(p, "ggplot") +}) + +test_that("plot_prior_predictive with custom alpha", { + withr::local_seed(505) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + p <- plot_prior_predictive(sim_data, alpha = 0.1) + + expect_s3_class(p, "ggplot") +}) + +test_that("plot_prior_predictive plot structure", { + withr::local_seed(606) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + p <- plot_prior_predictive(sim_data) + + # Check basic plot structure + expect_true("FacetWrap" %in% class(p$facet)) + expect_equal(p$labels$x, "Time (days)") + expect_equal(p$labels$title, "Prior Predictive Check") +}) diff --git a/tests/testthat/test-simulate_prior_predictive.R b/tests/testthat/test-simulate_prior_predictive.R new file mode 100644 index 00000000..f8fb6596 --- /dev/null +++ b/tests/testthat/test-simulate_prior_predictive.R @@ -0,0 +1,155 @@ +test_that("simulate_prior_predictive basic functionality", { + withr::local_seed(123) + + # Create test data + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 3) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + # Test single simulation + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + + expect_s3_class(sim_data, "prepped_jags_data") + expect_true(attr(sim_data, "simulated_from_priors")) + expect_equal(dim(sim_data$logy), dim(prepped_data$logy)) + expect_equal(dim(sim_data$smpl.t), dim(prepped_data$smpl.t)) +}) + +test_that("simulate_prior_predictive with multiple simulations", { + withr::local_seed(456) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + # Test multiple simulations + n_sims <- 5 + sim_list <- simulate_prior_predictive( + prepped_data, + prepped_priors, + n_sims = n_sims + ) + + expect_type(sim_list, "list") + expect_length(sim_list, n_sims) + expect_true(all(vapply( + sim_list, + inherits, + FUN.VALUE = logical(1), + "prepped_jags_data" + ))) +}) + +test_that("simulate_prior_predictive respects seed", { + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + # Generate two simulations with same seed + sim1 <- simulate_prior_predictive(prepped_data, prepped_priors, seed = 789) + sim2 <- simulate_prior_predictive(prepped_data, prepped_priors, seed = 789) + + expect_equal(sim1$logy, sim2$logy) +}) + +test_that("simulate_prior_predictive validates inputs", { + withr::local_seed(101) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + # Wrong class for prepped_data + expect_error( + simulate_prior_predictive(list(a = 1), prepped_priors), + "must be a.*prepped_jags_data" + ) + + # Wrong class for prepped_priors + expect_error( + simulate_prior_predictive(prepped_data, list(a = 1)), + "must be a.*curve_params_priors" + ) + + # Mismatch in number of biomarkers + wrong_priors <- prep_priors(max_antigens = 1) + expect_error( + simulate_prior_predictive(prepped_data, wrong_priors), + "Mismatch between data and priors" + ) +}) + +test_that("simulate_prior_predictive with custom priors", { + withr::local_seed(202) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + + # Custom priors + custom_priors <- prep_priors( + max_antigens = prepped_data$n_antigen_isos, + mu_hyp_param = c(1.0, 5.0, 0.5, -3.0, -2.0), + prec_hyp_param = c(0.5, 0.0001, 0.5, 0.002, 0.5), + omega_param = c(2.0, 30.0, 2.0, 8.0, 2.0), + wishdf_param = 15, + prec_logy_hyp_param = c(3.0, 0.8) + ) + + sim_data <- simulate_prior_predictive(prepped_data, custom_priors) + + expect_s3_class(sim_data, "prepped_jags_data") + expect_true(attr(sim_data, "simulated_from_priors")) + + # Check that sim_params are stored + sim_params <- attr(sim_data, "sim_params") + expect_type(sim_params, "list") + expect_true(all(c("y0", "y1", "t1", "alpha", "shape") %in% names(sim_params))) +}) + +test_that("simulate_prior_predictive produces finite values", { + withr::local_seed(303) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 3) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + + # Check that most values are finite (allowing for some NAs from structure) + all_values <- as.vector(sim_data$logy) + finite_values <- all_values[!is.na(all_values)] + + # At least 90% should be finite + expect_true(sum(is.finite(finite_values)) / length(finite_values) > 0.9) +}) + +test_that("simulate_prior_predictive result consistency", { + withr::local_seed(404) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + + # Snapshot the structure and some values + expect_snapshot_value( + list( + class = class(sim_data), + dims = list( + logy = dim(sim_data$logy), + smpl_t = dim(sim_data$smpl.t) + ), + has_sim_params = !is.null(attr(sim_data, "sim_params")), + simulated_from_priors = attr(sim_data, "simulated_from_priors") + ), + style = "deparse" + ) +}) diff --git a/tests/testthat/test-summarize_prior_predictive.R b/tests/testthat/test-summarize_prior_predictive.R new file mode 100644 index 00000000..0d7a533d --- /dev/null +++ b/tests/testthat/test-summarize_prior_predictive.R @@ -0,0 +1,200 @@ +test_that("summarize_prior_predictive basic functionality", { + withr::local_seed(123) + + # Create test data + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 3) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + # Single simulation + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + summary <- summarize_prior_predictive(sim_data) + + expect_s3_class(summary, "prior_predictive_summary") + expect_type(summary, "list") + expect_equal(summary$n_sims, 1) + expect_s3_class(summary$validity_check, "data.frame") + expect_s3_class(summary$range_summary, "data.frame") + expect_type(summary$issues, "character") +}) + +test_that("summarize_prior_predictive with multiple simulations", { + withr::local_seed(456) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + # Multiple simulations + n_sims <- 10 + sim_list <- simulate_prior_predictive( + prepped_data, + prepped_priors, + n_sims = n_sims + ) + summary <- summarize_prior_predictive(sim_list) + + expect_equal(summary$n_sims, n_sims) + expect_true(nrow(summary$validity_check) > 0) + expect_true(nrow(summary$range_summary) > 0) +}) + +test_that("summarize_prior_predictive with observed data comparison", { + withr::local_seed(789) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 3) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + summary <- summarize_prior_predictive(sim_data, original_data = prepped_data) + + expect_s3_class(summary$observed_range, "data.frame") + expect_equal( + nrow(summary$observed_range), + nrow(summary$range_summary) + ) + expect_true( + all( + c("biomarker", "obs_min", "obs_median", "obs_max") %in% + names(summary$observed_range) + ) + ) +}) + +test_that("summarize_prior_predictive validity checks", { + withr::local_seed(101) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + summary <- summarize_prior_predictive(sim_data) + + # Check validity_check structure + expect_true( + all( + c("biomarker", "n_finite", "n_nonfinite", "n_negative") %in% + names(summary$validity_check) + ) + ) + + # Check that we have counts for each biomarker + expect_equal( + nrow(summary$validity_check), + prepped_data$n_antigen_isos + ) +}) + +test_that("summarize_prior_predictive range summary", { + withr::local_seed(202) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + summary <- summarize_prior_predictive(sim_data) + + # Check range_summary structure + expect_true( + all( + c("biomarker", "min", "q25", "median", "q75", "max") %in% + names(summary$range_summary) + ) + ) + + # Check that quantiles are ordered (where finite) + for (i in seq_len(nrow(summary$range_summary))) { + if (all(is.finite(c( + summary$range_summary$min[i], + summary$range_summary$q25[i], + summary$range_summary$median[i], + summary$range_summary$q75[i], + summary$range_summary$max[i] + )))) { + expect_true( + summary$range_summary$min[i] <= summary$range_summary$q25[i] + ) + expect_true( + summary$range_summary$q25[i] <= summary$range_summary$median[i] + ) + expect_true( + summary$range_summary$median[i] <= summary$range_summary$q75[i] + ) + expect_true( + summary$range_summary$q75[i] <= summary$range_summary$max[i] + ) + } + } +}) + +test_that("print method for prior_predictive_summary works", { + withr::local_seed(303) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + summary <- summarize_prior_predictive(sim_data) + + # Test that print works without error + expect_no_error(print(summary)) + + # Verify that output is printed (cli headers go to console, not captured) + # Just check that it doesn't error and returns invisibly + expect_invisible(print(summary)) +}) + +test_that("summarize_prior_predictive handles invalid inputs gracefully", { + withr::local_seed(404) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_data <- simulate_prior_predictive(prepped_data, prepped_priors) + + # Test with wrong type of original_data + expect_warning( + summarize_prior_predictive(sim_data, original_data = list(a = 1)), + "is not a.*prepped_jags_data" + ) +}) + +test_that("summarize_prior_predictive result consistency", { + withr::local_seed(505) + + raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 2) + prepped_data <- prep_data(raw_data) + prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + + sim_list <- simulate_prior_predictive( + prepped_data, + prepped_priors, + n_sims = 3 + ) + summary <- summarize_prior_predictive(sim_list, original_data = prepped_data) + + # Snapshot key parts of the summary + expect_snapshot_value( + list( + n_sims = summary$n_sims, + validity_check_cols = names(summary$validity_check), + range_summary_cols = names(summary$range_summary), + has_observed_range = !is.null(summary$observed_range), + n_issues = length(summary$issues) + ), + style = "deparse" + ) +}) diff --git a/vignettes/articles/getting-started.qmd b/vignettes/articles/getting-started.qmd index 75a55253..e7b51b51 100644 --- a/vignettes/articles/getting-started.qmd +++ b/vignettes/articles/getting-started.qmd @@ -119,6 +119,35 @@ autoplot(simulated_data) ## Running the Bayesian Model +### Prior Predictive Checks (Recommended) + +**Before fitting the model**, it's recommended to perform a prior predictive check to ensure your priors generate realistic antibody trajectories: + +```{r} +#| label: prior-predictive-check +# Prepare data and priors +prepped_data <- prep_data(simulated_data) +prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) + +# Simulate from priors +sim_prior <- simulate_prior_predictive( + prepped_data, + prepped_priors, + n_sims = 30, + seed = 456 +) + +# Summarize and check for issues +summarize_prior_predictive(sim_prior, original_data = prepped_data) + +# Visualize prior predictive trajectories +plot_prior_predictive(sim_prior, original_data = prepped_data) +``` + +If the prior predictive check shows issues (e.g., unrealistic scales, non-finite values), adjust the priors using `prep_priors()` with custom parameters before proceeding. See `vignette("prior-predictive-checks")` for detailed guidance. + +### Fitting the Model + The main function `run_mod()` fits a Bayesian MCMC model to estimate antibody dynamic curve parameters: - `y0`: Baseline antibody concentration diff --git a/vignettes/articles/prior-predictive-checks.qmd b/vignettes/articles/prior-predictive-checks.qmd new file mode 100644 index 00000000..86a9d72b --- /dev/null +++ b/vignettes/articles/prior-predictive-checks.qmd @@ -0,0 +1,241 @@ +--- +title: "Prior Predictive Checks" +format: html +--- + +```{r} +#| label: setup +#| include: false +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 8, + fig.height = 6 +) +``` + +## Introduction + +Prior predictive checks (PPCs) are a critical step in Bayesian workflow, allowing you to assess whether your prior distributions generate realistic data **before** fitting the model. This is especially important in serodynamics because: + +- Different pathogens and assays operate on **very different measurement scales** (e.g., Shigella MFI vs. ELISA OD or titers) +- Poorly scaled priors can generate unrealistic antibody trajectories +- Prior predictive checks help identify issues **before** expensive MCMC sampling + +This vignette demonstrates how to perform prior predictive checks using the `serodynamics` package. + +## Load Required Libraries + +```{r load-packages} +library(serodynamics) +library(dplyr) +set.seed(123) +``` + +## Basic Workflow + +The basic prior predictive check workflow involves three steps: + +1. **Simulate** antibody trajectories from priors using `simulate_prior_predictive()` +2. **Summarize** the simulated data using `summarize_prior_predictive()` +3. **Visualize** the trajectories using `plot_prior_predictive()` + +## Example: Typhoid Data + +Let's demonstrate with simulated typhoid data: + +### 1. Prepare Data and Priors + +First, prepare your data and specify priors: + +```{r prepare-data} +# Simulate some case data +raw_data <- serocalculator::typhoid_curves_nostrat_100 |> + sim_case_data(n = 10) + +# Prepare data for JAGS +prepped_data <- prep_data(raw_data) + +# Prepare default priors +prepped_priors <- prep_priors(max_antigens = prepped_data$n_antigen_isos) +``` + +### 2. Simulate from Priors + +Generate simulated data using only the prior distributions: + +```{r simulate} +# Generate a single simulation +sim_data <- simulate_prior_predictive( + prepped_data, + prepped_priors, + seed = 456 +) + +# Or generate multiple simulations for better coverage +sim_list <- simulate_prior_predictive( + prepped_data, + prepped_priors, + n_sims = 50, + seed = 456 +) +``` + +### 3. Summarize Simulated Data + +Check for potential issues with the priors: + +```{r summarize} +# Summarize the simulations +summary <- summarize_prior_predictive(sim_list, original_data = prepped_data) +print(summary) +``` + +The summary includes: + +- **Validity check**: Counts of finite, non-finite, and negative values by biomarker +- **Range summary**: Quantiles of simulated values (on log scale) +- **Observed range**: Comparison with actual data (when provided) +- **Issues detected**: Warnings about potential problems + +### 4. Visualize Prior Predictive Trajectories + +Plot the simulated trajectories to visually assess whether they look realistic: + +```{r plot-basic} +# Plot simulated trajectories only +plot_prior_predictive(sim_list) +``` + +#### Overlay with Observed Data + +Compare simulated trajectories with actual data to assess scale: + +```{r plot-with-observed} +# Plot with observed data overlay +plot_prior_predictive( + sim_list, + original_data = prepped_data, + max_traj = 30 # Limit number of trajectories for clarity +) +``` + +Blue lines show simulated trajectories, while black lines/points show the observed data. If the simulated trajectories have a very different scale or pattern than the observed data, your priors may need adjustment. + +#### Natural Scale vs. Log Scale + +By default, plots use the log scale (matching the model). You can also view on the natural scale: + +```{r plot-natural-scale} +# Plot on natural scale +plot_prior_predictive( + sim_list, + original_data = prepped_data, + log_scale = FALSE, + max_traj = 30 +) +``` + +## Custom Priors + +If the default priors don't generate realistic trajectories, you can adjust them: + +```{r custom-priors} +# Define custom priors with different parameter values +custom_priors <- prep_priors( + max_antigens = prepped_data$n_antigen_isos, + mu_hyp_param = c(1.0, 5.0, 0.5, -3.0, -2.0), # Adjust means + prec_hyp_param = c(0.5, 0.0001, 0.5, 0.002, 0.5), # Adjust precisions + omega_param = c(2.0, 30.0, 2.0, 8.0, 2.0), # Adjust variability + wishdf_param = 15, + prec_logy_hyp_param = c(3.0, 0.8) +) + +# Simulate with custom priors +custom_sim <- simulate_prior_predictive( + prepped_data, + custom_priors, + n_sims = 50, + seed = 789 +) + +# Check results +custom_summary <- summarize_prior_predictive(custom_sim, original_data = prepped_data) +print(custom_summary) +``` + +```{r plot-custom} +plot_prior_predictive( + custom_sim, + original_data = prepped_data, + max_traj = 30 +) +``` + +## Interpreting Results + +### Good Prior Predictive Check + +A well-specified prior should generate: + +- **Finite values**: All simulated antibody levels should be finite (no NaN, Inf, or -Inf) +- **Reasonable scale**: Simulated ranges should roughly overlap with observed data ranges +- **Realistic trajectories**: Curves should show plausible antibody kinetics (rise and decay) + +### Warning Signs + +Be concerned if you see: + +- **Non-finite values**: Indicates mathematical issues with parameter combinations +- **Extreme values**: Very large or very small antibody levels that are biologically implausible +- **Scale mismatch**: Simulated data ranges orders of magnitude different from observed data +- **Unrealistic patterns**: Trajectories that don't follow expected antibody kinetics + +## Integration with Model Fitting + +Once you're satisfied with the prior predictive check, proceed to model fitting: + +```{r fit-model, eval = FALSE} +# After confirming priors are reasonable, fit the model +fitted_model <- run_mod( + data = raw_data, + file_mod = serodynamics_example("model.jags"), + nchain = 4, + nadapt = 1000, + nburn = 1000, + nmc = 1000, + niter = 2000, + # Use the custom priors you validated + mu_hyp_param = c(1.0, 5.0, 0.5, -3.0, -2.0), + prec_hyp_param = c(0.5, 0.0001, 0.5, 0.002, 0.5), + omega_param = c(2.0, 30.0, 2.0, 8.0, 2.0), + wishdf_param = 15, + prec_logy_hyp_param = c(3.0, 0.8) +) +``` + +## Best Practices + +1. **Always run prior predictive checks** before fitting models, especially when: + - Working with a new pathogen or assay + - Using non-default priors + - Data are on an unfamiliar scale + +2. **Generate enough simulations** (50-100) to see the full range of prior-implied trajectories + +3. **Compare with observed data** to ensure priors are on the right scale + +4. **Iterate on priors** if simulations show problems, then re-run the prior predictive check + +5. **Document your choices** about why you selected particular prior values + +## Summary + +Prior predictive checks are a simple but powerful tool for: + +- Validating that priors generate realistic antibody trajectories +- Identifying scale mismatches before expensive MCMC fitting +- Building intuition about how priors translate to observable data +- Catching mathematical issues (e.g., negative values in log functions) + +By incorporating prior predictive checks into your workflow, you can avoid many common pitfalls in Bayesian modeling and ensure more robust inference.