diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index b7f960eb..c51a6940 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -67,7 +67,7 @@ jobs: - name: Upload test results if: failure() - uses: actions/upload-artifact@v5 + uses: actions/upload-artifact@v6 with: name: coverage-test-failures path: ${{ runner.temp }}/package diff --git a/DESCRIPTION b/DESCRIPTION index 64509010..17f04de1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: TwoSampleMR Title: Two Sample MR Functions and Interface to MRC Integrative Epidemiology Unit OpenGWAS Database -Version: 0.6.30 +Version: 0.7.0 Authors@R: c( person("Gibran", "Hemani", , "g.hemani@bristol.ac.uk", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-0920-1055")), diff --git a/NEWS.md b/NEWS.md index ef0a8d7e..18c5b815 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,12 @@ +# TwoSampleMR v0.7.0 + +(Release date 2026-02-24) + +* Fixed a bug in the calculation in one of the code paths for the inferred p-value in `format_data()` (thanks @j-brody) +* Fixed the calculation of a p-value in `mr_rucker_internal()` +* Fixed a replacement length warning in `get_population_allele_frequency()` +* Reformatted code base with `air` + # TwoSampleMR v0.6.30 (Release date 2026-02-06) diff --git a/R/add_rsq.r b/R/add_rsq.r index 81a635a9..8c6c018b 100644 --- a/R/add_rsq.r +++ b/R/add_rsq.r @@ -351,7 +351,7 @@ get_population_allele_frequency <- function(af, prop, odds_ratio, prevalence) { co <- contingency(af[i], prop[i], odds_ratio[i]) af_controls <- co[1, 2] / (co[1, 2] + co[2, 2]) af_cases <- co[1, 1] / (co[1, 1] + co[2, 1]) - af[i] <- af_controls * (1 - prevalence) + af_cases * prevalence + af[i] <- af_controls * (1 - prevalence[i]) + af_cases * prevalence[i] } return(af) } diff --git a/R/format_mr_results2.R b/R/format_mr_results2.R index e60782c9..6be97cfe 100644 --- a/R/format_mr_results2.R +++ b/R/format_mr_results2.R @@ -163,8 +163,22 @@ combine_all_mrresults <- function( res <- merge(res, het, by = c("id.outcome", "id.exposure", "Method"), all.x = TRUE) res <- data.table::rbindlist( - list(res, sin[, c("exposure", "outcome", "id.exposure", "id.outcome", "SNP", "b", "se", "pval", "Method")]), - fill = TRUE, use.names = TRUE + list( + res, + sin[, c( + "exposure", + "outcome", + "id.exposure", + "id.outcome", + "SNP", + "b", + "se", + "pval", + "Method" + )] + ), + fill = TRUE, + use.names = TRUE ) data.table::setDF(res) diff --git a/R/harmonise.R b/R/harmonise.R index 1508aacd..54e9a253 100644 --- a/R/harmonise.R +++ b/R/harmonise.R @@ -114,7 +114,11 @@ harmonise_data <- function(exposure_dat, outcome_dat, action = 2) { # return(x) # }) - jlog <- data.table::rbindlist(lapply(fix.tab, function(x) attr(x, "log")), fill = TRUE, use.names = TRUE) + jlog <- data.table::rbindlist( + lapply(fix.tab, function(x) attr(x, "log")), + fill = TRUE, + use.names = TRUE + ) data.table::setDF(jlog) fix.tab <- data.table::rbindlist(fix.tab, fill = TRUE, use.names = TRUE) data.table::setDF(fix.tab) @@ -686,7 +690,8 @@ harmonise <- function(dat, tolerance, action) { as.data.frame(attr(d12, "log"), stringsAsFactors = FALSE), as.data.frame(attr(d11, "log"), stringsAsFactors = FALSE) ), - fill = TRUE, use.names = TRUE + fill = TRUE, + use.names = TRUE ) data.table::setDF(jlog) jlog <- cbind( diff --git a/R/instruments.R b/R/instruments.R index c2ba17b3..221e9182 100644 --- a/R/instruments.R +++ b/R/instruments.R @@ -22,9 +22,15 @@ extract_instruments <- function( opengwas_jwt = ieugwasr::get_opengwas_jwt(), force_server = FALSE ) { - if (!(clump %in% c(0, 1, FALSE, TRUE))) stop("The clump argument should be 0 or 1.") - if (clump) clump <- 1 - if (!clump) clump <- 0 + if (!(clump %in% c(0, 1, FALSE, TRUE))) { + stop("The clump argument should be 0 or 1.") + } + if (clump) { + clump <- 1 + } + if (!clump) { + clump <- 0 + } # .Deprecated("ieugwasr::tophits()") outcomes <- ieugwasr::legacy_ids(unique(outcomes)) diff --git a/R/knit.R b/R/knit.R index bfebacf1..1c5b724e 100644 --- a/R/knit.R +++ b/R/knit.R @@ -123,11 +123,14 @@ mr_report <- function( ) dat_dt <- data.table::as.data.table(dat) - combinations <- dat_dt[, .( - n = .N, - exposure = exposure[1], - outcome = outcome[1] - ), by = c("id.exposure", "id.outcome")] + combinations <- dat_dt[, + .( + n = .N, + exposure = exposure[1], + outcome = outcome[1] + ), + by = c("id.exposure", "id.outcome") + ] data.table::setDF(combinations) output_file <- array("", nrow(combinations)) diff --git a/R/leaveoneout.R b/R/leaveoneout.R index 5f44ce3d..bd724204 100644 --- a/R/leaveoneout.R +++ b/R/leaveoneout.R @@ -20,7 +20,7 @@ mr_leaveoneout <- function(dat, parameters = default_parameters(), method = mr_i dat_dt <- data.table::as.data.table(dat) combos <- unique(dat_dt[, .(id.exposure, id.outcome)]) - + results <- lapply(seq_len(nrow(combos)), function(i) { exp_id <- combos$id.exposure[i] out_id <- combos$id.outcome[i] @@ -104,7 +104,7 @@ mr_leaveoneout <- function(dat, parameters = default_parameters(), method = mr_i mr_leaveoneout_plot <- function(leaveoneout_results) { dat_dt <- data.table::as.data.table(leaveoneout_results) combos <- unique(dat_dt[, .(id.exposure, id.outcome)]) - + res <- lapply(seq_len(nrow(combos)), function(i) { exp_id <- combos$id.exposure[i] out_id <- combos$id.outcome[i] diff --git a/R/mr.R b/R/mr.R index 32fbee2a..ced8a4af 100644 --- a/R/mr.R +++ b/R/mr.R @@ -26,10 +26,10 @@ mr <- function( # Convert to data.table for efficient grouped operations dat_dt <- data.table::as.data.table(dat) - + # Get unique combinations of id.exposure and id.outcome combos <- unique(dat_dt[, .(id.exposure, id.outcome)]) - + # Pre-compute method names once, outside the loop methl <- mr_method_list() method_names <- methl$name[match(method_list, methl$obj)] @@ -40,7 +40,7 @@ mr <- function( out_id <- combos$id.outcome[i] x1 <- dat_dt[id.exposure == exp_id & id.outcome == out_id] x <- x1[mr_keep == TRUE] - + if (nrow(x) == 0) { message( "No SNPs available for MR analysis of '", @@ -71,7 +71,7 @@ mr <- function( mr_tab <- mr_tab[!(is.na(mr_tab$b) & is.na(mr_tab$se) & is.na(mr_tab$pval)), ] return(mr_tab) }) - + mr_tab <- data.table::rbindlist(results, fill = TRUE, use.names = TRUE) data.table::setDF(mr_tab) @@ -662,12 +662,16 @@ mr_egger_regression_bootstrap <- function(b_exp, b_out, se_exp, se_out, paramete # Vectorized bootstrap: generate all random values at once # Matrix dimensions: nboot rows x nsnp columns - xs_mat <- matrix(stats::rnorm(nboot * nsnp, mean = rep(b_exp, each = nboot), - sd = rep(se_exp, each = nboot)), - nrow = nboot, ncol = nsnp) - ys_mat <- matrix(stats::rnorm(nboot * nsnp, mean = rep(b_out, each = nboot), - sd = rep(se_out, each = nboot)), - nrow = nboot, ncol = nsnp) + xs_mat <- matrix( + stats::rnorm(nboot * nsnp, mean = rep(b_exp, each = nboot), sd = rep(se_exp, each = nboot)), + nrow = nboot, + ncol = nsnp + ) + ys_mat <- matrix( + stats::rnorm(nboot * nsnp, mean = rep(b_out, each = nboot), sd = rep(se_out, each = nboot)), + nrow = nboot, + ncol = nsnp + ) # Apply sign correction for Egger regression (vectorized) ys_mat <- ys_mat * sign(xs_mat) @@ -676,15 +680,19 @@ mr_egger_regression_bootstrap <- function(b_exp, b_out, se_exp, se_out, paramete # Vectorized weighted linear regression for all bootstrap iterations # For each bootstrap iteration, compute weighted regression coefficients # Using the formula: bhat = cov(x*w, y*w) / var(x*w), ahat = mean(y) - mean(x)*bhat - res <- t(vapply(seq_len(nboot), function(i) { - xs <- xs_mat[i, ] - ys <- ys_mat[i, ] - xw <- xs * weights - yw <- ys * weights - bhat <- stats::cov(xw, yw, use = "pair") / stats::var(xw, na.rm = TRUE) - ahat <- mean(ys, na.rm = TRUE) - mean(xs, na.rm = TRUE) * bhat - c(ahat, bhat) - }, numeric(2))) + res <- t(vapply( + seq_len(nboot), + function(i) { + xs <- xs_mat[i, ] + ys <- ys_mat[i, ] + xw <- xs * weights + yw <- ys * weights + bhat <- stats::cov(xw, yw, use = "pair") / stats::var(xw, na.rm = TRUE) + ahat <- mean(ys, na.rm = TRUE) - mean(xs, na.rm = TRUE) * bhat + c(ahat, bhat) + }, + numeric(2) + )) return(list( b = mean(res[, 2], na.rm = TRUE), @@ -808,20 +816,28 @@ weighted_median_bootstrap <- function(b_exp, b_out, se_exp, se_out, weights, nbo # Vectorized bootstrap: generate all random values at once # Matrix dimensions: nboot rows x nsnp columns - b_exp_mat <- matrix(stats::rnorm(nboot * nsnp, mean = rep(b_exp, each = nboot), - sd = rep(se_exp, each = nboot)), - nrow = nboot, ncol = nsnp) - b_out_mat <- matrix(stats::rnorm(nboot * nsnp, mean = rep(b_out, each = nboot), - sd = rep(se_out, each = nboot)), - nrow = nboot, ncol = nsnp) + b_exp_mat <- matrix( + stats::rnorm(nboot * nsnp, mean = rep(b_exp, each = nboot), sd = rep(se_exp, each = nboot)), + nrow = nboot, + ncol = nsnp + ) + b_out_mat <- matrix( + stats::rnorm(nboot * nsnp, mean = rep(b_out, each = nboot), sd = rep(se_out, each = nboot)), + nrow = nboot, + ncol = nsnp + ) # Compute Wald ratios for all bootstrap samples (element-wise division) betaIV_mat <- b_out_mat / b_exp_mat # Apply weighted_median to each bootstrap iteration - med <- vapply(seq_len(nboot), function(i) { - weighted_median(betaIV_mat[i, ], weights) - }, numeric(1)) + med <- vapply( + seq_len(nboot), + function(i) { + weighted_median(betaIV_mat[i, ], weights) + }, + numeric(1) + ) return(stats::sd(med)) } @@ -1118,7 +1134,11 @@ mr_raps <- function(b_exp, b_out, se_exp, se_out, parameters = default_parameter } if (utils::packageVersion('mr.raps') < '0.4.3') { - message(paste("The version of mr.raps is", utils::packageVersion('mr.raps'), "please consider updating to version 0.4.3 or higher, e.g., install.packages('mr.raps', repos = c('https://mrcieu.r-universe.dev', 'https://cloud.r-project.org')) ")) + message(paste( + "The version of mr.raps is", + utils::packageVersion('mr.raps'), + "please consider updating to version 0.4.3 or higher, e.g., install.packages('mr.raps', repos = c('https://mrcieu.r-universe.dev', 'https://cloud.r-project.org')) " + )) } data <- data.frame( diff --git a/R/mr_mode.R b/R/mr_mode.R index 1281af9b..f4de06ab 100644 --- a/R/mr_mode.R +++ b/R/mr_mode.R @@ -62,11 +62,13 @@ mr_mode <- function(dat, parameters = default_parameters(), mode_method = "all") #mean and sd recycle in lockstep (both length n), filling column-by-column BetaIV.boot_mat <- matrix( stats::rnorm(nboot * n, mean = BetaIV.in, sd = seBetaIV.in[, 1]), - nrow = nboot, ncol = n + nrow = nboot, + ncol = n ) BetaIV.boot_NOME_mat <- matrix( stats::rnorm(nboot * n, mean = BetaIV.in, sd = seBetaIV.in[, 2]), - nrow = nboot, ncol = n + nrow = nboot, + ncol = n ) #Pre-compute constants diff --git a/R/read_data.R b/R/read_data.R index c3c354d4..342df17f 100644 --- a/R/read_data.R +++ b/R/read_data.R @@ -451,10 +451,11 @@ format_data <- function( if (any(is.na(dat$pval.outcome))) { if ("beta.outcome" %in% names(dat) && "se.outcome" %in% names(dat)) { index <- is.na(dat$pval.outcome) - dat$pval.outcome[index] <- stats::pnorm( - abs(dat$beta.outcome[index]) / dat$se.outcome[index], - lower.tail = FALSE - ) + dat$pval.outcome[index] <- 2 * + stats::pnorm( + abs(dat$beta.outcome[index]) / dat$se.outcome[index], + lower.tail = FALSE + ) dat$pval_origin.outcome[index] <- "inferred" } } diff --git a/R/rucker.R b/R/rucker.R index e5752e06..5660c7e3 100644 --- a/R/rucker.R +++ b/R/rucker.R @@ -189,9 +189,10 @@ mr_rucker_internal <- function(dat, parameters = default_parameters()) { se0_egger_re <- stats::coefficients(mod_egger)[1, 2] if (parameters$test_dist == "z") { pval0_egger_re <- stats::pnorm( - stats::coefficients(mod_egger)[1, 1] / stats::coefficients(mod_egger)[1, 2], + abs(stats::coefficients(mod_egger)[1, 1]) / stats::coefficients(mod_egger)[1, 2], lower.tail = FALSE - ) + ) * + 2 } else { pval0_egger_re <- stats::coefficients(mod_egger)[1, 4] } @@ -290,11 +291,13 @@ mr_rucker_bootstrap <- function(dat, parameters = default_parameters()) { # Pre-generate all random values as matrices (nboot x nsnp) boot_exp <- matrix( stats::rnorm(nboot * nsnp, mean = dat$beta.exposure, sd = dat$se.exposure), - nrow = nboot, ncol = nsnp + nrow = nboot, + ncol = nsnp ) boot_out <- matrix( stats::rnorm(nboot * nsnp, mean = dat$beta.outcome, sd = dat$se.outcome), - nrow = nboot, ncol = nsnp + nrow = nboot, + ncol = nsnp ) dat2 <- dat @@ -456,7 +459,11 @@ mr_rucker_jackknife_internal <- function(dat, parameters = default_parameters()) l[[i]] <- mr_rucker_internal(dat2, parameters) } - modsel <- data.table::rbindlist(lapply(l, function(x) x$selected), fill = TRUE, use.names = TRUE) + modsel <- data.table::rbindlist( + lapply(l, function(x) x$selected), + fill = TRUE, + use.names = TRUE + ) data.table::setDF(modsel) modsel$model <- vapply(l, function(x) x$res, character(1)) diff --git a/R/singlesnp.R b/R/singlesnp.R index 82c9810d..1a6cc531 100644 --- a/R/singlesnp.R +++ b/R/singlesnp.R @@ -26,13 +26,17 @@ mr_singlesnp <- function( dat_dt <- data.table::as.data.table(dat) combos <- unique(dat_dt[, .(id.exposure, id.outcome)]) - + # Pre-compute method names outside the loop method_list <- mr_method_list() - method_names <- vapply(all_method, function(m) { - paste0("All - ", method_list$name[method_list$obj == m]) - }, character(1)) - + method_names <- vapply( + all_method, + function(m) { + paste0("All - ", method_list$name[method_list$obj == m]) + }, + character(1) + ) + results <- lapply(seq_len(nrow(combos)), function(i) { exp_id <- combos$id.exposure[i] out_id <- combos$id.outcome[i] @@ -108,7 +112,7 @@ mr_singlesnp <- function( mr_forest_plot <- function(singlesnp_results, exponentiate = FALSE) { dat_dt <- data.table::as.data.table(singlesnp_results) combos <- unique(dat_dt[, .(id.exposure, id.outcome)]) - + res <- lapply(seq_len(nrow(combos)), function(i) { exp_id <- combos$id.exposure[i] out_id <- combos$id.outcome[i] @@ -219,7 +223,7 @@ mr_density_plot <- function( ) { dat_dt <- data.table::as.data.table(singlesnp_results) combos <- unique(dat_dt[, .(id.exposure, id.outcome)]) - + res <- lapply(seq_len(nrow(combos)), function(i) { exp_id <- combos$id.exposure[i] out_id <- combos$id.outcome[i] @@ -264,7 +268,7 @@ mr_density_plot <- function( mr_funnel_plot <- function(singlesnp_results) { dat_dt <- data.table::as.data.table(singlesnp_results) combos <- unique(dat_dt[, .(id.exposure, id.outcome)]) - + res <- lapply(seq_len(nrow(combos)), function(i) { exp_id <- combos$id.exposure[i] out_id <- combos$id.outcome[i] diff --git a/tests/testthat/test_format_data.R b/tests/testthat/test_format_data.R index 7fc8eca2..c3ee9997 100644 --- a/tests/testthat/test_format_data.R +++ b/tests/testthat/test_format_data.R @@ -50,3 +50,44 @@ test_that("format_data() should not cause a stack overflow", { a <- data.table::data.table(x = sample(1:1e6)) expect_error(do.call(format_data, list(a))) }) + +test_that("Inferred p-value calculation is correct for outcome", { + df2 <- data.frame( + "SNP" = c("9_69001927_C_T", "9_69459263_A_G"), + "effect_allele" = c("T", "G"), + "other_allele" = c("C", "A"), + "se" = c(0.6, 0.3), + "beta" = c(1.1, 0.5), + "eaf" = c(0.319894, 0.39234), + "pval" = c(NA, NA), + "pheno_id" = c("traita", "traita") + ) + + suppressWarnings({ + datformat <- format_data(df2, type = "outcome") + }) + pvals <- 2 * stats::pnorm(abs(datformat$beta.outcome) / datformat$se.outcome, lower.tail = FALSE) + expect_equal(datformat$pval.outcome, pvals) + expect_equal(datformat$pval_origin.outcome, c("inferred", "inferred")) +}) + +test_that("Inferred p-value calculation is correct for exposure", { + df3 <- data.frame( + "SNP" = c("9_69001927_C_T", "9_69459263_A_G"), + "effect_allele" = c("T", "G"), + "other_allele" = c("C", "A"), + "se" = c(0.6, 0.3), + "beta" = c(1.1, 0.5), + "eaf" = c(0.319894, 0.39234), + "pval" = c(NA, NA), + "pheno_id" = c("traita", "traita") + ) + + suppressWarnings({ + datformat2 <- format_data(df3, type = "exposure") + }) + pvals2 <- 2 * + stats::pnorm(abs(datformat2$beta.exposure) / datformat2$se.exposure, lower.tail = FALSE) + expect_equal(datformat2$pval.exposure, pvals2) + expect_equal(datformat2$pval_origin.exposure, c("inferred", "inferred")) +})