diff --git a/R/ROI_Extraction.R b/R/ROI_Extraction.R index 102223c..06681b9 100644 --- a/R/ROI_Extraction.R +++ b/R/ROI_Extraction.R @@ -145,6 +145,87 @@ PerformROIExtraction <- start.time <- Sys.time(); function.name <- "ROI_extract"; + parallel_param <- BiocParallel::bpparam(); + + combine_msnexp_list <- function(msnexp_list) { + if (length(msnexp_list) == 1) { + return(msnexp_list[[1]]) + } + + assayData_env <- new.env(parent = emptyenv()) + feature_data_list <- list() + pheno_data_list <- list() + assay_name_vec <- character() + + for (idx in seq_along(msnexp_list)) { + obj <- msnexp_list[[idx]] + ad_names <- ls(obj@assayData) + new_ad_names <- sprintf("F%d.S%03d", idx, seq_along(ad_names)) + assay_name_vec <- c(assay_name_vec, new_ad_names) + + pheno_row <- obj@phenoData@data + if (is.null(rownames(pheno_row))) { + rownames(pheno_row) <- pheno_row$sample_name + } + pheno_data_list[[idx]] <- pheno_row + + fd <- obj@featureData@data + if (nrow(fd) == length(new_ad_names)) { + rownames(fd) <- new_ad_names + } + feature_data_list[[idx]] <- fd + + for (nm_idx in seq_along(ad_names)) { + spectrum_obj <- obj@assayData[[ad_names[nm_idx]]] + if (!is.null(slotNames(spectrum_obj)) && "fromFile" %in% slotNames(spectrum_obj)) { + spectrum_obj@fromFile <- as.integer(idx) + } + assign(new_ad_names[nm_idx], spectrum_obj, envir = assayData_env) + } + } + + pd_data <- do.call(rbind, pheno_data_list) + pd_data <- data.frame(pd_data, + row.names = as.character(seq_len(nrow(pd_data))), + stringsAsFactors = FALSE) + combined_pd <- new("AnnotatedDataFrame", data = pd_data) + attr(combined_pd@data, "row.names") <- seq_len(nrow(pd_data)) + + combined_fd <- new("AnnotatedDataFrame", + data = do.call(rbind, feature_data_list), + varMetadata = msnexp_list[[1]]@featureData@varMetadata) + combined_fd <- combined_fd[assay_name_vec, , drop = FALSE] + + processing_data <- msnexp_list[[1]]@processingData + processing_data@files <- unname(unlist(lapply(msnexp_list, function(x) x@processingData@files))) + + combined_experiment <- msnexp_list[[1]]@experimentData + exp_slots <- slotNames(combined_experiment) + file_count <- length(processing_data@files) + replicate_slots <- c("analyser", "detectorType", + "instrumentManufacturer", "instrumentModel", "ionSource") + for (sn in exp_slots) { + val <- slot(combined_experiment, sn) + if (sn %in% replicate_slots && is.vector(val) && length(val) == 1 && file_count > 1) { + slot(combined_experiment, sn) <- rep(val[1], file_count) + } + } + + protocol_df <- data.frame(row.names = character(0), + stringsAsFactors = FALSE) + protocol_data <- new("AnnotatedDataFrame", + data = protocol_df, + dimLabels = c("sampleNames", "sampleColumns")) + attr(protocol_data@data, "row.names") <- integer(0) + + new("MSnExp", + assayData = assayData_env, + phenoData = combined_pd, + featureData = combined_fd, + experimentData = combined_experiment, + protocolData = protocol_data, + processingData = processing_data) + } if (is.null(running.controller)) { c1 <- c2 <- c3 <- c4 <- c5 <- TRUE; @@ -285,31 +366,41 @@ PerformROIExtraction <- pd <- data.frame( - sample_name = sub( - basename(dda_file1), - pattern = ".mzXML", - replacement = "", - fixed = TRUE - ), + sample_name = tools::file_path_sans_ext(basename(dda_file1)), stringsAsFactors = FALSE ) + pdata_class <- if (isClass("NAnnotatedDataFrame")) "NAnnotatedDataFrame" else "AnnotatedDataFrame" - MessageOutput("Data Loading...", "\n", NULL) + MessageOutput("Data Loading in parallel...", "\n", NULL) - raw_data <- - tryCatch(read.MSdata( - dda_file1, - pdata = new("NAnnotatedDataFrame", pd), - msLevel. = 1L, - mode = "inMemory" - ), error = function(e) {e}) + raw_data_list <- + BiocParallel::bplapply(seq_along(dda_file1), + FUN = function(idx) { + tryCatch( + read.MSdata( + dda_file1[idx], + pdata = new(pdata_class, pd[idx, , drop = FALSE]), + msLevel. = 1L, + mode = "inMemory" + ), + error = function(e) { + e + } + ) + }, + BPPARAM = parallel_param) - if (!is(raw_data,"MSnExp")) { + invalid_data <- which(!vapply(raw_data_list, function(x) { + is(x, "MSnExp") + }, logical(1))) + + if (length(invalid_data)) { + first_error <- raw_data_list[[invalid_data[1]]] MessageOutput( mes = paste0( "", "\nERROR:", - raw_data$message, + first_error$message, "" ), ecol = "\n", @@ -321,53 +412,25 @@ PerformROIExtraction <- MessageOutput("Data Loaded !", "\n", NULL) if (.running.as.plan) { - cache.save(raw_data, paste0(function.name, "_c1")); + cache.save(raw_data_list, paste0(function.name, "_c1")); marker_record(paste0(function.name, "_c1")); } } else { mSet <- new("mSet"); mSet@rawfiles <- files; - raw_data <- cache.read(function.name, "c1"); + raw_data_list <- cache.read(function.name, "c1"); + pd <- + data.frame( + sample_name = tools::file_path_sans_ext(basename(mSet@rawfiles)), + stringsAsFactors = FALSE + ) + pdata_class <- if (isClass("NAnnotatedDataFrame")) "NAnnotatedDataFrame" else "AnnotatedDataFrame" marker_record(paste0(function.name, "_c1")); } if (c2) { if (!mode == "none") { - ## Data Trim - # a <- - # suppressMessages(unlist(lapply( - # ls(raw_data@assayData), - # FUN = function(x) { - # unlockBinding(sym = x, env = raw_data@assayData) - # } - # ))); - - ms_list <- - sapply( - ls(raw_data@assayData), - FUN = function(x) - raw_data@assayData[[x]]@mz - ); - - MessageOutput("Empty Scan detecting...", "\n", NULL) - emptyScan <- vector() - - for (i in seq_along(ms_list)) { - if (identical(ms_list[[i]], numeric(0))) { - emptyScan <- c(emptyScan, i) - } - } - - if (!identical(emptyScan, logical(0))) { - MessageOutput(paste0("Empty Scans Found. Removing..."), "\n", NULL); - raw_data <- .emptyscan.remove(raw_data, ms_list); - } else { - MessageOutput(paste0("No Empty scan found !"), "\n", NULL); - } - - MessageOutput("Identifying regions of interest (ROI)...", "\n", NULL); - if (missing(rt.idx)) { rt.idx <- 0.9 # include the 90% RT range @@ -379,63 +442,213 @@ PerformROIExtraction <- tmp_mes <- try(suppressWarnings(load("params.rda")),silent = TRUE); } else { tmp_mes <- 0; + peakParams <- NULL; } - if(c5){ + MessageOutput("Empty Scan detecting...", "\n", NULL) + + prepare_single_file <- function(raw_data_single) { + ms_list <- + sapply( + ls(raw_data_single@assayData), + FUN = function(x) + raw_data_single@assayData[[x]]@mz + ); + + emptyScan <- vector() + + for (i in seq_along(ms_list)) { + if (identical(ms_list[[i]], numeric(0))) { + emptyScan <- c(emptyScan, i) + } + } + + if (!identical(emptyScan, logical(0))) { + raw_data_single <- .emptyscan.remove(raw_data_single, ms_list); + ms_list <- + sapply( + ls(raw_data_single@assayData), + FUN = function(x) + raw_data_single@assayData[[x]]@mz + ); + } if(is(tmp_mes,"try-error") | rmConts){ - raw_data <- ContaminatsRemoval(raw_data, ms_list); - raw_data <- .emptyscan.remove(raw_data, ms_list); + raw_data_single <- ContaminatsRemoval(raw_data_single, ms_list); + raw_data_single <- .emptyscan.remove(raw_data_single, ms_list); } else if (.on.public.web()) { if(exists("peakParams")){ if(peakParams[["rmConts"]]){ - raw_data <- ContaminatsRemoval(raw_data, ms_list); - raw_data <- .emptyscan.remove(raw_data, ms_list); + raw_data_single <- ContaminatsRemoval(raw_data_single, ms_list); + raw_data_single <- .emptyscan.remove(raw_data_single, ms_list); } } else { - raw_data <- ContaminatsRemoval(raw_data, ms_list); - raw_data <- .emptyscan.remove(raw_data, ms_list); + raw_data_single <- ContaminatsRemoval(raw_data_single, ms_list); + raw_data_single <- .emptyscan.remove(raw_data_single, ms_list); } } + return(raw_data_single) + } + + if(c5){ + raw_data_list <- BiocParallel::bplapply(raw_data_list, + FUN = prepare_single_file, + BPPARAM = parallel_param) + if (.running.as.plan) { - cache.save(raw_data, paste0(function.name, "_c5")); + cache.save(raw_data_list, paste0(function.name, "_c5")); marker_record(paste0(function.name, "_c5")); } } else { - raw_data <- cache.read(function.name, "c5"); + raw_data_list <- cache.read(function.name, "c5"); marker_record(paste0(function.name, "_c5")); } - - if (mode == "ssm") { - trimed_MSnExp <- ssm_trim(raw_data, ms_list, rt.idx = rt.idx); + compute_global_params <- function(raw_list, rt.idx) { + spectra_mz_all <- unlist(lapply(raw_list, function(rd) unlist(lapply(MSnbase::spectra(rd), function(sp) MSnbase::mz(sp))))) + spectra_int_all <- unlist(lapply(raw_list, function(rd) unlist(lapply(MSnbase::spectra(rd), function(sp) MSnbase::intensity(sp))))) + + bins.boundary <- seq(from = min(spectra_mz_all), to = max(spectra_mz_all), length.out = 5) + + if (length(spectra_mz_all) > 1000000) { + rannum <- seq(from = 1, to = length(spectra_mz_all), by = 50) + } else if (length(spectra_mz_all) > 100000) { + rannum <- seq(from = 1, to = length(spectra_mz_all), by = 5) + } else { + rannum <- seq(length(spectra_mz_all)) + } + + spectra_mz_sample <- spectra_mz_all[rannum] + spectra_abundance_sample <- spectra_int_all[rannum] + + spectra_mz_set <- lapply(seq_len(4), function(ii) { + spectra_mz_sample[spectra_mz_sample > bins.boundary[ii] & spectra_mz_sample < bins.boundary[ii + 1]] + }) + + spectra_abundance_set <- lapply(seq_len(4), function(ii) { + spectra_abundance_sample[spectra_mz_sample > bins.boundary[ii] & spectra_mz_sample < bins.boundary[ii + 1]] + }) + + good.bins.list <- lapply( + seq_len(4), + function(i) { + binsb.low <- bins.boundary[i] + binsb.high <- bins.boundary[i + 1] + w <- 1 + bins.width <- (binsb.high - binsb.low) * 0.1 + wind.up <- wind.down <- 0 + inten.sum.set <- numeric() + + while (!wind.up > binsb.high) { + wind.down <- binsb.low + (w - 1) + wind.up <- binsb.low + bins.width + (w - 1) + + inten.sum.set <- c( + inten.sum.set, + sum(spectra_abundance_set[[i]][spectra_mz_set[[i]] > wind.down & + spectra_mz_set[[i]] < wind.up]) + ) + w <- w + 1 + } + + selected.bin.down <- binsb.low + which(max(inten.sum.set) == inten.sum.set) - 1 + selected.bin.up <- binsb.low + which(max(inten.sum.set) == inten.sum.set) - 1 + bins.width + + list(selected.bin.down, selected.bin.up) + } + ) + + rt_vec <- unlist(lapply(raw_list, function(rd) { + scan_names <- ls(rd@assayData) + sapply(scan_names, function(nm) rd@assayData[[nm]]@rt) + })) + + tic_vec <- unlist(lapply(raw_list, function(rd) { + scan_names <- ls(rd@assayData) + sapply(scan_names, function(nm) rd@assayData[[nm]]@tic) + })) + + rt.bin.width <- (max(rt_vec) - min(rt_vec)) * rt.idx + w <- 0 + rt.window.min <- min(rt_vec) + tic.sum <- numeric() + + while (!rt.window.min > max(rt_vec)) { + rt.window.min <- min(rt_vec) + w * 0.75 + rt.window.max <- min(rt_vec) + rt.bin.width + w * 0.75 + idx <- which(rt_vec > rt.window.min & rt_vec < rt.window.max) + if (length(idx)) { + tic.sum <- c(tic.sum, sum(tic_vec[idx])) + } + w <- w + 1 + } + + rt.boundary.lowlimit <- min(rt_vec) + which(max(tic.sum) == tic.sum)[ceiling(length(which(max(tic.sum) == tic.sum)) / 2)] * 0.75 + rt.boundary.uplimit <- min(rt_vec) + which(max(tic.sum) == tic.sum)[ceiling(length(which(max(tic.sum) == tic.sum)) / 2)] * 0.75 + rt.bin.width + + list(good.bins.list = good.bins.list, + rt.boundary = c(rt.boundary.lowlimit, rt.boundary.uplimit)) } + + global_params <- compute_global_params(raw_data_list, rt.idx) - if (mode == "mz_random") { - try(trimed_MSnExp <- mz.trim_random(raw_data, ms_list), silent = TRUE); - } + MessageOutput("Identifying regions of interest (ROI)...", "\n", NULL); - if (mode == "rt_random") { - try(trimed_MSnExp <- rt.trim_random(raw_data, ms_list), silent = TRUE) + trim_single_file <- function(raw_data_single) { + ms_list <- + sapply( + ls(raw_data_single@assayData), + FUN = function(x) + raw_data_single@assayData[[x]]@mz + ); + + if (mode == "ssm") { + trimed_MSnExp <- ssm_trim(raw_data_single, + ms_list, + rt.idx = rt.idx, + good.bins.list = global_params$good.bins.list, + rt.boundary = global_params$rt.boundary); + } + + if (mode == "mz_random") { + try(trimed_MSnExp <- mz.trim_random(raw_data_single, ms_list), silent = TRUE); + } + + if (mode == "rt_random") { + try(trimed_MSnExp <- rt.trim_random(raw_data_single, ms_list), silent = TRUE) + } + + if (mode == "mz_specific") { + trimed_MSnExp <- mz.trim_specific(raw_data_single, ms_list, mz, mzdiff = mzdiff) + } + + if (mode == "rt_specific") { + trimed_MSnExp <- rt.trim_specific(raw_data_single, ms_list, rt, rtdiff = rtdiff) + } + + trimed_MSnExp <- .emptyscan.remove(trimed_MSnExp, ms_list); + return(trimed_MSnExp) } - if (mode == "mz_specific") { - trimed_MSnExp <- mz.trim_specific(raw_data, ms_list, mz, mzdiff = mzdiff) - } + trimed_list <- BiocParallel::bplapply(raw_data_list, + FUN = trim_single_file, + BPPARAM = parallel_param) - if (mode == "rt_specific") { - trimed_MSnExp <- rt.trim_specific(raw_data, ms_list, rt, rtdiff = rtdiff) + invalid_trim <- which(!vapply(trimed_list, function(x) { + is(x, "MSnExp") + }, logical(1))) + + if (length(invalid_trim)) { + stop("EXCEPTION POINT CODE: TRIM1") } - # remove the empty scans in the ms data - trimed_MSnExp <- .emptyscan.remove(trimed_MSnExp, ms_list); + trimed_MSnExp <- combine_msnexp_list(trimed_list); } else{ - # keep the data without trimming. - trimed_MSnExp <- raw_data; + trimed_MSnExp <- combine_msnexp_list(raw_data_list); } if (.running.as.plan) { @@ -492,7 +705,7 @@ PerformROIExtraction <- } } - + MessageOutput(paste0("Identification on ROIs Finished!"), ecol = "\n", NULL); @@ -500,6 +713,18 @@ PerformROIExtraction <- MessageOutput("Optimization will be started soon...", "\n", NULL); } + pheno_tmp <- new("AnnotatedDataFrame", + data = pData(trimed_MSnExp), + varMetadata = varMetadata(phenoData(trimed_MSnExp)), + dimLabels = phenoData(trimed_MSnExp)@dimLabels) + attr(pheno_tmp@data, "row.names") <- seq_len(nrow(pData(trimed_MSnExp))) + trimed_MSnExp@phenoData <- pheno_tmp + protocol_tmp <- new("AnnotatedDataFrame", + data = data.frame(row.names = character(0), + stringsAsFactors = FALSE), + dimLabels = c("sampleNames", "sampleColumns")) + trimed_MSnExp@protocolData <- protocol_tmp + attr(trimed_MSnExp@protocolData@data, "row.names") <- integer(0) mSet@rawInMemory <- trimed_MSnExp; if(.on.public.web()){ @@ -527,7 +752,7 @@ PerformROIExtraction <- #' Mcgill University #' License: GNU GPL (>= 2) -ssm_trim <- function(raw_data, ms_list, rt.idx){ +ssm_trim <- function(raw_data, ms_list, rt.idx, good.bins.list = NULL, rt.boundary = NULL){ # mzdata_points<-unique(unlist(ms_list)) @@ -547,91 +772,76 @@ ssm_trim <- function(raw_data, ms_list, rt.idx){ ms_list <- ms_list[-empty_toRemove]; } - spectra_mz <- unlist(lapply(MSnbase::spectra(raw_data),mz)) - - highestmz<-max(spectra_mz) - lowestmz<-min(spectra_mz) - - # Split MS into 5 large districts - bins.boundary<-seq(from=lowestmz, to= highestmz,length.out = 5) - - - if (length(spectra_mz)>1000000){ - # set.seed(222); - # rannum<- sort(sample(length(spectra_mz), - # ceiling(length(spectra_mz)/50)), - # decreasing = FALSE) - rannum <- seq(from = 1, - to = length(spectra_mz), - by = 50) - } else if (length(spectra_mz)>100000){ - # set.seed(222) - # rannum<-sort(sample(length(spectra_mz), - # ceiling(length(spectra_mz)/5)), - # decreasing = FALSE) - rannum <- seq(from = 1, - to = length(spectra_mz), - by = 5) - } else { - rannum<-seq(length(spectra_mz)) - } - - spectra_abundance <- unlist(lapply(MSnbase::spectra(raw_data),intensity))[rannum] - spectra_mz <- spectra_mz[rannum] - - spectra_mz_set <- sapply(seq_len(4),FUN=function(ii){ + if (is.null(good.bins.list)) { + spectra_mz <- unlist(lapply(MSnbase::spectra(raw_data),mz)) - spectra_mz[spectra_mz > bins.boundary[ii] & spectra_mz < bins.boundary[ii+1]] + highestmz<-max(spectra_mz) + lowestmz<-min(spectra_mz) - }) - - spectra_abundance_set <- sapply(seq_len(4),FUN=function(ii){ + # Split MS into 5 large districts + bins.boundary<-seq(from=lowestmz, to= highestmz,length.out = 5) - spectra_abundance[spectra_mz > bins.boundary[ii] & spectra_mz < bins.boundary[ii+1]] - }) - - rm(spectra_abundance); rm(spectra_mz) - - good.bins.list<-bplapply(seq_len(4), - FUN = function(i, spectra_abundance_set, spectra_mz_set, bins.boundary){ - - binsb.low<-bins.boundary[i]; - binsb.high<-bins.boundary[i+1]; - w<-1; - bins.width<-(binsb.high-binsb.low)*0.1; - wind.up<-wind.down<-0; - inten.sum.set<-numeric(); - - while (!wind.up>binsb.high) { - wind.down<-binsb.low+(w-1); - wind.up<-binsb.low+bins.width+(w-1); - #inten.sum<-numeric() + if (length(spectra_mz)>1000000){ + rannum <- seq(from = 1, + to = length(spectra_mz), + by = 50) + } else if (length(spectra_mz)>100000){ + rannum <- seq(from = 1, + to = length(spectra_mz), + by = 5) + } else { + rannum<-seq(length(spectra_mz)) + } + + spectra_abundance <- unlist(lapply(MSnbase::spectra(raw_data),intensity))[rannum] + spectra_mz <- spectra_mz[rannum] + + spectra_mz_set <- sapply(seq_len(4),FUN=function(ii){ + + spectra_mz[spectra_mz > bins.boundary[ii] & spectra_mz < bins.boundary[ii+1]] + + }) + + spectra_abundance_set <- sapply(seq_len(4),FUN=function(ii){ + + spectra_abundance[spectra_mz > bins.boundary[ii] & spectra_mz < bins.boundary[ii+1]] + + }) + + rm(spectra_abundance); rm(spectra_mz) + + good.bins.list<-bplapply(seq_len(4), + FUN = function(i, spectra_abundance_set, spectra_mz_set, bins.boundary){ + + binsb.low<-bins.boundary[i]; + binsb.high<-bins.boundary[i+1]; + w<-1; + bins.width<-(binsb.high-binsb.low)*0.1; + wind.up<-wind.down<-0; + inten.sum.set<-numeric(); - inten.sum.set<-c(inten.sum.set, - sum (spectra_abundance_set[[i]] - [spectra_mz_set[[i]] > wind.down & spectra_mz_set[[i]] < wind.up])); + while (!wind.up>binsb.high) { + wind.down<-binsb.low+(w-1); + wind.up<-binsb.low+bins.width+(w-1); + + inten.sum.set<-c(inten.sum.set, + sum (spectra_abundance_set[[i]] + [spectra_mz_set[[i]] > wind.down & spectra_mz_set[[i]] < wind.up])); + w<-w+1; + } - #for (j in 1:length(ms_list_s)){ - # mz.set<-raw_data@assayData[[ms_list_s[j]]]@mz - # a1<-which(sapply(mz.set, FUN=function(x){x>wind.down && xmax(rt_set)) { - rt.window.min<-min(rt_set)+w*0.75 - rt.window.max<-min(rt_set)+rt.bin.width+w*0.75 - rt.name.set<-names(which(sapply(rt_set,FUN = function(x){x>rt.window.min && xmax(rt_set)) { + rt.window.min<-min(rt_set)+w*0.75 + rt.window.max<-min(rt_set)+rt.bin.width+w*0.75 + rt.name.set<-names(which(sapply(rt_set,FUN = function(x){x>rt.window.min && x #include #include #include