diff --git a/bin/aggregate_logs.R b/bin/aggregate_logs.R index 4de5e04..ce3c96e 100755 --- a/bin/aggregate_logs.R +++ b/bin/aggregate_logs.R @@ -15,9 +15,19 @@ suppressPackageStartupMessages(library(parallel)) # Function to read and aggregate filter_blat logs aggregate_filter_blat_logs <- function(log_files, num_cores) { + + expected_steps_keywords <- c("initial", "e-value", "max hits") + # Read all logs into a list of data.tables in parallel and remove "Elapsed time (s)" column log_list <- mclapply(log_files, function(log_file) { log.dt <- fread(log_file, sep = "\t", header = TRUE) + + missing_steps <- expected_steps_keywords[!sapply(expected_steps_keywords, function(p) any(str_detect(log.dt$Step, fixed(p))))] + + if (length(missing_steps) > 0) { + stop(paste("Log file", log_file, "is missing required Step(s): ", paste(missing_steps, collapse = ", "))) + } + elapsed_time_col <- "Elapsed time (s)" if (elapsed_time_col %in% colnames(log.dt)) { log.dt[, (elapsed_time_col) := NULL] # Remove time column @@ -50,9 +60,26 @@ aggregate_filter_blat_logs <- function(log_files, num_cores) { # Function to read and aggregate identify_hybrids logs aggregate_identify_hybrids_logs <- function(log_files, num_cores) { + + expected_types <- c( + "initial_read_count", + "strong_contiguous_match_to_a_single_gene", + "excessive_overlap_in_query_mappings", + "excessive_gap_between_query_mappings", + "excessive_overlap_in_subject_mappings", + "remaining_read_count" + ) + # Read all logs into a list of data.tables in parallel log_list <- mclapply(log_files, function(log_file) { log.dt <- fread(log_file, sep = "\t", header = TRUE) + + # Check all expected types are present + missing_types <- setdiff(expected_types, log.dt$type) + if (length(missing_types) > 0) { + stop(paste("Log file", log_file, "is missing expected types: ", paste(missing_types, collapse = ", "))) + } + return(log.dt) }, mc.cores = num_cores) @@ -61,6 +88,7 @@ aggregate_identify_hybrids_logs <- function(log_files, num_cores) { stop("The list of logs is empty.") } + # If only one log file, return it directly if (length(log_list) == 1) { aggregated.dt <- log_list[[1]] diff --git a/bin/identify_hybrids.R b/bin/identify_hybrids.R index dae1b46..2b434d1 100755 --- a/bin/identify_hybrids.R +++ b/bin/identify_hybrids.R @@ -90,10 +90,24 @@ option_list <- list(make_option(c("-b", "--blast8"), action = "store", type = "c opt_parser = OptionParser(option_list = option_list) opt <- parse_args(opt_parser) +# Prepare data for logging +discarded_reasons <- c( + "strong_contiguous_match_to_a_single_gene", + "excessive_overlap_in_query_mappings", + "excessive_gap_between_query_mappings", + "excessive_overlap_in_subject_mappings" +) + if(is.na(readLines(opt$blast8)[1])) { + message("Empty blast8 input — writing empty output and default log.") fwrite(data.table(), file = opt$output, sep = "\t", col.names = TRUE) # Accounts for empty filtered blast file - fwrite(data.table(), file = opt$log, sep = "\t", col.names = FALSE) + + log.dt <- data.table( + type = c("initial_read_count", discarded_reasons, "remaining_read_count"), + count = c(0L, rep(0L, length(discarded_reasons)), 0L) + ) + fwrite(log.dt, file = opt$log, sep = "\t") } else { @@ -108,7 +122,7 @@ if(is.na(readLines(opt$blast8)[1])) { hybrids.list <- parLapply(cl = cl, blast.list, function(x) get_valid_hybrids(blast.query.dt = x)) stopCluster(cl) - # get_valid_hybrids returns a data.table if valid, or a character string with a discard reason if not. + # get_valid_hybrids returns a data.table if valid, or a character string with a discard reason if not discarded_bools <- sapply(hybrids.list, is.character) # message(sum(S4Vectors::elementNROWS(hybrids.list) == 0), " out of ", length(hybrids.list), " reads did not have hybrids") @@ -126,31 +140,24 @@ if(is.na(readLines(opt$blast8)[1])) { fwrite(valid.hybrids.dt, file = opt$output, sep = "\t", col.names = TRUE) # Prepare data for logging - discarded_reasons <- c( - "strong_contiguous_match_to_a_single_gene", - "excessive_overlap_in_query_mappings", - "excessive_gap_between_query_mappings", - "excessive_overlap_in_subject_mappings" - ) discarded_reads.list <- hybrids.list[discarded_bools] if (length(unlist(discarded_reads.list)) > 0) { - discarded_reads_log.dt <- as.data.table(table(unlist(discarded_reads.list))) + message("Logging discard reasons...") + discarded_vector <- factor(unlist(discarded_reads.list), levels = discarded_reasons) # force factor with full levels + discarded_reads_log.dt <- as.data.table(table(discarded_vector)) # table() will include the unused levels with count = 0 setnames(discarded_reads_log.dt, c("type", "count")) } else { + message("No reads were discarded.") discarded_reads_log.dt <- data.table(type = discarded_reasons, count = rep(0L, length(discarded_reasons))) } - print(head(discarded_reads_log.dt)) - log.dt <- rbind( data.table(type = "initial_read_count", count = length(blast.list)), discarded_reads_log.dt, data.table(type = "remaining_read_count", count = length(hybrids.list[!discarded_bools])) ) - print(head(log.dt)) - fwrite(log.dt, file = opt$log, sep = "\t") } \ No newline at end of file