Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions bin/aggregate_logs.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,19 @@ suppressPackageStartupMessages(library(parallel))

# Function to read and aggregate filter_blat logs
aggregate_filter_blat_logs <- function(log_files, num_cores) {

expected_steps_keywords <- c("initial", "e-value", "max hits")

# Read all logs into a list of data.tables in parallel and remove "Elapsed time (s)" column
log_list <- mclapply(log_files, function(log_file) {
log.dt <- fread(log_file, sep = "\t", header = TRUE)

missing_steps <- expected_steps_keywords[!sapply(expected_steps_keywords, function(p) any(str_detect(log.dt$Step, fixed(p))))]

if (length(missing_steps) > 0) {
stop(paste("Log file", log_file, "is missing required Step(s): ", paste(missing_steps, collapse = ", ")))
}

elapsed_time_col <- "Elapsed time (s)"
if (elapsed_time_col %in% colnames(log.dt)) {
log.dt[, (elapsed_time_col) := NULL] # Remove time column
Expand Down Expand Up @@ -50,9 +60,26 @@ aggregate_filter_blat_logs <- function(log_files, num_cores) {

# Function to read and aggregate identify_hybrids logs
aggregate_identify_hybrids_logs <- function(log_files, num_cores) {

expected_types <- c(
"initial_read_count",
"strong_contiguous_match_to_a_single_gene",
"excessive_overlap_in_query_mappings",
"excessive_gap_between_query_mappings",
"excessive_overlap_in_subject_mappings",
"remaining_read_count"
)

# Read all logs into a list of data.tables in parallel
log_list <- mclapply(log_files, function(log_file) {
log.dt <- fread(log_file, sep = "\t", header = TRUE)

# Check all expected types are present
missing_types <- setdiff(expected_types, log.dt$type)
if (length(missing_types) > 0) {
stop(paste("Log file", log_file, "is missing expected types: ", paste(missing_types, collapse = ", ")))
}

return(log.dt)
}, mc.cores = num_cores)

Expand All @@ -61,6 +88,7 @@ aggregate_identify_hybrids_logs <- function(log_files, num_cores) {
stop("The list of logs is empty.")
}


# If only one log file, return it directly
if (length(log_list) == 1) {
aggregated.dt <- log_list[[1]]
Expand Down
33 changes: 20 additions & 13 deletions bin/identify_hybrids.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,24 @@ option_list <- list(make_option(c("-b", "--blast8"), action = "store", type = "c
opt_parser = OptionParser(option_list = option_list)
opt <- parse_args(opt_parser)

# Prepare data for logging
discarded_reasons <- c(
"strong_contiguous_match_to_a_single_gene",
"excessive_overlap_in_query_mappings",
"excessive_gap_between_query_mappings",
"excessive_overlap_in_subject_mappings"
)

if(is.na(readLines(opt$blast8)[1])) {

message("Empty blast8 input — writing empty output and default log.")
fwrite(data.table(), file = opt$output, sep = "\t", col.names = TRUE) # Accounts for empty filtered blast file
fwrite(data.table(), file = opt$log, sep = "\t", col.names = FALSE)

log.dt <- data.table(
type = c("initial_read_count", discarded_reasons, "remaining_read_count"),
count = c(0L, rep(0L, length(discarded_reasons)), 0L)
)
fwrite(log.dt, file = opt$log, sep = "\t")

} else {

Expand All @@ -108,7 +122,7 @@ if(is.na(readLines(opt$blast8)[1])) {
hybrids.list <- parLapply(cl = cl, blast.list, function(x) get_valid_hybrids(blast.query.dt = x))
stopCluster(cl)

# get_valid_hybrids returns a data.table if valid, or a character string with a discard reason if not.
# get_valid_hybrids returns a data.table if valid, or a character string with a discard reason if not
discarded_bools <- sapply(hybrids.list, is.character)

# message(sum(S4Vectors::elementNROWS(hybrids.list) == 0), " out of ", length(hybrids.list), " reads did not have hybrids")
Expand All @@ -126,31 +140,24 @@ if(is.na(readLines(opt$blast8)[1])) {
fwrite(valid.hybrids.dt, file = opt$output, sep = "\t", col.names = TRUE)

# Prepare data for logging
discarded_reasons <- c(
"strong_contiguous_match_to_a_single_gene",
"excessive_overlap_in_query_mappings",
"excessive_gap_between_query_mappings",
"excessive_overlap_in_subject_mappings"
)
discarded_reads.list <- hybrids.list[discarded_bools]

if (length(unlist(discarded_reads.list)) > 0) {
discarded_reads_log.dt <- as.data.table(table(unlist(discarded_reads.list)))
message("Logging discard reasons...")
discarded_vector <- factor(unlist(discarded_reads.list), levels = discarded_reasons) # force factor with full levels
discarded_reads_log.dt <- as.data.table(table(discarded_vector)) # table() will include the unused levels with count = 0
setnames(discarded_reads_log.dt, c("type", "count"))
} else {
message("No reads were discarded.")
discarded_reads_log.dt <- data.table(type = discarded_reasons, count = rep(0L, length(discarded_reasons)))
}

print(head(discarded_reads_log.dt))

log.dt <- rbind(
data.table(type = "initial_read_count", count = length(blast.list)),
discarded_reads_log.dt,
data.table(type = "remaining_read_count", count = length(hybrids.list[!discarded_bools]))
)

print(head(log.dt))

fwrite(log.dt, file = opt$log, sep = "\t")

}