From db8e15b1536e55a5818f9af7559da3f9acfff000 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Sat, 21 Feb 2026 00:36:22 -0500 Subject: [PATCH 1/4] Refactor phases 0-7 (allowed paths only) --- Cargo.toml | 1 + docs/architecture-map.md | 77 ++ scripts/refactor_safety_suite.sh | 8 + scripts/test_simpleaf_local_integration.sh | 14 + src/atac/index.rs | 48 +- src/atac/process.rs | 412 +++++----- src/core.rs | 5 + src/core/context.rs | 54 ++ src/core/exec.rs | 54 ++ src/core/index_meta.rs | 163 ++++ src/core/io.rs | 63 ++ src/core/runtime.rs | 36 + src/main.rs | 10 +- src/simpleaf_commands/chemistry.rs | 455 ++++++++--- src/simpleaf_commands/indexing.rs | 281 ++++--- src/simpleaf_commands/quant.rs | 712 ++++++++-------- src/simpleaf_commands/workflow.rs | 155 ++-- src/utils/workflow_utils.rs | 757 ++++++++++++++---- tests/cli_help_snapshots.rs | 116 +++ tests/cli_smoke.rs | 69 ++ tests/phase1_regressions.rs | 59 ++ tests/snapshots/cli-help/simpleaf___help.txt | 18 + .../cli-help/simpleaf_atac___help.txt | 13 + .../cli-help/simpleaf_atac_index___help.txt | 26 + .../cli-help/simpleaf_atac_process___help.txt | 86 ++ .../cli-help/simpleaf_chemistry___help.txt | 16 + .../simpleaf_chemistry_add___help.txt | 28 + .../simpleaf_chemistry_clean___help.txt | 8 + .../simpleaf_chemistry_fetch___help.txt | 11 + .../simpleaf_chemistry_lookup___help.txt | 9 + .../simpleaf_chemistry_refresh___help.txt | 9 + .../simpleaf_chemistry_remove___help.txt | 10 + .../cli-help/simpleaf_index___help.txt | 61 ++ .../cli-help/simpleaf_inspect___help.txt | 7 + .../cli-help/simpleaf_quant___help.txt | 78 ++ .../simpleaf_refresh_prog_info___help.txt | 7 + .../cli-help/simpleaf_set_paths___help.txt | 11 + .../cli-help/simpleaf_workflow___help.txt | 18 + .../cli-help/simpleaf_workflow_get___help.txt | 9 + .../simpleaf_workflow_list___help.txt | 7 + .../simpleaf_workflow_patch___help.txt | 22 + .../simpleaf_workflow_refresh___help.txt | 7 + .../cli-help/simpleaf_workflow_run___help.txt | 28 + 43 files changed, 2971 insertions(+), 1067 deletions(-) create mode 100644 docs/architecture-map.md create mode 100755 scripts/refactor_safety_suite.sh create mode 100755 scripts/test_simpleaf_local_integration.sh create mode 100644 src/core.rs create mode 100644 src/core/context.rs create mode 100644 src/core/exec.rs create mode 100644 src/core/index_meta.rs create mode 100644 src/core/io.rs create mode 100644 src/core/runtime.rs create mode 100644 tests/cli_help_snapshots.rs create mode 100644 tests/cli_smoke.rs create mode 100644 tests/phase1_regressions.rs create mode 100644 tests/snapshots/cli-help/simpleaf___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_atac___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_atac_index___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_atac_process___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_chemistry___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_chemistry_add___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_chemistry_clean___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_chemistry_fetch___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_chemistry_lookup___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_chemistry_refresh___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_chemistry_remove___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_index___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_inspect___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_quant___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_refresh_prog_info___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_set_paths___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_workflow___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_workflow_get___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_workflow_list___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_workflow_patch___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_workflow_refresh___help.txt create mode 100644 tests/snapshots/cli-help/simpleaf_workflow_run___help.txt diff --git a/Cargo.toml b/Cargo.toml index 7744979..abeac5e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ homepage = "https://simpleaf.readthedocs.io" #documentation = "https://fry.readthedocs.io/en/latest/" include = [ "/src/*.rs", + "/src/core/*.rs", "/src/utils/*.rs", "/src/utils/af_utils/*.rs", "/src/atac/*.rs", diff --git a/docs/architecture-map.md b/docs/architecture-map.md new file mode 100644 index 0000000..5c2e110 --- /dev/null +++ b/docs/architecture-map.md @@ -0,0 +1,77 @@ +# simpleaf Architecture Map + +This document summarizes the post-refactor module boundaries and where new code should live. + +## CLI Entry and Dispatch +- `src/main.rs` + - Parses CLI arguments and dispatches to command handlers. + - Should remain thin: argument parsing, high-level routing, and top-level logging only. +- `src/simpleaf_commands.rs` + - Defines CLI option structs and subcommands. + - Avoid business logic here; keep it as schema/contract for command-line interfaces. + +## Command Implementations +- `src/simpleaf_commands/indexing.rs` + - RNA reference/index orchestration. + - Uses staged outputs for readability and testability. +- `src/simpleaf_commands/quant.rs` + - RNA mapping/quantification orchestration. + - Stage-level helpers produce typed intermediate outputs. +- `src/atac/process.rs` + - ATAC processing pipeline (map/gpl/sort/macs). + - Stage decomposition mirrors RNA command structure. +- `src/simpleaf_commands/workflow.rs` + - Workflow command front-end (run/list/get/patch/refresh). + - Handles run-input resolution before delegating planning/execution. +- `src/simpleaf_commands/chemistry.rs` + - Chemistry registry and permit-list lifecycle operations. + - Contains registry merge/update logic and dry-run-safe file operations. + +## Core Shared Infrastructure +- `src/core/context.rs` + - Runtime context loading/validation (e.g., AF home, tool info). +- `src/core/exec.rs` + - Checked command execution helpers. +- `src/core/index_meta.rs` + - Shared index metadata discovery and parsing. +- `src/core/runtime.rs` + - Runtime helpers (e.g., thread capping). +- `src/core/io.rs` + - JSON file read/write and atomic write helpers. + +## Utility Layer +- `src/utils/workflow_utils.rs` + - Workflow manifest validation, planning, command queue building, execution, and logging. + - Contains typed workflow execution errors and log schema construction. +- `src/utils/chem_utils.rs` + - Chemistry model definitions and registry parsing helpers. +- `src/utils/prog_utils.rs` + - Program/tool discovery and generic process/IO utilities. +- `src/utils/af_utils.rs` + - Domain utilities for geometry and command-level support logic. + +## Testing Layout +- `tests/cli_help_snapshots.rs` + - Guards user-facing CLI help contract. +- `tests/cli_smoke.rs` + - Lightweight argument/dispatch integration checks. +- `tests/phase1_regressions.rs` + - High-value regression checks for prior production bugs. +- Module-local unit tests in `src/**` + - Stage/helper-focused tests close to implementation. + +## CI and Validation Paths +- Strict quality gate: + - `cargo fmt --check` + - `cargo clippy --all-targets -- -D warnings` + - `cargo test --all-targets` +- Deterministic local integration path: + - `scripts/test_simpleaf_local_integration.sh` +- Full e2e toy dataset path (heavier): + - `scripts/test_simpleaf.sh` + +## Extension Guidelines +- Add reusable primitives under `src/core/` before duplicating orchestration logic. +- Keep command modules focused on stage orchestration and user-facing flow. +- Prefer typed errors (`thiserror`) where multiple internal failure variants matter. +- Use atomic writes for registry/log/state files that are critical for resume/recovery flows. diff --git a/scripts/refactor_safety_suite.sh b/scripts/refactor_safety_suite.sh new file mode 100755 index 0000000..50e6fb6 --- /dev/null +++ b/scripts/refactor_safety_suite.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "${ROOT_DIR}" + +echo "[refactor-safety] running CLI help snapshot and parse-smoke tests" +cargo test --test cli_help_snapshots --test cli_smoke diff --git a/scripts/test_simpleaf_local_integration.sh b/scripts/test_simpleaf_local_integration.sh new file mode 100755 index 0000000..418e99d --- /dev/null +++ b/scripts/test_simpleaf_local_integration.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "${ROOT_DIR}" + +echo "[local-integration] running deterministic integration tests" + +# These tests avoid remote/networked resources and external large datasets. +cargo test --test cli_help_snapshots --test cli_smoke --test phase1_regressions + +# Targeted workflow and chemistry regressions that cover refactor-critical paths. +cargo test --all-targets test_execute_commands_external_failure_reports_status_and_stderr +cargo test --all-targets clean_chemistries_dry_run_is_non_destructive_then_removes_unused diff --git a/src/atac/index.rs b/src/atac/index.rs index 92614b7..1f108e2 100644 --- a/src/atac/index.rs +++ b/src/atac/index.rs @@ -1,25 +1,21 @@ use crate::atac::commands::IndexOpts; -use crate::utils::{ - prog_utils, - prog_utils::{CommandVerbosityLevel, ReqProgs}, -}; +use crate::core::{context, exec, io, runtime}; +use crate::utils::{prog_utils, prog_utils::ReqProgs}; use anyhow; -use anyhow::{bail, Context}; +use anyhow::Context; use cmd_lib::run_fun; -use serde_json::{json, Value}; +use serde_json::json; use std::path::Path; use std::time::Instant; use tracing::{info, warn}; pub(crate) fn piscem_index(af_home_path: &Path, opts: &IndexOpts) -> anyhow::Result<()> { - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let piscem_prog_info = rp .piscem .as_ref() - .expect("piscem program info should be properly set."); + .context("piscem program info is missing; please run `simpleaf set-paths`.")?; match prog_utils::check_version_constraints( "piscem", @@ -60,18 +56,13 @@ pub(crate) fn piscem_index(af_home_path: &Path, opts: &IndexOpts) -> anyhow::Res piscem_index_cmd.arg("--overwrite"); } - let mut threads = opts.threads; - // if the user requested more threads than can be used - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (threads, capped_at) = runtime::cap_threads(opts.threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, opts.threads + ); + warn!("setting number of threads to {}", max_threads); } piscem_index_cmd @@ -94,14 +85,9 @@ pub(crate) fn piscem_index(af_home_path: &Path, opts: &IndexOpts) -> anyhow::Res info!("piscem build cmd : {}", index_cmd_string); let index_start = Instant::now(); - let cres = prog_utils::execute_command(&mut piscem_index_cmd, CommandVerbosityLevel::Quiet) - .expect("failed to invoke piscem index command"); + exec::run_checked(&mut piscem_index_cmd, "atac piscem index command")?; let index_duration = index_start.elapsed(); - if !cres.status.success() { - bail!("piscem index failed to build succesfully {:?}", cres.status); - } - let index_json_file = output_index_dir.join("simpleaf_index.json"); let index_json = json!({ "cmd" : index_cmd_string, @@ -117,11 +103,7 @@ pub(crate) fn piscem_index(af_home_path: &Path, opts: &IndexOpts) -> anyhow::Res "ref" : &opts.input } }); - std::fs::write( - &index_json_file, - serde_json::to_string_pretty(&index_json).unwrap(), - ) - .with_context(|| format!("could not write {}", index_json_file.display()))?; + io::write_json_pretty_atomic(&index_json_file, &index_json)?; Ok(()) } diff --git a/src/atac/process.rs b/src/atac/process.rs index 47b4ec1..118c059 100644 --- a/src/atac/process.rs +++ b/src/atac/process.rs @@ -1,20 +1,39 @@ use crate::atac::commands::ProcessOpts; +use crate::core::{context, exec, index_meta, io, runtime}; use crate::utils::chem_utils::get_single_custom_chem_from_file; use crate::utils::chem_utils::ExpectedOri; use crate::utils::chem_utils::QueryInRegistry; use crate::utils::constants::CHEMISTRIES_PATH; -use crate::utils::{ - prog_utils, - prog_utils::{CommandVerbosityLevel, ReqProgs}, -}; +use crate::utils::{prog_utils, prog_utils::ReqProgs}; use anyhow; use anyhow::{bail, Context}; -use serde_json::{json, Value}; +use serde_json::json; use std::io::BufReader; use std::path::{Path, PathBuf}; use std::time::Instant; use tracing::{info, warn}; +pub(crate) struct MapStageOutput { + pub map_output: PathBuf, + pub map_duration_secs: f64, + pub map_cmd: String, +} + +struct GplStageOutput { + gpl_duration_secs: f64, + gpl_cmd: String, +} + +struct SortStageOutput { + sort_duration_secs: f64, + sort_cmd: String, +} + +struct MacsStageOutput { + macs_duration_secs: f64, + macs_cmd: String, +} + fn push_advanced_piscem_options( piscem_map_cmd: &mut std::process::Command, opts: &ProcessOpts, @@ -63,7 +82,7 @@ fn add_read_args(map_cmd: &mut std::process::Command, opts: &ProcessOpts) -> any let reads2 = opts .reads2 .as_ref() - .expect("since reads1 files is given, read2 files must be provided."); + .context("read2 files must be provided when read1 files are provided.")?; let barcode_reads: &Vec = &opts.barcode_reads; if reads1.len() != reads2.len() || reads1.len() != barcode_reads.len() { bail!( @@ -99,9 +118,9 @@ fn add_read_args(map_cmd: &mut std::process::Command, opts: &ProcessOpts) -> any .join(","); map_cmd.arg("--barcode").arg(bc_str); } else { - let reads = opts.reads.as_ref().expect( - "since reads1 and reads2 are not provided, the single-end reads must be provided.", - ); + let reads = opts.reads.as_ref().context( + "single-end reads must be provided when read1/read2 files are not provided.", + )?; let barcode_reads: &Vec = &opts.barcode_reads; if reads.len() != barcode_reads.len() { bail!( @@ -136,14 +155,12 @@ pub(crate) fn check_progs>( opts: &ProcessOpts, ) -> anyhow::Result<()> { let af_home_path = af_home_path.as_ref(); - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let af_prog_info = rp .alevin_fry .as_ref() - .expect("alevin-fry program info should be properly set."); + .context("alevin-fry program info is missing; please run `simpleaf set-paths`.")?; match prog_utils::check_version_constraints( "alevin-fry", @@ -157,7 +174,7 @@ pub(crate) fn check_progs>( let piscem_prog_info = rp .piscem .as_ref() - .expect("piscem program info should be properly set."); + .context("piscem program info is missing; please run `simpleaf set-paths`.")?; match prog_utils::check_version_constraints( "piscem", @@ -172,7 +189,9 @@ pub(crate) fn check_progs>( let macs_prog_info = rp .macs .as_ref() - .expect("macs3 program should be properly set if using the `--call-peaks` option"); + .context( + "macs3 program info is missing; please run `simpleaf set-paths` before using `--call-peaks`.", + )?; match prog_utils::check_version_constraints( "macs3", ">=3.0.2, <4.0.0", @@ -188,86 +207,15 @@ pub(crate) fn check_progs>( // NOTE: we assume that check_progs has already been called and so version constraints have // already been checked. -pub(crate) fn map_reads(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; +pub(crate) fn map_reads(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result { + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let piscem_prog_info = rp .piscem .as_ref() - .expect("piscem program info should be properly set."); - - // figure out what type of index we expect - let index_base; - - let mut index = opts.index.clone(); - // If the user built the index using simpleaf, there are - // 2 possibilities here: - // 1. They are passing in the directory containing the index - // 2. They are passing in the prefix stem of the index files - // The code below is to check, in both cases, if we can automatically - // detect if the index was constructed with simpleaf. - - // If we are in case 1., the passed in path is a directory and - // we can check for the simpleaf_index.json file directly, - // Otherwise if the path is not a directory, we check if it - // ends in piscem_idx (the suffix that simpleaf uses when - // making a piscem index). Then we test the directory we - // get after stripping off this suffix. - let removed_piscem_idx_suffix = if !index.is_dir() && index.ends_with("piscem_idx") { - // remove the piscem_idx part - index.pop(); - true - } else { - false - }; + .context("piscem program info is missing; please run `simpleaf set-paths`.")?; - let index_json_path = index.join("simpleaf_index.json"); - match index_json_path.try_exists() { - Ok(true) => { - // we have the simpleaf_index.json file, so parse it. - let index_json_file = std::fs::File::open(&index_json_path).with_context({ - || format!("Could not open file {}", index_json_path.display()) - })?; - - let index_json_reader = BufReader::new(&index_json_file); - let v: Value = serde_json::from_reader(index_json_reader)?; - - let index_type_str: String = serde_json::from_value(v["index_type"].clone())?; - - // here, set the index type based on what we found as the - // value for the `index_type` key. - match index_type_str.as_ref() { - "piscem" => { - // here, either the user has provided us with just - // the directory containing the piscem index, or - // we have "popped" off the "piscem_idx" suffix, so - // add it (back). - index_base = index.join("piscem_idx"); - } - _ => { - bail!( - "unknown index type {} present in simpleaf_index.json", - index_type_str, - ); - } - } - } - Ok(false) => { - // at this point, we have inferred that simpleaf wasn't - // used to construct the index, so fall back to what the user - // requested directly. - // if we have previously removed the piscem_idx suffix, add it back - if removed_piscem_idx_suffix { - index.push("piscem_idx"); - } - index_base = index; - } - Err(e) => { - bail!(e); - } - } + let index_base = index_meta::resolve_atac_piscem_index_base(opts.index.clone())?; let input_files = vec![ index_base.with_extension("ctab"), @@ -292,17 +240,13 @@ pub(crate) fn map_reads(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Resu .arg(opts.bin_overlap.to_string()); // if the user requested more threads than can be used - let mut threads = opts.threads; - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (threads, capped_at) = runtime::cap_threads(opts.threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, opts.threads + ); + warn!("setting number of threads to {}", max_threads); } // location of output directory, number of threads @@ -339,19 +283,9 @@ being used by simpleaf"#, info!("map command : {}", map_cmd_string); let map_start = Instant::now(); - let map_proc_out = - prog_utils::execute_command(&mut piscem_map_cmd, CommandVerbosityLevel::Quiet) - .expect("could not execute [atac::map]"); + exec::run_checked(&mut piscem_map_cmd, "[atac::map]")?; let map_duration = map_start.elapsed(); - - if !map_proc_out.status.success() { - bail!( - "atac::map failed with exit status {:?}", - map_proc_out.status - ); - } else { - info!("mapping completed successfully in {:#?}", map_duration); - } + info!("mapping completed successfully in {:#?}", map_duration); let af_process_info_file = opts.output.join("simpleaf_process_log.json"); let af_process_info = json!({ @@ -370,25 +304,23 @@ being used by simpleaf"#, // write the relevant info about // our run to file. - std::fs::write( - &af_process_info_file, - serde_json::to_string_pretty(&af_process_info).unwrap(), - ) - .with_context(|| format!("could not write {}", af_process_info_file.display()))?; + io::write_json_pretty_atomic(&af_process_info_file, &af_process_info)?; info!("successfully mapped reads and generated output RAD file."); - Ok(()) + Ok(MapStageOutput { + map_output, + map_duration_secs: map_duration.as_secs_f64(), + map_cmd: map_cmd_string, + }) } -fn macs_call_peaks(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; +fn macs_call_peaks(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result { + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let macs_prog_info = rp .macs .as_ref() - .expect("macs program info should be properly set."); + .context("macs program info is missing; please run `simpleaf set-paths`.")?; let gpl_dir = opts.output.join("af_process"); let bedsuf = if opts.compress { ".bed.gz" } else { ".bed" }; @@ -418,18 +350,9 @@ fn macs_call_peaks(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<() info!("macs3 command : {}", macs_cmd_string); let macs_start = Instant::now(); - let macs_proc_out = prog_utils::execute_command(&mut macs_cmd, CommandVerbosityLevel::Quiet) - .expect("could not execute [atac::macs]"); + exec::run_checked(&mut macs_cmd, "[atac::macs]")?; let macs_duration = macs_start.elapsed(); - - if !macs_proc_out.status.success() { - bail!( - "atac::macs failed with exit status {:?}", - macs_proc_out.status - ); - } else { - info!("macs completed successfully in {:#?}", macs_duration); - } + info!("macs completed successfully in {:#?}", macs_duration); let af_process_info_file = opts.output.join("simpleaf_process_log.json"); let json_file = std::fs::File::open(af_process_info_file.clone()) @@ -448,35 +371,40 @@ fn macs_call_peaks(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<() // write the relevant info about // our run to file. - std::fs::write( - &af_process_info_file, - serde_json::to_string_pretty(&af_process_info).unwrap(), - ) - .with_context(|| format!("could not write {}", af_process_info_file.display()))?; + io::write_json_pretty_atomic(&af_process_info_file, &af_process_info)?; info!("successfully called peaks using macs3."); - Ok(()) + Ok(MacsStageOutput { + macs_duration_secs: macs_duration.as_secs_f64(), + macs_cmd: macs_cmd_string, + }) } pub(crate) fn gen_bed(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { - af_gpl(af_home_path, opts)?; - af_sort(af_home_path, opts)?; - macs_call_peaks(af_home_path, opts)?; + let gpl = af_gpl(af_home_path, opts)?; + let sort = af_sort(af_home_path, opts)?; + let macs = macs_call_peaks(af_home_path, opts)?; + info!( + "ATAC downstream stages completed (gpl: {:.2}s, sort: {:.2}s, macs: {:.2}s).", + gpl.gpl_duration_secs, sort.sort_duration_secs, macs.macs_duration_secs + ); + info!( + "ATAC commands: gpl=`{}`, sort=`{}`, macs=`{}`", + gpl.gpl_cmd, sort.sort_cmd, macs.macs_cmd + ); Ok(()) } // NOTE: we assume that check_progs has already been called and so version constraints have // already been checked. -fn af_sort(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; +fn af_sort(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result { + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let af_prog_info = rp .alevin_fry .as_ref() - .expect("alevin-fry program info should be properly set."); + .context("alevin-fry program info is missing; please run `simpleaf set-paths`.")?; let gpl_dir = opts.output.join("af_process"); let rad_dir = opts.output.join("af_map"); @@ -490,17 +418,13 @@ fn af_sort(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { .arg(rad_dir); // if the user requested more threads than can be used - let mut threads = opts.threads; - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (threads, capped_at) = runtime::cap_threads(opts.threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, opts.threads + ); + warn!("setting number of threads to {}", max_threads); } af_sort.arg("--threads").arg(threads.to_string()); @@ -512,18 +436,9 @@ fn af_sort(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { info!("sort command : {}", sort_cmd_string); let af_sort_start = Instant::now(); - let af_sort_proc_out = prog_utils::execute_command(&mut af_sort, CommandVerbosityLevel::Quiet) - .expect("could not execute [atac::af_sort]"); + exec::run_checked(&mut af_sort, "[atac::af_sort]")?; let af_sort_duration = af_sort_start.elapsed(); - - if !af_sort_proc_out.status.success() { - bail!( - "atac::sort failed with exit status {:?}", - af_sort_proc_out.status - ); - } else { - info!("sort completed successfully in {:#?}", af_sort_duration); - } + info!("sort completed successfully in {:#?}", af_sort_duration); let af_process_info_file = opts.output.join("simpleaf_process_log.json"); let json_file = std::fs::File::open(af_process_info_file.clone()) @@ -542,27 +457,24 @@ fn af_sort(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { // write the relevant info about // our run to file. - std::fs::write( - &af_process_info_file, - serde_json::to_string_pretty(&af_process_info).unwrap(), - ) - .with_context(|| format!("could not write {}", af_process_info_file.display()))?; + io::write_json_pretty_atomic(&af_process_info_file, &af_process_info)?; info!("successfully sorted and deduplicated records and created the output BED file."); - Ok(()) + Ok(SortStageOutput { + sort_duration_secs: af_sort_duration.as_secs_f64(), + sort_cmd: sort_cmd_string, + }) } // NOTE: we assume that check_progs has already been called and so version constraints have // already been checked. -fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; +fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result { + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let af_prog_info = rp .alevin_fry .as_ref() - .expect("alevin-fry program info should be properly set."); + .context("alevin-fry program info is missing; please run `simpleaf set-paths`.")?; let filter_meth_opt; @@ -693,17 +605,13 @@ fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { } // if the user requested more threads than can be used - let mut threads = opts.threads; - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (threads, capped_at) = runtime::cap_threads(opts.threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, opts.threads + ); + warn!("setting number of threads to {}", max_threads); } af_gpl.arg("--threads").arg(format!("{}", threads)); @@ -711,21 +619,12 @@ fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { info!("gpl command : {}", gpl_cmd_string); let af_gpl_start = Instant::now(); - let af_gpl_proc_out = prog_utils::execute_command(&mut af_gpl, CommandVerbosityLevel::Quiet) - .expect("could not execute [atac::af_gpl]"); + exec::run_checked(&mut af_gpl, "[atac::af_gpl]")?; let af_gpl_duration = af_gpl_start.elapsed(); - - if !af_gpl_proc_out.status.success() { - bail!( - "atac::gpl failed with exit status {:?}", - af_gpl_proc_out.status - ); - } else { - info!( - "permit list generation completed successfully in {:#?}", - af_gpl_duration - ); - } + info!( + "permit list generation completed successfully in {:#?}", + af_gpl_duration + ); let af_process_info_file = opts.output.join("simpleaf_process_log.json"); let json_file = std::fs::File::open(af_process_info_file.clone()) @@ -744,12 +643,99 @@ fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { // write the relevant info about // our run to file. - std::fs::write( - &af_process_info_file, - serde_json::to_string_pretty(&af_process_info).unwrap(), - ) - .with_context(|| format!("could not write {}", af_process_info_file.display()))?; + io::write_json_pretty_atomic(&af_process_info_file, &af_process_info)?; info!("successfully performed cell barcode detection and correction."); - Ok(()) + Ok(GplStageOutput { + gpl_duration_secs: af_gpl_duration.as_secs_f64(), + gpl_cmd: gpl_cmd_string, + }) +} + +#[cfg(test)] +mod tests { + use std::fs; + + use crate::atac::commands::{AtacChemistry, Macs3GenomeSize}; + + use super::*; + + fn base_process_opts() -> ProcessOpts { + ProcessOpts { + index: PathBuf::from("/tmp/index"), + reads1: None, + reads2: None, + reads: None, + barcode_reads: vec![], + chemistry: AtacChemistry::TenxV2, + barcode_length: 16, + output: PathBuf::from("/tmp/out"), + threads: 1, + call_peaks: false, + permit_barcode_ori: None, + unfiltered_pl: None, + min_reads: 10, + compress: false, + ignore_ambig_hits: false, + no_poison: false, + use_chr: false, + thr: 0.8, + bin_size: 50, + bin_overlap: 2, + no_tn5_shift: false, + check_kmer_orphan: false, + max_ec_card: 4096, + max_hit_occ: 64, + max_hit_occ_recover: 1024, + max_read_occ: 250, + gsize: Macs3GenomeSize::KnownOpt("hs"), + qvalue: 0.1, + extsize: 50, + } + } + + #[test] + fn add_read_args_succeeds_for_paired_end_inputs() { + let td = tempfile::tempdir().expect("failed to create tempdir"); + let r1 = td.path().join("r1.fastq"); + let r2 = td.path().join("r2.fastq"); + let bc = td.path().join("bc.fastq"); + fs::write(&r1, "").expect("failed to write r1"); + fs::write(&r2, "").expect("failed to write r2"); + fs::write(&bc, "").expect("failed to write bc"); + + let mut opts = base_process_opts(); + opts.reads1 = Some(vec![r1]); + opts.reads2 = Some(vec![r2]); + opts.barcode_reads = vec![bc]; + + let mut cmd = std::process::Command::new("echo"); + add_read_args(&mut cmd, &opts).expect("expected add_read_args to succeed"); + } + + #[test] + fn add_read_args_fails_for_mismatched_read_counts() { + let td = tempfile::tempdir().expect("failed to create tempdir"); + let r1 = td.path().join("r1.fastq"); + let r2a = td.path().join("r2a.fastq"); + let r2b = td.path().join("r2b.fastq"); + let bc = td.path().join("bc.fastq"); + fs::write(&r1, "").expect("failed to write r1"); + fs::write(&r2a, "").expect("failed to write r2a"); + fs::write(&r2b, "").expect("failed to write r2b"); + fs::write(&bc, "").expect("failed to write bc"); + + let mut opts = base_process_opts(); + opts.reads1 = Some(vec![r1]); + opts.reads2 = Some(vec![r2a, r2b]); + opts.barcode_reads = vec![bc]; + + let mut cmd = std::process::Command::new("echo"); + let err = add_read_args(&mut cmd, &opts).expect_err("expected mismatch to fail"); + assert!( + format!("{:#}", err).contains("Cannot proceed"), + "unexpected error: {:#}", + err + ); + } } diff --git a/src/core.rs b/src/core.rs new file mode 100644 index 0000000..eaeed1f --- /dev/null +++ b/src/core.rs @@ -0,0 +1,5 @@ +pub mod context; +pub mod exec; +pub mod index_meta; +pub mod io; +pub mod runtime; diff --git a/src/core/context.rs b/src/core/context.rs new file mode 100644 index 0000000..e14024b --- /dev/null +++ b/src/core/context.rs @@ -0,0 +1,54 @@ +use std::path::Path; + +use anyhow::Context; +use serde_json::Value; + +use crate::utils::prog_utils::{self, ReqProgs}; + +pub struct RuntimeContext { + pub progs: ReqProgs, +} + +pub fn load_required_programs(af_home_path: &Path) -> anyhow::Result { + let v: Value = prog_utils::inspect_af_home(af_home_path)?; + serde_json::from_value(v["prog_info"].clone()) + .with_context(|| "Could not deserialize required program metadata from simpleaf_info.json") +} + +pub fn load_runtime_context(af_home_path: &Path) -> anyhow::Result { + Ok(RuntimeContext { + progs: load_required_programs(af_home_path)?, + }) +} + +#[cfg(test)] +mod tests { + use std::fs; + + use serde_json::json; + use tempfile::tempdir; + + use super::load_runtime_context; + + #[test] + fn load_runtime_context_reads_prog_info() { + let td = tempdir().expect("failed to create tempdir"); + let af_info = json!({ + "prog_info": { + "salmon": null, + "piscem": {"exe_path": "/bin/echo", "version": "0.12.2"}, + "alevin_fry": {"exe_path": "/bin/echo", "version": "0.11.2"}, + "macs": null + } + }); + fs::write( + td.path().join("simpleaf_info.json"), + serde_json::to_string_pretty(&af_info).expect("failed to serialize json"), + ) + .expect("failed to write simpleaf_info.json"); + + let ctx = load_runtime_context(td.path()).expect("failed to load runtime context"); + assert!(ctx.progs.piscem.is_some()); + assert!(ctx.progs.alevin_fry.is_some()); + } +} diff --git a/src/core/exec.rs b/src/core/exec.rs new file mode 100644 index 0000000..b550f7c --- /dev/null +++ b/src/core/exec.rs @@ -0,0 +1,54 @@ +use anyhow::{bail, Context}; + +use crate::utils::prog_utils::{self, CommandVerbosityLevel}; + +pub fn run_capture( + cmd: &mut std::process::Command, + context: &str, +) -> anyhow::Result { + prog_utils::execute_command(cmd, CommandVerbosityLevel::Quiet) + .with_context(|| format!("failed to execute {}", context)) +} + +pub fn ensure_success(output: &std::process::Output, context: &str) -> anyhow::Result<()> { + if output.status.success() { + return Ok(()); + } + + let stderr = String::from_utf8_lossy(&output.stderr); + bail!( + "{} failed with exit status {}. stderr: {}", + context, + output.status, + stderr.trim() + ); +} + +pub fn run_checked( + cmd: &mut std::process::Command, + context: &str, +) -> anyhow::Result { + let output = run_capture(cmd, context)?; + ensure_success(&output, context)?; + Ok(output) +} + +#[cfg(test)] +mod tests { + use super::run_checked; + + #[test] + fn run_checked_succeeds_for_true_command() { + let mut cmd = std::process::Command::new("sh"); + cmd.arg("-c").arg("true"); + run_checked(&mut cmd, "sh true").expect("expected true command to succeed"); + } + + #[test] + fn run_checked_errors_for_false_command() { + let mut cmd = std::process::Command::new("sh"); + cmd.arg("-c").arg("false"); + let err = run_checked(&mut cmd, "sh false").expect_err("expected false command to fail"); + assert!(format!("{:#}", err).contains("failed with exit status")); + } +} diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs new file mode 100644 index 0000000..af53247 --- /dev/null +++ b/src/core/index_meta.rs @@ -0,0 +1,163 @@ +use std::path::PathBuf; + +use anyhow::bail; +use serde_json::Value; + +use crate::core::io; +use crate::utils::af_utils::IndexType; + +pub struct QuantIndexMetadata { + pub index_type: IndexType, + pub inferred_t2g: Option, + pub inferred_gene_id_to_name: Option, +} + +pub fn resolve_quant_index( + index: Option, + use_piscem: bool, +) -> anyhow::Result { + let mut inferred_t2g = None; + let mut inferred_gene_id_to_name = None; + let index_type; + + if let Some(mut index) = index { + let removed_piscem_idx_suffix = if !index.is_dir() && index.ends_with("piscem_idx") { + index.pop(); + true + } else { + false + }; + + let index_json_path = index.join("simpleaf_index.json"); + match index_json_path.try_exists() { + Ok(true) => { + let v: Value = io::read_json_file(&index_json_path)?; + + let index_type_str: String = serde_json::from_value(v["index_type"].clone())?; + index_type = match index_type_str.as_ref() { + "salmon" => IndexType::Salmon(index.clone()), + "piscem" => IndexType::Piscem(index.join("piscem_idx")), + _ => { + bail!( + "unknown index type {} present in simpleaf_index.json", + index_type_str + ); + } + }; + + let t2g_opt: Option = serde_json::from_value(v["t2g_file"].clone())?; + if let Some(t2g_rel) = t2g_opt { + inferred_t2g = Some(index.join(t2g_rel)); + } + + if index.join("gene_id_to_name.tsv").exists() { + inferred_gene_id_to_name = Some(index.join("gene_id_to_name.tsv")); + } else if let Some(index_parent) = index.parent() { + let gene_name_path = index_parent.join("ref").join("gene_id_to_name.tsv"); + if gene_name_path.exists() && gene_name_path.is_file() { + inferred_gene_id_to_name = Some(gene_name_path); + } + } + } + Ok(false) => { + if removed_piscem_idx_suffix { + index.push("piscem_idx"); + } + index_type = if use_piscem { + IndexType::Piscem(index) + } else { + IndexType::Salmon(index) + }; + } + Err(e) => bail!(e), + } + } else { + index_type = IndexType::NoIndex; + } + + Ok(QuantIndexMetadata { + index_type, + inferred_t2g, + inferred_gene_id_to_name, + }) +} + +pub fn resolve_atac_piscem_index_base(mut index: PathBuf) -> anyhow::Result { + let removed_piscem_idx_suffix = if !index.is_dir() && index.ends_with("piscem_idx") { + index.pop(); + true + } else { + false + }; + + let index_json_path = index.join("simpleaf_index.json"); + match index_json_path.try_exists() { + Ok(true) => { + let v: Value = io::read_json_file(&index_json_path)?; + + let index_type_str: String = serde_json::from_value(v["index_type"].clone())?; + match index_type_str.as_ref() { + "piscem" => Ok(index.join("piscem_idx")), + _ => bail!( + "unknown index type {} present in simpleaf_index.json", + index_type_str + ), + } + } + Ok(false) => { + if removed_piscem_idx_suffix { + index.push("piscem_idx"); + } + Ok(index) + } + Err(e) => bail!(e), + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use serde_json::json; + use tempfile::tempdir; + + use crate::utils::af_utils::IndexType; + + use super::{resolve_atac_piscem_index_base, resolve_quant_index}; + + #[test] + fn resolve_quant_index_reads_simpleaf_index_metadata() { + let td = tempdir().expect("failed to create tempdir"); + let idx_dir = td.path().join("index"); + fs::create_dir_all(&idx_dir).expect("failed to create index dir"); + fs::write( + idx_dir.join("simpleaf_index.json"), + serde_json::to_string_pretty(&json!({ + "index_type":"salmon", + "t2g_file":"t2g_3col.tsv" + })) + .expect("failed to serialize json"), + ) + .expect("failed to write simpleaf_index.json"); + fs::write(idx_dir.join("gene_id_to_name.tsv"), "g1\tn1\n") + .expect("failed to write gene_id_to_name.tsv"); + + let meta = resolve_quant_index(Some(idx_dir.clone()), false) + .expect("failed to resolve quant index"); + assert!(matches!(meta.index_type, IndexType::Salmon(_))); + assert_eq!(meta.inferred_t2g, Some(idx_dir.join("t2g_3col.tsv"))); + assert_eq!( + meta.inferred_gene_id_to_name, + Some(idx_dir.join("gene_id_to_name.tsv")) + ); + } + + #[test] + fn resolve_atac_index_base_accepts_plain_prefix() { + let td = tempdir().expect("failed to create tempdir"); + let idx = td.path().join("foo").join("piscem_idx"); + let resolved = + resolve_atac_piscem_index_base(idx.clone()).expect("failed to resolve atac index"); + assert_eq!(resolved, idx); + } +} diff --git a/src/core/io.rs b/src/core/io.rs new file mode 100644 index 0000000..88b6279 --- /dev/null +++ b/src/core/io.rs @@ -0,0 +1,63 @@ +use std::fs; +use std::path::Path; + +use anyhow::Context; +use serde::Serialize; +use serde_json::Value; + +pub fn read_json_file(path: &Path) -> anyhow::Result { + let json_file = std::fs::File::open(path) + .with_context(|| format!("Could not open JSON file {}.", path.display()))?; + let v: Value = serde_json::from_reader(json_file) + .with_context(|| format!("Could not parse JSON file {}.", path.display()))?; + Ok(v) +} + +pub fn write_json_pretty(path: &Path, value: &T) -> anyhow::Result<()> { + let payload = serde_json::to_string_pretty(value) + .with_context(|| format!("Could not serialize JSON for {}.", path.display()))?; + fs::write(path, payload).with_context(|| format!("could not write {}", path.display())) +} + +pub fn write_json_pretty_atomic(path: &Path, value: &T) -> anyhow::Result<()> { + let payload = serde_json::to_string_pretty(value) + .with_context(|| format!("Could not serialize JSON for {}.", path.display()))?; + let tmp_path = path.with_extension("tmp"); + fs::write(&tmp_path, payload) + .with_context(|| format!("could not write temporary file {}", tmp_path.display()))?; + fs::rename(&tmp_path, path).with_context(|| { + format!( + "could not atomically rename {} to {}", + tmp_path.display(), + path.display() + ) + }) +} + +#[cfg(test)] +mod tests { + use serde_json::json; + use tempfile::tempdir; + + use super::{read_json_file, write_json_pretty, write_json_pretty_atomic}; + + #[test] + fn write_and_read_json_roundtrip() { + let td = tempdir().expect("failed to create tempdir"); + let path = td.path().join("a.json"); + let v = json!({"key":"value","n":1}); + write_json_pretty(&path, &v).expect("failed to write json"); + let read_back = read_json_file(&path).expect("failed to read json"); + assert_eq!(read_back, v); + } + + #[test] + fn write_json_pretty_atomic_persists_content() { + let td = tempdir().expect("failed to create tempdir"); + let path = td.path().join("b.json"); + let v = json!({"ok":true}); + write_json_pretty_atomic(&path, &v).expect("failed to write json atomically"); + let read_back = read_json_file(&path).expect("failed to read json"); + assert_eq!(read_back, v); + } +} diff --git a/src/core/runtime.rs b/src/core/runtime.rs new file mode 100644 index 0000000..3329337 --- /dev/null +++ b/src/core/runtime.rs @@ -0,0 +1,36 @@ +pub fn cap_threads(requested: u32) -> (u32, Option) { + cap_threads_with_limit( + requested, + std::thread::available_parallelism() + .ok() + .map(|n| n.get() as u32), + ) +} + +fn cap_threads_with_limit(requested: u32, limit: Option) -> (u32, Option) { + if let Some(max_threads) = limit { + if requested > max_threads { + return (max_threads, Some(max_threads)); + } + } + (requested, None) +} + +#[cfg(test)] +mod tests { + use super::cap_threads_with_limit; + + #[test] + fn caps_requested_threads_when_over_limit() { + let (effective, capped_at) = cap_threads_with_limit(32, Some(8)); + assert_eq!(effective, 8); + assert_eq!(capped_at, Some(8)); + } + + #[test] + fn keeps_requested_threads_when_within_limit() { + let (effective, capped_at) = cap_threads_with_limit(8, Some(32)); + assert_eq!(effective, 8); + assert_eq!(capped_at, None); + } +} diff --git a/src/main.rs b/src/main.rs index 64efefe..c144bbe 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ use chemistry::{ add_chemistry, clean_chemistries, fetch_chemistries, lookup_chemistry, refresh_chemistries, remove_chemistry, }; +use tracing::info; use tracing_subscriber::{filter::LevelFilter, fmt, prelude::*, EnvFilter}; use anyhow::bail; @@ -10,6 +11,7 @@ use clap::{crate_version, Parser}; use std::env; use std::path::PathBuf; +mod core; mod defaults; mod utils; @@ -109,7 +111,13 @@ fn main() -> anyhow::Result<()> { // validate versions atac::process::check_progs(&af_home_path, &process_opts)?; // first we map the reads - atac::process::map_reads(af_home_path.as_path(), &process_opts)?; + let map_stage = atac::process::map_reads(af_home_path.as_path(), &process_opts)?; + info!( + "ATAC map stage complete: output={} duration={:.2}s cmd={}", + map_stage.map_output.display(), + map_stage.map_duration_secs, + map_stage.map_cmd + ); // then we generate the permit list and sort the file atac::process::gen_bed(af_home_path.as_path(), &process_opts) } diff --git a/src/simpleaf_commands/chemistry.rs b/src/simpleaf_commands/chemistry.rs index 22d6918..418b551 100644 --- a/src/simpleaf_commands/chemistry.rs +++ b/src/simpleaf_commands/chemistry.rs @@ -1,3 +1,4 @@ +use crate::core::io::write_json_pretty_atomic; use crate::utils::chem_utils::{ custom_chem_hm_into_json, get_custom_chem_hm, get_single_custom_chem_from_file, CustomChemistry, CustomChemistryMap, ExpectedOri, LOCAL_PL_PATH_KEY, REMOTE_PL_URL_KEY, @@ -10,15 +11,128 @@ use regex::Regex; use anyhow::{bail, Context, Result}; use semver::Version; use serde_json::json; +use serde_json::{Map, Value}; use std::collections::HashSet; use std::fs; -use std::io::{Seek, Write}; use std::os::unix::fs::MetadataExt; use std::path::{Path, PathBuf}; use tracing::{debug, info, warn}; use utils::prog_utils::read_json_from_remote_url; use utils::remote::is_remote_url; +fn removable_permit_lists( + used_pls: &HashSet, + present_pls: &HashSet, +) -> HashSet { + present_pls - used_pls +} + +/// Parse a chemistry version string with additional operation context. +fn parse_chemistry_version(version: &str, context: &str) -> Result { + Version::parse(version).with_context(|| { + format!( + "Could not parse version {} while {}. Please follow https://semver.org/ (e.g. 0.1.0).", + version, context + ) + }) +} + +/// Return `true` if a new chemistry entry should replace an existing one. +/// +/// Replacement happens when `force` is set or when the incoming version is +/// strictly newer than the existing version. +fn should_replace_registry_entry(existing: &Value, incoming: &Value, force: bool) -> Result { + if force { + return Ok(true); + } + let current_version = existing + .get("version") + .and_then(Value::as_str) + .with_context(|| "Chemistry should have a string `version` field.")?; + let new_version = incoming + .get("version") + .and_then(Value::as_str) + .with_context(|| "Chemistry should have a string `version` field.")?; + Ok( + parse_chemistry_version(new_version, "comparing incoming chemistry")? + > parse_chemistry_version(current_version, "comparing existing chemistry")?, + ) +} + +/// Merge incoming registry entries into an existing registry object. +/// +/// Missing keys are inserted. Existing keys are replaced only when +/// `should_replace_registry_entry` permits it. +fn merge_registry_entries( + existing: &mut Map, + incoming: &Map, + force: bool, + dry_run_pref: &str, +) -> Result<()> { + for (k, v) in incoming { + match existing.get_mut(k) { + None => { + existing.insert(k.clone(), v.clone()); + } + Some(curr) => { + if should_replace_registry_entry(curr, v, force)? { + info!("{}updating {}", dry_run_pref, k); + existing.insert(k.clone(), v.clone()); + } + } + } + } + Ok(()) +} + +/// Merge entries from deprecated `custom_chemistries.json` into the main registry. +/// +/// Only missing entries are inserted, and only when the deprecated value is a +/// valid geometry string. +fn merge_deprecated_registry_entries( + existing: &mut Map, + deprecated: &Map, + dry_run_pref: &str, +) { + for (k, v) in deprecated { + if existing.contains_key(k) { + warn!( + "{}The main registry already contained the chemistry \"{}\"; Ignored the one from the deprecated registry.", + dry_run_pref, k + ); + } else if let Value::String(geom) = v { + if validate_geometry(geom).is_ok() { + let new_ent = json!({ + "geometry": geom, + "expected_ori": "both", + "version" : CustomChemistry::default_version(), + }); + existing.insert(k.to_owned(), new_ent); + info!( + "{}Successfully inserted chemistry \"{}\" from the deprecated registry into the main registry.", + dry_run_pref, k + ); + } else { + warn!( + "{}The chemistry \"{}\" in the deprecated registry is not a valid geometry string; Skipped.", + dry_run_pref, k + ); + } + } else { + warn!( + "{}The chemistry \"{}\" in the deprecated registry is not a string; Skipped.", + dry_run_pref, k + ); + } + } +} + +/// Persist a JSON value as pretty-printed text to disk. +fn write_json_pretty(path: &Path, value: &Value) -> Result<()> { + write_json_pretty_atomic(path, value) + .with_context(|| format!("Could not write {}", path.display())) +} + /// Attempt to get the chemistry definition from the provided JSON file /// Check if the JSON file is local or remote. If remote, fetch the file first. /// Parse the JSON file, and look for the specific chemistry with the requested name. @@ -160,7 +274,7 @@ pub fn add_chemistry( let version = add_opts .version .unwrap_or(CustomChemistry::default_version()); - let add_ver = Version::parse(version.as_ref()).with_context(|| format!("Could not parse version {}. Please follow https://semver.org/. A valid example is 0.1.0", version))?; + let add_ver = parse_chemistry_version(version.as_ref(), "adding chemistry")?; let name = add_opts.name; @@ -169,7 +283,10 @@ pub fn add_chemistry( if let Some(existing_entry) = get_single_custom_chem_from_file(&chem_p, &name)? { let existing_ver_str = existing_entry.version(); - let existing_ver = Version::parse(existing_ver_str).with_context( || format!("Could not parse version {} found in existing chemistries.json file. Please correct this entry", existing_ver_str))?; + let existing_ver = parse_chemistry_version( + existing_ver_str, + "reading existing chemistry version from chemistries.json", + )?; if add_ver <= existing_ver { info!("Attempting to add chemistry with version {:#} which is <= than the existing version ({:#}) for this chemistry; Skipping addition.", add_ver, existing_ver); return Ok(()); @@ -302,15 +419,7 @@ pub fn add_chemistry( // convert the custom chemistry hashmap to json let v = custom_chem_hm_into_json(chem_hm)?; - - // write out the new custom chemistry file - let mut custom_chem_file = std::fs::File::create(&chem_p) - .with_context(|| format!("Could not create {}", chem_p.display()))?; - custom_chem_file.rewind()?; - - custom_chem_file - .write_all(serde_json::to_string_pretty(&v).unwrap().as_bytes()) - .with_context(|| format!("Could not write {}", chem_p.display()))?; + write_json_pretty(&chem_p, &v)?; Ok(()) } @@ -377,44 +486,8 @@ pub fn refresh_chemistries( prog_utils::download_to_file(CHEMISTRIES_URL, &tmp_chem_path)?; if let Some(existing_chem) = parse_resource_json_file(&chem_path, None)?.as_object_mut() { if let Some(new_chem) = parse_resource_json_file(&tmp_chem_path, None)?.as_object() { - for (k, v) in new_chem.iter() { - match existing_chem.get_mut(k) { - None => { - existing_chem.insert(k.clone(), v.clone()); - } - Some(ev) => { - let curr_ver = Version::parse( - ev.get("version") - .expect("Chemistry should have a version field") - .as_str() - .expect("Version should be a string"), - )?; - let new_ver = Version::parse( - v.get("version") - .expect("Chemistry should have a version field") - .as_str() - .expect("Version should be a string"), - )?; - if refresh_opts.force || new_ver > curr_ver { - info!("{}updating {}", dry_run_pref, k); - existing_chem.insert(k.clone(), v.clone()); - } - } - } - } - - // write out the merged chemistry file - let mut chem_file = std::fs::File::create(&chem_path) - .with_context(|| format!("could not create {}", chem_path.display()))?; - chem_file.rewind()?; - - chem_file - .write_all( - serde_json::to_string_pretty(&existing_chem) - .unwrap() - .as_bytes(), - ) - .with_context(|| format!("could not write {}", chem_path.display()))?; + merge_registry_entries(existing_chem, new_chem, refresh_opts.force, dry_run_pref)?; + write_json_pretty(&chem_path, &Value::Object(existing_chem.clone()))?; // remove the temp file std::fs::remove_file(tmp_chem_path)?; @@ -434,34 +507,8 @@ pub fn refresh_chemistries( if let Some(old_custom_chem) = parse_resource_json_file(&custom_chem_file, None)?.as_object() { - for (k, v) in old_custom_chem.iter() { - if new_chem.contains_key(k) { - warn!("{}The main registry already contained the chemistry \"{}\"; Ignored the one from the deprecated registry.", dry_run_pref, k); - } else if let serde_json::Value::String(v) = v { - if validate_geometry(v).is_ok() { - let new_ent = json!({ - "geometry": v, - "expected_ori": "both", - "version" : CustomChemistry::default_version(), - }); - new_chem.insert(k.to_owned(), new_ent); - info!("{}Successfully inserted chemistry \"{}\" from the deprecated registry into the main registry.", dry_run_pref, k); - } else { - warn!("{}The chemistry \"{}\" in the deprecated registry is not a valid geometry string; Skipped.", dry_run_pref, k); - } - } else { - warn!("{}The chemistry \"{}\" in the deprecated registry is not a string; Skipped.", dry_run_pref, k); - } - } - - // write out the merged chemistry file - let mut chem_file = std::fs::File::create(&chem_path) - .with_context(|| format!("could not create {}", chem_path.display()))?; - chem_file.rewind()?; - - chem_file - .write_all(serde_json::to_string_pretty(&new_chem).unwrap().as_bytes()) - .with_context(|| format!("could not write {}", chem_path.display()))?; + merge_deprecated_registry_entries(new_chem, old_custom_chem, dry_run_pref); + write_json_pretty(&chem_path, &Value::Object(new_chem.clone()))?; let backup = custom_chem_file.with_extension("json.bak"); std::fs::rename(custom_chem_file, backup)?; @@ -531,17 +578,17 @@ pub fn clean_chemistries( }) .collect::>(); - let rem_pls = &present_pls - &used_pls; + let rem_pls = removable_permit_lists(&used_pls, &present_pls); // check if the chemistry already exists and log if dry_run { if rem_pls.is_empty() { info!("[dry_run] : No permit list files in the cache directory are currently unused; Nothing to clean."); } else { - info!("[dry_run] : The following files in the permit list directory do not match any registered chemistries and would be removed: {:#?}", present_pls); + info!("[dry_run] : The following files in the permit list directory do not match any registered chemistries and would be removed: {:#?}", rem_pls); } } else { - for pl in rem_pls { + for pl in &rem_pls { info!("removing file from {}", pl.display()); std::fs::remove_file(pl)?; } @@ -593,15 +640,7 @@ pub fn remove_chemistry( } else if !remove_opts.dry_run { // convert the custom chemistry hashmap to json let v = custom_chem_hm_into_json(chem_hm)?; - - // write out the new custom chemistry file - let mut custom_chem_file = std::fs::File::create(&chem_p) - .with_context(|| format!("Could not create {}", chem_p.display()))?; - custom_chem_file.rewind()?; - - custom_chem_file - .write_all(serde_json::to_string_pretty(&v).unwrap().as_bytes()) - .with_context(|| format!("Could not write {}", chem_p.display()))?; + write_json_pretty(&chem_p, &v)?; } Ok(()) @@ -760,3 +799,239 @@ pub fn fetch_chemistries( Ok(()) } + +#[cfg(test)] +mod tests { + use super::{ + add_chemistry, clean_chemistries, merge_deprecated_registry_entries, + merge_registry_entries, parse_chemistry_version, removable_permit_lists, remove_chemistry, + }; + use crate::simpleaf_commands::{ChemistryAddOpts, ChemistryCleanOpts, ChemistryRemoveOpts}; + use crate::utils::constants::CHEMISTRIES_PATH; + use serde_json::{json, Map, Value}; + use std::collections::HashSet; + use std::fs; + use std::path::PathBuf; + use tempfile::tempdir; + + /// Write a JSON value to `/chemistries.json` for registry tests. + fn write_registry(af_home: &std::path::Path, value: &Value) { + fs::write( + af_home.join(CHEMISTRIES_PATH), + serde_json::to_string_pretty(value).unwrap(), + ) + .unwrap(); + } + + /// Read `/chemistries.json` as JSON. + fn read_registry(af_home: &std::path::Path) -> Value { + serde_json::from_str(&fs::read_to_string(af_home.join(CHEMISTRIES_PATH)).unwrap()).unwrap() + } + + #[test] + fn removable_permit_list_set_only_contains_unused_files() { + let used = HashSet::from([PathBuf::from("plist/a"), PathBuf::from("plist/b")]); + let present = HashSet::from([ + PathBuf::from("plist/a"), + PathBuf::from("plist/b"), + PathBuf::from("plist/c"), + ]); + + let removable = removable_permit_lists(&used, &present); + assert_eq!(removable, HashSet::from([PathBuf::from("plist/c")])); + } + + #[test] + fn merge_registry_entries_prefers_newer_versions_unless_forced() { + let mut existing = Map::new(); + existing.insert( + "chem_a".to_string(), + json!({"geometry":"1{b[16]u[12]x:}2{r:}","expected_ori":"both","version":"1.0.0"}), + ); + let mut incoming = Map::new(); + incoming.insert( + "chem_a".to_string(), + json!({"geometry":"1{b[16]u[10]x:}2{r:}","expected_ori":"both","version":"0.9.0"}), + ); + incoming.insert( + "chem_b".to_string(), + json!({"geometry":"1{b[16]u[12]x:}2{r[50]x:}","expected_ori":"both","version":"0.1.0"}), + ); + + merge_registry_entries(&mut existing, &incoming, false, "").unwrap(); + assert_eq!(existing["chem_a"]["version"], json!("1.0.0")); + assert_eq!(existing["chem_b"]["version"], json!("0.1.0")); + + merge_registry_entries(&mut existing, &incoming, true, "").unwrap(); + assert_eq!(existing["chem_a"]["version"], json!("0.9.0")); + } + + #[test] + fn merge_deprecated_registry_entries_inserts_only_valid_missing_entries() { + let mut existing = Map::new(); + existing.insert( + "already".to_string(), + json!({"geometry":"1{b[16]u[12]x:}2{r:}","expected_ori":"both","version":"1.0.0"}), + ); + let deprecated = Map::from_iter([ + ( + "already".to_string(), + Value::String("1{b[10]}2{r:}".to_string()), + ), + ( + "valid_new".to_string(), + Value::String("1{b[16]u[12]x:}2{r:}".to_string()), + ), + ( + "invalid_new".to_string(), + Value::String("bad-geom".to_string()), + ), + ]); + + merge_deprecated_registry_entries(&mut existing, &deprecated, ""); + + assert!(existing.contains_key("already")); + assert!(existing.contains_key("valid_new")); + assert!(!existing.contains_key("invalid_new")); + } + + #[test] + fn add_chemistry_only_updates_when_version_increases() { + let tmp = tempdir().unwrap(); + write_registry( + tmp.path(), + &json!({ + "mychem": { + "geometry": "1{b[16]u[12]x:}2{r:}", + "expected_ori": "both", + "version": "1.2.0" + } + }), + ); + + add_chemistry( + tmp.path().to_path_buf(), + ChemistryAddOpts { + name: "mychem".to_string(), + geometry: Some("1{b[16]u[10]x:}2{r:}".to_string()), + expected_ori: Some("both".to_string()), + local_url: None, + remote_url: None, + version: Some("1.1.0".to_string()), + from_json: None, + }, + ) + .unwrap(); + let registry_after_older = read_registry(tmp.path()); + assert_eq!(registry_after_older["mychem"]["version"], json!("1.2.0")); + assert_eq!( + registry_after_older["mychem"]["geometry"], + json!("1{b[16]u[12]x:}2{r:}") + ); + + add_chemistry( + tmp.path().to_path_buf(), + ChemistryAddOpts { + name: "mychem".to_string(), + geometry: Some("1{b[16]u[10]x:}2{r:}".to_string()), + expected_ori: Some("both".to_string()), + local_url: None, + remote_url: None, + version: Some("1.3.0".to_string()), + from_json: None, + }, + ) + .unwrap(); + let registry_after_newer = read_registry(tmp.path()); + assert_eq!(registry_after_newer["mychem"]["version"], json!("1.3.0")); + assert_eq!( + registry_after_newer["mychem"]["geometry"], + json!("1{b[16]u[10]x:}2{r:}") + ); + } + + #[test] + fn remove_chemistry_respects_dry_run_and_regex_removal() { + let tmp = tempdir().unwrap(); + write_registry( + tmp.path(), + &json!({ + "foo_chem": {"geometry":"1{b[16]u[12]x:}2{r:}","expected_ori":"both","version":"0.1.0"}, + "bar_chem": {"geometry":"1{b[16]u[12]x:}2{r:}","expected_ori":"both","version":"0.1.0"} + }), + ); + + remove_chemistry( + tmp.path().to_path_buf(), + ChemistryRemoveOpts { + name: "foo_.*".to_string(), + dry_run: true, + }, + ) + .unwrap(); + let after_dry_run = read_registry(tmp.path()); + assert!(after_dry_run.get("foo_chem").is_some()); + assert!(after_dry_run.get("bar_chem").is_some()); + + remove_chemistry( + tmp.path().to_path_buf(), + ChemistryRemoveOpts { + name: "foo_.*".to_string(), + dry_run: false, + }, + ) + .unwrap(); + let after_remove = read_registry(tmp.path()); + assert!(after_remove.get("foo_chem").is_none()); + assert!(after_remove.get("bar_chem").is_some()); + } + + #[test] + fn clean_chemistries_dry_run_is_non_destructive_then_removes_unused() { + let tmp = tempdir().unwrap(); + write_registry( + tmp.path(), + &json!({ + "chem": { + "geometry":"1{b[16]u[12]x:}2{r:}", + "expected_ori":"both", + "version":"0.1.0", + "plist_name":"keep_pl" + } + }), + ); + + let plist_dir = tmp.path().join("plist"); + fs::create_dir_all(&plist_dir).unwrap(); + let keep = plist_dir.join("keep_pl"); + let remove = plist_dir.join("remove_pl"); + fs::write(&keep, "keep").unwrap(); + fs::write(&remove, "remove").unwrap(); + + clean_chemistries( + tmp.path().to_path_buf(), + ChemistryCleanOpts { dry_run: true }, + ) + .unwrap(); + assert!(keep.exists()); + assert!(remove.exists()); + + clean_chemistries( + tmp.path().to_path_buf(), + ChemistryCleanOpts { dry_run: false }, + ) + .unwrap(); + assert!(keep.exists()); + assert!(!remove.exists()); + } + + #[test] + fn parse_chemistry_version_rejects_invalid_versions() { + let err = parse_chemistry_version("not-a-version", "unit test").unwrap_err(); + assert!( + format!("{:#}", err).contains("Could not parse version"), + "unexpected error: {:#}", + err + ); + } +} diff --git a/src/simpleaf_commands/indexing.rs b/src/simpleaf_commands/indexing.rs index 3556db1..98dd124 100644 --- a/src/simpleaf_commands/indexing.rs +++ b/src/simpleaf_commands/indexing.rs @@ -1,12 +1,12 @@ +use crate::core::{context, exec, io, runtime}; use crate::utils::af_utils::create_dir_if_absent; use crate::utils::prog_utils; -use crate::utils::prog_utils::{CommandVerbosityLevel, ReqProgs}; +use crate::utils::prog_utils::ReqProgs; use anyhow::{anyhow, bail, Context}; use roers; use serde::Deserialize; use serde_json::json; -use serde_json::Value; use std::collections::HashSet; use std::fs::File; use std::io::{BufWriter, Write}; @@ -16,6 +16,100 @@ use tracing::{error, info, warn}; use super::{IndexOpts, ReferenceType}; +struct ReferenceStageOutput { + ref_seq: PathBuf, + min_seq_len: Option, + t2g: Option, + gene_id_to_name: Option, + roers_duration: Option, + roers_cmd: Option, +} + +struct IndexBuildStageOutput { + index_duration: std::time::Duration, + index_cmd_string: String, +} + +fn derive_kmer_and_minimizer( + min_seq_len: Option, + default_kmer_length: u32, + default_minimizer_length: u32, +) -> anyhow::Result<(u32, u32)> { + if let Some(msl) = min_seq_len { + if msl < 10 { + bail!("The reference sequences are too short for indexing. Please provide sequences with a minimum length of at least 10 bases."); + } + if (msl / 2) < default_kmer_length && default_kmer_length == 31 { + let kmer_length = msl / 2; + let minimizer_length = (kmer_length as f32 / 1.8).ceil() as u32 + 1; + info!( + "Using kmer_length = {} and minimizer_length = {} because the default values are too big for the reference sequences.", + kmer_length, minimizer_length + ); + Ok((kmer_length, minimizer_length)) + } else { + Ok((default_kmer_length, default_minimizer_length)) + } + } else { + Ok((default_kmer_length, default_minimizer_length)) + } +} + +fn write_index_log_stage( + output: &Path, + reference_stage: &ReferenceStageOutput, + index_stage: &IndexBuildStageOutput, +) -> anyhow::Result<()> { + let index_log_file = output.join("simpleaf_index_log.json"); + let index_log_info = if let Some(roers_duration) = reference_stage.roers_duration { + json!({ + "time_info" : { + "roers_time" : roers_duration, + "index_time" : index_stage.index_duration + }, + "cmd_info" : { + "roers_cmd" : reference_stage.roers_cmd, + "index_cmd" : index_stage.index_cmd_string, + } + }) + } else { + json!({ + "time_info" : { + "index_time" : index_stage.index_duration + }, + "cmd_info" : { + "index_cmd" : index_stage.index_cmd_string, + } + }) + }; + + io::write_json_pretty_atomic(&index_log_file, &index_log_info) +} + +#[cfg(test)] +mod tests { + use super::derive_kmer_and_minimizer; + + #[test] + fn derive_kmer_and_minimizer_fails_for_short_reference() { + let err = derive_kmer_and_minimizer(Some(9), 31, 19) + .expect_err("expected short sequence length to fail"); + assert!( + format!("{:#}", err).contains("minimum length of at least 10"), + "unexpected error: {:#}", + err + ); + } + + #[test] + fn derive_kmer_and_minimizer_adjusts_defaults_for_short_sequences() { + let (k, m) = + derive_kmer_and_minimizer(Some(20), 31, 19).expect("expected adjusted k/m values"); + assert_eq!(k, 10); + assert_eq!(m, 7); + } +} + fn validate_index_type_opts(opts: &IndexOpts) -> anyhow::Result<()> { let mut bail = false; if opts.use_piscem && opts.sparse { @@ -230,9 +324,7 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu validate_index_type_opts(&opts)?; let mut threads = opts.threads; let output = opts.output; - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - // Read the JSON contents of the file as an instance of `User`. - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; + let rp: ReqProgs = context::load_required_programs(af_home_path)?; rp.issue_recommended_version_messages(); // we are building a custom spliced+intronic reference @@ -435,40 +527,29 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu // _gene_id_to_name = Some(id_to_name_path); } - std::fs::write( - &info_file, - serde_json::to_string_pretty(&index_info).unwrap(), - ) - .with_context(|| format!("could not write {}", info_file.display()))?; - - let ref_seq = reference_sequence.with_context(|| - "Reference sequence should either be generated from --fasta with reftype spliced+intronic / spliced+unspliced or set with --ref-seq", - )?; + io::write_json_pretty(&info_file, &index_info)?; + + let ref_seq = reference_sequence.with_context( + || { + "Reference sequence should either be generated from --fasta with reftype spliced+intronic / spliced+unspliced or set with --ref-seq" + }, + )?; + let reference_stage = ReferenceStageOutput { + ref_seq: ref_seq.clone(), + min_seq_len, + t2g, + gene_id_to_name, + roers_duration, + roers_cmd: roers_aug_ref_opt, + }; - let input_files = vec![ref_seq.clone()]; + let input_files = vec![reference_stage.ref_seq.clone()]; prog_utils::check_files_exist(&input_files)?; - - let kmer_length: u32; - let minimizer_length: u32; - if let Some(msl) = min_seq_len { - if msl < 10 { - bail!("The reference sequences are too short for indexing. Please provide sequences with a minimum length of at least 10 bases."); - } - // only if the value is not the default value - if (msl / 2) < opts.kmer_length && opts.kmer_length == 31 { - kmer_length = msl / 2; - // https://github.com/COMBINE-lab/protocol-estuary/blob/2ecc65f1891ebfafff2a4a17460550e4dd1f4bb6/utils/simpleaf_workflow_utils.libsonnet#L232 - minimizer_length = (kmer_length as f32 / 1.8).ceil() as u32 + 1; - - info!("Using kmer_length = {} and minimizer_length = {} because the default values are too big for the reference sequences.", kmer_length, minimizer_length); - } else { - kmer_length = opts.kmer_length; - minimizer_length = opts.minimizer_length; - }; - } else { - kmer_length = opts.kmer_length; - minimizer_length = opts.minimizer_length; - } + let (kmer_length, minimizer_length) = derive_kmer_and_minimizer( + reference_stage.min_seq_len, + opts.kmer_length, + opts.minimizer_length, + )?; let output_index_dir = output.join("index"); let index_duration; @@ -481,10 +562,9 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu Please either set a path using the `set-paths` command, or ensure the `PISCEM` environment variable is set properly."); } - let piscem_prog_info = rp - .piscem - .as_ref() - .expect("piscem program info should be properly set."); + let piscem_prog_info = rp.piscem.as_ref().context( + "The construction of a piscem index was requested, but piscem program info is missing.", + )?; let mut piscem_index_cmd = std::process::Command::new(format!("{}", piscem_prog_info.exe_path.display())); @@ -501,7 +581,7 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu .arg("-o") .arg(&output_index_stem) .arg("-s") - .arg(&ref_seq) + .arg(&reference_stage.ref_seq) .arg("--seed") .arg(opts.hash_seed.to_string()) .arg("-w") @@ -513,18 +593,15 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu piscem_index_cmd.arg("--overwrite"); } - // if the user requested more threads than can be used - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (capped_threads, capped_at) = runtime::cap_threads(threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, threads + ); + warn!("setting number of threads to {}", max_threads); } + threads = capped_threads; piscem_index_cmd .arg("--threads") @@ -561,17 +638,12 @@ simpleaf"#, info!("piscem build cmd : {}", index_cmd_string); let index_start = Instant::now(); - let cres = prog_utils::execute_command(&mut piscem_index_cmd, CommandVerbosityLevel::Quiet) - .expect("failed to invoke piscem index command"); + let _cres = exec::run_checked(&mut piscem_index_cmd, "piscem index command")?; index_duration = index_start.elapsed(); - if !cres.status.success() { - bail!("piscem index failed to build succesfully {:?}", cres.status); - } - // copy over the t2g file to the index let mut t2g_out_path: Option = None; - if let Some(t2g_file) = t2g { + if let Some(t2g_file) = reference_stage.t2g.clone() { let index_t2g_path = output_index_dir.join("t2g_3col.tsv"); t2g_out_path = Some(PathBuf::from("t2g_3col.tsv")); std::fs::copy(t2g_file, index_t2g_path)?; @@ -579,7 +651,7 @@ simpleaf"#, // copy over the gene_id_to_name.tsv file to the index let mut gene_id_to_name_out_path: Option = None; - if let Some(gene_id_to_name_file) = gene_id_to_name { + if let Some(gene_id_to_name_file) = reference_stage.gene_id_to_name.clone() { let index_id2name_path = output_index_dir.join("gene_id_to_name.tsv"); gene_id_to_name_out_path = Some(PathBuf::from("gene_id_to_name.tsv")); std::fs::copy(gene_id_to_name_file, index_id2name_path)?; @@ -596,14 +668,10 @@ simpleaf"#, "m" : minimizer_length, "overwrite" : opts.overwrite, "threads" : threads, - "ref" : ref_seq + "ref" : reference_stage.ref_seq } }); - std::fs::write( - &index_json_file, - serde_json::to_string_pretty(&index_json).unwrap(), - ) - .with_context(|| format!("could not write {}", index_json_file.display()))?; + io::write_json_pretty_atomic(&index_json_file, &index_json)?; } else { // ensure we have piscem if rp.salmon.is_none() { @@ -611,8 +679,11 @@ simpleaf"#, Please either set a path using the `simpleaf set-paths` command, or ensure the `SALMON` environment variable is set properly."); } + let salmon_prog_info = rp.salmon.as_ref().context( + "The construction of a salmon index was requested, but salmon program info is missing.", + )?; let mut salmon_index_cmd = - std::process::Command::new(format!("{}", rp.salmon.unwrap().exe_path.display())); + std::process::Command::new(format!("{}", salmon_prog_info.exe_path.display())); salmon_index_cmd .arg("index") @@ -621,7 +692,7 @@ simpleaf"#, .arg("-i") .arg(&output_index_dir) .arg("-t") - .arg(&ref_seq); + .arg(&reference_stage.ref_seq); // overwrite doesn't do anything special for the salmon index, so mention this to // the user. @@ -640,18 +711,15 @@ simpleaf"#, salmon_index_cmd.arg("--keepDuplicates"); } - // if the user requested more threads than can be used - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (capped_threads, capped_at) = runtime::cap_threads(threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, threads + ); + warn!("setting number of threads to {}", max_threads); } + threads = capped_threads; salmon_index_cmd .arg("--threads") @@ -662,17 +730,12 @@ simpleaf"#, info!("salmon index cmd : {}", index_cmd_string); let index_start = Instant::now(); - let cres = prog_utils::execute_command(&mut salmon_index_cmd, CommandVerbosityLevel::Quiet) - .expect("failed to invoke salmon index command"); + let _cres = exec::run_checked(&mut salmon_index_cmd, "salmon index command")?; index_duration = index_start.elapsed(); - if !cres.status.success() { - bail!("salmon index failed to build succesfully {:?}", cres.status); - } - // copy over the t2g file to the index let mut t2g_out_path: Option = None; - if let Some(t2g_file) = t2g { + if let Some(t2g_file) = reference_stage.t2g.clone() { let index_t2g_path = output_index_dir.join("t2g_3col.tsv"); t2g_out_path = Some(PathBuf::from("t2g_3col.tsv")); std::fs::copy(t2g_file, index_t2g_path)?; @@ -680,8 +743,8 @@ simpleaf"#, // copy over the gene_id_to_name.tsv file to the index let mut gene_id_to_name_out_path: Option = None; - info!("{:?}", gene_id_to_name); - if let Some(gene_id_to_name_file) = gene_id_to_name { + info!("{:?}", reference_stage.gene_id_to_name); + if let Some(gene_id_to_name_file) = reference_stage.gene_id_to_name.clone() { let index_id2name_path = output_index_dir.join("gene_id_to_name.tsv"); gene_id_to_name_out_path = Some(PathBuf::from("gene_id_to_name.tsv")); std::fs::copy(gene_id_to_name_file, index_id2name_path)?; @@ -699,43 +762,15 @@ simpleaf"#, "sparse" : opts.sparse, "keep_duplicates" : opts.keep_duplicates, "threads" : threads, - "ref" : ref_seq + "ref" : reference_stage.ref_seq } }); - std::fs::write( - &index_json_file, - serde_json::to_string_pretty(&index_json).unwrap(), - ) - .with_context(|| format!("could not write {}", index_json_file.display()))?; + io::write_json_pretty_atomic(&index_json_file, &index_json)?; } - - let index_log_file = output.join("simpleaf_index_log.json"); - let index_log_info = if let Some(roers_duration) = roers_duration { - // if we ran make-splici - json!({ - "time_info" : { - "roers_time" : roers_duration, - "index_time" : index_duration - }, - "cmd_info" : { - "roers_cmd" : roers_aug_ref_opt, - "index_cmd" : index_cmd_string, } - }) - } else { - // if we indexed provided sequences directly - json!({ - "time_info" : { - "index_time" : index_duration - }, - "cmd_info" : { - "index_cmd" : index_cmd_string, } - }) + let index_stage = IndexBuildStageOutput { + index_duration, + index_cmd_string, }; - - std::fs::write( - &index_log_file, - serde_json::to_string_pretty(&index_log_info).unwrap(), - ) - .with_context(|| format!("could not write {}", index_log_file.display()))?; + write_index_log_stage(&output, &reference_stage, &index_stage)?; Ok(()) } diff --git a/src/simpleaf_commands/quant.rs b/src/simpleaf_commands/quant.rs index 7cd8896..1a1a1ca 100644 --- a/src/simpleaf_commands/quant.rs +++ b/src/simpleaf_commands/quant.rs @@ -1,12 +1,12 @@ use crate::utils::af_utils::*; +use crate::core::{context, exec, index_meta, io, runtime}; use crate::utils::prog_parsing_utils; use crate::utils::prog_utils; -use crate::utils::prog_utils::{CommandVerbosityLevel, ReqProgs}; +use crate::utils::prog_utils::ReqProgs; use anyhow::{bail, Context}; use serde_json::json; -use serde_json::Value; use std::collections::HashMap; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::path::{Path, PathBuf}; @@ -52,8 +52,10 @@ impl CBListInfo { .lines() .take(NUM_SAMPLE_LINES) // don't read the whole file in the single-coumn case .map(|l| { - l.unwrap_or_else(|_| panic!("Could not open permitlist file {}", pl_file.display())) + l.with_context(|| format!("Could not open permitlist file {}", pl_file.display())) }) + .collect::>>()? + .into_iter() .any(|l| !l.contains('\t')); // if single column, we are good. Otherwise, we need to write the first column to the final file @@ -236,140 +238,65 @@ fn validate_map_and_quant_opts(opts: &MapQuantOpts) -> anyhow::Result<()> { Ok(()) } -pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result<()> { - validate_map_and_quant_opts(&opts)?; - - let mut t2g_map = opts.t2g_map.clone(); - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; - - rp.issue_recommended_version_messages(); - - let mut gene_id_to_name_opt: Option = None; - - // figure out what type of index we expect - let index_type; - - if let Some(mut index) = opts.index.clone() { - // If the user built the index using simpleaf, and they are using - // piscem, then they are not required to pass the --use-piscem flag - // to the quant step (though they *can* pass it if they wish). - // Thus, if they built the piscem index using simpleaf, there are - // 2 possibilities here: - // 1. They are passing in the directory containing the index - // 2. They are passing in the prefix stem of the index files - // The code below is to check, in both cases, if we can automatically - // detect if the index was constructed with simpleaf, so that we can - // then automatically infer other files, like the t2g files. - - // If we are in case 1., the passed in path is a directory and - // we can check for the simpleaf_index.json file directly, - // Otherwise if the path is not a directory, we check if it - // ends in piscem_idx (the suffix that simpleaf uses when - // making a piscem index). Then we test the directory we - // get after stripping off this suffix. - let removed_piscem_idx_suffix = if !index.is_dir() && index.ends_with("piscem_idx") { - // remove the piscem_idx part - index.pop(); - true - } else { - false - }; - - let index_json_path = index.join("simpleaf_index.json"); - match index_json_path.try_exists() { - Ok(true) => { - // we have the simpleaf_index.json file, so parse it. - let index_json_file = std::fs::File::open(&index_json_path).with_context({ - || format!("Could not open file {}", index_json_path.display()) - })?; +#[derive(Debug)] +struct QuantSetup { + rp: ReqProgs, + index_type: IndexType, + t2g_map_file: PathBuf, + gene_id_to_name_opt: Option, + chem: Chemistry, + ori: ExpectedOri, + filter_meth: CellFilterMethod, + threads: u32, +} - let index_json_reader = BufReader::new(&index_json_file); - let v: Value = serde_json::from_reader(index_json_reader)?; +#[derive(Debug)] +struct MappingStageOutput { + sc_mapper: String, + map_cmd_string: String, + map_output: PathBuf, + map_duration: Duration, +} - let index_type_str: String = serde_json::from_value(v["index_type"].clone())?; +#[derive(Debug)] +struct QuantStageOutput { + gpl_output: PathBuf, + gpl_cmd_string: String, + collate_cmd_string: String, + quant_cmd_string: String, + gpl_duration: Duration, + collate_duration: Duration, + quant_duration: Duration, +} - // here, set the index type based on what we found as the - // value for the `index_type` key. - match index_type_str.as_ref() { - "salmon" => { - index_type = IndexType::Salmon(index.clone()); - } - "piscem" => { - // here, either the user has provided us with just - // the directory containing the piscem index, or - // we have "popped" off the "piscem_idx" suffix, so - // add it (back). - index_type = IndexType::Piscem(index.join("piscem_idx")); - } - _ => { - bail!( - "unknown index type {} present in simpleaf_index.json", - index_type_str, - ); - } - } - // if the user didn't pass in a t2g_map, try and populate it - // automatically here - if t2g_map.is_none() { - let t2g_opt: Option = serde_json::from_value(v["t2g_file"].clone())?; - if let Some(t2g_val) = t2g_opt { - let t2g_loc = index.join(t2g_val); - info!("found local t2g file at {}, will attempt to use this since none was provided explicitly", t2g_loc.display()); - t2g_map = Some(t2g_loc); - } - } +fn resolve_quant_setup( + af_home_path: &Path, + opts: &MapQuantOpts, +) -> anyhow::Result<(QuantSetup, CBListInfo)> { + let mut t2g_map = opts.t2g_map.clone(); + let ctx = context::load_runtime_context(af_home_path)?; + let rp: ReqProgs = ctx.progs; + rp.issue_recommended_version_messages(); - // if the user used simpleaf for index construction, then we also built the - // reference and populated the gene_id_to_name.tsv file. See if we can grab - // that as well. - if index.join("gene_id_to_name.tsv").exists() { - gene_id_to_name_opt = Some(index.join("gene_id_to_name.tsv")); - } else if let Some(index_parent) = index.parent() { - // we are doing index_dir/../ref/gene_id_to_name.tsv - let gene_name_path = index_parent.join("ref").join("gene_id_to_name.tsv"); - if gene_name_path.exists() && gene_name_path.is_file() { - gene_id_to_name_opt = Some(gene_name_path); - } - } - } - Ok(false) => { - // at this point, we have inferred that simpleaf wasn't - // used to construct the index, so fall back to what the user - // requested directly. - // if we have previously removed the piscem_idx suffix, add it back - if removed_piscem_idx_suffix { - index.push("piscem_idx"); - } - if opts.use_piscem { - // the user passed the use-piscem flag, so treat the provided - // path as the *prefix stem* to the piscem index - index_type = IndexType::Piscem(index); - } else { - // if the user didn't pass use-piscem and there - // is no simpleaf index json file to check, then - // it's assumed they are using a salmon index. - index_type = IndexType::Salmon(index); - } - } - Err(e) => { - bail!(e); - } + let index_meta = index_meta::resolve_quant_index(opts.index.clone(), opts.use_piscem)?; + if t2g_map.is_none() { + if let Some(t2g_loc) = index_meta.inferred_t2g.clone() { + info!( + "found local t2g file at {}, will attempt to use this since none was provided explicitly", + t2g_loc.display() + ); + t2g_map = Some(t2g_loc); } - } else { - index_type = IndexType::NoIndex; } + let index_type = index_meta.index_type; + let gene_id_to_name_opt = index_meta.inferred_gene_id_to_name; - // at this point make sure we have a t2g value let t2g_map_file = t2g_map.context( "A transcript-to-gene map (t2g) file was not provided via `--t2g-map`|`-m` and could \ not be inferred from the index. Please provide a t2g map explicitly to the quant command.", )?; - prog_utils::check_files_exist(&[t2g_map_file.clone()])?; + prog_utils::check_files_exist(std::slice::from_ref(&t2g_map_file))?; - // make sure we have an program matching the - // appropriate index type match index_type { IndexType::Piscem(_) => { if rp.piscem.is_none() { @@ -378,54 +305,35 @@ pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result< } IndexType::Salmon(_) => { if rp.salmon.is_none() { - bail!("A salmon index is being used, but no piscem executable is provided. Please set one with `simpleaf set-paths`."); + bail!("A salmon index is being used, but no salmon executable is provided. Please set one with `simpleaf set-paths`."); } } IndexType::NoIndex => {} } - // the chemistries file let custom_chem_p = af_home_path.join(CHEMISTRIES_PATH); - let chem = Chemistry::from_str(&index_type, &custom_chem_p, &opts.chemistry)?; - - let ori: ExpectedOri; - // if the user set the orientation, then - // use that explicitly - if let Some(o) = &opts.expected_ori { - ori = ExpectedOri::from_str(o).with_context(|| { + let ori = if let Some(o) = &opts.expected_ori { + ExpectedOri::from_str(o).with_context(|| { format!( "Could not parse orientation {}. It must be one of the following: {:?}", o, ExpectedOri::all_to_str().join(", ") ) - })?; + })? } else { - ori = chem.expected_ori(); - } + chem.expected_ori() + }; let mut filter_meth_opt = None; let mut pl_info = CBListInfo::new(); - - // based on the filtering method if let Some(ref pl_file) = opts.unfiltered_pl { - // NOTE: unfiltered_pl is of type Option> so being in here - // tells us nothing about the inner option. We handle that now. - - // if the -u flag is passed and some file is provided, then the inner - // Option is Some(PathBuf) if let Some(pl_file) = pl_file { - // the user has explicily passed a file along, so try - // to use that if pl_file.is_file() { - // we read the file to see if there is additional columns separated by \t. - // unwrap is safe here cuz we defined it above pl_info.init(pl_file, &opts.output)?; - - let min_cells = opts.min_reads; filter_meth_opt = Some(CellFilterMethod::UnfilteredExternalList( pl_info.final_file.to_string_lossy().into_owned(), - min_cells, + opts.min_reads, )); } else { bail!( @@ -434,20 +342,13 @@ pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result< ); } } else { - // here, the -u flag is provided - // but no file is provided, then the - // inner option is None and we will try to get the permit list automatically if - // using 10xv2, 10xv3, or 10xv4 - - // check the chemistry let pl_res = get_permit_if_absent(af_home_path, &chem)?; - let min_cells = opts.min_reads; match pl_res { PermitListResult::DownloadSuccessful(p) | PermitListResult::AlreadyPresent(p) => { pl_info.init(&p, &opts.output)?; filter_meth_opt = Some(CellFilterMethod::UnfilteredExternalList( pl_info.final_file.to_string_lossy().into_owned(), - min_cells, + opts.min_reads, )); } PermitListResult::UnregisteredChemistry => { @@ -472,63 +373,57 @@ pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result< filter_meth_opt = Some(CellFilterMethod::ExpectCells(*num_expected)); }; } - // otherwise it must have been knee; if opts.knee { filter_meth_opt = Some(CellFilterMethod::KneeFinding); } - if filter_meth_opt.is_none() { - bail!("No valid filtering strategy was provided!"); - } - - // if the user requested more threads than can be used - let mut threads = opts.threads; - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (threads, capped_at) = runtime::cap_threads(opts.threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, opts.threads + ); + warn!("setting number of threads to {}", max_threads); } - // here we must be safe to unwrap - let filter_meth = filter_meth_opt.unwrap(); - - let sc_mapper: String; - let map_cmd_string: String; - let map_output: PathBuf; - let map_duration: Duration; + let setup = QuantSetup { + rp, + index_type, + t2g_map_file, + gene_id_to_name_opt, + chem, + ori, + filter_meth: filter_meth_opt.context("No valid filtering strategy was provided!")?, + threads, + }; + Ok((setup, pl_info)) +} - // if we are mapping against an index +fn run_mapping_stage( + opts: &MapQuantOpts, + setup: &QuantSetup, +) -> anyhow::Result { if let Some(index) = opts.index.clone() { - let reads1 = opts - .reads1 - .as_ref() - .expect("since mapping against an index is requested, read1 files must be provided."); - let reads2 = opts - .reads2 - .as_ref() - .expect("since mapping against an index is requested, read2 files must be provided."); - assert_eq!( - reads1.len(), - reads2.len(), - "{} read1 files and {} read2 files were given; Cannot proceed!", - reads1.len(), - reads2.len() - ); - - match index_type { - IndexType::Piscem(ref index_base) => { - let piscem_prog_info = rp - .piscem - .as_ref() - .expect("piscem program info should be properly set."); + let reads1 = opts.reads1.as_ref().context( + "Mapping against an index was requested, but read1 files were not provided.", + )?; + let reads2 = opts.reads2.as_ref().context( + "Mapping against an index was requested, but read2 files were not provided.", + )?; + if reads1.len() != reads2.len() { + bail!( + "{} read1 files and {} read2 files were given; Cannot proceed!", + reads1.len(), + reads2.len() + ); + } - // using a piscem index + match &setup.index_type { + IndexType::Piscem(index_base) => { + let piscem_prog_info = + setup.rp.piscem.as_ref().context( + "A piscem index is being used, but piscem program info is missing.", + )?; let mut piscem_quant_cmd = std::process::Command::new(format!("{}", &piscem_prog_info.exe_path.display())); let index_path = format!("{}", index_base.display()); @@ -537,22 +432,19 @@ pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result< .arg("--index") .arg(index_path); - // location of output directory, number of threads - map_output = opts.output.join("af_map"); + let map_output = opts.output.join("af_map"); piscem_quant_cmd .arg("--threads") - .arg(format!("{}", threads)) + .arg(format!("{}", setup.threads)) .arg("-o") .arg(&map_output); - // if the user is requesting a mapping option that required - // piscem version >= 0.7.0, ensure we have that if let Ok(_piscem_ver) = prog_utils::check_version_constraints( "piscem", ">=0.7.0, <1.0.0", &piscem_prog_info.version, ) { - push_advanced_piscem_options(&mut piscem_quant_cmd, &opts)?; + push_advanced_piscem_options(&mut piscem_quant_cmd, opts)?; } else { info!( r#" @@ -564,21 +456,16 @@ being used by simpleaf"#, ); } - // we get the final geometry we want to pass to piscem - // check if we can parse the geometry directly, or if we are dealing with a - // "complex" geometry. let frag_lib_xform = add_or_transform_fragment_library( MapperType::Piscem, - chem.fragment_geometry_str(), + setup.chem.fragment_geometry_str(), reads1, reads2, &mut piscem_quant_cmd, )?; - map_cmd_string = prog_utils::get_cmd_line_string(&piscem_quant_cmd); + let map_cmd_string = prog_utils::get_cmd_line_string(&piscem_quant_cmd); info!("piscem map-sc cmd : {}", map_cmd_string); - sc_mapper = String::from("piscem"); - let mut input_files = vec![ index_base.with_extension("ctab"), index_base.with_extension("refinfo"), @@ -586,21 +473,12 @@ being used by simpleaf"#, ]; input_files.extend_from_slice(reads1); input_files.extend_from_slice(reads2); - prog_utils::check_files_exist(&input_files)?; let map_start = Instant::now(); - let cres = prog_utils::execute_command( - &mut piscem_quant_cmd, - CommandVerbosityLevel::Quiet, - ) - .expect("failed to execute piscem [mapping phase]"); - - // if we had to filter the reads through a fifo - // wait for the thread feeding the fifo to finish + exec::run_checked(&mut piscem_quant_cmd, "piscem [mapping phase]")?; match frag_lib_xform { FragmentTransformationType::TransformedIntoFifo(xform_data) => { - // wait for it to join match xform_data.join_handle.join() { Ok(join_res) => { let xform_stats = join_res?; @@ -616,25 +494,23 @@ being used by simpleaf"#, } } } - FragmentTransformationType::Identity => { - // nothing to do. - } + FragmentTransformationType::Identity => {} } - map_duration = map_start.elapsed(); - - if !cres.status.success() { - bail!("piscem mapping failed with exit status {:?}", cres.status); - } + Ok(MappingStageOutput { + sc_mapper: String::from("piscem"), + map_cmd_string, + map_output, + map_duration: map_start.elapsed(), + }) } - IndexType::Salmon(ref index_base) => { - // using a salmon index - let mut salmon_quant_cmd = std::process::Command::new(format!( - "{}", - rp.salmon.unwrap().exe_path.display() - )); - - // set the input index and library type + IndexType::Salmon(index_base) => { + let salmon_prog_info = + setup.rp.salmon.as_ref().context( + "A salmon index is being used, but salmon program info is missing.", + )?; + let mut salmon_quant_cmd = + std::process::Command::new(format!("{}", salmon_prog_info.exe_path.display())); let index_path = format!("{}", index_base.display()); salmon_quant_cmd .arg("alevin") @@ -643,55 +519,37 @@ being used by simpleaf"#, .arg("-l") .arg("A"); - // check if we can parse the geometry directly, or if we are dealing with a - // "complex" geometry. let frag_lib_xform = add_or_transform_fragment_library( MapperType::Salmon, - chem.fragment_geometry_str(), + setup.chem.fragment_geometry_str(), reads1, reads2, &mut salmon_quant_cmd, )?; - // location of output directory, number of threads - map_output = opts.output.join("af_map"); + let map_output = opts.output.join("af_map"); salmon_quant_cmd .arg("--threads") - .arg(format!("{}", threads)) + .arg(format!("{}", setup.threads)) .arg("-o") .arg(&map_output); - - // if the user explicitly requested to use selective-alignment - // then enable that if opts.use_selective_alignment { salmon_quant_cmd.arg("--rad"); } else { - // otherwise default to sketch mode salmon_quant_cmd.arg("--sketch"); } - map_cmd_string = prog_utils::get_cmd_line_string(&salmon_quant_cmd); + let map_cmd_string = prog_utils::get_cmd_line_string(&salmon_quant_cmd); info!("salmon alevin cmd : {}", map_cmd_string); - sc_mapper = String::from("salmon"); - let mut input_files = vec![index]; input_files.extend_from_slice(reads1); input_files.extend_from_slice(reads2); - prog_utils::check_files_exist(&input_files)?; let map_start = Instant::now(); - let cres = prog_utils::execute_command( - &mut salmon_quant_cmd, - CommandVerbosityLevel::Quiet, - ) - .expect("failed to execute salmon [mapping phase]"); - - // if we had to filter the reads through a fifo - // wait for the thread feeding the fifo to finish + exec::run_checked(&mut salmon_quant_cmd, "salmon [mapping phase]")?; match frag_lib_xform { FragmentTransformationType::TransformedIntoFifo(xform_data) => { - // wait for it to join match xform_data.join_handle.join() { Ok(join_res) => { let xform_stats = join_res?; @@ -707,35 +565,40 @@ being used by simpleaf"#, } } } - FragmentTransformationType::Identity => { - // nothing to do. - } + FragmentTransformationType::Identity => {} } - map_duration = map_start.elapsed(); - - if !cres.status.success() { - bail!("salmon mapping failed with exit status {:?}", cres.status); - } + Ok(MappingStageOutput { + sc_mapper: String::from("salmon"), + map_cmd_string, + map_output, + map_duration: map_start.elapsed(), + }) } IndexType::NoIndex => { bail!("Cannot perform mapping an quantification without known (piscem or salmon) index!"); } } } else { - map_cmd_string = String::from(""); - sc_mapper = String::from(""); - map_output = opts - .map_dir - .expect("map-dir must be provided, since index, read1 and read2 were not."); - map_duration = Duration::new(0, 0); + Ok(MappingStageOutput { + sc_mapper: String::new(), + map_cmd_string: String::new(), + map_output: opts + .map_dir + .clone() + .context("map-dir must be provided, since index, read1 and read2 were not.")?, + map_duration: Duration::new(0, 0), + }) } +} - let map_output_string = map_output.display().to_string(); - - // make the quant directory +fn run_quant_stage( + opts: &MapQuantOpts, + setup: &QuantSetup, + mapping: &MappingStageOutput, + pl_info: &mut CBListInfo, +) -> anyhow::Result { let gpl_output = opts.output.join("af_quant"); - std::fs::create_dir_all(&gpl_output).with_context(|| { format!( "Failed to create quantification output directory {}", @@ -743,16 +606,13 @@ being used by simpleaf"#, ) })?; - // attempt to get the relevant information from map_info to propagate forward to the - // quantification directory - //get_mapping_info - let mapping_log = match index_type { + let mapping_log = match &setup.index_type { IndexType::Piscem(_) => { - let piscem_map_log_path = map_output.join("map_info.json"); + let piscem_map_log_path = mapping.map_output.join("map_info.json"); prog_parsing_utils::construct_json_from_piscem_log(piscem_map_log_path)? } IndexType::Salmon(_) => { - let salmon_log_path = map_output.join("logs").join("salmon_quant.log"); + let salmon_log_path = mapping.map_output.join("logs").join("salmon_quant.log"); prog_parsing_utils::construct_json_from_salmon_log(salmon_log_path)? } IndexType::NoIndex => { @@ -765,153 +625,119 @@ being used by simpleaf"#, }) } }; - let map_info_path = gpl_output.join("simpleaf_map_info.json"); let map_info_file = std::fs::File::create(map_info_path)?; serde_json::to_writer(map_info_file, &mapping_log)?; - let alevin_fry = rp.alevin_fry.unwrap().exe_path; - // alevin-fry generate permit list + let alevin_fry = setup + .rp + .alevin_fry + .as_ref() + .context("Alevin-fry program info is missing; please run `simpleaf set-paths`.")? + .exe_path + .clone(); let mut alevin_gpl_cmd = std::process::Command::new(format!("{}", &alevin_fry.display())); - - let gpl_threads = threads.min(8); + let gpl_threads = setup.threads.min(8); alevin_gpl_cmd.arg("generate-permit-list"); - alevin_gpl_cmd.arg("-i").arg(&map_output); - alevin_gpl_cmd.arg("-d").arg(ori.as_str()); + alevin_gpl_cmd.arg("-i").arg(&mapping.map_output); + alevin_gpl_cmd.arg("-d").arg(setup.ori.as_str()); alevin_gpl_cmd.arg("-t").arg(format!("{}", gpl_threads)); - - // add the filter mode - filter_meth.add_to_args(&mut alevin_gpl_cmd); - + setup.filter_meth.add_to_args(&mut alevin_gpl_cmd); alevin_gpl_cmd.arg("-o").arg(&gpl_output); - - info!( - "alevin-fry generate-permit-list cmd : {}", - prog_utils::get_cmd_line_string(&alevin_gpl_cmd) - ); - let input_files = vec![map_output.clone()]; + let gpl_cmd_string = prog_utils::get_cmd_line_string(&alevin_gpl_cmd); + info!("alevin-fry generate-permit-list cmd : {}", gpl_cmd_string); + let input_files = vec![mapping.map_output.clone()]; prog_utils::check_files_exist(&input_files)?; - let gpl_start = Instant::now(); - let gpl_proc_out = - prog_utils::execute_command(&mut alevin_gpl_cmd, CommandVerbosityLevel::Quiet) - .expect("could not execute [generate permit list]"); + exec::run_checked(&mut alevin_gpl_cmd, "[generate permit list]")?; let gpl_duration = gpl_start.elapsed(); - if !gpl_proc_out.status.success() { - bail!( - "alevin-fry generate-permit-list failed with exit status {:?}", - gpl_proc_out.status - ); - } - - // - // collate - // let mut alevin_collate_cmd = std::process::Command::new(format!("{}", &alevin_fry.display())); - alevin_collate_cmd.arg("collate"); alevin_collate_cmd.arg("-i").arg(&gpl_output); - alevin_collate_cmd.arg("-r").arg(&map_output); - alevin_collate_cmd.arg("-t").arg(format!("{}", threads)); - - info!( - "alevin-fry collate cmd : {}", - prog_utils::get_cmd_line_string(&alevin_collate_cmd) - ); - let input_files = vec![gpl_output.clone(), map_output]; + alevin_collate_cmd.arg("-r").arg(&mapping.map_output); + alevin_collate_cmd + .arg("-t") + .arg(format!("{}", setup.threads)); + let collate_cmd_string = prog_utils::get_cmd_line_string(&alevin_collate_cmd); + info!("alevin-fry collate cmd : {}", collate_cmd_string); + let input_files = vec![gpl_output.clone(), mapping.map_output.clone()]; prog_utils::check_files_exist(&input_files)?; - let collate_start = Instant::now(); - let collate_proc_out = - prog_utils::execute_command(&mut alevin_collate_cmd, CommandVerbosityLevel::Quiet) - .expect("could not execute [collate]"); + exec::run_checked(&mut alevin_collate_cmd, "[collate]")?; let collate_duration = collate_start.elapsed(); - if !collate_proc_out.status.success() { - bail!( - "alevin-fry collate failed with exit status {:?}", - collate_proc_out.status - ); - } - - // - // quant - // let mut alevin_quant_cmd = std::process::Command::new(format!("{}", &alevin_fry.display())); - alevin_quant_cmd .arg("quant") .arg("-i") .arg(&gpl_output) .arg("-o") .arg(&gpl_output); - alevin_quant_cmd.arg("-t").arg(format!("{}", threads)); - alevin_quant_cmd.arg("-m").arg(t2g_map_file.clone()); - alevin_quant_cmd.arg("-r").arg(opts.resolution); - + alevin_quant_cmd.arg("-t").arg(format!("{}", setup.threads)); + alevin_quant_cmd.arg("-m").arg(setup.t2g_map_file.clone()); + alevin_quant_cmd.arg("-r").arg(&opts.resolution); + let quant_cmd_string = prog_utils::get_cmd_line_string(&alevin_quant_cmd); info!("cmd : {:?}", alevin_quant_cmd); - - let input_files = vec![gpl_output.clone(), t2g_map_file]; + let input_files = vec![gpl_output.clone(), setup.t2g_map_file.clone()]; prog_utils::check_files_exist(&input_files)?; - let quant_start = Instant::now(); - let quant_proc_out = - prog_utils::execute_command(&mut alevin_quant_cmd, CommandVerbosityLevel::Quiet) - .expect("could not execute [quant]"); + exec::run_checked(&mut alevin_quant_cmd, "[quant]")?; let quant_duration = quant_start.elapsed(); - if !quant_proc_out.status.success() { - bail!("quant failed with exit status {:?}", quant_proc_out.status); - } - - // If we had a gene_id_to_name.tsv file handy, copy it over into the - // quantification directory. - if let Some(gene_name_path) = gene_id_to_name_opt { + if let Some(gene_name_path) = &setup.gene_id_to_name_opt { let target_path = gpl_output.join("gene_id_to_name.tsv"); - match std::fs::copy(&gene_name_path, &target_path) { + match std::fs::copy(gene_name_path, &target_path) { Ok(_) => { info!("successfully copied the gene_name_to_id.tsv file into the quantification directory."); } Err(err) => { - warn!("could not successfully copy gene_id_to_name file from {:?} to {:?} because of {:?}", - gene_name_path, target_path, err - ); + warn!( + "could not successfully copy gene_id_to_name file from {:?} to {:?} because of {:?}", + gene_name_path, target_path, err + ); } } } - // If a permit/explit list with auxilary info was provided, - // we add the auxilary info to the barcodes.tsv file. let quants_mat_rows_p = gpl_output.join("alevin").join("quants_mat_rows.txt"); pl_info.update_af_quant_barcodes_tsv(&quants_mat_rows_p)?; - let mut convert_duration = None; - if opts.anndata_out { - let convert_start = Instant::now(); - let opath = gpl_output.join("alevin").join("quants.h5ad"); - af_anndata::convert_csr_to_anndata(&gpl_output, &opath)?; - convert_duration = Some(convert_start.elapsed()); - } + Ok(QuantStageOutput { + gpl_output, + gpl_cmd_string, + collate_cmd_string, + quant_cmd_string, + gpl_duration, + collate_duration, + quant_duration, + }) +} +fn write_quant_log( + opts: &MapQuantOpts, + mapping: &MappingStageOutput, + quant_stage: &QuantStageOutput, + convert_duration: Option, +) -> anyhow::Result<()> { let af_quant_info_file = opts.output.join("simpleaf_quant_log.json"); let mut af_quant_info = json!({ "time_info" : { - "map_time" : map_duration, - "gpl_time" : gpl_duration, - "collate_time" : collate_duration, - "quant_time" : quant_duration + "map_time" : mapping.map_duration, + "gpl_time" : quant_stage.gpl_duration, + "collate_time" : quant_stage.collate_duration, + "quant_time" : quant_stage.quant_duration }, "cmd_info" : { - "map_cmd" : map_cmd_string, - "gpl_cmd" : prog_utils::get_cmd_line_string(&alevin_gpl_cmd), - "collate_cmd" : prog_utils::get_cmd_line_string(&alevin_collate_cmd), - "quant_cmd" : prog_utils::get_cmd_line_string(&alevin_quant_cmd) + "map_cmd" : mapping.map_cmd_string, + "gpl_cmd" : quant_stage.gpl_cmd_string, + "collate_cmd" : quant_stage.collate_cmd_string, + "quant_cmd" : quant_stage.quant_cmd_string }, "map_info" : { - "mapper" : sc_mapper, - "map_cmd" : map_cmd_string, - "map_outdir": map_output_string + "mapper" : mapping.sc_mapper, + "map_cmd" : mapping.map_cmd_string, + "map_outdir": mapping.map_output.display().to_string() } }); @@ -919,12 +745,106 @@ being used by simpleaf"#, af_quant_info["time_info"]["conversion_time"] = json!(ctime); } - // write the relevant info about - // our run to file. - std::fs::write( - &af_quant_info_file, - serde_json::to_string_pretty(&af_quant_info).unwrap(), - ) - .with_context(|| format!("could not write {}", af_quant_info_file.display()))?; + io::write_json_pretty_atomic(&af_quant_info_file, &af_quant_info)?; Ok(()) } + +pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result<()> { + validate_map_and_quant_opts(&opts)?; + let (setup, mut pl_info) = resolve_quant_setup(af_home_path, &opts)?; + let mapping = run_mapping_stage(&opts, &setup)?; + let quant_stage = run_quant_stage(&opts, &setup, &mapping, &mut pl_info)?; + + let mut convert_duration = None; + if opts.anndata_out { + let convert_start = Instant::now(); + let opath = quant_stage.gpl_output.join("alevin").join("quants.h5ad"); + af_anndata::convert_csr_to_anndata(&quant_stage.gpl_output, &opath)?; + convert_duration = Some(convert_start.elapsed()); + } + + write_quant_log(&opts, &mapping, &quant_stage, convert_duration) +} + +#[cfg(test)] +mod tests { + use clap::Parser; + + use crate::utils::af_utils::RnaChemistry; + use crate::{Cli, Commands}; + + use super::*; + + fn parse_quant_opts(args: &[&str]) -> MapQuantOpts { + let mut cli_args = vec!["simpleaf"]; + cli_args.extend_from_slice(args); + match Cli::parse_from(cli_args).command { + Commands::Quant(opts) => opts, + cmd => panic!("expected quant command, found {:?}", cmd), + } + } + + fn minimal_no_index_setup() -> QuantSetup { + QuantSetup { + rp: ReqProgs { + salmon: None, + piscem: None, + alevin_fry: None, + macs: None, + }, + index_type: IndexType::NoIndex, + t2g_map_file: PathBuf::from("/tmp/t2g.tsv"), + gene_id_to_name_opt: None, + chem: Chemistry::Rna(RnaChemistry::TenxV3), + ori: ExpectedOri::Forward, + filter_meth: CellFilterMethod::KneeFinding, + threads: 1, + } + } + + #[test] + fn mapping_stage_no_index_uses_map_dir() { + let opts = parse_quant_opts(&[ + "quant", + "-c", + "10xv3", + "-o", + "/tmp/out", + "-r", + "cr-like", + "--knee", + "--map-dir", + "/tmp/mapped", + ]); + let setup = minimal_no_index_setup(); + let stage = run_mapping_stage(&opts, &setup).expect("mapping stage should succeed"); + assert_eq!(stage.map_output, PathBuf::from("/tmp/mapped")); + assert_eq!(stage.sc_mapper, ""); + } + + #[test] + fn mapping_stage_no_index_fails_without_map_dir() { + let mut opts = parse_quant_opts(&[ + "quant", + "-c", + "10xv3", + "-o", + "/tmp/out", + "-r", + "cr-like", + "--knee", + "--map-dir", + "/tmp/mapped", + ]); + opts.map_dir = None; + opts.index = None; + + let setup = minimal_no_index_setup(); + let err = run_mapping_stage(&opts, &setup).expect_err("expected missing map-dir to fail"); + assert!( + format!("{:#}", err).contains("map-dir must be provided"), + "unexpected error: {:#}", + err + ); + } +} diff --git a/src/simpleaf_commands/workflow.rs b/src/simpleaf_commands/workflow.rs index 02ec4a2..9dad7a8 100644 --- a/src/simpleaf_commands/workflow.rs +++ b/src/simpleaf_commands/workflow.rs @@ -21,6 +21,85 @@ struct WorkflowTemplate { version: String, } +/// Fully resolved workflow run inputs after parsing either a manifest or template. +struct ResolvedWorkflowRun { + /// Parsed executable manifest. + instantiated_manifest: serde_json::Value, + /// Source path of the provided manifest/template (for logging context). + source_path: std::path::PathBuf, + /// Resolved output path extracted from the instantiated manifest. + output_path: std::path::PathBuf, +} + +/// Resolve `workflow run` inputs into an instantiated manifest and paths. +/// +/// This stage is responsible for parse/instantiate + output-path resolution. +fn resolve_workflow_run_inputs>( + af_home_path: T, + template: Option, + manifest: Option, + output_opt: Option, + jpaths: &Option>, + ext_codes: &Option>, +) -> anyhow::Result { + if let Some(manifest) = manifest { + info!("Loading and executing manifest."); + let instantiated_manifest = workflow_utils::parse_manifest(&manifest)?; + let output_path = workflow_utils::get_output_path(&instantiated_manifest)?; + return Ok(ResolvedWorkflowRun { + instantiated_manifest, + source_path: manifest, + output_path, + }); + } + + if let Some(template) = template { + if !template.exists() || !template.is_file() { + bail!("the path of the given workflow template file doesn't exist; Cannot proceed.") + } + + info!("Processing simpleaf template to produce and execute manifest."); + let workflow_json_string = workflow_utils::instantiate_workflow_template( + af_home_path.as_ref(), + template.as_path(), + output_opt.clone(), + jpaths, + ext_codes, + )?; + + let instantiated_manifest: Value = serde_json::from_str(workflow_json_string.as_str())?; + let output_path = workflow_utils::get_output_path(&instantiated_manifest)?; + + // If both command-line and template outputs are set, warn if template wins. + if let Some(requested_output_path) = output_opt { + if requested_output_path != output_path { + warn!( + r#"The output path {} was requested via the command line, but + the output path {} was resolved from the workflow template. + In this case, since the output variable is not used when instantiating + the template, the value ({}) present in the template must be used. + Please be aware that {} will not be used for output!"#, + requested_output_path.display(), + output_path.display(), + output_path.display(), + requested_output_path.display() + ); + } + } + + return Ok(ResolvedWorkflowRun { + instantiated_manifest, + source_path: template, + output_path, + }); + } + + bail!(concat!( + "You must have one of a manifest or template, ", + "but provided neither; this shouldn't happen" + )); +} + pub fn refresh_protocol_estuary>(af_home_path: T) -> anyhow::Result<()> { workflow_utils::get_protocol_estuary( af_home_path.as_ref(), @@ -384,71 +463,17 @@ pub fn run_workflow>( skip_step, ext_codes, } => { - // designate that output is optional - let output_opt = output; - - let instantiated_manifest: serde_json::Value; - let source_path: std::path::PathBuf; - let output_path: std::path::PathBuf; - if let Some(manifest) = manifest { - // If the user passed a fully-instantiated - // manifest to execute - info!("Loading and executing manifest."); - // iterate json files and parse records to commands - // convert files into json string vector - instantiated_manifest = workflow_utils::parse_manifest(&manifest)?; - output_path = workflow_utils::get_output_path(&instantiated_manifest)?; - source_path = manifest.clone(); - } else if let Some(template) = template { - // check the validity of the file - if !template.exists() || !template.is_file() { - bail!("the path of the given workflow template file doesn't exist; Cannot proceed.") - } - - info!("Processing simpleaf template to produce and execute manifest."); - - // iterate json files and parse records to commands - // convert files into json string vector - let workflow_json_string = workflow_utils::instantiate_workflow_template( - af_home_path.as_ref(), - template.as_path(), - output_opt.clone(), - &jpaths, - &ext_codes, - )?; - - // write complete workflow (i.e. the manifest) JSON to output folder - instantiated_manifest = serde_json::from_str(workflow_json_string.as_str())?; - output_path = workflow_utils::get_output_path(&instantiated_manifest)?; - - // check if the output path we read from the instantiated template matches - // the output path requested by the user (if the user passed one in). If - // they do not match, issue an obnoxious warning. - // @DongzeHe : We should also probably log this warning to the output - // log for subsequent inspection. - if let Some(requested_output_path) = output_opt { - if requested_output_path != output_path { - warn!( - r#"The output path {} was requested via the command line, but - the output path {} was resolved from the workflow template. - In this case, since the output variable is not used when instantiating - the template, the value ({}) present in the template must be used. - Please be aware that {} will not be used for output!"#, - requested_output_path.display(), - output_path.display(), - output_path.display(), - requested_output_path.display() - ); - } - } - - source_path = template.clone(); - } else { - bail!(concat!( - "You must have one of a manifest or template, ", - "but provided neither; this shouldn't happen" - )); - } + let resolved = resolve_workflow_run_inputs( + af_home_path.as_ref(), + template, + manifest, + output, + &jpaths, + &ext_codes, + )?; + let instantiated_manifest = resolved.instantiated_manifest; + let source_path = resolved.source_path; + let output_path = resolved.output_path; // recursively make the output directory, which at this point // has been resolved as the one used in the template or manifest diff --git a/src/utils/workflow_utils.rs b/src/utils/workflow_utils.rs index 7be22e8..3212c35 100644 --- a/src/utils/workflow_utils.rs +++ b/src/utils/workflow_utils.rs @@ -6,6 +6,7 @@ use anyhow::{anyhow, bail, Context}; use chrono::{DateTime, Local}; use clap::Parser; // use cmd_lib::run_cmd; +use crate::core::io::write_json_pretty_atomic; use serde_json::{json, Map, Value}; use std::boxed::Box; use std::collections::HashMap; @@ -33,6 +34,42 @@ pub enum WFCommand { ExternalCommand(std::process::Command), } +/// Structured errors returned when executing workflow commands. +#[derive(Debug, thiserror::Error)] +pub enum WorkflowExecutionError { + /// A `simpleaf` command in the workflow failed. + #[error("Workflow step {step} ({program}) failed while executing `{command}`: {source}")] + SimpleafStepFailed { + step: u64, + program: String, + command: String, + #[source] + source: anyhow::Error, + }, + /// An external workflow command exited with a non-zero status. + #[error( + "Workflow step {step} ({program}) failed executing external command `{command}` with exit status {status}. stderr: {stderr}" + )] + ExternalCommandNonZero { + step: u64, + program: String, + command: String, + status: String, + stderr: String, + }, + /// An external workflow command could not be launched. + #[error( + "Workflow step {step} ({program}) could not launch external command `{command}`: {source}" + )] + ExternalCommandSpawnFailed { + step: u64, + program: String, + command: String, + #[source] + source: std::io::Error, + }, +} + #[allow(dead_code)] enum ColumnTypeTag { String, @@ -66,7 +103,7 @@ impl PatchCollection { self.patches.push(p); } - pub fn iter(&self) -> std::slice::Iter { + pub fn iter(&self) -> std::slice::Iter<'_, JsonPatch> { self.patches.iter() } } @@ -285,6 +322,123 @@ be certain you intend to do this.", Ok(patches) } +/// Validate workflow structure before planning or execution. +/// +/// This ensures command entries contain both `step` and `program_name`, +/// verifies expected types for core fields, and rejects unsupported +/// `simpleaf` command variants with actionable messages. +pub fn validate_manifest_structure(workflow: &Value) -> anyhow::Result<()> { + fn validate_node(node: &Value, path: &str) -> anyhow::Result<()> { + match node { + Value::Object(obj) => { + let has_step = obj.get(SystemFields::Step.as_str()).is_some(); + let has_program = obj.get(SystemFields::ProgramName.as_str()).is_some(); + if has_step ^ has_program { + bail!( + "Malformed workflow entry at {}: command records must contain both `step` and `program_name`.", + path + ); + } + + if has_step { + let step_val = obj.get(SystemFields::Step.as_str()).with_context(|| { + format!( + "Malformed workflow entry at {}: missing `step` value.", + path + ) + })?; + let step = step_val.as_u64().with_context(|| { + format!( + "Malformed workflow entry at {}: `step` must be a positive integer, found {:?}.", + path, step_val + ) + })?; + if step == 0 { + bail!( + "Malformed workflow entry at {}: `step` must be >= 1, found 0.", + path + ); + } + + if let Some(active_val) = obj.get(SystemFields::Active.as_str()) { + if active_val.as_bool().is_none() { + bail!( + "Malformed workflow entry at {}: `active` must be a boolean, found {:?}.", + path, + active_val + ); + } + } + + let program_name_val = obj + .get(SystemFields::ProgramName.as_str()) + .with_context(|| { + format!( + "Malformed workflow entry at {}: missing `program_name` value.", + path + ) + })?; + let program_name = program_name_val.as_str().with_context(|| { + format!( + "Malformed workflow entry at {}: `program_name` must be a string, found {:?}.", + path, program_name_val + ) + })?; + + if program_name.starts_with("simpleaf") + && !program_name.ends_with("index") + && !program_name.ends_with("quant") + { + bail!( + "Unsupported workflow command `{}` at {}. Only `simpleaf index` and `simpleaf quant` are currently supported.", + program_name, + path + ); + } + + if !program_name.starts_with("simpleaf") { + let args_val = obj + .get(SystemFields::ExternalArguments.as_str()) + .with_context(|| { + format!( + "Malformed external command at {}: missing `arguments` array.", + path + ) + })?; + if args_val.as_array().is_none() { + bail!( + "Malformed external command at {}: `arguments` must be an array, found {:?}.", + path, + args_val + ); + } + } + } + + // Recurse to validate nested workflow sections. + for (k, v) in obj { + let child_path = if path == "$" { + format!("$.{}", k) + } else { + format!("{}.{}", path, k) + }; + validate_node(v, &child_path)?; + } + Ok(()) + } + Value::Array(arr) => { + for (idx, v) in arr.iter().enumerate() { + validate_node(v, &format!("{}[{}]", path, idx))?; + } + Ok(()) + } + _ => Ok(()), + } + } + + validate_node(workflow, "$") +} + pub fn get_output_path(manifest: &serde_json::Value) -> anyhow::Result { // we assume that the path we want is /meta_info/output, and it *must* exist // as a key! @@ -366,16 +520,11 @@ pub fn duration_to_dhms(d: chrono::Duration) -> String { execution_elapsed_time } -/// This function updates the start_at variable -/// if --resume is provided.\ -/// It finds the workflow_info.json exported by -/// simpleaf workflow from the previous run and -/// grab the "Succeed" and "Execution Terminated Step" -/// fields.\ -/// If the previous run was succeed, then we report an error -/// saying nothing to resume -/// If Execution Terminated Step is a negative number, that -/// means there was no previous execution: +/// Compute the effective `start_at` for `--resume` runs from a prior workflow log. +/// +/// This reads `"Succeed"` and `"Latest Run"."Execution Terminated Step"` from the +/// previous `simpleaf_workflow_log.json` payload and returns the step to resume from. +/// If the previous run already succeeded, resuming is rejected. pub fn update_start_at(v: &Value) -> anyhow::Result { let latest_run = v.get("Latest Run").with_context(|| { "Could not get the `Latest Run` field from the `simpleaf_workflow_log.json`; Cannot proceed" @@ -404,6 +553,9 @@ pub fn update_start_at(v: &Value) -> anyhow::Result { } } +/// Load the previous workflow metadata log used by `--resume`. +/// +/// The expected file is `simpleaf_workflow_log.json` in the provided output directory. pub fn get_previous_log>(output: T) -> anyhow::Result { // the path to the expected log file let exec_log_path = output.as_ref().join("simpleaf_workflow_log.json"); @@ -436,6 +588,10 @@ pub fn get_previous_log>(output: T) -> anyhow::Result { } } +/// Execute a planned workflow queue and update the workflow log after each successful step. +/// +/// Commands are run in queue order (already sorted by step during planning). +/// On the first command failure, this writes a failed log state and returns the error. pub fn execute_commands_in_workflow>( simpleaf_workflow: SimpleafWorkflow, af_home_path: T, @@ -454,6 +610,7 @@ pub fn execute_commands_in_workflow>( match cr.cmd { WFCommand::SimpleafCommand(cmd) => { + let command_string = pn.to_string(); let exec_result = match *cmd { Commands::Index(index_opts) => { crate::indexing::build_ref_and_index(af_home_path.as_ref(), index_opts) @@ -462,12 +619,23 @@ pub fn execute_commands_in_workflow>( Commands::Quant(quant_opts) => { crate::quant::map_and_quant(af_home_path.as_ref(), quant_opts) } - _ => todo!(), + unsupported_cmd => { + bail!( + "Unsupported simpleaf workflow command variant: {:?}. Only `index` and `quant` are currently supported in workflow execution.", + unsupported_cmd + ) + } }; if let Err(e) = exec_result { workflow_log.write(false)?; info!("Execution terminated at {} command for step {}", pn, step); - return Err(e); + return Err(WorkflowExecutionError::SimpleafStepFailed { + step, + program: pn.to_string(), + command: command_string, + source: e, + } + .into()); } else { info!("Successfully ran {} command for step {}", pn, step); workflow_log.update(&cr.field_trajectory_vec[..])?; @@ -490,23 +658,29 @@ pub fn execute_commands_in_workflow>( workflow_log.update(&cr.field_trajectory_vec[..])?; } else { workflow_log.write(false)?; - let cmd_stderr = std::str::from_utf8(&cres.stderr[..])?; - let msg = format!("{} command at step {} failed to exit with code 0 under the shell.\n\ - The exit status was: {}.\n\ - The stderr of the invocation was: {}.", pn, step, cres.status, cmd_stderr); - warn!(msg); - bail!(msg); + let cmd_stderr = + String::from_utf8_lossy(&cres.stderr).trim().to_string(); + let err = WorkflowExecutionError::ExternalCommandNonZero { + step, + program: pn.to_string(), + command: cmd_string, + status: cres.status.to_string(), + stderr: cmd_stderr, + }; + warn!("{:#}", err); + return Err(err.into()); } } Err(e) => { workflow_log.write(false)?; - let msg = format!( - "{} command at step {} failed to execute under the shell.\n\ - The returned error was: {:?}.\n", - pn, step, e - ); - warn!(msg); - bail!(msg); + let err = WorkflowExecutionError::ExternalCommandSpawnFailed { + step, + program: pn.to_string(), + command: cmd_string, + source: e, + }; + warn!("{:#}", err); + return Err(err.into()); } // TODO: use this in the log somewhere. } // invoke external cmd } @@ -527,6 +701,9 @@ pub fn initialize_workflow>( skip_step: Vec, resume: bool, ) -> anyhow::Result<(SimpleafWorkflow, WorkflowLog)> { + // Validate manifest shape before planning command execution. + validate_manifest_structure(&workflow_json_value)?; + // Instantiate a workflow log struct let mut wl = WorkflowLog::new( output.as_ref(), @@ -621,7 +798,7 @@ impl SimpleafWorkflow { .with_context(|| { format!( "Cannot parse step field value, {:?}, as an integer", - field.get(SystemFields::Step.as_str()).unwrap() + field.get(SystemFields::Step.as_str()) ) })?; @@ -633,7 +810,7 @@ impl SimpleafWorkflow { v.as_bool().with_context(|| { format!( "Cannot parse active field value, {:?}, as a boolean", - field.get(SystemFields::Active.as_str()).unwrap() + field.get(SystemFields::Active.as_str()) ) })? } else { @@ -644,37 +821,45 @@ impl SimpleafWorkflow { let cmd_field = workflow_log.get_mut_cmd_field(&curr_field_trajectory_vec)?; cmd_field[SystemFields::Active.as_str()] = json!(active); - // The field must contains a program_name - if let Some(program_name) = field.get(SystemFields::ProgramName.as_str()) { - pn = ProgramName::from_str(program_name.as_str().with_context(|| { + // The command record must contain a program_name field. + let program_name = field + .get(SystemFields::ProgramName.as_str()) + .with_context(|| { + format!( + "Malformed workflow command at step {}: missing `program_name` field.", + step + ) + })?; + pn = + ProgramName::from_str(program_name.as_str().with_context(|| { "Cannot create ProgramName struct from a program name" })?); - // if active, then push to execution queue - if active { - info!("Parsing {} command for step {}", pn, step); - // The `step` will be used for sorting the cmd_queue vector. - // all commands must have a valid `step`. - let cmd = match pn.create_cmd(field) { - Ok(v) => v, - Err(e) => { - if pn.is_external() { - bail!("Could not parse external command {} for step {}. The error message was: {}", pn, step, e); - } else { - bail!("Could not parse simpleaf command {} for step {}. The error message was: {}", pn, step, e); - } + + // if active, then push to execution queue + if active { + info!("Parsing {} command for step {}", pn, step); + // The `step` will be used for sorting the cmd_queue vector. + // all commands must have a valid `step`. + let cmd = match pn.create_cmd(field) { + Ok(v) => v, + Err(e) => { + if pn.is_external() { + bail!("Could not parse external command {} for step {}. The error message was: {}", pn, step, e); + } else { + bail!("Could not parse simpleaf command {} for step {}. The error message was: {}", pn, step, e); } - }; - cmd_queue.push(CommandRecord { - step, - active, - program_name: pn, - cmd, - field_trajectory_vec: curr_field_trajectory_vec, - }); - } else { - info!("Skipping {} command for step {}", pn, step); - } // if active - } // if have ProgramName + } + }; + cmd_queue.push(CommandRecord { + step, + active, + program_name: pn, + cmd, + field_trajectory_vec: curr_field_trajectory_vec, + }); + } else { + info!("Skipping {} command for step {}", pn, step); + } // if active } else { // If this is not a command record, we move to the next level // recursively calling this function on the current field. @@ -729,6 +914,7 @@ pub struct WorkflowLog { // this vector records all field names in the complete workflow. // This is used for locating the step for each command field_id_to_name: Vec, + field_name_to_id: HashMap, // TODO: the trajectory vector in each CommandRecord can be // move here as a long vector, and in each CommandRecord // we just need to record the start pos and the length of its trajectory. @@ -772,12 +958,12 @@ impl WorkflowLog { let workflow_name = template .as_ref() .file_stem() - .unwrap_or_else(|| { - panic!( + .with_context(|| { + format!( "Cannot parse file name of file {}", template.as_ref().display() ) - }) + })? .to_string_lossy() .into_owned(); @@ -804,6 +990,7 @@ impl WorkflowLog { value: workflow_json_value.clone(), cmd_runtime_records: Map::new(), field_id_to_name: Vec::new(), + field_name_to_id: HashMap::new(), // cmds_field_id_trajectory: Vec::new() previous_log, }) @@ -816,13 +1003,8 @@ impl WorkflowLog { }); } - /// Write log to the output path. - /// 1. an execution log file includes the whole workflow, - /// in which succeffully invoked commands have - /// a negative `step` - /// 2. an info log file records runtime, workflow name, - /// output path etc. - pub fn write(&self, succeed: bool) -> anyhow::Result<()> { + /// Build the metadata payload written to `simpleaf_workflow_log.json`. + fn build_meta_info_payload(&self, succeed: bool) -> anyhow::Result { // initiate meta_info let workflow_meta_info = if let Some(workflow_meta_info) = &self.workflow_meta_info { workflow_meta_info.to_owned() @@ -868,7 +1050,7 @@ impl WorkflowLog { let d = Local::now().signed_duration_since(self.workflow_start_time); let execution_elapsed_time = duration_to_dhms(d); - let meta_info = json!( + Ok(json!( { "Workflow Name": self.workflow_name, "Workflow Meta Info": workflow_meta_info, @@ -883,28 +1065,25 @@ impl WorkflowLog { "Command Runtime by Step": Value::from(self.cmd_runtime_records.clone()), }, "Previous Runs": previous_runs - }); + })) + } - // execution log - std::fs::write( - self.meta_info_path.as_path(), - serde_json::to_string_pretty(&meta_info) - .with_context(|| ("Cannot convert json value to string."))?, - ) - .with_context(|| { + /// Write log to the output path. + /// 1. an execution log file includes the whole workflow, + /// in which succeffully invoked commands have + /// a negative `step` + /// 2. an info log file records runtime, workflow name, + /// output path etc. + pub fn write(&self, succeed: bool) -> anyhow::Result<()> { + let meta_info = self.build_meta_info_payload(succeed)?; + write_json_pretty_atomic(self.meta_info_path.as_path(), &meta_info).with_context(|| { format!( "could not write workflow meta info JSON file to {}", self.meta_info_path.display() ) })?; - // execution log - std::fs::write( - self.exec_log_path.as_path(), - serde_json::to_string_pretty(&self.value) - .with_context(|| "Could not convert json value to string.")?, - ) - .with_context(|| { + write_json_pretty_atomic(self.exec_log_path.as_path(), &self.value).with_context(|| { format!( "could not write complete simpleaf workflow JSON file to {}", self.exec_log_path.display() @@ -916,11 +1095,13 @@ impl WorkflowLog { /// Get the index corresponds to the field name in the field_id_to_name vector. pub fn get_field_id(&mut self, field_name: &String) -> usize { - if let Ok(pos) = self.field_id_to_name.binary_search(field_name) { - pos + if let Some(pos) = self.field_name_to_id.get(field_name) { + *pos } else { + let pos = self.field_id_to_name.len(); self.field_id_to_name.push(field_name.to_owned()); - self.field_id_to_name.len() - 1 + self.field_name_to_id.insert(field_name.to_owned(), pos); + pos } } @@ -943,7 +1124,7 @@ impl WorkflowLog { curr_field_name = self .field_id_to_name .get(*curr_field_id) - .expect("Cannot map field ID to name."); + .with_context(|| format!("Cannot map field ID {} to name.", curr_field_id))?; // let curr_pos = field_trajectory_vec.first().expect("Cannot get the first element"); curr_field = curr_field .get_mut(curr_field_name) @@ -1047,6 +1228,13 @@ impl ProgramName { matches!(self, &ProgramName::External(_)) } + /// Return true if an argument contains shell operators and therefore + /// requires shell interpretation. + fn has_shell_metachar(arg: &str) -> bool { + arg.chars() + .any(|c| ['|', '&', ';', '<', '>', '$', '`', '(', ')'].contains(&c)) + } + /// Create a valid simpleaf command object using the arguments recoreded in the field. /// step and program name will be ignored in this procedure pub fn create_simpleaf_cmd(&self, value: &Value) -> anyhow::Result { @@ -1123,10 +1311,18 @@ impl ProgramName { })? ); } - // make Command struct for the command - let external_cmd = shell(arg_vec.join(" ")); + let requires_shell = arg_vec.iter().any(|s| Self::has_shell_metachar(s)); - Ok(WFCommand::ExternalCommand(external_cmd)) + if requires_shell { + // Preserve shell semantics for advanced command forms (e.g. pipes/redirection). + let external_cmd = shell(arg_vec.join(" ")); + Ok(WFCommand::ExternalCommand(external_cmd)) + } else { + // Prefer direct argv invocation for robust argument handling. + let mut external_cmd = std::process::Command::new(&arg_vec[0]); + external_cmd.args(&arg_vec[1..]); + Ok(WFCommand::ExternalCommand(external_cmd)) + } } pub fn create_cmd(&self, value: &Value) -> anyhow::Result { @@ -1367,12 +1563,15 @@ mod tests { use crate::{ utils::{ prog_utils::{get_cmd_line_string, shell}, - workflow_utils::{initialize_workflow, WorkflowLog}, + workflow_utils::{ + execute_commands_in_workflow, initialize_workflow, validate_manifest_structure, + CommandRecord, SimpleafWorkflow, WorkflowLog, + }, }, Commands, }; - use core::panic; - use std::path::PathBuf; + use std::path::{Path, PathBuf}; + use tempfile::tempdir; #[test] fn test_workflow_command() { @@ -1482,69 +1681,70 @@ mod tests { ) .unwrap(); - match &wl { - WorkflowLog { - meta_info_path, + let WorkflowLog { + meta_info_path, + exec_log_path, + workflow_start_time: _, + command_runtime, + num_succ, + start_at, + workflow_name, + workflow_meta_info, + value, + cmd_runtime_records, + field_id_to_name, + field_name_to_id, + skip_step, + previous_log: _, + } = &wl; + { + // test wl + // check JSON log output json + assert_eq!( exec_log_path, - workflow_start_time: _, - command_runtime, - num_succ, - start_at, - workflow_name, - workflow_meta_info, - value, - cmd_runtime_records, - field_id_to_name, - skip_step, - previous_log: _, - } => { - // test wl - // check JSON log output json - assert_eq!( - exec_log_path, - &PathBuf::from("output_dir/workflow_execution_log.json") - ); - assert_eq!( - meta_info_path, - &PathBuf::from("output_dir/simpleaf_workflow_log.json") - ); + &PathBuf::from("output_dir/workflow_execution_log.json") + ); + assert_eq!( + meta_info_path, + &PathBuf::from("output_dir/simpleaf_workflow_log.json") + ); - assert_eq!(workflow_name, &String::from("fake_config")); + assert_eq!(workflow_name, &String::from("fake_config")); - assert_eq!( - workflow_meta_info, - &Some( - workflow_json_value - .get(SystemFields::MetaInfo.as_str()) - .unwrap() - .to_owned() - ) - ); + assert_eq!( + workflow_meta_info, + &Some( + workflow_json_value + .get(SystemFields::MetaInfo.as_str()) + .unwrap() + .to_owned() + ) + ); - let mut new_value = value.to_owned(); - new_value["rna"]["simpleaf_index"]["active"] = json!(true); - new_value["external-commands"]["HTO ref gunzip"]["active"] = json!(true); + let mut new_value = value.to_owned(); + new_value["rna"]["simpleaf_index"]["active"] = json!(true); + new_value["external-commands"]["HTO ref gunzip"]["active"] = json!(true); - assert_eq!(new_value, workflow_json_value); + assert_eq!(new_value, workflow_json_value); - assert_eq!(cmd_runtime_records, &Map::new()); + assert_eq!(cmd_runtime_records, &Map::new()); - assert_eq!(start_at, &2u64); - assert_eq!(skip_step, &vec![3]); - assert!( - field_id_to_name.contains(&"rna".to_string()) - && field_id_to_name.contains(&"meta_info".to_string()) - && field_id_to_name.contains(&"simpleaf_index".to_string()) - && field_id_to_name.contains(&"simpleaf_quant".to_string()) - && field_id_to_name.contains(&"external-commands".to_string()) - && field_id_to_name.contains(&"HTO ref gunzip".to_string()) - && field_id_to_name.contains(&"ADT ref gunzip".to_string()) - ); + assert_eq!(start_at, &2u64); + assert_eq!(skip_step, &vec![3]); + assert!( + field_id_to_name.contains(&"rna".to_string()) + && field_id_to_name.contains(&"meta_info".to_string()) + && field_id_to_name.contains(&"simpleaf_index".to_string()) + && field_id_to_name.contains(&"simpleaf_quant".to_string()) + && field_id_to_name.contains(&"external-commands".to_string()) + && field_id_to_name.contains(&"HTO ref gunzip".to_string()) + && field_id_to_name.contains(&"ADT ref gunzip".to_string()) + ); + assert_eq!(field_id_to_name.len(), field_name_to_id.len()); - assert!(command_runtime.is_none()); + assert!(command_runtime.is_none()); - assert_eq!(num_succ, &0); - } + assert_eq!(num_succ, &0); } // we started at 2, and skipped 3. So there are two commands @@ -1656,4 +1856,269 @@ mod tests { e => panic!("expected SimpleafCommand, found {:?}", e), }; } + + #[test] + fn test_get_field_id_is_stable_for_unsorted_insertions() { + let tmp = tempdir().unwrap(); + let mut wl = WorkflowLog::new( + tmp.path(), + Path::new("fake_template.jsonnet"), + &json!({}), + 1, + vec![], + false, + ) + .unwrap(); + + let id_z_1 = wl.get_field_id(&"z_field".to_string()); + let _id_a = wl.get_field_id(&"a_field".to_string()); + let id_z_2 = wl.get_field_id(&"z_field".to_string()); + + assert_eq!(id_z_1, id_z_2); + } + + #[test] + fn test_execute_commands_unsupported_simpleaf_command_returns_error() { + let tmp = tempdir().unwrap(); + let mut wl = WorkflowLog::new( + tmp.path(), + Path::new("fake_template.jsonnet"), + &json!({}), + 1, + vec![], + false, + ) + .unwrap(); + + let sw = SimpleafWorkflow { + af_home_path: tmp.path().to_path_buf(), + cmd_queue: vec![CommandRecord { + step: 1, + active: true, + program_name: ProgramName::Index, + cmd: WFCommand::SimpleafCommand(Box::new(Commands::Inspect {})), + field_trajectory_vec: vec![], + }], + }; + + let err = execute_commands_in_workflow(sw, tmp.path(), &mut wl).unwrap_err(); + assert!( + format!("{:#}", err).contains("Unsupported simpleaf workflow command variant"), + "unexpected error: {:#}", + err + ); + } + + #[test] + fn test_validate_manifest_structure_rejects_missing_program_name() { + let manifest = json!({ + "external": { + "broken": { + "step": 1, + "arguments": ["-l"] + } + } + }); + + let err = validate_manifest_structure(&manifest).unwrap_err(); + assert!( + format!("{:#}", err).contains("must contain both `step` and `program_name`"), + "unexpected error: {:#}", + err + ); + } + + #[test] + fn test_validate_manifest_structure_rejects_unsupported_simpleaf_command() { + let manifest = json!({ + "bad_simpleaf": { + "step": 1, + "program_name": "simpleaf inspect", + "arguments": [] + } + }); + + let err = validate_manifest_structure(&manifest).unwrap_err(); + assert!( + format!("{:#}", err) + .contains("Only `simpleaf index` and `simpleaf quant` are currently supported"), + "unexpected error: {:#}", + err + ); + } + + #[test] + fn test_initialize_workflow_resume_uses_previous_terminated_step() { + let tmp = tempdir().unwrap(); + let previous_log = json!({ + "Succeed": false, + "Latest Run": { + "Execution Terminated Step": 2 + } + }); + std::fs::write( + tmp.path().join("simpleaf_workflow_log.json"), + serde_json::to_string_pretty(&previous_log).unwrap(), + ) + .unwrap(); + + let manifest = json!({ + "external": { + "step1": { + "step": 1, + "program_name": "echo", + "arguments": ["one"] + }, + "step2": { + "step": 2, + "program_name": "echo", + "arguments": ["two"] + } + } + }); + + let (sw, wl) = initialize_workflow( + tmp.path(), + Path::new("resume_template.jsonnet"), + tmp.path(), + manifest, + 1, + vec![], + true, + ) + .unwrap(); + + assert_eq!(wl.start_at, 2); + assert_eq!(sw.cmd_queue.len(), 1); + assert_eq!(sw.cmd_queue[0].step, 2); + } + + #[test] + fn test_initialize_workflow_skip_step_excludes_skipped_command() { + let tmp = tempdir().unwrap(); + let manifest = json!({ + "external": { + "step1": { + "step": 1, + "program_name": "echo", + "arguments": ["one"] + }, + "step2": { + "step": 2, + "program_name": "echo", + "arguments": ["two"] + } + } + }); + + let (sw, wl) = initialize_workflow( + tmp.path(), + Path::new("skip_template.jsonnet"), + tmp.path(), + manifest, + 1, + vec![1], + false, + ) + .unwrap(); + + assert_eq!(wl.skip_step, vec![1]); + assert_eq!(sw.cmd_queue.len(), 1); + assert_eq!(sw.cmd_queue[0].step, 2); + } + + #[test] + fn test_external_command_builder_uses_shell_only_when_required() { + let direct = ProgramName::from_str("echo") + .create_external_cmd(&json!({"arguments": ["hello"]})) + .unwrap(); + let shell_required = ProgramName::from_str("echo") + .create_external_cmd(&json!({"arguments": ["hello", ">", "out.txt"]})) + .unwrap(); + + match direct { + WFCommand::ExternalCommand(cmd) => { + let cmd_line = get_cmd_line_string(&cmd); + assert!( + !cmd_line.contains(" -c "), + "expected direct argv invocation, saw: {}", + cmd_line + ); + } + _ => panic!("expected external command for direct argv case"), + } + + match shell_required { + WFCommand::ExternalCommand(cmd) => { + let cmd_line = get_cmd_line_string(&cmd); + assert!( + cmd_line.contains(" -c "), + "expected shell invocation for redirection case, saw: {}", + cmd_line + ); + } + _ => panic!("expected external command for shell case"), + } + } + + #[test] + fn test_execute_commands_external_failure_reports_status_and_stderr() { + let tmp = tempdir().unwrap(); + let mut wl = WorkflowLog::new( + tmp.path(), + Path::new("fake_template.jsonnet"), + &json!({}), + 1, + vec![], + false, + ) + .unwrap(); + + let sw = SimpleafWorkflow { + af_home_path: tmp.path().to_path_buf(), + cmd_queue: vec![CommandRecord { + step: 1, + active: true, + program_name: ProgramName::External("sh".to_string()), + cmd: WFCommand::ExternalCommand(shell("echo workflow_phase6_err 1>&2; exit 7")), + field_trajectory_vec: vec![], + }], + }; + + let err = execute_commands_in_workflow(sw, tmp.path(), &mut wl).unwrap_err(); + let rendered = format!("{:#}", err); + assert!( + rendered.contains("failed executing external command"), + "unexpected error: {}", + rendered + ); + assert!( + rendered.contains("exit status"), + "unexpected error: {}", + rendered + ); + assert!( + rendered.contains("workflow_phase6_err"), + "unexpected error: {}", + rendered + ); + } + + #[test] + fn test_workflow_log_write_creates_both_json_logs() { + let tmp = tempdir().unwrap(); + let wl = WorkflowLog::new( + tmp.path(), + Path::new("fake_template.jsonnet"), + &json!({}), + 1, + vec![], + false, + ) + .unwrap(); + + wl.write(true).unwrap(); + assert!(tmp.path().join("simpleaf_workflow_log.json").is_file()); + assert!(tmp.path().join("workflow_execution_log.json").is_file()); + } } diff --git a/tests/cli_help_snapshots.rs b/tests/cli_help_snapshots.rs new file mode 100644 index 0000000..6d0c5e3 --- /dev/null +++ b/tests/cli_help_snapshots.rs @@ -0,0 +1,116 @@ +use std::fs; +use std::path::{Path, PathBuf}; +use std::process::Command; + +use tempfile::tempdir; + +fn snapshot_cases() -> Vec<(&'static str, Vec<&'static str>)> { + vec![ + ("simpleaf___help.txt", vec!["--help"]), + ("simpleaf_index___help.txt", vec!["index", "--help"]), + ("simpleaf_quant___help.txt", vec!["quant", "--help"]), + ("simpleaf_chemistry___help.txt", vec!["chemistry", "--help"]), + ( + "simpleaf_chemistry_add___help.txt", + vec!["chemistry", "add", "--help"], + ), + ( + "simpleaf_chemistry_remove___help.txt", + vec!["chemistry", "remove", "--help"], + ), + ( + "simpleaf_chemistry_clean___help.txt", + vec!["chemistry", "clean", "--help"], + ), + ( + "simpleaf_chemistry_lookup___help.txt", + vec!["chemistry", "lookup", "--help"], + ), + ( + "simpleaf_chemistry_refresh___help.txt", + vec!["chemistry", "refresh", "--help"], + ), + ( + "simpleaf_chemistry_fetch___help.txt", + vec!["chemistry", "fetch", "--help"], + ), + ("simpleaf_inspect___help.txt", vec!["inspect", "--help"]), + ("simpleaf_set_paths___help.txt", vec!["set-paths", "--help"]), + ( + "simpleaf_refresh_prog_info___help.txt", + vec!["refresh-prog-info", "--help"], + ), + ("simpleaf_workflow___help.txt", vec!["workflow", "--help"]), + ( + "simpleaf_workflow_run___help.txt", + vec!["workflow", "run", "--help"], + ), + ( + "simpleaf_workflow_get___help.txt", + vec!["workflow", "get", "--help"], + ), + ( + "simpleaf_workflow_patch___help.txt", + vec!["workflow", "patch", "--help"], + ), + ( + "simpleaf_workflow_list___help.txt", + vec!["workflow", "list", "--help"], + ), + ( + "simpleaf_workflow_refresh___help.txt", + vec!["workflow", "refresh", "--help"], + ), + ("simpleaf_atac___help.txt", vec!["atac", "--help"]), + ( + "simpleaf_atac_index___help.txt", + vec!["atac", "index", "--help"], + ), + ( + "simpleaf_atac_process___help.txt", + vec!["atac", "process", "--help"], + ), + ] +} + +fn snapshots_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("snapshots") + .join("cli-help") +} + +#[test] +fn cli_help_outputs_match_snapshots() { + let binary = env!("CARGO_BIN_EXE_simpleaf"); + let af_home = tempdir().expect("unable to create temp ALEVIN_FRY_HOME"); + + for (snapshot_file, args) in snapshot_cases() { + let output = Command::new(binary) + .args(&args) + .env("ALEVIN_FRY_HOME", af_home.path()) + .env("COLUMNS", "100") + .output() + .expect("failed to execute simpleaf"); + + assert!( + output.status.success(), + "expected success for args {:?}, got status {:?}\nstderr:\n{}", + &args, + output.status.code(), + String::from_utf8_lossy(&output.stderr) + ); + + let actual = String::from_utf8(output.stdout).expect("stdout was not valid UTF-8"); + let expected_path = snapshots_dir().join(snapshot_file); + let expected = fs::read_to_string(&expected_path).unwrap_or_else(|e| { + panic!("failed reading snapshot {}: {}", expected_path.display(), e) + }); + + assert_eq!( + actual, expected, + "help output drifted for args {:?} against snapshot {}", + &args, snapshot_file + ); + } +} diff --git a/tests/cli_smoke.rs b/tests/cli_smoke.rs new file mode 100644 index 0000000..fa016bd --- /dev/null +++ b/tests/cli_smoke.rs @@ -0,0 +1,69 @@ +use std::process::{Command, Output}; + +use tempfile::tempdir; + +fn run_simpleaf(args: &[&str]) -> Output { + let binary = env!("CARGO_BIN_EXE_simpleaf"); + let af_home = tempdir().expect("unable to create temp ALEVIN_FRY_HOME"); + + Command::new(binary) + .args(args) + .env("ALEVIN_FRY_HOME", af_home.path()) + .output() + .expect("failed to execute simpleaf") +} + +fn assert_parse_error(args: &[&str]) { + let output = run_simpleaf(args); + assert!( + !output.status.success(), + "expected parse error for args {:?}", + args + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Usage:"), + "expected clap usage in stderr for {:?}, got:\n{}", + args, + stderr + ); +} + +#[test] +fn quant_requires_input_type_group() { + assert_parse_error(&[ + "quant", "-c", "10xv3", "-o", "/tmp/out", "-r", "cr-like", "--knee", + ]); +} + +#[test] +fn quant_rejects_conflicting_map_inputs() { + assert_parse_error(&[ + "quant", + "-c", + "10xv3", + "-o", + "/tmp/out", + "-r", + "cr-like", + "--knee", + "--map-dir", + "/tmp/mapped", + "--index", + "/tmp/index", + "-1", + "r1.fastq", + "-2", + "r2.fastq", + ]); +} + +#[test] +fn index_requires_gtf_when_fasta_is_provided() { + assert_parse_error(&["index", "-f", "genome.fa", "-o", "/tmp/index_out"]); +} + +#[test] +fn atac_command_requires_subcommand() { + assert_parse_error(&["atac"]); +} diff --git a/tests/phase1_regressions.rs b/tests/phase1_regressions.rs new file mode 100644 index 0000000..58bce8e --- /dev/null +++ b/tests/phase1_regressions.rs @@ -0,0 +1,59 @@ +use std::fs; +use std::process::Command; + +use serde_json::json; +use tempfile::tempdir; + +#[test] +fn salmon_missing_error_mentions_salmon_executable() { + let binary = env!("CARGO_BIN_EXE_simpleaf"); + let af_home = tempdir().expect("failed to create temp af home"); + let t2g = af_home.path().join("t2g.tsv"); + fs::write(&t2g, "tx1\tgene1\n").expect("failed to write t2g"); + + let af_info = json!({ + "prog_info": { + "salmon": null, + "piscem": null, + "alevin_fry": {"exe_path": "/bin/echo", "version": "0.11.2"}, + "macs": null + } + }); + fs::write( + af_home.path().join("simpleaf_info.json"), + serde_json::to_string_pretty(&af_info).expect("failed to serialize af info"), + ) + .expect("failed to write simpleaf_info.json"); + + let output = Command::new(binary) + .env("ALEVIN_FRY_HOME", af_home.path()) + .args([ + "quant", + "-c", + "10xv3", + "-o", + "/tmp/out", + "-r", + "cr-like", + "--knee", + "-i", + "/tmp/fake_index", + "--no-piscem", + "-1", + "r1.fastq", + "-2", + "r2.fastq", + "-m", + &t2g.to_string_lossy(), + ]) + .output() + .expect("failed to run simpleaf"); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("no salmon executable is provided"), + "stderr did not mention salmon executable:\n{}", + stderr + ); +} diff --git a/tests/snapshots/cli-help/simpleaf___help.txt b/tests/snapshots/cli-help/simpleaf___help.txt new file mode 100644 index 0000000..d722166 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf___help.txt @@ -0,0 +1,18 @@ +A rust framework to make using alevin-fry and alevin-fry-ATAC even simpler. + +Usage: simpleaf + +Commands: + index build the (expanded) reference index + chemistry operate on or inspect the chemistry registry + inspect inspect the current configuration + quant quantify a sample + set-paths set paths to the programs that simpleaf will use + refresh-prog-info refreshes version information associated with programs used by simpleaf + atac run a sub-command dealing with atac-seq data + workflow simpleaf workflow related command set + help Print this message or the help of the given subcommand(s) + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_atac___help.txt b/tests/snapshots/cli-help/simpleaf_atac___help.txt new file mode 100644 index 0000000..b0bb92f --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_atac___help.txt @@ -0,0 +1,13 @@ +run a sub-command dealing with atac-seq data + +Usage: simpleaf atac + +Commands: + index build a piscem index over the genome for scATAC-seq mapping + process process a scATAC-seq sample by performing mapping, barcode correction, and sorted + (deduplicated) BED file generation + help Print this message or the help of the given subcommand(s) + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_atac_index___help.txt b/tests/snapshots/cli-help/simpleaf_atac_index___help.txt new file mode 100644 index 0000000..e2a9344 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_atac_index___help.txt @@ -0,0 +1,26 @@ +build a piscem index over the genome for scATAC-seq mapping + +Usage: simpleaf atac index [OPTIONS] --input --output + +Options: + -i, --input target sequences (provide target sequences directly; avoid expanded + reference construction) + -o, --output path to output directory (will be created if it doesn't exist) + -t, --threads number of threads to use when running [default: 16] + --work-dir working directory where temporary files should be placed [default: + ./workdir.noindex] + --seed seed value to use in SSHash index construction (try changing this in + the rare event index build fails) [default: 1] + --overwrite overwrite existing files if the output directory is already populated + -h, --help Print help + -V, --version Print version + +Index Configuration Options: + -k, --kmer-length + the value of k to be used to construct the index [default: 25] + -m, --minimizer-length + the value of m to be used to construct the piscem index (must be < k) [default: 17] + +Piscem Index Options: + --decoy-paths path to (optional) decoy sequence used to insert poison k-mer + information into the index (only if using piscem >= 0.7) diff --git a/tests/snapshots/cli-help/simpleaf_atac_process___help.txt b/tests/snapshots/cli-help/simpleaf_atac_process___help.txt new file mode 100644 index 0000000..6399d38 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_atac_process___help.txt @@ -0,0 +1,86 @@ +process a scATAC-seq sample by performing mapping, barcode correction, and sorted (deduplicated) BED +file generation + +Usage: simpleaf atac process [OPTIONS] --index --barcode-reads --chemistry --output + +Options: + -c, --chemistry chemistry [possible values: 10x-v1, 10x-v2, 10x-multi] + -t, --threads number of threads to use when running [default: 16] + --output + --call-peaks do peak calling after generating the bed file + -h, --help Print help + -V, --version Print version + +Mapping Options: + -i, --index + path to index + -1, --reads1 + comma-separated list of paths to read 1 files + -2, --reads2 + comma-separated list of paths to read 2 files + -r, --reads + path to the read files containing single-end reads + -b, --barcode-reads + path to the read files containing the cell barcodes + --barcode-length + the length of the barcode read from which to extract the barcode (usually this is the + length of the entire read, and reads shorter than this will be discarded) [default: 16] + +Permit List Generation Options: + --permit-barcode-ori + The expected orientation of the barcodes in the permit list relative to the barcodes + extracted from the reads. If this is "fw", it is expected that the sequences will match + directly, if "rc" it is expected the reverse complement of the permit-list sequence will + match the reads' barcodes. If provided, this value will be used, if not provided, simpleaf + will attempt to look up the appropriate orientation in the chemistry registry + -u, --unfiltered-pl + Use the provided file as the unfiltered permit list (i.e. whitelist). This argument only + needs to be provided if you are providing the permit list explicitly, overriding the + default permit list for the provided chemistry + --min-reads + minimum read count threshold for a cell to be retained/processed; only used with + --unfiltered-pl [default: 10] + +Advanced Options: + --compress + compress the output mapping bed file + --ignore-ambig-hits + skip checking of the equivalence classes of k-mers that were too ambiguous to be otherwise + considered (passing this flag can speed up mapping slightly, but may reduce specificity) + --no-poison + do not consider poison k-mers, even if the underlying index contains them. In this case, + the mapping results will be identical to those obtained as if no poison table was added to + the index + --use-chr + use chromosomes as color + --thr + threshold to be considered for pseudoalignment [default: 0.7] + --bin-size + size of virtual color intervals [default: 1000] + --bin-overlap + size for virtual color interval overlap [default: 300] + --no-tn5-shift + do not apply Tn5 shift to mapped positions + --check-kmer-orphan + Check if any mapping kmer exist for a mate which is not mapped, but there exists mapping + for the other read. If set to true and a mapping kmer exists, then the pair would not be + mapped + --max-ec-card + determines the maximum cardinality equivalence class (number of (txp, orientation status) + pairs) to examine (cannot be used with --ignore-ambig-hits) [default: 4096] + --max-hit-occ + in the first pass, consider only k-mers having <= --max-hit-occ hits [default: 256] + --max-hit-occ-recover + if all k-mers have > --max-hit-occ hits, then make a second pass and consider k-mers + having <= --max-hit-occ-recover hits [default: 1024] + --max-read-occ + reads with more than this number of mappings will not have their mappings reported + [default: 2500] + +Peak Caller Options: + --gsize The value to be passed to the `macs3` `--gsize` (genome size) option. + Possible values are "hs", "mm", "ce", "dm" or an unsigned integer + [default: hs] + --qvalue The value to be passed to the `macs3` `--qvalue` (minimum FDR cutoff) + option [default: 0.1] + --extsize The value to be passed to the `macs3` `--extsize` option [default: 50] diff --git a/tests/snapshots/cli-help/simpleaf_chemistry___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry___help.txt new file mode 100644 index 0000000..b0c9a42 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry___help.txt @@ -0,0 +1,16 @@ +operate on or inspect the chemistry registry + +Usage: simpleaf chemistry + +Commands: + refresh Update the local chemistry registry according to the upstream repository + add Add a new or update an existing chemistry in the local registry + remove Remove chemistries from the local chemistry registry + clean Remove cached permit list files that do not belong to any registered chemistries + lookup Look up chemistries in the local registry and print the details + fetch Download the permit list files for registered chemistries + help Print this message or the help of the given subcommand(s) + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_add___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_add___help.txt new file mode 100644 index 0000000..e2c6824 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_add___help.txt @@ -0,0 +1,28 @@ +Add a new or update an existing chemistry in the local registry + +Usage: simpleaf chemistry add [OPTIONS] --name + +Options: + -n, --name The name to give to the chemistry + -g, --geometry A quoted string representing the geometry to which the + chemistry maps + -e, --expected-ori The direction of the first (most upstream) mappable biological + sequence [possible values: fw, rc, both] + --local-url The (fully-qualified) path to a local permit list file that + will be copied into the ALEVIN_FRY_HOME directory for future + use + --remote-url The url of a remote file that will be downloaded (on demand) to + provide a permit list for use with this chemistry. This file + should be obtainable with the equivalent of `wget `. + The file will only be downloaded the first time it is needed + and will be locally cached in ALEVIN_FRY_HOME after that + --version A semver format version tag, e.g., `0.1.0`, indicating the + version of the chemistry definition. To update a registered + chemistry, please provide a higher version number, e.g., + `0.2.0` [default: 0.0.0] + --from-json Instead of providing the chemistry directly on the command + line, use the chemistry definition provided in the provided + JSON file. This JSON file can be local or remote, but it must + contain a valid JSON object with the provided `--name` as the + key of the chemistry you wish to add + -h, --help Print help diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_clean___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_clean___help.txt new file mode 100644 index 0000000..126a384 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_clean___help.txt @@ -0,0 +1,8 @@ +Remove cached permit list files that do not belong to any registered chemistries + +Usage: simpleaf chemistry clean [OPTIONS] + +Options: + -d, --dry-run Print the permit list file(s) that will be removed without removing them + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_fetch___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_fetch___help.txt new file mode 100644 index 0000000..5cff394 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_fetch___help.txt @@ -0,0 +1,11 @@ +Download the permit list files for registered chemistries + +Usage: simpleaf chemistry fetch [OPTIONS] --name + +Options: + -n, --name A comma-separated list of chemistry names to fetch (or a *single* regex pattern + for matching multiple chemistries). Use '.*' to fetch for all registered + chemistries + -d, --dry-run Print the permit list file(s) that will be downloaded without downloading them + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_lookup___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_lookup___help.txt new file mode 100644 index 0000000..9bbd2fa --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_lookup___help.txt @@ -0,0 +1,9 @@ +Look up chemistries in the local registry and print the details + +Usage: simpleaf chemistry lookup --name + +Options: + -n, --name The name of a registered chemistry, or a regex pattern for matching registered + chemistries' names + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_refresh___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_refresh___help.txt new file mode 100644 index 0000000..ef471ac --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_refresh___help.txt @@ -0,0 +1,9 @@ +Update the local chemistry registry according to the upstream repository + +Usage: simpleaf chemistry refresh [OPTIONS] + +Options: + -f, --force overwrite existing chemistries even if the versions aren't newer + -d, --dry-run print the chemistries that will be added or updated without modifying the local + registry + -h, --help Print help diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_remove___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_remove___help.txt new file mode 100644 index 0000000..b704273 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_remove___help.txt @@ -0,0 +1,10 @@ +Remove chemistries from the local chemistry registry + +Usage: simpleaf chemistry remove [OPTIONS] --name + +Options: + -n, --name A chemistry name or a regex pattern matching the names of chemistries in the + registry to remove + -d, --dry-run Print the chemistries that would be removed without removing them + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_index___help.txt b/tests/snapshots/cli-help/simpleaf_index___help.txt new file mode 100644 index 0000000..29c9d44 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_index___help.txt @@ -0,0 +1,61 @@ +build the (expanded) reference index + +Usage: simpleaf index [OPTIONS] --output <--fasta |--ref-seq |--probe-csv |--feature-csv > + +Options: + -o, --output Path to output directory (will be created if it doesn't exist) + -t, --threads Number of threads to use when running [default: 16] + -k, --kmer-length The value of k to be used to construct the index [default: 31] + --gff3-format Denotes that the input annotation is a GFF3 (instead of GTF) file + --keep-duplicates Keep duplicated identical sequences when constructing the index + --overwrite Overwrite existing files if the output directory is already + populated + -h, --help Print help + -V, --version Print version + +Expanded Reference Options: + --ref-type Specify whether an expanded reference, spliced+intronic (or splici) + or spliced+unspliced (or spliceu), should be built [default: + spliced+intronic] + -f, --fasta Path to a reference genome to be used for the expanded reference + construction + -g, --gtf Path to a reference GTF/GFF3 file to be used for the expanded + reference construction + -r, --rlen The Read length used in roers to add flanking lengths to intronic + sequences + --dedup Deduplicate identical sequences in roers when building the expanded + reference + --spliced Path to a FASTA file with extra spliced sequence to add to the index + --unspliced Path to a FASTA file with extra unspliced sequence to add to the + index + +Direct Reference Options: + --feature-csv Path to a CSV file containing feature barcode sequences to use + for direct reference indexing. The file must follow the format of + 10x Feature Reference CSV. Currently, only three columns are + used: id, name, and sequence + --probe-csv Path to a CSV file containing probe sequences to use for direct + reference indexing. The file must follow the format of 10x Probe + Set Reference v2 CSV, containing four mandatory columns: gene_id, + probe_seq, probe_id, and included (TRUE or FALSE), and an + optional column: region (spliced or unspliced) + --ref-seq Path to a FASTA file containing reference sequences to directly + build index on, and avoid expanded reference construction + +Piscem Index Options: + -m, --minimizer-length + Minimizer length to be used to construct the piscem index (must be < k) [default: 19] + --decoy-paths + Paths to decoy sequence FASTA files used to insert poison k-mer information into the index + (only if using piscem >= 0.7) + --seed + The seed value to use in SSHash index construction (try changing this in the rare event + index build fails) [default: 1] + --work-dir + The working directory where temporary files should be placed [default: ./workdir.noindex] + --use-piscem + Use piscem instead of salmon for indexing and mapping (default) + +Alternative salmon-alevin Index Options: + -p, --sparse If this flag is passed, build the sparse rather than dense index for mapping + --no-piscem Don't use the default piscem mapper, instead, use salmon-alevin diff --git a/tests/snapshots/cli-help/simpleaf_inspect___help.txt b/tests/snapshots/cli-help/simpleaf_inspect___help.txt new file mode 100644 index 0000000..b4fd152 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_inspect___help.txt @@ -0,0 +1,7 @@ +inspect the current configuration + +Usage: simpleaf inspect + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_quant___help.txt b/tests/snapshots/cli-help/simpleaf_quant___help.txt new file mode 100644 index 0000000..da4cc31 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_quant___help.txt @@ -0,0 +1,78 @@ +quantify a sample + +Usage: simpleaf quant [OPTIONS] --chemistry --output --resolution <--expect-cells |--explicit-pl |--forced-cells |--knee|--unfiltered-pl []> <--index |--map-dir > + +Options: + -c, --chemistry The name of a registered chemistry or a quoted string representing a + custom geometry specification + -o, --output Path to the output directory + -t, --threads Number of threads to use when running [default: 16] + -h, --help Print help + -V, --version Print version + +Mapping Options: + -i, --index Path to a folder containing the index files + -1, --reads1 Comma-separated list of paths to read 1 files. The order must match + the read 2 files + -2, --reads2 Comma-separated list of paths to read 2 files. The order must match + the read 1 files + --no-piscem Don't use the default piscem mapper, instead, use salmon-alevin + --use-piscem Use piscem for mapping (requires that index points to the piscem + index) + -s, --use-selective-alignment Use selective-alignment for mapping (only if using salmon alevin as + the underlying mapper) + --map-dir Path to a mapped output directory containing a RAD file to skip + mapping + +Piscem Mapping Options: + --struct-constraints + If piscem >= 0.7.0, enable structural constraints + --ignore-ambig-hits + Skip checking of the equivalence classes of k-mers that were too ambiguous to be otherwise + considered (passing this flag can speed up mapping slightly, but may reduce specificity) + --no-poison + Do not consider poison k-mers, even if the underlying index contains them. In this case, + the mapping results will be identical to those obtained as if no poison table was added to + the index + --skipping-strategy + The skipping strategy to use for k-mer collection [default: permissive] [possible values: + permissive, strict] + --max-ec-card + Determines the maximum cardinality equivalence class (number of (txp, orientation status) + pairs) to examine (cannot be used with --ignore-ambig-hits) [default: 4096] + --max-hit-occ + In the first pass, consider only collected and matched k-mers of a read having <= + --max-hit-occ hits [default: 256] + --max-hit-occ-recover + If all collected and matched k-mers of a read have > --max-hit-occ hits, then make a + second pass and consider k-mers having <= --max-hit-occ-recover hits [default: 1024] + --max-read-occ + Threshold for discarding reads with too many mappings [default: 2500] + +Permit List Generation Options: + -k, --knee + Use knee filtering mode + -u, --unfiltered-pl [] + Use unfiltered permit list + -f, --forced-cells + Use forced number of cells + -x, --explicit-pl + Use a filtered, explicit permit list + -e, --expect-cells + Use expected number of cells + -d, --expected-ori + The expected direction/orientation of alignments in the chemistry being processed. If not + provided, will default to `fw` for 10xv2/10xv3, otherwise `both` [possible values: fw, rc, + both] + --min-reads + Minimum read count threshold for a cell to be retained/processed; only use with + --unfiltered-pl [default: 10] + +UMI Resolution Options: + -m, --t2g-map Path to a transcript to gene map file + -r, --resolution UMI resolution mode [possible values: cr-like, cr-like-em, + parsimony, parsimony-em, parsimony-gene, parsimony-gene-em] + +Output Options: + --anndata-out Generate an anndata (h5ad format) count matrix from the standard (matrix-market + format) output diff --git a/tests/snapshots/cli-help/simpleaf_refresh_prog_info___help.txt b/tests/snapshots/cli-help/simpleaf_refresh_prog_info___help.txt new file mode 100644 index 0000000..5125ed6 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_refresh_prog_info___help.txt @@ -0,0 +1,7 @@ +refreshes version information associated with programs used by simpleaf + +Usage: simpleaf refresh-prog-info + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_set_paths___help.txt b/tests/snapshots/cli-help/simpleaf_set_paths___help.txt new file mode 100644 index 0000000..03bc425 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_set_paths___help.txt @@ -0,0 +1,11 @@ +set paths to the programs that simpleaf will use + +Usage: simpleaf set-paths [OPTIONS] + +Options: + -s, --salmon path to salmon to use + -p, --piscem path to piscem to use + -a, --alevin-fry path to alein-fry to use + -m, --macs path to macs3 to use + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_workflow___help.txt b/tests/snapshots/cli-help/simpleaf_workflow___help.txt new file mode 100644 index 0000000..a769590 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_workflow___help.txt @@ -0,0 +1,18 @@ +simpleaf workflow related command set + +Usage: simpleaf workflow + simpleaf workflow + +Commands: + list Print a summary of the currently available workflows + refresh Update the local copy of protocol esturary to the latest version + get Get the workflow template and related files of a registered workflow + patch Patch a workflow template or instantiated manifest with a subset of parameters to produce + a series of workflow manifests + run Parse and instantiate a workflow template and invoke the workflow commands, or run an + instantiated manifest directly + help Print this message or the help of the given subcommand(s) + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_workflow_get___help.txt b/tests/snapshots/cli-help/simpleaf_workflow_get___help.txt new file mode 100644 index 0000000..2546441 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_workflow_get___help.txt @@ -0,0 +1,9 @@ +Get the workflow template and related files of a registered workflow + +Usage: simpleaf workflow get --name --output + +Options: + -o, --output path to dump the folder containing the workflow related files + -n, --name name of the queried workflow + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_workflow_list___help.txt b/tests/snapshots/cli-help/simpleaf_workflow_list___help.txt new file mode 100644 index 0000000..3d2835c --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_workflow_list___help.txt @@ -0,0 +1,7 @@ +Print a summary of the currently available workflows + +Usage: simpleaf workflow list + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_workflow_patch___help.txt b/tests/snapshots/cli-help/simpleaf_workflow_patch___help.txt new file mode 100644 index 0000000..a007158 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_workflow_patch___help.txt @@ -0,0 +1,22 @@ +Patch a workflow template or instantiated manifest with a subset of parameters to produce a series +of workflow manifests + +Usage: simpleaf workflow patch [OPTIONS] --patch <--manifest |--template