diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 314331e..d9aa406 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,4 @@ -# This file was autogenerated by cargo-dist: https://opensource.axo.dev/cargo-dist/ +# This file was autogenerated by dist: https://axodotdev.github.io/cargo-dist # # Copyright 2022-2024, axodotdev # SPDX-License-Identifier: MIT or Apache-2.0 @@ -6,7 +6,7 @@ # CI that: # # * checks for a Git Tag that looks like a release -# * builds artifacts with cargo-dist (archives, installers, hashes) +# * builds artifacts with dist (archives, installers, hashes) # * uploads those artifacts to temporary workflow zip # * on success, uploads the artifacts to a GitHub Release # @@ -24,10 +24,10 @@ permissions: # must be a Cargo-style SemVer Version (must have at least major.minor.patch). # # If PACKAGE_NAME is specified, then the announcement will be for that -# package (erroring out if it doesn't have the given version or isn't cargo-dist-able). +# package (erroring out if it doesn't have the given version or isn't dist-able). # # If PACKAGE_NAME isn't specified, then the announcement will be for all -# (cargo-dist-able) packages in the workspace with that version (this mode is +# (dist-able) packages in the workspace with that version (this mode is # intended for workspaces with only one dist-able package, or with all dist-able # packages versioned/released in lockstep). # @@ -45,9 +45,9 @@ on: - '**[0-9]+.[0-9]+.[0-9]+*' jobs: - # Run 'cargo dist plan' (or host) to determine what tasks we need to do + # Run 'dist plan' (or host) to determine what tasks we need to do plan: - runs-on: "ubuntu-20.04" + runs-on: "ubuntu-22.04" outputs: val: ${{ steps.plan.outputs.manifest }} tag: ${{ !github.event.pull_request && github.ref_name || '' }} @@ -56,19 +56,20 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: + persist-credentials: false submodules: recursive - - name: Install cargo-dist + - name: Install dist # we specify bash to get pipefail; it guards against the `curl` command # failing. otherwise `sh` won't catch that `curl` returned non-0 shell: bash - run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.21.1/cargo-dist-installer.sh | sh" - - name: Cache cargo-dist - uses: actions/upload-artifact@v4 + run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.31.0/cargo-dist-installer.sh | sh" + - name: Cache dist + uses: actions/upload-artifact@v6 with: name: cargo-dist-cache - path: ~/.cargo/bin/cargo-dist + path: ~/.cargo/bin/dist # sure would be cool if github gave us proper conditionals... # so here's a doubly-nested ternary-via-truthiness to try to provide the best possible # functionality based on whether this is a pull_request, and whether it's from a fork. @@ -76,12 +77,12 @@ jobs: # but also really annoying to build CI around when it needs secrets to work right.) - id: plan run: | - cargo dist ${{ (!github.event.pull_request && format('host --steps=create --tag={0}', github.ref_name)) || 'plan' }} --output-format=json > plan-dist-manifest.json - echo "cargo dist ran successfully" + dist ${{ (!github.event.pull_request && format('host --steps=create --tag={0}', github.ref_name)) || 'plan' }} --output-format=json > plan-dist-manifest.json + echo "dist ran successfully" cat plan-dist-manifest.json echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: artifacts-plan-dist-manifest path: plan-dist-manifest.json @@ -95,18 +96,19 @@ jobs: if: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix.include != null && (needs.plan.outputs.publishing == 'true' || fromJson(needs.plan.outputs.val).ci.github.pr_run_mode == 'upload') }} strategy: fail-fast: false - # Target platforms/runners are computed by cargo-dist in create-release. + # Target platforms/runners are computed by dist in create-release. # Each member of the matrix has the following arguments: # # - runner: the github runner - # - dist-args: cli flags to pass to cargo dist - # - install-dist: expression to run to install cargo-dist on the runner + # - dist-args: cli flags to pass to dist + # - install-dist: expression to run to install dist on the runner # # Typically there will be: # - 1 "global" task that builds universal installers # - N "local" tasks that build each platform's binaries and platform-specific installers matrix: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix }} runs-on: ${{ matrix.runner }} + container: ${{ matrix.container && matrix.container.image || null }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} BUILD_MANIFEST_NAME: target/distrib/${{ join(matrix.targets, '-') }}-dist-manifest.json @@ -114,14 +116,22 @@ jobs: - name: enable windows longpaths run: | git config --global core.longpaths true - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: + persist-credentials: false submodules: recursive - - name: Install cargo-dist - run: ${{ matrix.install_dist }} + - name: Install Rust non-interactively if not already installed + if: ${{ matrix.container }} + run: | + if ! command -v cargo > /dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + echo "$HOME/.cargo/bin" >> $GITHUB_PATH + fi + - name: Install dist + run: ${{ matrix.install_dist.run }} # Get the dist-manifest - name: Fetch local artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: pattern: artifacts-* path: target/distrib/ @@ -132,8 +142,8 @@ jobs: - name: Build artifacts run: | # Actually do builds and make zips and whatnot - cargo dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json - echo "cargo dist ran successfully" + dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json + echo "dist ran successfully" - id: cargo-dist name: Post-build # We force bash here just because github makes it really hard to get values up @@ -143,12 +153,12 @@ jobs: run: | # Parse out what we just built and upload it to scratch storage echo "paths<> "$GITHUB_OUTPUT" - jq --raw-output ".upload_files[]" dist-manifest.json >> "$GITHUB_OUTPUT" + dist print-upload-files-from-manifest --manifest dist-manifest.json >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" cp dist-manifest.json "$BUILD_MANIFEST_NAME" - name: "Upload artifacts" - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: artifacts-build-local-${{ join(matrix.targets, '_') }} path: | @@ -160,23 +170,24 @@ jobs: needs: - plan - build-local-artifacts - runs-on: "ubuntu-20.04" + runs-on: "ubuntu-22.04" env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} BUILD_MANIFEST_NAME: target/distrib/global-dist-manifest.json steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: + persist-credentials: false submodules: recursive - - name: Install cached cargo-dist - uses: actions/download-artifact@v4 + - name: Install cached dist + uses: actions/download-artifact@v7 with: name: cargo-dist-cache path: ~/.cargo/bin/ - - run: chmod +x ~/.cargo/bin/cargo-dist + - run: chmod +x ~/.cargo/bin/dist # Get all the local artifacts for the global tasks to use (for e.g. checksums) - name: Fetch local artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: pattern: artifacts-* path: target/distrib/ @@ -184,8 +195,8 @@ jobs: - id: cargo-dist shell: bash run: | - cargo dist build ${{ needs.plan.outputs.tag-flag }} --output-format=json "--artifacts=global" > dist-manifest.json - echo "cargo dist ran successfully" + dist build ${{ needs.plan.outputs.tag-flag }} --output-format=json "--artifacts=global" > dist-manifest.json + echo "dist ran successfully" # Parse out what we just built and upload it to scratch storage echo "paths<> "$GITHUB_OUTPUT" @@ -194,7 +205,7 @@ jobs: cp dist-manifest.json "$BUILD_MANIFEST_NAME" - name: "Upload artifacts" - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: artifacts-build-global path: | @@ -206,26 +217,27 @@ jobs: - plan - build-local-artifacts - build-global-artifacts - # Only run if we're "publishing", and only if local and global didn't fail (skipped is fine) - if: ${{ always() && needs.plan.outputs.publishing == 'true' && (needs.build-global-artifacts.result == 'skipped' || needs.build-global-artifacts.result == 'success') && (needs.build-local-artifacts.result == 'skipped' || needs.build-local-artifacts.result == 'success') }} + # Only run if we're "publishing", and only if plan, local and global didn't fail (skipped is fine) + if: ${{ always() && needs.plan.result == 'success' && needs.plan.outputs.publishing == 'true' && (needs.build-global-artifacts.result == 'skipped' || needs.build-global-artifacts.result == 'success') && (needs.build-local-artifacts.result == 'skipped' || needs.build-local-artifacts.result == 'success') }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - runs-on: "ubuntu-20.04" + runs-on: "ubuntu-22.04" outputs: val: ${{ steps.host.outputs.manifest }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: + persist-credentials: false submodules: recursive - - name: Install cached cargo-dist - uses: actions/download-artifact@v4 + - name: Install cached dist + uses: actions/download-artifact@v7 with: name: cargo-dist-cache path: ~/.cargo/bin/ - - run: chmod +x ~/.cargo/bin/cargo-dist + - run: chmod +x ~/.cargo/bin/dist # Fetch artifacts from scratch-storage - name: Fetch artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: pattern: artifacts-* path: target/distrib/ @@ -233,19 +245,19 @@ jobs: - id: host shell: bash run: | - cargo dist host ${{ needs.plan.outputs.tag-flag }} --steps=upload --steps=release --output-format=json > dist-manifest.json + dist host ${{ needs.plan.outputs.tag-flag }} --steps=upload --steps=release --output-format=json > dist-manifest.json echo "artifacts uploaded and released successfully" cat dist-manifest.json echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: # Overwrite the previous copy name: artifacts-dist-manifest path: dist-manifest.json # Create a GitHub Release while uploading all files to it - name: "Download GitHub Artifacts" - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: pattern: artifacts-* path: artifacts @@ -274,10 +286,11 @@ jobs: # still allowing individual publish jobs to skip themselves (for prereleases). # "host" however must run to completion, no skipping allowed! if: ${{ always() && needs.host.result == 'success' }} - runs-on: "ubuntu-20.04" + runs-on: "ubuntu-22.04" env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: + persist-credentials: false submodules: recursive diff --git a/.gitignore b/.gitignore index e6a88f4..270b9d5 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,6 @@ TEST_REF TEST_REF_CLAPV4 TEST_REF_ROERS blarg + +# macOS Finder metadata +**/.DS_Store diff --git a/Cargo.lock b/Cargo.lock index 5e9fd85..60f74e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -912,6 +912,15 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "file-requirements" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab1300e0345c97d6d5ed20b90d53ea83e710e971215b47590ac5c6c69847b65" +dependencies = [ + "thiserror 2.0.12", +] + [[package]] name = "flate2" version = "1.1.1" @@ -3577,6 +3586,7 @@ dependencies = [ "clap", "cmd_lib", "csv", + "file-requirements", "flate2", "jrsonnet-cli", "jrsonnet-evaluator", diff --git a/Cargo.toml b/Cargo.toml index 7744979..70acc7b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "simpleaf" version = "0.19.5" -edition = "2021" +edition = "2024" authors = [ "Rob Patro ", "Dongze He ", @@ -15,6 +15,7 @@ homepage = "https://simpleaf.readthedocs.io" #documentation = "https://fry.readthedocs.io/en/latest/" include = [ "/src/*.rs", + "/src/core/*.rs", "/src/utils/*.rs", "/src/utils/af_utils/*.rs", "/src/atac/*.rs", @@ -85,6 +86,7 @@ regex = { version = "1.11.1", default-features = false, features = [ tempfile = "3.19.1" ureq = { version = "3.0.11", features = ["json"] } af-anndata = { version = "0.3.3", git = "https://github.com/COMBINE-lab/af-anndata" } +file-requirements = "0.1.0" [profile.release] lto = "thin" @@ -94,27 +96,3 @@ opt-level = 3 [profile.dist] inherits = "release" lto = "thin" - -# Config for 'cargo dist' -[workspace.metadata.dist] -# The preferred cargo-dist version to use in CI (Cargo.toml SemVer syntax) -cargo-dist-version = "0.21.1" -# CI backends to support -ci = "github" -# The installers to generate for each app -installers = ["shell"] -# Target platforms to build apps for (Rust target-triple syntax) -targets = [ - "aarch64-apple-darwin", - "x86_64-apple-darwin", - "x86_64-unknown-linux-gnu", -] -# Which actions to run on pull requests -pr-run-mode = "plan" -# Whether to install an updater program -install-updater = true -# Path that installers should place binaries in -install-path = "CARGO_HOME" - -[workspace.metadata.dist.github-custom-runners] -aarch64-apple-darwin = "macos-14" diff --git a/dist-workspace.toml b/dist-workspace.toml new file mode 100644 index 0000000..6d6b111 --- /dev/null +++ b/dist-workspace.toml @@ -0,0 +1,22 @@ +[workspace] +members = ["cargo:."] + +# Config for 'dist' +[dist] +# The preferred dist version to use in CI (Cargo.toml SemVer syntax) +cargo-dist-version = "0.31.0" +# CI backends to support +ci = "github" +# The installers to generate for each app +installers = ["shell"] +# Target platforms to build apps for (Rust target-triple syntax) +targets = ["aarch64-apple-darwin", "aarch64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu"] +# Which actions to run on pull requests +pr-run-mode = "plan" +# Whether to install an updater program +install-updater = true +# Path that installers should place binaries in +install-path = "CARGO_HOME" + +[dist.github-custom-runners] +aarch64-apple-darwin = "macos-14" diff --git a/docs/architecture-map.md b/docs/architecture-map.md new file mode 100644 index 0000000..5c2e110 --- /dev/null +++ b/docs/architecture-map.md @@ -0,0 +1,77 @@ +# simpleaf Architecture Map + +This document summarizes the post-refactor module boundaries and where new code should live. + +## CLI Entry and Dispatch +- `src/main.rs` + - Parses CLI arguments and dispatches to command handlers. + - Should remain thin: argument parsing, high-level routing, and top-level logging only. +- `src/simpleaf_commands.rs` + - Defines CLI option structs and subcommands. + - Avoid business logic here; keep it as schema/contract for command-line interfaces. + +## Command Implementations +- `src/simpleaf_commands/indexing.rs` + - RNA reference/index orchestration. + - Uses staged outputs for readability and testability. +- `src/simpleaf_commands/quant.rs` + - RNA mapping/quantification orchestration. + - Stage-level helpers produce typed intermediate outputs. +- `src/atac/process.rs` + - ATAC processing pipeline (map/gpl/sort/macs). + - Stage decomposition mirrors RNA command structure. +- `src/simpleaf_commands/workflow.rs` + - Workflow command front-end (run/list/get/patch/refresh). + - Handles run-input resolution before delegating planning/execution. +- `src/simpleaf_commands/chemistry.rs` + - Chemistry registry and permit-list lifecycle operations. + - Contains registry merge/update logic and dry-run-safe file operations. + +## Core Shared Infrastructure +- `src/core/context.rs` + - Runtime context loading/validation (e.g., AF home, tool info). +- `src/core/exec.rs` + - Checked command execution helpers. +- `src/core/index_meta.rs` + - Shared index metadata discovery and parsing. +- `src/core/runtime.rs` + - Runtime helpers (e.g., thread capping). +- `src/core/io.rs` + - JSON file read/write and atomic write helpers. + +## Utility Layer +- `src/utils/workflow_utils.rs` + - Workflow manifest validation, planning, command queue building, execution, and logging. + - Contains typed workflow execution errors and log schema construction. +- `src/utils/chem_utils.rs` + - Chemistry model definitions and registry parsing helpers. +- `src/utils/prog_utils.rs` + - Program/tool discovery and generic process/IO utilities. +- `src/utils/af_utils.rs` + - Domain utilities for geometry and command-level support logic. + +## Testing Layout +- `tests/cli_help_snapshots.rs` + - Guards user-facing CLI help contract. +- `tests/cli_smoke.rs` + - Lightweight argument/dispatch integration checks. +- `tests/phase1_regressions.rs` + - High-value regression checks for prior production bugs. +- Module-local unit tests in `src/**` + - Stage/helper-focused tests close to implementation. + +## CI and Validation Paths +- Strict quality gate: + - `cargo fmt --check` + - `cargo clippy --all-targets -- -D warnings` + - `cargo test --all-targets` +- Deterministic local integration path: + - `scripts/test_simpleaf_local_integration.sh` +- Full e2e toy dataset path (heavier): + - `scripts/test_simpleaf.sh` + +## Extension Guidelines +- Add reusable primitives under `src/core/` before duplicating orchestration logic. +- Keep command modules focused on stage orchestration and user-facing flow. +- Prefer typed errors (`thiserror`) where multiple internal failure variants matter. +- Use atomic writes for registry/log/state files that are critical for resume/recovery flows. diff --git a/release.sh b/release.sh new file mode 100755 index 0000000..b7442a1 --- /dev/null +++ b/release.sh @@ -0,0 +1,284 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: release.sh [--push] [--dry-run] + +Creates a release by: +1. Validating as semantic versioning (SemVer 2.0.0) +2. Ensuring is greater than the current Cargo.toml package version +3. Ensuring tag v does not already exist +4. Updating Cargo.toml [package].version +5. Committing Cargo.toml and creating tag v +6. Optionally pushing the tag when --push is provided +7. With --dry-run, print planned actions without changing git state/files +USAGE +} + +err() { + echo "Error: $*" >&2 + exit 1 +} + +is_valid_semver() { + local version="$1" + local semver_regex='^(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9A-Za-z-]*[A-Za-z-][0-9A-Za-z-]*)(\.(0|[1-9][0-9]*|[0-9A-Za-z-]*[A-Za-z-][0-9A-Za-z-]*))*))?(\+([0-9A-Za-z-]+(\.[0-9A-Za-z-]+)*))?$' + printf '%s\n' "$version" | grep -Eq "$semver_regex" +} + +split_semver() { + local version="$1" + local core="$version" + local pre="" + local major minor patch + + if [[ "$core" == *+* ]]; then + core="${core%%+*}" + fi + + if [[ "$core" == *-* ]]; then + pre="${core#*-}" + core="${core%%-*}" + fi + + IFS='.' read -r major minor patch <<<"$core" + printf '%s\t%s\t%s\t%s\n' "$major" "$minor" "$patch" "$pre" +} + +compare_prerelease() { + local left="$1" + local right="$2" + + if [[ -z "$left" && -z "$right" ]]; then + echo 0 + return + fi + if [[ -z "$left" ]]; then + echo 1 + return + fi + if [[ -z "$right" ]]; then + echo -1 + return + fi + + local -a lparts=() + local -a rparts=() + IFS='.' read -r -a lparts <<<"$left" + IFS='.' read -r -a rparts <<<"$right" + + local i + local lnum rnum + for ((i = 0; i < ${#lparts[@]} || i < ${#rparts[@]}; i++)); do + if ((i >= ${#lparts[@]})); then + echo -1 + return + fi + if ((i >= ${#rparts[@]})); then + echo 1 + return + fi + + if [[ "${lparts[$i]}" == "${rparts[$i]}" ]]; then + continue + fi + + if [[ "${lparts[$i]}" =~ ^(0|[1-9][0-9]*)$ ]]; then + lnum=1 + else + lnum=0 + fi + if [[ "${rparts[$i]}" =~ ^(0|[1-9][0-9]*)$ ]]; then + rnum=1 + else + rnum=0 + fi + + if ((lnum == 1 && rnum == 1)); then + if ((10#${lparts[$i]} > 10#${rparts[$i]})); then + echo 1 + else + echo -1 + fi + return + fi + if ((lnum == 1 && rnum == 0)); then + echo -1 + return + fi + if ((lnum == 0 && rnum == 1)); then + echo 1 + return + fi + + if [[ "${lparts[$i]}" > "${rparts[$i]}" ]]; then + echo 1 + else + echo -1 + fi + return + done + + echo 0 +} + +compare_semver() { + local left="$1" + local right="$2" + + local lmaj lmin lpat lpre + local rmaj rmin rpat rpre + IFS=$'\t' read -r lmaj lmin lpat lpre <<<"$(split_semver "$left")" + IFS=$'\t' read -r rmaj rmin rpat rpre <<<"$(split_semver "$right")" + + if ((10#$lmaj > 10#$rmaj)); then + echo 1 + return + fi + if ((10#$lmaj < 10#$rmaj)); then + echo -1 + return + fi + + if ((10#$lmin > 10#$rmin)); then + echo 1 + return + fi + if ((10#$lmin < 10#$rmin)); then + echo -1 + return + fi + + if ((10#$lpat > 10#$rpat)); then + echo 1 + return + fi + if ((10#$lpat < 10#$rpat)); then + echo -1 + return + fi + + compare_prerelease "$lpre" "$rpre" +} + +push_tag=false +dry_run=false +version="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --push) + push_tag=true + shift + ;; + --dry-run) + dry_run=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + -*) + err "Unknown option: $1" + ;; + *) + if [[ -n "$version" ]]; then + err "Provide exactly one version argument." + fi + version="$1" + shift + ;; + esac +done + +[[ -n "$version" ]] || { usage >&2; exit 1; } + +[[ -f "Cargo.toml" ]] || err "Cargo.toml not found in current directory." +git rev-parse --is-inside-work-tree >/dev/null 2>&1 || err "Current directory is not a git repository." + +is_valid_semver "$version" || err "Version '$version' is not valid SemVer 2.0.0." + +current_version="$( + awk ' + /^\[package\]/ { in_package=1; next } + in_package && /^\[/ { in_package=0 } + in_package && /^[[:space:]]*version[[:space:]]*=/ { + match($0, /"[^"]+"/) + if (RSTART > 0) { + print substr($0, RSTART + 1, RLENGTH - 2) + exit + } + } + ' Cargo.toml +)" + +[[ -n "$current_version" ]] || err "Could not find [package].version in Cargo.toml." +is_valid_semver "$current_version" || err "Current Cargo.toml version '$current_version' is not valid SemVer." + +cmp="$(compare_semver "$version" "$current_version")" +if [[ "$cmp" != "1" ]]; then + err "Version '$version' must be greater than current version '$current_version'." +fi + +tag="v${version}" +if git rev-parse -q --verify "refs/tags/${tag}" >/dev/null 2>&1; then + err "Tag '${tag}' already exists locally." +fi +if git remote get-url origin >/dev/null 2>&1; then + if git ls-remote --exit-code --tags origin "refs/tags/${tag}" >/dev/null 2>&1; then + err "Tag '${tag}' already exists on remote 'origin'." + fi +fi + +tmp_file="$(mktemp)" +cleanup() { + rm -f "$tmp_file" +} +trap cleanup EXIT + +awk -v new_version="$version" ' + BEGIN { in_package=0; updated=0 } + /^\[package\]/ { in_package=1; print; next } + in_package && /^\[/ { in_package=0 } + in_package && !updated && /^[[:space:]]*version[[:space:]]*=/ { + print "version = \"" new_version "\"" + updated=1 + next + } + { print } + END { + if (!updated) { + exit 2 + } + } +' Cargo.toml > "$tmp_file" || err "Failed to update Cargo.toml." + +if [[ "$dry_run" == true ]]; then + echo "Dry run successful:" + echo "- Current version: ${current_version}" + echo "- New version: ${version}" + echo "- Tag to create: ${tag}" + echo "- Would update Cargo.toml [package].version to ${version}" + echo "- Would commit: Release ${tag}" + echo "- Would create tag: ${tag}" + if [[ "$push_tag" == true ]]; then + git remote get-url origin >/dev/null 2>&1 || err "No 'origin' remote configured, cannot push tag." + echo "- Would push tag to origin: ${tag}" + fi + exit 0 +fi + +mv "$tmp_file" Cargo.toml + +git add Cargo.toml +git commit -m "Release ${tag}" -- Cargo.toml +git tag "${tag}" + +if [[ "$push_tag" == true ]]; then + git remote get-url origin >/dev/null 2>&1 || err "No 'origin' remote configured, cannot push tag." + git push origin "${tag}" +fi + +echo "Released ${tag} (updated Cargo.toml from ${current_version} to ${version})." diff --git a/scripts/refactor_safety_suite.sh b/scripts/refactor_safety_suite.sh new file mode 100755 index 0000000..50e6fb6 --- /dev/null +++ b/scripts/refactor_safety_suite.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "${ROOT_DIR}" + +echo "[refactor-safety] running CLI help snapshot and parse-smoke tests" +cargo test --test cli_help_snapshots --test cli_smoke diff --git a/scripts/test_simpleaf_local_integration.sh b/scripts/test_simpleaf_local_integration.sh new file mode 100755 index 0000000..418e99d --- /dev/null +++ b/scripts/test_simpleaf_local_integration.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "${ROOT_DIR}" + +echo "[local-integration] running deterministic integration tests" + +# These tests avoid remote/networked resources and external large datasets. +cargo test --test cli_help_snapshots --test cli_smoke --test phase1_regressions + +# Targeted workflow and chemistry regressions that cover refactor-critical paths. +cargo test --all-targets test_execute_commands_external_failure_reports_status_and_stderr +cargo test --all-targets clean_chemistries_dry_run_is_non_destructive_then_removes_unused diff --git a/src/atac/commands.rs b/src/atac/commands.rs index 00e9bd6..9f1ee42 100644 --- a/src/atac/commands.rs +++ b/src/atac/commands.rs @@ -2,8 +2,8 @@ use crate::atac::defaults::{AtacIndexParams, DefaultAtacParams}; use crate::defaults::{DefaultMappingParams, DefaultParams}; use crate::utils::chem_utils::{ExpectedOri, QueryInRegistry}; use clap::{ - builder::{ArgPredicate, PossibleValue}, Args, Subcommand, ValueEnum, + builder::{ArgPredicate, PossibleValue}, }; use std::fmt; use std::path::PathBuf; diff --git a/src/atac/index.rs b/src/atac/index.rs index 92614b7..1f108e2 100644 --- a/src/atac/index.rs +++ b/src/atac/index.rs @@ -1,25 +1,21 @@ use crate::atac::commands::IndexOpts; -use crate::utils::{ - prog_utils, - prog_utils::{CommandVerbosityLevel, ReqProgs}, -}; +use crate::core::{context, exec, io, runtime}; +use crate::utils::{prog_utils, prog_utils::ReqProgs}; use anyhow; -use anyhow::{bail, Context}; +use anyhow::Context; use cmd_lib::run_fun; -use serde_json::{json, Value}; +use serde_json::json; use std::path::Path; use std::time::Instant; use tracing::{info, warn}; pub(crate) fn piscem_index(af_home_path: &Path, opts: &IndexOpts) -> anyhow::Result<()> { - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let piscem_prog_info = rp .piscem .as_ref() - .expect("piscem program info should be properly set."); + .context("piscem program info is missing; please run `simpleaf set-paths`.")?; match prog_utils::check_version_constraints( "piscem", @@ -60,18 +56,13 @@ pub(crate) fn piscem_index(af_home_path: &Path, opts: &IndexOpts) -> anyhow::Res piscem_index_cmd.arg("--overwrite"); } - let mut threads = opts.threads; - // if the user requested more threads than can be used - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (threads, capped_at) = runtime::cap_threads(opts.threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, opts.threads + ); + warn!("setting number of threads to {}", max_threads); } piscem_index_cmd @@ -94,14 +85,9 @@ pub(crate) fn piscem_index(af_home_path: &Path, opts: &IndexOpts) -> anyhow::Res info!("piscem build cmd : {}", index_cmd_string); let index_start = Instant::now(); - let cres = prog_utils::execute_command(&mut piscem_index_cmd, CommandVerbosityLevel::Quiet) - .expect("failed to invoke piscem index command"); + exec::run_checked(&mut piscem_index_cmd, "atac piscem index command")?; let index_duration = index_start.elapsed(); - if !cres.status.success() { - bail!("piscem index failed to build succesfully {:?}", cres.status); - } - let index_json_file = output_index_dir.join("simpleaf_index.json"); let index_json = json!({ "cmd" : index_cmd_string, @@ -117,11 +103,7 @@ pub(crate) fn piscem_index(af_home_path: &Path, opts: &IndexOpts) -> anyhow::Res "ref" : &opts.input } }); - std::fs::write( - &index_json_file, - serde_json::to_string_pretty(&index_json).unwrap(), - ) - .with_context(|| format!("could not write {}", index_json_file.display()))?; + io::write_json_pretty_atomic(&index_json_file, &index_json)?; Ok(()) } diff --git a/src/atac/process.rs b/src/atac/process.rs index 47b4ec1..069e965 100644 --- a/src/atac/process.rs +++ b/src/atac/process.rs @@ -1,20 +1,39 @@ use crate::atac::commands::ProcessOpts; -use crate::utils::chem_utils::get_single_custom_chem_from_file; +use crate::core::{context, exec, index_meta, io, runtime}; use crate::utils::chem_utils::ExpectedOri; use crate::utils::chem_utils::QueryInRegistry; +use crate::utils::chem_utils::get_single_custom_chem_from_file; use crate::utils::constants::CHEMISTRIES_PATH; -use crate::utils::{ - prog_utils, - prog_utils::{CommandVerbosityLevel, ReqProgs}, -}; +use crate::utils::{prog_utils, prog_utils::ReqProgs}; use anyhow; -use anyhow::{bail, Context}; -use serde_json::{json, Value}; +use anyhow::{Context, bail}; +use serde_json::json; use std::io::BufReader; use std::path::{Path, PathBuf}; use std::time::Instant; use tracing::{info, warn}; +pub(crate) struct MapStageOutput { + pub map_output: PathBuf, + pub map_duration_secs: f64, + pub map_cmd: String, +} + +struct GplStageOutput { + gpl_duration_secs: f64, + gpl_cmd: String, +} + +struct SortStageOutput { + sort_duration_secs: f64, + sort_cmd: String, +} + +struct MacsStageOutput { + macs_duration_secs: f64, + macs_cmd: String, +} + fn push_advanced_piscem_options( piscem_map_cmd: &mut std::process::Command, opts: &ProcessOpts, @@ -63,7 +82,7 @@ fn add_read_args(map_cmd: &mut std::process::Command, opts: &ProcessOpts) -> any let reads2 = opts .reads2 .as_ref() - .expect("since reads1 files is given, read2 files must be provided."); + .context("read2 files must be provided when read1 files are provided.")?; let barcode_reads: &Vec = &opts.barcode_reads; if reads1.len() != reads2.len() || reads1.len() != barcode_reads.len() { bail!( @@ -99,9 +118,9 @@ fn add_read_args(map_cmd: &mut std::process::Command, opts: &ProcessOpts) -> any .join(","); map_cmd.arg("--barcode").arg(bc_str); } else { - let reads = opts.reads.as_ref().expect( - "since reads1 and reads2 are not provided, the single-end reads must be provided.", - ); + let reads = opts.reads.as_ref().context( + "single-end reads must be provided when read1/read2 files are not provided.", + )?; let barcode_reads: &Vec = &opts.barcode_reads; if reads.len() != barcode_reads.len() { bail!( @@ -136,14 +155,12 @@ pub(crate) fn check_progs>( opts: &ProcessOpts, ) -> anyhow::Result<()> { let af_home_path = af_home_path.as_ref(); - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let af_prog_info = rp .alevin_fry .as_ref() - .expect("alevin-fry program info should be properly set."); + .context("alevin-fry program info is missing; please run `simpleaf set-paths`.")?; match prog_utils::check_version_constraints( "alevin-fry", @@ -157,7 +174,7 @@ pub(crate) fn check_progs>( let piscem_prog_info = rp .piscem .as_ref() - .expect("piscem program info should be properly set."); + .context("piscem program info is missing; please run `simpleaf set-paths`.")?; match prog_utils::check_version_constraints( "piscem", @@ -172,7 +189,9 @@ pub(crate) fn check_progs>( let macs_prog_info = rp .macs .as_ref() - .expect("macs3 program should be properly set if using the `--call-peaks` option"); + .context( + "macs3 program info is missing; please run `simpleaf set-paths` before using `--call-peaks`.", + )?; match prog_utils::check_version_constraints( "macs3", ">=3.0.2, <4.0.0", @@ -188,93 +207,16 @@ pub(crate) fn check_progs>( // NOTE: we assume that check_progs has already been called and so version constraints have // already been checked. -pub(crate) fn map_reads(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; +pub(crate) fn map_reads(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result { + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let piscem_prog_info = rp .piscem .as_ref() - .expect("piscem program info should be properly set."); - - // figure out what type of index we expect - let index_base; - - let mut index = opts.index.clone(); - // If the user built the index using simpleaf, there are - // 2 possibilities here: - // 1. They are passing in the directory containing the index - // 2. They are passing in the prefix stem of the index files - // The code below is to check, in both cases, if we can automatically - // detect if the index was constructed with simpleaf. - - // If we are in case 1., the passed in path is a directory and - // we can check for the simpleaf_index.json file directly, - // Otherwise if the path is not a directory, we check if it - // ends in piscem_idx (the suffix that simpleaf uses when - // making a piscem index). Then we test the directory we - // get after stripping off this suffix. - let removed_piscem_idx_suffix = if !index.is_dir() && index.ends_with("piscem_idx") { - // remove the piscem_idx part - index.pop(); - true - } else { - false - }; - - let index_json_path = index.join("simpleaf_index.json"); - match index_json_path.try_exists() { - Ok(true) => { - // we have the simpleaf_index.json file, so parse it. - let index_json_file = std::fs::File::open(&index_json_path).with_context({ - || format!("Could not open file {}", index_json_path.display()) - })?; - - let index_json_reader = BufReader::new(&index_json_file); - let v: Value = serde_json::from_reader(index_json_reader)?; - - let index_type_str: String = serde_json::from_value(v["index_type"].clone())?; - - // here, set the index type based on what we found as the - // value for the `index_type` key. - match index_type_str.as_ref() { - "piscem" => { - // here, either the user has provided us with just - // the directory containing the piscem index, or - // we have "popped" off the "piscem_idx" suffix, so - // add it (back). - index_base = index.join("piscem_idx"); - } - _ => { - bail!( - "unknown index type {} present in simpleaf_index.json", - index_type_str, - ); - } - } - } - Ok(false) => { - // at this point, we have inferred that simpleaf wasn't - // used to construct the index, so fall back to what the user - // requested directly. - // if we have previously removed the piscem_idx suffix, add it back - if removed_piscem_idx_suffix { - index.push("piscem_idx"); - } - index_base = index; - } - Err(e) => { - bail!(e); - } - } + .context("piscem program info is missing; please run `simpleaf set-paths`.")?; - let input_files = vec![ - index_base.with_extension("ctab"), - index_base.with_extension("refinfo"), - index_base.with_extension("sshash"), - ]; - prog_utils::check_files_exist(&input_files)?; + let index_base = index_meta::resolve_atac_piscem_index_base(opts.index.clone())?; + prog_utils::check_piscem_index_files(index_base.as_path())?; // using a piscem index let mut piscem_map_cmd = @@ -292,17 +234,13 @@ pub(crate) fn map_reads(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Resu .arg(opts.bin_overlap.to_string()); // if the user requested more threads than can be used - let mut threads = opts.threads; - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (threads, capped_at) = runtime::cap_threads(opts.threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, opts.threads + ); + warn!("setting number of threads to {}", max_threads); } // location of output directory, number of threads @@ -318,40 +256,33 @@ pub(crate) fn map_reads(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Resu // if the user is requesting a mapping option that required // piscem version >= 0.7.0, ensure we have that - if let Ok(_piscem_ver) = prog_utils::check_version_constraints( + match prog_utils::check_version_constraints( "piscem", ">=0.11.0, <1.0.0", &piscem_prog_info.version, ) { - push_advanced_piscem_options(&mut piscem_map_cmd, opts)?; - } else { - info!( - r#" + Ok(_piscem_ver) => { + push_advanced_piscem_options(&mut piscem_map_cmd, opts)?; + } + Err(_) => { + info!( + r#" Simpleaf is currently using piscem version {}, but you must be using version >= 0.11.0 in order to use the mapping options specific to this, or later versions. If you wish to use these options, please upgrade your piscem version or, if you believe you have a sufficiently new version installed, update the executable being used by simpleaf"#, - &piscem_prog_info.version - ); + &piscem_prog_info.version + ); + } } let map_cmd_string = prog_utils::get_cmd_line_string(&piscem_map_cmd); info!("map command : {}", map_cmd_string); let map_start = Instant::now(); - let map_proc_out = - prog_utils::execute_command(&mut piscem_map_cmd, CommandVerbosityLevel::Quiet) - .expect("could not execute [atac::map]"); + exec::run_checked(&mut piscem_map_cmd, "[atac::map]")?; let map_duration = map_start.elapsed(); - - if !map_proc_out.status.success() { - bail!( - "atac::map failed with exit status {:?}", - map_proc_out.status - ); - } else { - info!("mapping completed successfully in {:#?}", map_duration); - } + info!("mapping completed successfully in {:#?}", map_duration); let af_process_info_file = opts.output.join("simpleaf_process_log.json"); let af_process_info = json!({ @@ -370,25 +301,23 @@ being used by simpleaf"#, // write the relevant info about // our run to file. - std::fs::write( - &af_process_info_file, - serde_json::to_string_pretty(&af_process_info).unwrap(), - ) - .with_context(|| format!("could not write {}", af_process_info_file.display()))?; + io::write_json_pretty_atomic(&af_process_info_file, &af_process_info)?; info!("successfully mapped reads and generated output RAD file."); - Ok(()) + Ok(MapStageOutput { + map_output, + map_duration_secs: map_duration.as_secs_f64(), + map_cmd: map_cmd_string, + }) } -fn macs_call_peaks(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; +fn macs_call_peaks(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result { + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let macs_prog_info = rp .macs .as_ref() - .expect("macs program info should be properly set."); + .context("macs program info is missing; please run `simpleaf set-paths`.")?; let gpl_dir = opts.output.join("af_process"); let bedsuf = if opts.compress { ".bed.gz" } else { ".bed" }; @@ -418,18 +347,9 @@ fn macs_call_peaks(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<() info!("macs3 command : {}", macs_cmd_string); let macs_start = Instant::now(); - let macs_proc_out = prog_utils::execute_command(&mut macs_cmd, CommandVerbosityLevel::Quiet) - .expect("could not execute [atac::macs]"); + exec::run_checked(&mut macs_cmd, "[atac::macs]")?; let macs_duration = macs_start.elapsed(); - - if !macs_proc_out.status.success() { - bail!( - "atac::macs failed with exit status {:?}", - macs_proc_out.status - ); - } else { - info!("macs completed successfully in {:#?}", macs_duration); - } + info!("macs completed successfully in {:#?}", macs_duration); let af_process_info_file = opts.output.join("simpleaf_process_log.json"); let json_file = std::fs::File::open(af_process_info_file.clone()) @@ -448,35 +368,40 @@ fn macs_call_peaks(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<() // write the relevant info about // our run to file. - std::fs::write( - &af_process_info_file, - serde_json::to_string_pretty(&af_process_info).unwrap(), - ) - .with_context(|| format!("could not write {}", af_process_info_file.display()))?; + io::write_json_pretty_atomic(&af_process_info_file, &af_process_info)?; info!("successfully called peaks using macs3."); - Ok(()) + Ok(MacsStageOutput { + macs_duration_secs: macs_duration.as_secs_f64(), + macs_cmd: macs_cmd_string, + }) } pub(crate) fn gen_bed(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { - af_gpl(af_home_path, opts)?; - af_sort(af_home_path, opts)?; - macs_call_peaks(af_home_path, opts)?; + let gpl = af_gpl(af_home_path, opts)?; + let sort = af_sort(af_home_path, opts)?; + let macs = macs_call_peaks(af_home_path, opts)?; + info!( + "ATAC downstream stages completed (gpl: {:.2}s, sort: {:.2}s, macs: {:.2}s).", + gpl.gpl_duration_secs, sort.sort_duration_secs, macs.macs_duration_secs + ); + info!( + "ATAC commands: gpl=`{}`, sort=`{}`, macs=`{}`", + gpl.gpl_cmd, sort.sort_cmd, macs.macs_cmd + ); Ok(()) } // NOTE: we assume that check_progs has already been called and so version constraints have // already been checked. -fn af_sort(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; +fn af_sort(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result { + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let af_prog_info = rp .alevin_fry .as_ref() - .expect("alevin-fry program info should be properly set."); + .context("alevin-fry program info is missing; please run `simpleaf set-paths`.")?; let gpl_dir = opts.output.join("af_process"); let rad_dir = opts.output.join("af_map"); @@ -490,17 +415,13 @@ fn af_sort(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { .arg(rad_dir); // if the user requested more threads than can be used - let mut threads = opts.threads; - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (threads, capped_at) = runtime::cap_threads(opts.threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, opts.threads + ); + warn!("setting number of threads to {}", max_threads); } af_sort.arg("--threads").arg(threads.to_string()); @@ -512,18 +433,9 @@ fn af_sort(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { info!("sort command : {}", sort_cmd_string); let af_sort_start = Instant::now(); - let af_sort_proc_out = prog_utils::execute_command(&mut af_sort, CommandVerbosityLevel::Quiet) - .expect("could not execute [atac::af_sort]"); + exec::run_checked(&mut af_sort, "[atac::af_sort]")?; let af_sort_duration = af_sort_start.elapsed(); - - if !af_sort_proc_out.status.success() { - bail!( - "atac::sort failed with exit status {:?}", - af_sort_proc_out.status - ); - } else { - info!("sort completed successfully in {:#?}", af_sort_duration); - } + info!("sort completed successfully in {:#?}", af_sort_duration); let af_process_info_file = opts.output.join("simpleaf_process_log.json"); let json_file = std::fs::File::open(af_process_info_file.clone()) @@ -542,27 +454,24 @@ fn af_sort(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { // write the relevant info about // our run to file. - std::fs::write( - &af_process_info_file, - serde_json::to_string_pretty(&af_process_info).unwrap(), - ) - .with_context(|| format!("could not write {}", af_process_info_file.display()))?; + io::write_json_pretty_atomic(&af_process_info_file, &af_process_info)?; info!("successfully sorted and deduplicated records and created the output BED file."); - Ok(()) + Ok(SortStageOutput { + sort_duration_secs: af_sort_duration.as_secs_f64(), + sort_cmd: sort_cmd_string, + }) } // NOTE: we assume that check_progs has already been called and so version constraints have // already been checked. -fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; +fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result { + let rp: ReqProgs = context::load_required_programs(af_home_path)?; let af_prog_info = rp .alevin_fry .as_ref() - .expect("alevin-fry program info should be properly set."); + .context("alevin-fry program info is missing; please run `simpleaf set-paths`.")?; let filter_meth_opt; @@ -654,15 +563,22 @@ fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { warn!("barcode_ori \"{}\" is unknown; assuming forward.", s); } None => { - warn!("couldn't interpret value associated with \"barcode_ori\" as a string; assuming forward."); + warn!( + "couldn't interpret value associated with \"barcode_ori\" as a string; assuming forward." + ); } } } else { - warn!("No meta field present for the chemistry so can't check if barcodes should be reverse complemented."); + warn!( + "No meta field present for the chemistry so can't check if barcodes should be reverse complemented." + ); } } } else { - warn!("Couldn't find expected chemistry registry {} so can't check if barcodes should be reverse complemented.", custom_chem_p.display()); + warn!( + "Couldn't find expected chemistry registry {} so can't check if barcodes should be reverse complemented.", + custom_chem_p.display() + ); } pbco }; @@ -693,17 +609,13 @@ fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { } // if the user requested more threads than can be used - let mut threads = opts.threads; - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (threads, capped_at) = runtime::cap_threads(opts.threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, opts.threads + ); + warn!("setting number of threads to {}", max_threads); } af_gpl.arg("--threads").arg(format!("{}", threads)); @@ -711,21 +623,12 @@ fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { info!("gpl command : {}", gpl_cmd_string); let af_gpl_start = Instant::now(); - let af_gpl_proc_out = prog_utils::execute_command(&mut af_gpl, CommandVerbosityLevel::Quiet) - .expect("could not execute [atac::af_gpl]"); + exec::run_checked(&mut af_gpl, "[atac::af_gpl]")?; let af_gpl_duration = af_gpl_start.elapsed(); - - if !af_gpl_proc_out.status.success() { - bail!( - "atac::gpl failed with exit status {:?}", - af_gpl_proc_out.status - ); - } else { - info!( - "permit list generation completed successfully in {:#?}", - af_gpl_duration - ); - } + info!( + "permit list generation completed successfully in {:#?}", + af_gpl_duration + ); let af_process_info_file = opts.output.join("simpleaf_process_log.json"); let json_file = std::fs::File::open(af_process_info_file.clone()) @@ -744,12 +647,99 @@ fn af_gpl(af_home_path: &Path, opts: &ProcessOpts) -> anyhow::Result<()> { // write the relevant info about // our run to file. - std::fs::write( - &af_process_info_file, - serde_json::to_string_pretty(&af_process_info).unwrap(), - ) - .with_context(|| format!("could not write {}", af_process_info_file.display()))?; + io::write_json_pretty_atomic(&af_process_info_file, &af_process_info)?; info!("successfully performed cell barcode detection and correction."); - Ok(()) + Ok(GplStageOutput { + gpl_duration_secs: af_gpl_duration.as_secs_f64(), + gpl_cmd: gpl_cmd_string, + }) +} + +#[cfg(test)] +mod tests { + use std::fs; + + use crate::atac::commands::{AtacChemistry, Macs3GenomeSize}; + + use super::*; + + fn base_process_opts() -> ProcessOpts { + ProcessOpts { + index: PathBuf::from("/tmp/index"), + reads1: None, + reads2: None, + reads: None, + barcode_reads: vec![], + chemistry: AtacChemistry::TenxV2, + barcode_length: 16, + output: PathBuf::from("/tmp/out"), + threads: 1, + call_peaks: false, + permit_barcode_ori: None, + unfiltered_pl: None, + min_reads: 10, + compress: false, + ignore_ambig_hits: false, + no_poison: false, + use_chr: false, + thr: 0.8, + bin_size: 50, + bin_overlap: 2, + no_tn5_shift: false, + check_kmer_orphan: false, + max_ec_card: 4096, + max_hit_occ: 64, + max_hit_occ_recover: 1024, + max_read_occ: 250, + gsize: Macs3GenomeSize::KnownOpt("hs"), + qvalue: 0.1, + extsize: 50, + } + } + + #[test] + fn add_read_args_succeeds_for_paired_end_inputs() { + let td = tempfile::tempdir().expect("failed to create tempdir"); + let r1 = td.path().join("r1.fastq"); + let r2 = td.path().join("r2.fastq"); + let bc = td.path().join("bc.fastq"); + fs::write(&r1, "").expect("failed to write r1"); + fs::write(&r2, "").expect("failed to write r2"); + fs::write(&bc, "").expect("failed to write bc"); + + let mut opts = base_process_opts(); + opts.reads1 = Some(vec![r1]); + opts.reads2 = Some(vec![r2]); + opts.barcode_reads = vec![bc]; + + let mut cmd = std::process::Command::new("echo"); + add_read_args(&mut cmd, &opts).expect("expected add_read_args to succeed"); + } + + #[test] + fn add_read_args_fails_for_mismatched_read_counts() { + let td = tempfile::tempdir().expect("failed to create tempdir"); + let r1 = td.path().join("r1.fastq"); + let r2a = td.path().join("r2a.fastq"); + let r2b = td.path().join("r2b.fastq"); + let bc = td.path().join("bc.fastq"); + fs::write(&r1, "").expect("failed to write r1"); + fs::write(&r2a, "").expect("failed to write r2a"); + fs::write(&r2b, "").expect("failed to write r2b"); + fs::write(&bc, "").expect("failed to write bc"); + + let mut opts = base_process_opts(); + opts.reads1 = Some(vec![r1]); + opts.reads2 = Some(vec![r2a, r2b]); + opts.barcode_reads = vec![bc]; + + let mut cmd = std::process::Command::new("echo"); + let err = add_read_args(&mut cmd, &opts).expect_err("expected mismatch to fail"); + assert!( + format!("{:#}", err).contains("Cannot proceed"), + "unexpected error: {:#}", + err + ); + } } diff --git a/src/core.rs b/src/core.rs new file mode 100644 index 0000000..eaeed1f --- /dev/null +++ b/src/core.rs @@ -0,0 +1,5 @@ +pub mod context; +pub mod exec; +pub mod index_meta; +pub mod io; +pub mod runtime; diff --git a/src/core/context.rs b/src/core/context.rs new file mode 100644 index 0000000..e14024b --- /dev/null +++ b/src/core/context.rs @@ -0,0 +1,54 @@ +use std::path::Path; + +use anyhow::Context; +use serde_json::Value; + +use crate::utils::prog_utils::{self, ReqProgs}; + +pub struct RuntimeContext { + pub progs: ReqProgs, +} + +pub fn load_required_programs(af_home_path: &Path) -> anyhow::Result { + let v: Value = prog_utils::inspect_af_home(af_home_path)?; + serde_json::from_value(v["prog_info"].clone()) + .with_context(|| "Could not deserialize required program metadata from simpleaf_info.json") +} + +pub fn load_runtime_context(af_home_path: &Path) -> anyhow::Result { + Ok(RuntimeContext { + progs: load_required_programs(af_home_path)?, + }) +} + +#[cfg(test)] +mod tests { + use std::fs; + + use serde_json::json; + use tempfile::tempdir; + + use super::load_runtime_context; + + #[test] + fn load_runtime_context_reads_prog_info() { + let td = tempdir().expect("failed to create tempdir"); + let af_info = json!({ + "prog_info": { + "salmon": null, + "piscem": {"exe_path": "/bin/echo", "version": "0.12.2"}, + "alevin_fry": {"exe_path": "/bin/echo", "version": "0.11.2"}, + "macs": null + } + }); + fs::write( + td.path().join("simpleaf_info.json"), + serde_json::to_string_pretty(&af_info).expect("failed to serialize json"), + ) + .expect("failed to write simpleaf_info.json"); + + let ctx = load_runtime_context(td.path()).expect("failed to load runtime context"); + assert!(ctx.progs.piscem.is_some()); + assert!(ctx.progs.alevin_fry.is_some()); + } +} diff --git a/src/core/exec.rs b/src/core/exec.rs new file mode 100644 index 0000000..1abaee3 --- /dev/null +++ b/src/core/exec.rs @@ -0,0 +1,54 @@ +use anyhow::{Context, bail}; + +use crate::utils::prog_utils::{self, CommandVerbosityLevel}; + +pub fn run_capture( + cmd: &mut std::process::Command, + context: &str, +) -> anyhow::Result { + prog_utils::execute_command(cmd, CommandVerbosityLevel::Quiet) + .with_context(|| format!("failed to execute {}", context)) +} + +pub fn ensure_success(output: &std::process::Output, context: &str) -> anyhow::Result<()> { + if output.status.success() { + return Ok(()); + } + + let stderr = String::from_utf8_lossy(&output.stderr); + bail!( + "{} failed with exit status {}. stderr: {}", + context, + output.status, + stderr.trim() + ); +} + +pub fn run_checked( + cmd: &mut std::process::Command, + context: &str, +) -> anyhow::Result { + let output = run_capture(cmd, context)?; + ensure_success(&output, context)?; + Ok(output) +} + +#[cfg(test)] +mod tests { + use super::run_checked; + + #[test] + fn run_checked_succeeds_for_true_command() { + let mut cmd = std::process::Command::new("sh"); + cmd.arg("-c").arg("true"); + run_checked(&mut cmd, "sh true").expect("expected true command to succeed"); + } + + #[test] + fn run_checked_errors_for_false_command() { + let mut cmd = std::process::Command::new("sh"); + cmd.arg("-c").arg("false"); + let err = run_checked(&mut cmd, "sh false").expect_err("expected false command to fail"); + assert!(format!("{:#}", err).contains("failed with exit status")); + } +} diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs new file mode 100644 index 0000000..af53247 --- /dev/null +++ b/src/core/index_meta.rs @@ -0,0 +1,163 @@ +use std::path::PathBuf; + +use anyhow::bail; +use serde_json::Value; + +use crate::core::io; +use crate::utils::af_utils::IndexType; + +pub struct QuantIndexMetadata { + pub index_type: IndexType, + pub inferred_t2g: Option, + pub inferred_gene_id_to_name: Option, +} + +pub fn resolve_quant_index( + index: Option, + use_piscem: bool, +) -> anyhow::Result { + let mut inferred_t2g = None; + let mut inferred_gene_id_to_name = None; + let index_type; + + if let Some(mut index) = index { + let removed_piscem_idx_suffix = if !index.is_dir() && index.ends_with("piscem_idx") { + index.pop(); + true + } else { + false + }; + + let index_json_path = index.join("simpleaf_index.json"); + match index_json_path.try_exists() { + Ok(true) => { + let v: Value = io::read_json_file(&index_json_path)?; + + let index_type_str: String = serde_json::from_value(v["index_type"].clone())?; + index_type = match index_type_str.as_ref() { + "salmon" => IndexType::Salmon(index.clone()), + "piscem" => IndexType::Piscem(index.join("piscem_idx")), + _ => { + bail!( + "unknown index type {} present in simpleaf_index.json", + index_type_str + ); + } + }; + + let t2g_opt: Option = serde_json::from_value(v["t2g_file"].clone())?; + if let Some(t2g_rel) = t2g_opt { + inferred_t2g = Some(index.join(t2g_rel)); + } + + if index.join("gene_id_to_name.tsv").exists() { + inferred_gene_id_to_name = Some(index.join("gene_id_to_name.tsv")); + } else if let Some(index_parent) = index.parent() { + let gene_name_path = index_parent.join("ref").join("gene_id_to_name.tsv"); + if gene_name_path.exists() && gene_name_path.is_file() { + inferred_gene_id_to_name = Some(gene_name_path); + } + } + } + Ok(false) => { + if removed_piscem_idx_suffix { + index.push("piscem_idx"); + } + index_type = if use_piscem { + IndexType::Piscem(index) + } else { + IndexType::Salmon(index) + }; + } + Err(e) => bail!(e), + } + } else { + index_type = IndexType::NoIndex; + } + + Ok(QuantIndexMetadata { + index_type, + inferred_t2g, + inferred_gene_id_to_name, + }) +} + +pub fn resolve_atac_piscem_index_base(mut index: PathBuf) -> anyhow::Result { + let removed_piscem_idx_suffix = if !index.is_dir() && index.ends_with("piscem_idx") { + index.pop(); + true + } else { + false + }; + + let index_json_path = index.join("simpleaf_index.json"); + match index_json_path.try_exists() { + Ok(true) => { + let v: Value = io::read_json_file(&index_json_path)?; + + let index_type_str: String = serde_json::from_value(v["index_type"].clone())?; + match index_type_str.as_ref() { + "piscem" => Ok(index.join("piscem_idx")), + _ => bail!( + "unknown index type {} present in simpleaf_index.json", + index_type_str + ), + } + } + Ok(false) => { + if removed_piscem_idx_suffix { + index.push("piscem_idx"); + } + Ok(index) + } + Err(e) => bail!(e), + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use serde_json::json; + use tempfile::tempdir; + + use crate::utils::af_utils::IndexType; + + use super::{resolve_atac_piscem_index_base, resolve_quant_index}; + + #[test] + fn resolve_quant_index_reads_simpleaf_index_metadata() { + let td = tempdir().expect("failed to create tempdir"); + let idx_dir = td.path().join("index"); + fs::create_dir_all(&idx_dir).expect("failed to create index dir"); + fs::write( + idx_dir.join("simpleaf_index.json"), + serde_json::to_string_pretty(&json!({ + "index_type":"salmon", + "t2g_file":"t2g_3col.tsv" + })) + .expect("failed to serialize json"), + ) + .expect("failed to write simpleaf_index.json"); + fs::write(idx_dir.join("gene_id_to_name.tsv"), "g1\tn1\n") + .expect("failed to write gene_id_to_name.tsv"); + + let meta = resolve_quant_index(Some(idx_dir.clone()), false) + .expect("failed to resolve quant index"); + assert!(matches!(meta.index_type, IndexType::Salmon(_))); + assert_eq!(meta.inferred_t2g, Some(idx_dir.join("t2g_3col.tsv"))); + assert_eq!( + meta.inferred_gene_id_to_name, + Some(idx_dir.join("gene_id_to_name.tsv")) + ); + } + + #[test] + fn resolve_atac_index_base_accepts_plain_prefix() { + let td = tempdir().expect("failed to create tempdir"); + let idx = td.path().join("foo").join("piscem_idx"); + let resolved = + resolve_atac_piscem_index_base(idx.clone()).expect("failed to resolve atac index"); + assert_eq!(resolved, idx); + } +} diff --git a/src/core/io.rs b/src/core/io.rs new file mode 100644 index 0000000..88b6279 --- /dev/null +++ b/src/core/io.rs @@ -0,0 +1,63 @@ +use std::fs; +use std::path::Path; + +use anyhow::Context; +use serde::Serialize; +use serde_json::Value; + +pub fn read_json_file(path: &Path) -> anyhow::Result { + let json_file = std::fs::File::open(path) + .with_context(|| format!("Could not open JSON file {}.", path.display()))?; + let v: Value = serde_json::from_reader(json_file) + .with_context(|| format!("Could not parse JSON file {}.", path.display()))?; + Ok(v) +} + +pub fn write_json_pretty(path: &Path, value: &T) -> anyhow::Result<()> { + let payload = serde_json::to_string_pretty(value) + .with_context(|| format!("Could not serialize JSON for {}.", path.display()))?; + fs::write(path, payload).with_context(|| format!("could not write {}", path.display())) +} + +pub fn write_json_pretty_atomic(path: &Path, value: &T) -> anyhow::Result<()> { + let payload = serde_json::to_string_pretty(value) + .with_context(|| format!("Could not serialize JSON for {}.", path.display()))?; + let tmp_path = path.with_extension("tmp"); + fs::write(&tmp_path, payload) + .with_context(|| format!("could not write temporary file {}", tmp_path.display()))?; + fs::rename(&tmp_path, path).with_context(|| { + format!( + "could not atomically rename {} to {}", + tmp_path.display(), + path.display() + ) + }) +} + +#[cfg(test)] +mod tests { + use serde_json::json; + use tempfile::tempdir; + + use super::{read_json_file, write_json_pretty, write_json_pretty_atomic}; + + #[test] + fn write_and_read_json_roundtrip() { + let td = tempdir().expect("failed to create tempdir"); + let path = td.path().join("a.json"); + let v = json!({"key":"value","n":1}); + write_json_pretty(&path, &v).expect("failed to write json"); + let read_back = read_json_file(&path).expect("failed to read json"); + assert_eq!(read_back, v); + } + + #[test] + fn write_json_pretty_atomic_persists_content() { + let td = tempdir().expect("failed to create tempdir"); + let path = td.path().join("b.json"); + let v = json!({"ok":true}); + write_json_pretty_atomic(&path, &v).expect("failed to write json atomically"); + let read_back = read_json_file(&path).expect("failed to read json"); + assert_eq!(read_back, v); + } +} diff --git a/src/core/runtime.rs b/src/core/runtime.rs new file mode 100644 index 0000000..ac38487 --- /dev/null +++ b/src/core/runtime.rs @@ -0,0 +1,36 @@ +pub fn cap_threads(requested: u32) -> (u32, Option) { + cap_threads_with_limit( + requested, + std::thread::available_parallelism() + .ok() + .map(|n| n.get() as u32), + ) +} + +fn cap_threads_with_limit(requested: u32, limit: Option) -> (u32, Option) { + if let Some(max_threads) = limit + && requested > max_threads + { + return (max_threads, Some(max_threads)); + } + (requested, None) +} + +#[cfg(test)] +mod tests { + use super::cap_threads_with_limit; + + #[test] + fn caps_requested_threads_when_over_limit() { + let (effective, capped_at) = cap_threads_with_limit(32, Some(8)); + assert_eq!(effective, 8); + assert_eq!(capped_at, Some(8)); + } + + #[test] + fn keeps_requested_threads_when_within_limit() { + let (effective, capped_at) = cap_threads_with_limit(8, Some(32)); + assert_eq!(effective, 8); + assert_eq!(capped_at, None); + } +} diff --git a/src/main.rs b/src/main.rs index 64efefe..e5bcbc8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,14 +2,16 @@ use chemistry::{ add_chemistry, clean_chemistries, fetch_chemistries, lookup_chemistry, refresh_chemistries, remove_chemistry, }; -use tracing_subscriber::{filter::LevelFilter, fmt, prelude::*, EnvFilter}; +use tracing::info; +use tracing_subscriber::{EnvFilter, filter::LevelFilter, fmt, prelude::*}; use anyhow::bail; -use clap::{crate_version, Parser}; +use clap::{Parser, crate_version}; use std::env; use std::path::PathBuf; +mod core; mod defaults; mod utils; @@ -109,7 +111,13 @@ fn main() -> anyhow::Result<()> { // validate versions atac::process::check_progs(&af_home_path, &process_opts)?; // first we map the reads - atac::process::map_reads(af_home_path.as_path(), &process_opts)?; + let map_stage = atac::process::map_reads(af_home_path.as_path(), &process_opts)?; + info!( + "ATAC map stage complete: output={} duration={:.2}s cmd={}", + map_stage.map_output.display(), + map_stage.map_duration_secs, + map_stage.map_cmd + ); // then we generate the permit list and sort the file atac::process::gen_bed(af_home_path.as_path(), &process_opts) } diff --git a/src/simpleaf_commands.rs b/src/simpleaf_commands.rs index 9bc72f3..b226f6d 100644 --- a/src/simpleaf_commands.rs +++ b/src/simpleaf_commands.rs @@ -24,7 +24,7 @@ pub use self::workflow::{ pub use crate::atac::commands::AtacCommand; pub use crate::defaults::{DefaultMappingParams, DefaultParams}; -use clap::{builder::ArgPredicate, ArgAction, ArgGroup, Args, Subcommand}; +use clap::{ArgAction, ArgGroup, Args, Subcommand, builder::ArgPredicate}; use std::path::PathBuf; /// The type of references we might create diff --git a/src/simpleaf_commands/chemistry.rs b/src/simpleaf_commands/chemistry.rs index 22d6918..91b7924 100644 --- a/src/simpleaf_commands/chemistry.rs +++ b/src/simpleaf_commands/chemistry.rs @@ -1,24 +1,138 @@ +use crate::core::io::write_json_pretty_atomic; use crate::utils::chem_utils::{ - custom_chem_hm_into_json, get_custom_chem_hm, get_single_custom_chem_from_file, CustomChemistry, CustomChemistryMap, ExpectedOri, LOCAL_PL_PATH_KEY, REMOTE_PL_URL_KEY, + custom_chem_hm_into_json, get_custom_chem_hm, get_single_custom_chem_from_file, }; use crate::utils::constants::*; use crate::utils::prog_utils::{self, download_to_file_compute_hash}; use crate::utils::{self, af_utils::*}; use regex::Regex; -use anyhow::{bail, Context, Result}; +use anyhow::{Context, Result, bail}; use semver::Version; use serde_json::json; +use serde_json::{Map, Value}; use std::collections::HashSet; use std::fs; -use std::io::{Seek, Write}; use std::os::unix::fs::MetadataExt; use std::path::{Path, PathBuf}; use tracing::{debug, info, warn}; use utils::prog_utils::read_json_from_remote_url; use utils::remote::is_remote_url; +fn removable_permit_lists( + used_pls: &HashSet, + present_pls: &HashSet, +) -> HashSet { + present_pls - used_pls +} + +/// Parse a chemistry version string with additional operation context. +fn parse_chemistry_version(version: &str, context: &str) -> Result { + Version::parse(version).with_context(|| { + format!( + "Could not parse version {} while {}. Please follow https://semver.org/ (e.g. 0.1.0).", + version, context + ) + }) +} + +/// Return `true` if a new chemistry entry should replace an existing one. +/// +/// Replacement happens when `force` is set or when the incoming version is +/// strictly newer than the existing version. +fn should_replace_registry_entry(existing: &Value, incoming: &Value, force: bool) -> Result { + if force { + return Ok(true); + } + let current_version = existing + .get("version") + .and_then(Value::as_str) + .with_context(|| "Chemistry should have a string `version` field.")?; + let new_version = incoming + .get("version") + .and_then(Value::as_str) + .with_context(|| "Chemistry should have a string `version` field.")?; + Ok( + parse_chemistry_version(new_version, "comparing incoming chemistry")? + > parse_chemistry_version(current_version, "comparing existing chemistry")?, + ) +} + +/// Merge incoming registry entries into an existing registry object. +/// +/// Missing keys are inserted. Existing keys are replaced only when +/// `should_replace_registry_entry` permits it. +fn merge_registry_entries( + existing: &mut Map, + incoming: &Map, + force: bool, + dry_run_pref: &str, +) -> Result<()> { + for (k, v) in incoming { + match existing.get_mut(k) { + None => { + existing.insert(k.clone(), v.clone()); + } + Some(curr) => { + if should_replace_registry_entry(curr, v, force)? { + info!("{}updating {}", dry_run_pref, k); + existing.insert(k.clone(), v.clone()); + } + } + } + } + Ok(()) +} + +/// Merge entries from deprecated `custom_chemistries.json` into the main registry. +/// +/// Only missing entries are inserted, and only when the deprecated value is a +/// valid geometry string. +fn merge_deprecated_registry_entries( + existing: &mut Map, + deprecated: &Map, + dry_run_pref: &str, +) { + for (k, v) in deprecated { + if existing.contains_key(k) { + warn!( + "{}The main registry already contained the chemistry \"{}\"; Ignored the one from the deprecated registry.", + dry_run_pref, k + ); + } else if let Value::String(geom) = v { + if validate_geometry(geom).is_ok() { + let new_ent = json!({ + "geometry": geom, + "expected_ori": "both", + "version" : CustomChemistry::default_version(), + }); + existing.insert(k.to_owned(), new_ent); + info!( + "{}Successfully inserted chemistry \"{}\" from the deprecated registry into the main registry.", + dry_run_pref, k + ); + } else { + warn!( + "{}The chemistry \"{}\" in the deprecated registry is not a valid geometry string; Skipped.", + dry_run_pref, k + ); + } + } else { + warn!( + "{}The chemistry \"{}\" in the deprecated registry is not a string; Skipped.", + dry_run_pref, k + ); + } + } +} + +/// Persist a JSON value as pretty-printed text to disk. +fn write_json_pretty(path: &Path, value: &Value) -> Result<()> { + write_json_pretty_atomic(path, value) + .with_context(|| format!("Could not write {}", path.display())) +} + /// Attempt to get the chemistry definition from the provided JSON file /// Check if the JSON file is local or remote. If remote, fetch the file first. /// Parse the JSON file, and look for the specific chemistry with the requested name. @@ -139,7 +253,8 @@ pub fn add_chemistry( anyhow::Result::Err(e) => { bail!( "failed to obtain the chemistry definition from the provided JSON source : {}. Error :: {:#}", - json_src, e + json_src, + e ); } } @@ -160,7 +275,7 @@ pub fn add_chemistry( let version = add_opts .version .unwrap_or(CustomChemistry::default_version()); - let add_ver = Version::parse(version.as_ref()).with_context(|| format!("Could not parse version {}. Please follow https://semver.org/. A valid example is 0.1.0", version))?; + let add_ver = parse_chemistry_version(version.as_ref(), "adding chemistry")?; let name = add_opts.name; @@ -169,9 +284,15 @@ pub fn add_chemistry( if let Some(existing_entry) = get_single_custom_chem_from_file(&chem_p, &name)? { let existing_ver_str = existing_entry.version(); - let existing_ver = Version::parse(existing_ver_str).with_context( || format!("Could not parse version {} found in existing chemistries.json file. Please correct this entry", existing_ver_str))?; + let existing_ver = parse_chemistry_version( + existing_ver_str, + "reading existing chemistry version from chemistries.json", + )?; if add_ver <= existing_ver { - info!("Attempting to add chemistry with version {:#} which is <= than the existing version ({:#}) for this chemistry; Skipping addition.", add_ver, existing_ver); + info!( + "Attempting to add chemistry with version {:#} which is <= than the existing version ({:#}) for this chemistry; Skipping addition.", + add_ver, existing_ver + ); return Ok(()); } else { info!( @@ -187,7 +308,11 @@ pub fn add_chemistry( let metadata = std::fs::metadata(&local_url)?; let flen = metadata.size(); if flen > 4_294_967_296 { - warn!("The file provided to local-url ({}) is {:.1} GB. This file will be *copied* into the ALEVIN_FRY_HOME directory", local_url.display(), flen / 1_073_741_824); + warn!( + "The file provided to local-url ({}) is {:.1} GB. This file will be *copied* into the ALEVIN_FRY_HOME directory", + local_url.display(), + flen / 1_073_741_824 + ); } let mut hasher = blake3::Hasher::new(); @@ -252,8 +377,8 @@ pub fn add_chemistry( // check if the file already exists if local_plist_path.is_file() { info!( - "Found a cached, content-equivalent permit list file; will use the existing file." - ); + "Found a cached, content-equivalent permit list file; will use the existing file." + ); // remove what we just downloaded fs::remove_file(tmpfile)?; @@ -287,7 +412,16 @@ pub fn add_chemistry( // check if the chemistry already exists and log if let Some(cc) = chem_hm.get(custom_chem.name()) { - info!("Chemistry {} is already registered, with geometry {} the one recorded: {}; overwriting geometry specification.", custom_chem.name(), if cc.geometry() == custom_chem.geometry() {"same as"} else {"different with"}, cc.geometry()); + info!( + "Chemistry {} is already registered, with geometry {} the one recorded: {}; overwriting geometry specification.", + custom_chem.name(), + if cc.geometry() == custom_chem.geometry() { + "same as" + } else { + "different with" + }, + cc.geometry() + ); chem_hm .entry(custom_chem.name().to_string()) .and_modify(|e| *e = custom_chem); @@ -302,15 +436,7 @@ pub fn add_chemistry( // convert the custom chemistry hashmap to json let v = custom_chem_hm_into_json(chem_hm)?; - - // write out the new custom chemistry file - let mut custom_chem_file = std::fs::File::create(&chem_p) - .with_context(|| format!("Could not create {}", chem_p.display()))?; - custom_chem_file.rewind()?; - - custom_chem_file - .write_all(serde_json::to_string_pretty(&v).unwrap().as_bytes()) - .with_context(|| format!("Could not write {}", chem_p.display()))?; + write_json_pretty(&chem_p, &v)?; Ok(()) } @@ -333,7 +459,10 @@ pub fn refresh_chemistries( // but read it in and attempt to populate. let custom_chem_file = af_home.join(CUSTOM_CHEMISTRIES_PATH); let merge_custom_chem = if custom_chem_file.exists() { - warn!("{}Found deprecated chemistry registry file \"{}\"; Attempting to merge the chemistries defined in this file into the main registry.", dry_run_pref, CUSTOM_CHEMISTRIES_PATH); + warn!( + "{}Found deprecated chemistry registry file \"{}\"; Attempting to merge the chemistries defined in this file into the main registry.", + dry_run_pref, CUSTOM_CHEMISTRIES_PATH + ); true } else { false @@ -377,49 +506,16 @@ pub fn refresh_chemistries( prog_utils::download_to_file(CHEMISTRIES_URL, &tmp_chem_path)?; if let Some(existing_chem) = parse_resource_json_file(&chem_path, None)?.as_object_mut() { if let Some(new_chem) = parse_resource_json_file(&tmp_chem_path, None)?.as_object() { - for (k, v) in new_chem.iter() { - match existing_chem.get_mut(k) { - None => { - existing_chem.insert(k.clone(), v.clone()); - } - Some(ev) => { - let curr_ver = Version::parse( - ev.get("version") - .expect("Chemistry should have a version field") - .as_str() - .expect("Version should be a string"), - )?; - let new_ver = Version::parse( - v.get("version") - .expect("Chemistry should have a version field") - .as_str() - .expect("Version should be a string"), - )?; - if refresh_opts.force || new_ver > curr_ver { - info!("{}updating {}", dry_run_pref, k); - existing_chem.insert(k.clone(), v.clone()); - } - } - } - } - - // write out the merged chemistry file - let mut chem_file = std::fs::File::create(&chem_path) - .with_context(|| format!("could not create {}", chem_path.display()))?; - chem_file.rewind()?; - - chem_file - .write_all( - serde_json::to_string_pretty(&existing_chem) - .unwrap() - .as_bytes(), - ) - .with_context(|| format!("could not write {}", chem_path.display()))?; + merge_registry_entries(existing_chem, new_chem, refresh_opts.force, dry_run_pref)?; + write_json_pretty(&chem_path, &Value::Object(existing_chem.clone()))?; // remove the temp file std::fs::remove_file(tmp_chem_path)?; } else { - bail!("Could not parse the main registry from \"{}\" file. Please report this on GitHub.", chem_path.display()); + bail!( + "Could not parse the main registry from \"{}\" file. Please report this on GitHub.", + chem_path.display() + ); } } else { bail!( @@ -434,42 +530,22 @@ pub fn refresh_chemistries( if let Some(old_custom_chem) = parse_resource_json_file(&custom_chem_file, None)?.as_object() { - for (k, v) in old_custom_chem.iter() { - if new_chem.contains_key(k) { - warn!("{}The main registry already contained the chemistry \"{}\"; Ignored the one from the deprecated registry.", dry_run_pref, k); - } else if let serde_json::Value::String(v) = v { - if validate_geometry(v).is_ok() { - let new_ent = json!({ - "geometry": v, - "expected_ori": "both", - "version" : CustomChemistry::default_version(), - }); - new_chem.insert(k.to_owned(), new_ent); - info!("{}Successfully inserted chemistry \"{}\" from the deprecated registry into the main registry.", dry_run_pref, k); - } else { - warn!("{}The chemistry \"{}\" in the deprecated registry is not a valid geometry string; Skipped.", dry_run_pref, k); - } - } else { - warn!("{}The chemistry \"{}\" in the deprecated registry is not a string; Skipped.", dry_run_pref, k); - } - } - - // write out the merged chemistry file - let mut chem_file = std::fs::File::create(&chem_path) - .with_context(|| format!("could not create {}", chem_path.display()))?; - chem_file.rewind()?; - - chem_file - .write_all(serde_json::to_string_pretty(&new_chem).unwrap().as_bytes()) - .with_context(|| format!("could not write {}", chem_path.display()))?; + merge_deprecated_registry_entries(new_chem, old_custom_chem, dry_run_pref); + write_json_pretty(&chem_path, &Value::Object(new_chem.clone()))?; let backup = custom_chem_file.with_extension("json.bak"); std::fs::rename(custom_chem_file, backup)?; } else { - bail!("Could not parse the deprecated registry file as a JSON object; it may be corrupted. Consider deleting this file from {}.", custom_chem_file.display()); + bail!( + "Could not parse the deprecated registry file as a JSON object; it may be corrupted. Consider deleting this file from {}.", + custom_chem_file.display() + ); } } else { - bail!("Could not parse the main chemistry registry file, \"{}\", as a JSON object. Please report this on GitHub.", chem_path.display()); + bail!( + "Could not parse the main chemistry registry file, \"{}\", as a JSON object. Please report this on GitHub.", + chem_path.display() + ); } } @@ -520,28 +596,29 @@ pub fn clean_chemistries( .filter_map(|de| { if let Ok(entry) = de { let path = entry.path(); - if path.is_file() { - Some(path) - } else { - None - } + if path.is_file() { Some(path) } else { None } } else { None } }) .collect::>(); - let rem_pls = &present_pls - &used_pls; + let rem_pls = removable_permit_lists(&used_pls, &present_pls); // check if the chemistry already exists and log if dry_run { if rem_pls.is_empty() { - info!("[dry_run] : No permit list files in the cache directory are currently unused; Nothing to clean."); + info!( + "[dry_run] : No permit list files in the cache directory are currently unused; Nothing to clean." + ); } else { - info!("[dry_run] : The following files in the permit list directory do not match any registered chemistries and would be removed: {:#?}", present_pls); + info!( + "[dry_run] : The following files in the permit list directory do not match any registered chemistries and would be removed: {:#?}", + rem_pls + ); } } else { - for pl in rem_pls { + for pl in &rem_pls { info!("removing file from {}", pl.display()); std::fs::remove_file(pl)?; } @@ -563,26 +640,29 @@ pub fn remove_chemistry( let mut num_matched = 0; let keys = chem_hm.keys().cloned().collect::>(); - if let Ok(name_re) = regex::Regex::new(&name) { - for k in keys { - if name_re.is_match(&k) { - num_matched += 1; - if remove_opts.dry_run { - info!( - "[dry_run] : Would remove chemistry \"{}\" from the registry.", - k - ); - } else { - info!("Chemistry \"{}\" found in the registry; Removing it!", k); - chem_hm.remove(&k); + match regex::Regex::new(&name) { + Ok(name_re) => { + for k in keys { + if name_re.is_match(&k) { + num_matched += 1; + if remove_opts.dry_run { + info!( + "[dry_run] : Would remove chemistry \"{}\" from the registry.", + k + ); + } else { + info!("Chemistry \"{}\" found in the registry; Removing it!", k); + chem_hm.remove(&k); + } } } } - } else { - bail!( - "The provided chemistry name {} was neither a valid chemistry name nor a valid regex.", - name - ); + Err(_) => { + bail!( + "The provided chemistry name {} was neither a valid chemistry name nor a valid regex.", + name + ); + } } if num_matched == 0 { @@ -593,15 +673,7 @@ pub fn remove_chemistry( } else if !remove_opts.dry_run { // convert the custom chemistry hashmap to json let v = custom_chem_hm_into_json(chem_hm)?; - - // write out the new custom chemistry file - let mut custom_chem_file = std::fs::File::create(&chem_p) - .with_context(|| format!("Could not create {}", chem_p.display()))?; - custom_chem_file.rewind()?; - - custom_chem_file - .write_all(serde_json::to_string_pretty(&v).unwrap().as_bytes()) - .with_context(|| format!("Could not write {}", chem_p.display()))?; + write_json_pretty(&chem_p, &v)?; } Ok(()) @@ -630,19 +702,22 @@ pub fn lookup_chemistry( ); let chem_hm = get_custom_chem_hm(&chem_p)?; - if let Ok(re) = Regex::new(&name) { - println!("================="); - for (cname, cval) in chem_hm.iter() { - if re.is_match(cname) { - print!("{}", cval); - println!("================="); + match Regex::new(&name) { + Ok(re) => { + println!("================="); + for (cname, cval) in chem_hm.iter() { + if re.is_match(cname) { + print!("{}", cval); + println!("================="); + } } } - } else { - info!( - "No chemistry matching regex pattern {} was found in the registry!", - name - ); + Err(_) => { + info!( + "No chemistry matching regex pattern {} was found in the registry!", + name + ); + } } } @@ -656,13 +731,14 @@ struct FetchSet<'a> { impl<'a> FetchSet<'a> { pub fn from_re(s: &str) -> Result { - if let Ok(re) = regex::Regex::new(s) { - Ok(Self { + match regex::Regex::new(s) { + Ok(re) => Ok(Self { m: HashSet::new(), re: Some(re), - }) - } else { - bail!("Could not compile regex : [{}]", s) + }), + Err(_) => { + bail!("Could not compile regex : [{}]", s) + } } } @@ -716,43 +792,45 @@ pub fn fetch_chemistries( for (k, v) in chem_obj.iter() { // if we want to fetch this chem - if fetch_chems.contains(k) { - if let Some(serde_json::Value::String(pfile)) = v.get(LOCAL_PL_PATH_KEY) { - let fpath = plist_path.join(pfile); - - // if it doesn't exist - if !fpath.is_file() { - //check for a remote path - if let Some(serde_json::Value::String(rpath)) = v.get(REMOTE_PL_URL_KEY) { - if fetch_opts.dry_run { - info!( - "[dry_run] : Fetch would fetch missing file {} for {} from {}", - pfile, k, rpath + if fetch_chems.contains(k) + && let Some(serde_json::Value::String(pfile)) = v.get(LOCAL_PL_PATH_KEY) + { + let fpath = plist_path.join(pfile); + + // if it doesn't exist + if !fpath.is_file() { + //check for a remote path + if let Some(serde_json::Value::String(rpath)) = v.get(REMOTE_PL_URL_KEY) { + if fetch_opts.dry_run { + info!( + "[dry_run] : Fetch would fetch missing file {} for {} from {}", + pfile, k, rpath + ); + } else { + let hash = download_to_file_compute_hash(rpath, &fpath)?; + let expected_hash = pfile.to_string(); + let observed_hash = hash.to_string(); + if expected_hash != observed_hash { + warn!( + "Downloaded the file for chemistry {} from {}, but the observed hash {} was not equal to the expcted hash {}", + k, rpath, observed_hash, expected_hash ); - } else { - let hash = download_to_file_compute_hash(rpath, &fpath)?; - let expected_hash = pfile.to_string(); - let observed_hash = hash.to_string(); - if expected_hash != observed_hash { - warn!("Downloaded the file for chemistry {} from {}, but the observed hash {} was not equal to the expcted hash {}", - k, rpath, observed_hash, expected_hash); - } - info!("Fetched permit list file for {} to {}", k, fpath.display()); } - } else { - warn!( - "{}Requested to obtain chemistry {}, but it has no remote URL!", - dry_run_str, k - ); + info!("Fetched permit list file for {} to {}", k, fpath.display()); } } else { - info!( - "{}File for requested chemistry {} already exists ({}).", - dry_run_str, - k, - fpath.display() + warn!( + "{}Requested to obtain chemistry {}, but it has no remote URL!", + dry_run_str, k ); } + } else { + info!( + "{}File for requested chemistry {} already exists ({}).", + dry_run_str, + k, + fpath.display() + ); } } } @@ -760,3 +838,239 @@ pub fn fetch_chemistries( Ok(()) } + +#[cfg(test)] +mod tests { + use super::{ + add_chemistry, clean_chemistries, merge_deprecated_registry_entries, + merge_registry_entries, parse_chemistry_version, removable_permit_lists, remove_chemistry, + }; + use crate::simpleaf_commands::{ChemistryAddOpts, ChemistryCleanOpts, ChemistryRemoveOpts}; + use crate::utils::constants::CHEMISTRIES_PATH; + use serde_json::{Map, Value, json}; + use std::collections::HashSet; + use std::fs; + use std::path::PathBuf; + use tempfile::tempdir; + + /// Write a JSON value to `/chemistries.json` for registry tests. + fn write_registry(af_home: &std::path::Path, value: &Value) { + fs::write( + af_home.join(CHEMISTRIES_PATH), + serde_json::to_string_pretty(value).unwrap(), + ) + .unwrap(); + } + + /// Read `/chemistries.json` as JSON. + fn read_registry(af_home: &std::path::Path) -> Value { + serde_json::from_str(&fs::read_to_string(af_home.join(CHEMISTRIES_PATH)).unwrap()).unwrap() + } + + #[test] + fn removable_permit_list_set_only_contains_unused_files() { + let used = HashSet::from([PathBuf::from("plist/a"), PathBuf::from("plist/b")]); + let present = HashSet::from([ + PathBuf::from("plist/a"), + PathBuf::from("plist/b"), + PathBuf::from("plist/c"), + ]); + + let removable = removable_permit_lists(&used, &present); + assert_eq!(removable, HashSet::from([PathBuf::from("plist/c")])); + } + + #[test] + fn merge_registry_entries_prefers_newer_versions_unless_forced() { + let mut existing = Map::new(); + existing.insert( + "chem_a".to_string(), + json!({"geometry":"1{b[16]u[12]x:}2{r:}","expected_ori":"both","version":"1.0.0"}), + ); + let mut incoming = Map::new(); + incoming.insert( + "chem_a".to_string(), + json!({"geometry":"1{b[16]u[10]x:}2{r:}","expected_ori":"both","version":"0.9.0"}), + ); + incoming.insert( + "chem_b".to_string(), + json!({"geometry":"1{b[16]u[12]x:}2{r[50]x:}","expected_ori":"both","version":"0.1.0"}), + ); + + merge_registry_entries(&mut existing, &incoming, false, "").unwrap(); + assert_eq!(existing["chem_a"]["version"], json!("1.0.0")); + assert_eq!(existing["chem_b"]["version"], json!("0.1.0")); + + merge_registry_entries(&mut existing, &incoming, true, "").unwrap(); + assert_eq!(existing["chem_a"]["version"], json!("0.9.0")); + } + + #[test] + fn merge_deprecated_registry_entries_inserts_only_valid_missing_entries() { + let mut existing = Map::new(); + existing.insert( + "already".to_string(), + json!({"geometry":"1{b[16]u[12]x:}2{r:}","expected_ori":"both","version":"1.0.0"}), + ); + let deprecated = Map::from_iter([ + ( + "already".to_string(), + Value::String("1{b[10]}2{r:}".to_string()), + ), + ( + "valid_new".to_string(), + Value::String("1{b[16]u[12]x:}2{r:}".to_string()), + ), + ( + "invalid_new".to_string(), + Value::String("bad-geom".to_string()), + ), + ]); + + merge_deprecated_registry_entries(&mut existing, &deprecated, ""); + + assert!(existing.contains_key("already")); + assert!(existing.contains_key("valid_new")); + assert!(!existing.contains_key("invalid_new")); + } + + #[test] + fn add_chemistry_only_updates_when_version_increases() { + let tmp = tempdir().unwrap(); + write_registry( + tmp.path(), + &json!({ + "mychem": { + "geometry": "1{b[16]u[12]x:}2{r:}", + "expected_ori": "both", + "version": "1.2.0" + } + }), + ); + + add_chemistry( + tmp.path().to_path_buf(), + ChemistryAddOpts { + name: "mychem".to_string(), + geometry: Some("1{b[16]u[10]x:}2{r:}".to_string()), + expected_ori: Some("both".to_string()), + local_url: None, + remote_url: None, + version: Some("1.1.0".to_string()), + from_json: None, + }, + ) + .unwrap(); + let registry_after_older = read_registry(tmp.path()); + assert_eq!(registry_after_older["mychem"]["version"], json!("1.2.0")); + assert_eq!( + registry_after_older["mychem"]["geometry"], + json!("1{b[16]u[12]x:}2{r:}") + ); + + add_chemistry( + tmp.path().to_path_buf(), + ChemistryAddOpts { + name: "mychem".to_string(), + geometry: Some("1{b[16]u[10]x:}2{r:}".to_string()), + expected_ori: Some("both".to_string()), + local_url: None, + remote_url: None, + version: Some("1.3.0".to_string()), + from_json: None, + }, + ) + .unwrap(); + let registry_after_newer = read_registry(tmp.path()); + assert_eq!(registry_after_newer["mychem"]["version"], json!("1.3.0")); + assert_eq!( + registry_after_newer["mychem"]["geometry"], + json!("1{b[16]u[10]x:}2{r:}") + ); + } + + #[test] + fn remove_chemistry_respects_dry_run_and_regex_removal() { + let tmp = tempdir().unwrap(); + write_registry( + tmp.path(), + &json!({ + "foo_chem": {"geometry":"1{b[16]u[12]x:}2{r:}","expected_ori":"both","version":"0.1.0"}, + "bar_chem": {"geometry":"1{b[16]u[12]x:}2{r:}","expected_ori":"both","version":"0.1.0"} + }), + ); + + remove_chemistry( + tmp.path().to_path_buf(), + ChemistryRemoveOpts { + name: "foo_.*".to_string(), + dry_run: true, + }, + ) + .unwrap(); + let after_dry_run = read_registry(tmp.path()); + assert!(after_dry_run.get("foo_chem").is_some()); + assert!(after_dry_run.get("bar_chem").is_some()); + + remove_chemistry( + tmp.path().to_path_buf(), + ChemistryRemoveOpts { + name: "foo_.*".to_string(), + dry_run: false, + }, + ) + .unwrap(); + let after_remove = read_registry(tmp.path()); + assert!(after_remove.get("foo_chem").is_none()); + assert!(after_remove.get("bar_chem").is_some()); + } + + #[test] + fn clean_chemistries_dry_run_is_non_destructive_then_removes_unused() { + let tmp = tempdir().unwrap(); + write_registry( + tmp.path(), + &json!({ + "chem": { + "geometry":"1{b[16]u[12]x:}2{r:}", + "expected_ori":"both", + "version":"0.1.0", + "plist_name":"keep_pl" + } + }), + ); + + let plist_dir = tmp.path().join("plist"); + fs::create_dir_all(&plist_dir).unwrap(); + let keep = plist_dir.join("keep_pl"); + let remove = plist_dir.join("remove_pl"); + fs::write(&keep, "keep").unwrap(); + fs::write(&remove, "remove").unwrap(); + + clean_chemistries( + tmp.path().to_path_buf(), + ChemistryCleanOpts { dry_run: true }, + ) + .unwrap(); + assert!(keep.exists()); + assert!(remove.exists()); + + clean_chemistries( + tmp.path().to_path_buf(), + ChemistryCleanOpts { dry_run: false }, + ) + .unwrap(); + assert!(keep.exists()); + assert!(!remove.exists()); + } + + #[test] + fn parse_chemistry_version_rejects_invalid_versions() { + let err = parse_chemistry_version("not-a-version", "unit test").unwrap_err(); + assert!( + format!("{:#}", err).contains("Could not parse version"), + "unexpected error: {:#}", + err + ); + } +} diff --git a/src/simpleaf_commands/indexing.rs b/src/simpleaf_commands/indexing.rs index 3556db1..6f38b3a 100644 --- a/src/simpleaf_commands/indexing.rs +++ b/src/simpleaf_commands/indexing.rs @@ -1,12 +1,12 @@ +use crate::core::{context, exec, io, runtime}; use crate::utils::af_utils::create_dir_if_absent; use crate::utils::prog_utils; -use crate::utils::prog_utils::{CommandVerbosityLevel, ReqProgs}; +use crate::utils::prog_utils::ReqProgs; -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow, bail}; use roers; use serde::Deserialize; use serde_json::json; -use serde_json::Value; use std::collections::HashSet; use std::fs::File; use std::io::{BufWriter, Write}; @@ -16,6 +16,102 @@ use tracing::{error, info, warn}; use super::{IndexOpts, ReferenceType}; +struct ReferenceStageOutput { + ref_seq: PathBuf, + min_seq_len: Option, + t2g: Option, + gene_id_to_name: Option, + roers_duration: Option, + roers_cmd: Option, +} + +struct IndexBuildStageOutput { + index_duration: std::time::Duration, + index_cmd_string: String, +} + +fn derive_kmer_and_minimizer( + min_seq_len: Option, + default_kmer_length: u32, + default_minimizer_length: u32, +) -> anyhow::Result<(u32, u32)> { + if let Some(msl) = min_seq_len { + if msl < 10 { + bail!( + "The reference sequences are too short for indexing. Please provide sequences with a minimum length of at least 10 bases." + ); + } + if (msl / 2) < default_kmer_length && default_kmer_length == 31 { + let kmer_length = msl / 2; + let minimizer_length = (kmer_length as f32 / 1.8).ceil() as u32 + 1; + info!( + "Using kmer_length = {} and minimizer_length = {} because the default values are too big for the reference sequences.", + kmer_length, minimizer_length + ); + Ok((kmer_length, minimizer_length)) + } else { + Ok((default_kmer_length, default_minimizer_length)) + } + } else { + Ok((default_kmer_length, default_minimizer_length)) + } +} + +fn write_index_log_stage( + output: &Path, + reference_stage: &ReferenceStageOutput, + index_stage: &IndexBuildStageOutput, +) -> anyhow::Result<()> { + let index_log_file = output.join("simpleaf_index_log.json"); + let index_log_info = if let Some(roers_duration) = reference_stage.roers_duration { + json!({ + "time_info" : { + "roers_time" : roers_duration, + "index_time" : index_stage.index_duration + }, + "cmd_info" : { + "roers_cmd" : reference_stage.roers_cmd, + "index_cmd" : index_stage.index_cmd_string, + } + }) + } else { + json!({ + "time_info" : { + "index_time" : index_stage.index_duration + }, + "cmd_info" : { + "index_cmd" : index_stage.index_cmd_string, + } + }) + }; + + io::write_json_pretty_atomic(&index_log_file, &index_log_info) +} + +#[cfg(test)] +mod tests { + use super::derive_kmer_and_minimizer; + + #[test] + fn derive_kmer_and_minimizer_fails_for_short_reference() { + let err = derive_kmer_and_minimizer(Some(9), 31, 19) + .expect_err("expected short sequence length to fail"); + assert!( + format!("{:#}", err).contains("minimum length of at least 10"), + "unexpected error: {:#}", + err + ); + } + + #[test] + fn derive_kmer_and_minimizer_adjusts_defaults_for_short_sequences() { + let (k, m) = + derive_kmer_and_minimizer(Some(20), 31, 19).expect("expected adjusted k/m values"); + assert_eq!(k, 10); + assert_eq!(m, 7); + } +} + fn validate_index_type_opts(opts: &IndexOpts) -> anyhow::Result<()> { let mut bail = false; if opts.use_piscem && opts.sparse { @@ -230,9 +326,7 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu validate_index_type_opts(&opts)?; let mut threads = opts.threads; let output = opts.output; - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - // Read the JSON contents of the file as an instance of `User`. - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; + let rp: ReqProgs = context::load_required_programs(af_home_path)?; rp.issue_recommended_version_messages(); // we are building a custom spliced+intronic reference @@ -360,7 +454,9 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu CsvReader::Feature(rdr) } else { - bail!("No reference sequence provided. It should not happen, please report this issue on GitHub."); + bail!( + "No reference sequence provided. It should not happen, please report this issue on GitHub." + ); }; // determine the format of t2g file @@ -435,40 +531,29 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu // _gene_id_to_name = Some(id_to_name_path); } - std::fs::write( - &info_file, - serde_json::to_string_pretty(&index_info).unwrap(), - ) - .with_context(|| format!("could not write {}", info_file.display()))?; - - let ref_seq = reference_sequence.with_context(|| - "Reference sequence should either be generated from --fasta with reftype spliced+intronic / spliced+unspliced or set with --ref-seq", - )?; + io::write_json_pretty(&info_file, &index_info)?; + + let ref_seq = reference_sequence.with_context( + || { + "Reference sequence should either be generated from --fasta with reftype spliced+intronic / spliced+unspliced or set with --ref-seq" + }, + )?; + let reference_stage = ReferenceStageOutput { + ref_seq: ref_seq.clone(), + min_seq_len, + t2g, + gene_id_to_name, + roers_duration, + roers_cmd: roers_aug_ref_opt, + }; - let input_files = vec![ref_seq.clone()]; + let input_files = vec![reference_stage.ref_seq.clone()]; prog_utils::check_files_exist(&input_files)?; - - let kmer_length: u32; - let minimizer_length: u32; - if let Some(msl) = min_seq_len { - if msl < 10 { - bail!("The reference sequences are too short for indexing. Please provide sequences with a minimum length of at least 10 bases."); - } - // only if the value is not the default value - if (msl / 2) < opts.kmer_length && opts.kmer_length == 31 { - kmer_length = msl / 2; - // https://github.com/COMBINE-lab/protocol-estuary/blob/2ecc65f1891ebfafff2a4a17460550e4dd1f4bb6/utils/simpleaf_workflow_utils.libsonnet#L232 - minimizer_length = (kmer_length as f32 / 1.8).ceil() as u32 + 1; - - info!("Using kmer_length = {} and minimizer_length = {} because the default values are too big for the reference sequences.", kmer_length, minimizer_length); - } else { - kmer_length = opts.kmer_length; - minimizer_length = opts.minimizer_length; - }; - } else { - kmer_length = opts.kmer_length; - minimizer_length = opts.minimizer_length; - } + let (kmer_length, minimizer_length) = derive_kmer_and_minimizer( + reference_stage.min_seq_len, + opts.kmer_length, + opts.minimizer_length, + )?; let output_index_dir = output.join("index"); let index_duration; @@ -477,14 +562,15 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu if opts.use_piscem { // ensure we have piscem if rp.piscem.is_none() { - bail!("The construction of a piscem index was requested, but a valid piscem executable was not available. \n\ - Please either set a path using the `set-paths` command, or ensure the `PISCEM` environment variable is set properly."); + bail!( + "The construction of a piscem index was requested, but a valid piscem executable was not available. \n\ + Please either set a path using the `set-paths` command, or ensure the `PISCEM` environment variable is set properly." + ); } - let piscem_prog_info = rp - .piscem - .as_ref() - .expect("piscem program info should be properly set."); + let piscem_prog_info = rp.piscem.as_ref().context( + "The construction of a piscem index was requested, but piscem program info is missing.", + )?; let mut piscem_index_cmd = std::process::Command::new(format!("{}", piscem_prog_info.exe_path.display())); @@ -501,7 +587,7 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu .arg("-o") .arg(&output_index_stem) .arg("-s") - .arg(&ref_seq) + .arg(&reference_stage.ref_seq) .arg("--seed") .arg(opts.hash_seed.to_string()) .arg("-w") @@ -513,18 +599,15 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu piscem_index_cmd.arg("--overwrite"); } - // if the user requested more threads than can be used - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (capped_threads, capped_at) = runtime::cap_threads(threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, threads + ); + warn!("setting number of threads to {}", max_threads); } + threads = capped_threads; piscem_index_cmd .arg("--threads") @@ -533,26 +616,29 @@ pub fn build_ref_and_index(af_home_path: &Path, opts: IndexOpts) -> anyhow::Resu // if the user is requesting a poison k-mer table, ensure the // piscem version is at least 0.7.0 if let Some(decoy_paths) = opts.decoy_paths { - if let Ok(_piscem_ver) = prog_utils::check_version_constraints( + match prog_utils::check_version_constraints( "piscem", ">=0.7.0, <1.0.0", &piscem_prog_info.version, ) { - let path_args = decoy_paths - .into_iter() - .map(|x| x.to_string_lossy().into_owned()) - .collect::>() - .join(","); - piscem_index_cmd.arg("--decoy-paths").arg(path_args); - } else { - warn!( - r#" + Ok(_piscem_ver) => { + let path_args = decoy_paths + .into_iter() + .map(|x| x.to_string_lossy().into_owned()) + .collect::>() + .join(","); + piscem_index_cmd.arg("--decoy-paths").arg(path_args); + } + Err(_) => { + warn!( + r#" You requested to build a poison k-mer table with {:?}, but you must be using piscem version >= 0.7.0 to use this feature. Simpleaf is currently using version {}. Please upgrade your piscem version or, if you believe you have a sufficiently new version installed, update the executable being used by simpleaf"#, - decoy_paths, &piscem_prog_info.version - ); + decoy_paths, &piscem_prog_info.version + ); + } } } @@ -561,17 +647,12 @@ simpleaf"#, info!("piscem build cmd : {}", index_cmd_string); let index_start = Instant::now(); - let cres = prog_utils::execute_command(&mut piscem_index_cmd, CommandVerbosityLevel::Quiet) - .expect("failed to invoke piscem index command"); + let _cres = exec::run_checked(&mut piscem_index_cmd, "piscem index command")?; index_duration = index_start.elapsed(); - if !cres.status.success() { - bail!("piscem index failed to build succesfully {:?}", cres.status); - } - // copy over the t2g file to the index let mut t2g_out_path: Option = None; - if let Some(t2g_file) = t2g { + if let Some(t2g_file) = reference_stage.t2g.clone() { let index_t2g_path = output_index_dir.join("t2g_3col.tsv"); t2g_out_path = Some(PathBuf::from("t2g_3col.tsv")); std::fs::copy(t2g_file, index_t2g_path)?; @@ -579,7 +660,7 @@ simpleaf"#, // copy over the gene_id_to_name.tsv file to the index let mut gene_id_to_name_out_path: Option = None; - if let Some(gene_id_to_name_file) = gene_id_to_name { + if let Some(gene_id_to_name_file) = reference_stage.gene_id_to_name.clone() { let index_id2name_path = output_index_dir.join("gene_id_to_name.tsv"); gene_id_to_name_out_path = Some(PathBuf::from("gene_id_to_name.tsv")); std::fs::copy(gene_id_to_name_file, index_id2name_path)?; @@ -596,23 +677,24 @@ simpleaf"#, "m" : minimizer_length, "overwrite" : opts.overwrite, "threads" : threads, - "ref" : ref_seq + "ref" : reference_stage.ref_seq } }); - std::fs::write( - &index_json_file, - serde_json::to_string_pretty(&index_json).unwrap(), - ) - .with_context(|| format!("could not write {}", index_json_file.display()))?; + io::write_json_pretty_atomic(&index_json_file, &index_json)?; } else { // ensure we have piscem if rp.salmon.is_none() { - bail!("The construction of a salmon index was requested, but a valid salmon executable was not available. \n\ - Please either set a path using the `simpleaf set-paths` command, or ensure the `SALMON` environment variable is set properly."); + bail!( + "The construction of a salmon index was requested, but a valid salmon executable was not available. \n\ + Please either set a path using the `simpleaf set-paths` command, or ensure the `SALMON` environment variable is set properly." + ); } + let salmon_prog_info = rp.salmon.as_ref().context( + "The construction of a salmon index was requested, but salmon program info is missing.", + )?; let mut salmon_index_cmd = - std::process::Command::new(format!("{}", rp.salmon.unwrap().exe_path.display())); + std::process::Command::new(format!("{}", salmon_prog_info.exe_path.display())); salmon_index_cmd .arg("index") @@ -621,13 +703,15 @@ simpleaf"#, .arg("-i") .arg(&output_index_dir) .arg("-t") - .arg(&ref_seq); + .arg(&reference_stage.ref_seq); // overwrite doesn't do anything special for the salmon index, so mention this to // the user. if opts.overwrite { - info!("As the default salmon behavior is to overwrite an existing index if the same directory is provided, \n\ - the --overwrite flag will have no additional effect."); + info!( + "As the default salmon behavior is to overwrite an existing index if the same directory is provided, \n\ + the --overwrite flag will have no additional effect." + ); } // if the user requested a sparse index. @@ -640,18 +724,15 @@ simpleaf"#, salmon_index_cmd.arg("--keepDuplicates"); } - // if the user requested more threads than can be used - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (capped_threads, capped_at) = runtime::cap_threads(threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, threads + ); + warn!("setting number of threads to {}", max_threads); } + threads = capped_threads; salmon_index_cmd .arg("--threads") @@ -662,17 +743,12 @@ simpleaf"#, info!("salmon index cmd : {}", index_cmd_string); let index_start = Instant::now(); - let cres = prog_utils::execute_command(&mut salmon_index_cmd, CommandVerbosityLevel::Quiet) - .expect("failed to invoke salmon index command"); + let _cres = exec::run_checked(&mut salmon_index_cmd, "salmon index command")?; index_duration = index_start.elapsed(); - if !cres.status.success() { - bail!("salmon index failed to build succesfully {:?}", cres.status); - } - // copy over the t2g file to the index let mut t2g_out_path: Option = None; - if let Some(t2g_file) = t2g { + if let Some(t2g_file) = reference_stage.t2g.clone() { let index_t2g_path = output_index_dir.join("t2g_3col.tsv"); t2g_out_path = Some(PathBuf::from("t2g_3col.tsv")); std::fs::copy(t2g_file, index_t2g_path)?; @@ -680,8 +756,8 @@ simpleaf"#, // copy over the gene_id_to_name.tsv file to the index let mut gene_id_to_name_out_path: Option = None; - info!("{:?}", gene_id_to_name); - if let Some(gene_id_to_name_file) = gene_id_to_name { + info!("{:?}", reference_stage.gene_id_to_name); + if let Some(gene_id_to_name_file) = reference_stage.gene_id_to_name.clone() { let index_id2name_path = output_index_dir.join("gene_id_to_name.tsv"); gene_id_to_name_out_path = Some(PathBuf::from("gene_id_to_name.tsv")); std::fs::copy(gene_id_to_name_file, index_id2name_path)?; @@ -699,43 +775,15 @@ simpleaf"#, "sparse" : opts.sparse, "keep_duplicates" : opts.keep_duplicates, "threads" : threads, - "ref" : ref_seq + "ref" : reference_stage.ref_seq } }); - std::fs::write( - &index_json_file, - serde_json::to_string_pretty(&index_json).unwrap(), - ) - .with_context(|| format!("could not write {}", index_json_file.display()))?; + io::write_json_pretty_atomic(&index_json_file, &index_json)?; } - - let index_log_file = output.join("simpleaf_index_log.json"); - let index_log_info = if let Some(roers_duration) = roers_duration { - // if we ran make-splici - json!({ - "time_info" : { - "roers_time" : roers_duration, - "index_time" : index_duration - }, - "cmd_info" : { - "roers_cmd" : roers_aug_ref_opt, - "index_cmd" : index_cmd_string, } - }) - } else { - // if we indexed provided sequences directly - json!({ - "time_info" : { - "index_time" : index_duration - }, - "cmd_info" : { - "index_cmd" : index_cmd_string, } - }) + let index_stage = IndexBuildStageOutput { + index_duration, + index_cmd_string, }; - - std::fs::write( - &index_log_file, - serde_json::to_string_pretty(&index_log_info).unwrap(), - ) - .with_context(|| format!("could not write {}", index_log_file.display()))?; + write_index_log_stage(&output, &reference_stage, &index_stage)?; Ok(()) } diff --git a/src/simpleaf_commands/inspect.rs b/src/simpleaf_commands/inspect.rs index c5e5988..067c74e 100644 --- a/src/simpleaf_commands/inspect.rs +++ b/src/simpleaf_commands/inspect.rs @@ -6,7 +6,7 @@ use crate::utils::{ prog_utils::*, }; use anyhow::Result; -use serde_json::{json, Value}; +use serde_json::{Value, json}; use std::path::PathBuf; use strum::IntoEnumIterator; use tracing::warn; diff --git a/src/simpleaf_commands/paths.rs b/src/simpleaf_commands/paths.rs index 9db27b4..1fc5be6 100644 --- a/src/simpleaf_commands/paths.rs +++ b/src/simpleaf_commands/paths.rs @@ -1,6 +1,6 @@ use crate::utils::prog_utils::*; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use serde_json::json; use std::fs; use std::path::PathBuf; @@ -30,7 +30,9 @@ pub fn set_paths(af_home_path: PathBuf, set_path_args: SetPathOpts) -> anyhow::R let have_mapper = rp.salmon.is_some() || rp.piscem.is_some(); if !have_mapper { - bail!("Suitable executable for piscem or salmon not found — at least one of these must be available."); + bail!( + "Suitable executable for piscem or salmon not found — at least one of these must be available." + ); } if rp.alevin_fry.is_none() { bail!("Suitable alevin_fry executable not found."); diff --git a/src/simpleaf_commands/quant.rs b/src/simpleaf_commands/quant.rs index 7cd8896..e2e3ecd 100644 --- a/src/simpleaf_commands/quant.rs +++ b/src/simpleaf_commands/quant.rs @@ -1,14 +1,14 @@ use crate::utils::af_utils::*; +use crate::core::{context, exec, index_meta, io, runtime}; use crate::utils::prog_parsing_utils; use crate::utils::prog_utils; -use crate::utils::prog_utils::{CommandVerbosityLevel, ReqProgs}; +use crate::utils::prog_utils::ReqProgs; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use serde_json::json; -use serde_json::Value; use std::collections::HashMap; -use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::io::{BufRead, BufReader, BufWriter, Read, Write}; use std::path::{Path, PathBuf}; use std::time::{Duration, Instant}; use tracing::{error, info, warn}; @@ -17,7 +17,9 @@ use super::MapQuantOpts; use crate::utils::chem_utils::ExpectedOri; use crate::utils::constants::{CHEMISTRIES_PATH, NUM_SAMPLE_LINES}; -fn get_generic_buf_reader(ipath: &PathBuf) -> anyhow::Result { +/// Open a permit-list file with transparent compression handling and return a +/// buffered reader over the resulting stream. +fn get_generic_buf_reader(ipath: &Path) -> anyhow::Result>> { let (reader, compression) = niffler::from_path(ipath) .with_context(|| format!("Could not open requsted file {}", ipath.display()))?; match compression { @@ -42,7 +44,7 @@ impl CBListInfo { } } // we iterate the file to see if it only has cb or with affiliated info (by separator \t). - fn init(&mut self, pl_file: &PathBuf, output: &PathBuf) -> anyhow::Result<()> { + fn init(&mut self, pl_file: &Path, output: &Path) -> anyhow::Result<()> { // open pl_file let br = get_generic_buf_reader(pl_file) .with_context(|| "failed to successfully open permit-list file.")?; @@ -52,16 +54,20 @@ impl CBListInfo { .lines() .take(NUM_SAMPLE_LINES) // don't read the whole file in the single-coumn case .map(|l| { - l.unwrap_or_else(|_| panic!("Could not open permitlist file {}", pl_file.display())) + l.with_context(|| format!("Could not open permitlist file {}", pl_file.display())) }) + .collect::>>()? + .into_iter() .any(|l| !l.contains('\t')); // if single column, we are good. Otherwise, we need to write the first column to the final file let final_file: PathBuf; if is_single_column { - final_file = pl_file.clone(); + final_file = pl_file.to_path_buf(); } else { - info!("found multiple columns in the barcode list tsv file, use the first column as the barcodes."); + info!( + "found multiple columns in the barcode list tsv file, use the first column as the barcodes." + ); // create output dir if doesn't exist if !output.exists() { @@ -91,7 +97,7 @@ impl CBListInfo { } } - self.init_file = pl_file.clone(); + self.init_file = pl_file.to_path_buf(); self.final_file = final_file; self.is_single_column = is_single_column; Ok(()) @@ -106,7 +112,9 @@ impl CBListInfo { // if we are here but the init file doesn't exist, then we have a problem if !self.init_file.exists() { - bail!("The CBListInfo struct was not properly initialized. Please report this issue on GitHub."); + bail!( + "The CBListInfo struct was not properly initialized. Please report this issue on GitHub." + ); } // if we cannot find the count matrix column files, then complain @@ -236,196 +244,106 @@ fn validate_map_and_quant_opts(opts: &MapQuantOpts) -> anyhow::Result<()> { Ok(()) } -pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result<()> { - validate_map_and_quant_opts(&opts)?; - - let mut t2g_map = opts.t2g_map.clone(); - // Read the JSON contents of the file as an instance of `User`. - let v: Value = prog_utils::inspect_af_home(af_home_path)?; - let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; - - rp.issue_recommended_version_messages(); - - let mut gene_id_to_name_opt: Option = None; - - // figure out what type of index we expect - let index_type; - - if let Some(mut index) = opts.index.clone() { - // If the user built the index using simpleaf, and they are using - // piscem, then they are not required to pass the --use-piscem flag - // to the quant step (though they *can* pass it if they wish). - // Thus, if they built the piscem index using simpleaf, there are - // 2 possibilities here: - // 1. They are passing in the directory containing the index - // 2. They are passing in the prefix stem of the index files - // The code below is to check, in both cases, if we can automatically - // detect if the index was constructed with simpleaf, so that we can - // then automatically infer other files, like the t2g files. - - // If we are in case 1., the passed in path is a directory and - // we can check for the simpleaf_index.json file directly, - // Otherwise if the path is not a directory, we check if it - // ends in piscem_idx (the suffix that simpleaf uses when - // making a piscem index). Then we test the directory we - // get after stripping off this suffix. - let removed_piscem_idx_suffix = if !index.is_dir() && index.ends_with("piscem_idx") { - // remove the piscem_idx part - index.pop(); - true - } else { - false - }; - - let index_json_path = index.join("simpleaf_index.json"); - match index_json_path.try_exists() { - Ok(true) => { - // we have the simpleaf_index.json file, so parse it. - let index_json_file = std::fs::File::open(&index_json_path).with_context({ - || format!("Could not open file {}", index_json_path.display()) - })?; +#[derive(Debug)] +struct QuantSetup { + rp: ReqProgs, + index_type: IndexType, + t2g_map_file: PathBuf, + gene_id_to_name_opt: Option, + chem: Chemistry, + ori: ExpectedOri, + filter_meth: CellFilterMethod, + threads: u32, +} - let index_json_reader = BufReader::new(&index_json_file); - let v: Value = serde_json::from_reader(index_json_reader)?; +#[derive(Debug)] +struct MappingStageOutput { + sc_mapper: String, + map_cmd_string: String, + map_output: PathBuf, + map_duration: Duration, +} - let index_type_str: String = serde_json::from_value(v["index_type"].clone())?; +#[derive(Debug)] +struct QuantStageOutput { + gpl_output: PathBuf, + gpl_cmd_string: String, + collate_cmd_string: String, + quant_cmd_string: String, + gpl_duration: Duration, + collate_duration: Duration, + quant_duration: Duration, +} - // here, set the index type based on what we found as the - // value for the `index_type` key. - match index_type_str.as_ref() { - "salmon" => { - index_type = IndexType::Salmon(index.clone()); - } - "piscem" => { - // here, either the user has provided us with just - // the directory containing the piscem index, or - // we have "popped" off the "piscem_idx" suffix, so - // add it (back). - index_type = IndexType::Piscem(index.join("piscem_idx")); - } - _ => { - bail!( - "unknown index type {} present in simpleaf_index.json", - index_type_str, - ); - } - } - // if the user didn't pass in a t2g_map, try and populate it - // automatically here - if t2g_map.is_none() { - let t2g_opt: Option = serde_json::from_value(v["t2g_file"].clone())?; - if let Some(t2g_val) = t2g_opt { - let t2g_loc = index.join(t2g_val); - info!("found local t2g file at {}, will attempt to use this since none was provided explicitly", t2g_loc.display()); - t2g_map = Some(t2g_loc); - } - } +fn resolve_quant_setup( + af_home_path: &Path, + opts: &MapQuantOpts, +) -> anyhow::Result<(QuantSetup, CBListInfo)> { + let mut t2g_map = opts.t2g_map.clone(); + let ctx = context::load_runtime_context(af_home_path)?; + let rp: ReqProgs = ctx.progs; + rp.issue_recommended_version_messages(); - // if the user used simpleaf for index construction, then we also built the - // reference and populated the gene_id_to_name.tsv file. See if we can grab - // that as well. - if index.join("gene_id_to_name.tsv").exists() { - gene_id_to_name_opt = Some(index.join("gene_id_to_name.tsv")); - } else if let Some(index_parent) = index.parent() { - // we are doing index_dir/../ref/gene_id_to_name.tsv - let gene_name_path = index_parent.join("ref").join("gene_id_to_name.tsv"); - if gene_name_path.exists() && gene_name_path.is_file() { - gene_id_to_name_opt = Some(gene_name_path); - } - } - } - Ok(false) => { - // at this point, we have inferred that simpleaf wasn't - // used to construct the index, so fall back to what the user - // requested directly. - // if we have previously removed the piscem_idx suffix, add it back - if removed_piscem_idx_suffix { - index.push("piscem_idx"); - } - if opts.use_piscem { - // the user passed the use-piscem flag, so treat the provided - // path as the *prefix stem* to the piscem index - index_type = IndexType::Piscem(index); - } else { - // if the user didn't pass use-piscem and there - // is no simpleaf index json file to check, then - // it's assumed they are using a salmon index. - index_type = IndexType::Salmon(index); - } - } - Err(e) => { - bail!(e); - } - } - } else { - index_type = IndexType::NoIndex; + let index_meta = index_meta::resolve_quant_index(opts.index.clone(), opts.use_piscem)?; + if t2g_map.is_none() + && let Some(t2g_loc) = index_meta.inferred_t2g.clone() + { + info!( + "found local t2g file at {}, will attempt to use this since none was provided explicitly", + t2g_loc.display() + ); + t2g_map = Some(t2g_loc); } + let index_type = index_meta.index_type; + let gene_id_to_name_opt = index_meta.inferred_gene_id_to_name; - // at this point make sure we have a t2g value let t2g_map_file = t2g_map.context( "A transcript-to-gene map (t2g) file was not provided via `--t2g-map`|`-m` and could \ not be inferred from the index. Please provide a t2g map explicitly to the quant command.", )?; - prog_utils::check_files_exist(&[t2g_map_file.clone()])?; + prog_utils::check_files_exist(std::slice::from_ref(&t2g_map_file))?; - // make sure we have an program matching the - // appropriate index type match index_type { IndexType::Piscem(_) => { if rp.piscem.is_none() { - bail!("A piscem index is being used, but no piscem executable is provided. Please set one with `simpleaf set-paths`."); + bail!( + "A piscem index is being used, but no piscem executable is provided. Please set one with `simpleaf set-paths`." + ); } } IndexType::Salmon(_) => { if rp.salmon.is_none() { - bail!("A salmon index is being used, but no piscem executable is provided. Please set one with `simpleaf set-paths`."); + bail!( + "A salmon index is being used, but no salmon executable is provided. Please set one with `simpleaf set-paths`." + ); } } IndexType::NoIndex => {} } - // the chemistries file let custom_chem_p = af_home_path.join(CHEMISTRIES_PATH); - let chem = Chemistry::from_str(&index_type, &custom_chem_p, &opts.chemistry)?; - - let ori: ExpectedOri; - // if the user set the orientation, then - // use that explicitly - if let Some(o) = &opts.expected_ori { - ori = ExpectedOri::from_str(o).with_context(|| { + let ori = if let Some(o) = &opts.expected_ori { + ExpectedOri::from_str(o).with_context(|| { format!( "Could not parse orientation {}. It must be one of the following: {:?}", o, ExpectedOri::all_to_str().join(", ") ) - })?; + })? } else { - ori = chem.expected_ori(); - } + chem.expected_ori() + }; let mut filter_meth_opt = None; let mut pl_info = CBListInfo::new(); - - // based on the filtering method if let Some(ref pl_file) = opts.unfiltered_pl { - // NOTE: unfiltered_pl is of type Option> so being in here - // tells us nothing about the inner option. We handle that now. - - // if the -u flag is passed and some file is provided, then the inner - // Option is Some(PathBuf) if let Some(pl_file) = pl_file { - // the user has explicily passed a file along, so try - // to use that if pl_file.is_file() { - // we read the file to see if there is additional columns separated by \t. - // unwrap is safe here cuz we defined it above pl_info.init(pl_file, &opts.output)?; - - let min_cells = opts.min_reads; filter_meth_opt = Some(CellFilterMethod::UnfilteredExternalList( pl_info.final_file.to_string_lossy().into_owned(), - min_cells, + opts.min_reads, )); } else { bail!( @@ -434,20 +352,13 @@ pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result< ); } } else { - // here, the -u flag is provided - // but no file is provided, then the - // inner option is None and we will try to get the permit list automatically if - // using 10xv2, 10xv3, or 10xv4 - - // check the chemistry let pl_res = get_permit_if_absent(af_home_path, &chem)?; - let min_cells = opts.min_reads; match pl_res { PermitListResult::DownloadSuccessful(p) | PermitListResult::AlreadyPresent(p) => { pl_info.init(&p, &opts.output)?; filter_meth_opt = Some(CellFilterMethod::UnfilteredExternalList( pl_info.final_file.to_string_lossy().into_owned(), - min_cells, + opts.min_reads, )); } PermitListResult::UnregisteredChemistry => { @@ -472,63 +383,57 @@ pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result< filter_meth_opt = Some(CellFilterMethod::ExpectCells(*num_expected)); }; } - // otherwise it must have been knee; if opts.knee { filter_meth_opt = Some(CellFilterMethod::KneeFinding); } - if filter_meth_opt.is_none() { - bail!("No valid filtering strategy was provided!"); - } - - // if the user requested more threads than can be used - let mut threads = opts.threads; - if let Ok(max_threads_usize) = std::thread::available_parallelism() { - let max_threads = max_threads_usize.get() as u32; - if threads > max_threads { - warn!( - "The maximum available parallelism is {}, but {} threads were requested.", - max_threads, threads - ); - warn!("setting number of threads to {}", max_threads); - threads = max_threads; - } + let (threads, capped_at) = runtime::cap_threads(opts.threads); + if let Some(max_threads) = capped_at { + warn!( + "The maximum available parallelism is {}, but {} threads were requested.", + max_threads, opts.threads + ); + warn!("setting number of threads to {}", max_threads); } - // here we must be safe to unwrap - let filter_meth = filter_meth_opt.unwrap(); - - let sc_mapper: String; - let map_cmd_string: String; - let map_output: PathBuf; - let map_duration: Duration; + let setup = QuantSetup { + rp, + index_type, + t2g_map_file, + gene_id_to_name_opt, + chem, + ori, + filter_meth: filter_meth_opt.context("No valid filtering strategy was provided!")?, + threads, + }; + Ok((setup, pl_info)) +} - // if we are mapping against an index +fn run_mapping_stage( + opts: &MapQuantOpts, + setup: &QuantSetup, +) -> anyhow::Result { if let Some(index) = opts.index.clone() { - let reads1 = opts - .reads1 - .as_ref() - .expect("since mapping against an index is requested, read1 files must be provided."); - let reads2 = opts - .reads2 - .as_ref() - .expect("since mapping against an index is requested, read2 files must be provided."); - assert_eq!( - reads1.len(), - reads2.len(), - "{} read1 files and {} read2 files were given; Cannot proceed!", - reads1.len(), - reads2.len() - ); - - match index_type { - IndexType::Piscem(ref index_base) => { - let piscem_prog_info = rp - .piscem - .as_ref() - .expect("piscem program info should be properly set."); + let reads1 = opts.reads1.as_ref().context( + "Mapping against an index was requested, but read1 files were not provided.", + )?; + let reads2 = opts.reads2.as_ref().context( + "Mapping against an index was requested, but read2 files were not provided.", + )?; + if reads1.len() != reads2.len() { + bail!( + "{} read1 files and {} read2 files were given; Cannot proceed!", + reads1.len(), + reads2.len() + ); + } - // using a piscem index + match &setup.index_type { + IndexType::Piscem(index_base) => { + let piscem_prog_info = + setup.rp.piscem.as_ref().context( + "A piscem index is being used, but piscem program info is missing.", + )?; let mut piscem_quant_cmd = std::process::Command::new(format!("{}", &piscem_prog_info.exe_path.display())); let index_path = format!("{}", index_base.display()); @@ -537,70 +442,53 @@ pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result< .arg("--index") .arg(index_path); - // location of output directory, number of threads - map_output = opts.output.join("af_map"); + let map_output = opts.output.join("af_map"); piscem_quant_cmd .arg("--threads") - .arg(format!("{}", threads)) + .arg(format!("{}", setup.threads)) .arg("-o") .arg(&map_output); - // if the user is requesting a mapping option that required - // piscem version >= 0.7.0, ensure we have that - if let Ok(_piscem_ver) = prog_utils::check_version_constraints( + match prog_utils::check_version_constraints( "piscem", ">=0.7.0, <1.0.0", &piscem_prog_info.version, ) { - push_advanced_piscem_options(&mut piscem_quant_cmd, &opts)?; - } else { - info!( - r#" + Ok(_piscem_ver) => { + push_advanced_piscem_options(&mut piscem_quant_cmd, opts)?; + } + Err(_) => { + info!( + r#" Simpleaf is currently using piscem version {}, but you must be using version >= 0.7.0 in order to use the mapping options specific to this, or later versions. If you wish to use these options, please upgrade your piscem version or, if you believe you have a sufficiently new version installed, update the executable being used by simpleaf"#, - &piscem_prog_info.version - ); + &piscem_prog_info.version + ); + } } - // we get the final geometry we want to pass to piscem - // check if we can parse the geometry directly, or if we are dealing with a - // "complex" geometry. let frag_lib_xform = add_or_transform_fragment_library( MapperType::Piscem, - chem.fragment_geometry_str(), + setup.chem.fragment_geometry_str(), reads1, reads2, &mut piscem_quant_cmd, )?; - map_cmd_string = prog_utils::get_cmd_line_string(&piscem_quant_cmd); + let map_cmd_string = prog_utils::get_cmd_line_string(&piscem_quant_cmd); info!("piscem map-sc cmd : {}", map_cmd_string); - sc_mapper = String::from("piscem"); - - let mut input_files = vec![ - index_base.with_extension("ctab"), - index_base.with_extension("refinfo"), - index_base.with_extension("sshash"), - ]; - input_files.extend_from_slice(reads1); - input_files.extend_from_slice(reads2); - - prog_utils::check_files_exist(&input_files)?; + prog_utils::check_piscem_index_files(index_base.as_path())?; + let mut read_inputs = Vec::new(); + read_inputs.extend_from_slice(reads1); + read_inputs.extend_from_slice(reads2); + prog_utils::check_files_exist(&read_inputs)?; let map_start = Instant::now(); - let cres = prog_utils::execute_command( - &mut piscem_quant_cmd, - CommandVerbosityLevel::Quiet, - ) - .expect("failed to execute piscem [mapping phase]"); - - // if we had to filter the reads through a fifo - // wait for the thread feeding the fifo to finish + exec::run_checked(&mut piscem_quant_cmd, "piscem [mapping phase]")?; match frag_lib_xform { FragmentTransformationType::TransformedIntoFifo(xform_data) => { - // wait for it to join match xform_data.join_handle.join() { Ok(join_res) => { let xform_stats = join_res?; @@ -608,7 +496,13 @@ being used by simpleaf"#, let failed = xform_stats.failed_parsing; info!( "seq_geom_xform : observed {} input fragments. {} ({:.2}%) of them failed to parse and were not transformed", - total, failed, if total > 0 { (failed as f64) / (total as f64) } else { 0_f64 } * 100_f64 + total, + failed, + if total > 0 { + (failed as f64) / (total as f64) + } else { + 0_f64 + } * 100_f64 ); } Err(e) => { @@ -616,25 +510,23 @@ being used by simpleaf"#, } } } - FragmentTransformationType::Identity => { - // nothing to do. - } + FragmentTransformationType::Identity => {} } - map_duration = map_start.elapsed(); - - if !cres.status.success() { - bail!("piscem mapping failed with exit status {:?}", cres.status); - } + Ok(MappingStageOutput { + sc_mapper: String::from("piscem"), + map_cmd_string, + map_output, + map_duration: map_start.elapsed(), + }) } - IndexType::Salmon(ref index_base) => { - // using a salmon index - let mut salmon_quant_cmd = std::process::Command::new(format!( - "{}", - rp.salmon.unwrap().exe_path.display() - )); - - // set the input index and library type + IndexType::Salmon(index_base) => { + let salmon_prog_info = + setup.rp.salmon.as_ref().context( + "A salmon index is being used, but salmon program info is missing.", + )?; + let mut salmon_quant_cmd = + std::process::Command::new(format!("{}", salmon_prog_info.exe_path.display())); let index_path = format!("{}", index_base.display()); salmon_quant_cmd .arg("alevin") @@ -643,55 +535,37 @@ being used by simpleaf"#, .arg("-l") .arg("A"); - // check if we can parse the geometry directly, or if we are dealing with a - // "complex" geometry. let frag_lib_xform = add_or_transform_fragment_library( MapperType::Salmon, - chem.fragment_geometry_str(), + setup.chem.fragment_geometry_str(), reads1, reads2, &mut salmon_quant_cmd, )?; - // location of output directory, number of threads - map_output = opts.output.join("af_map"); + let map_output = opts.output.join("af_map"); salmon_quant_cmd .arg("--threads") - .arg(format!("{}", threads)) + .arg(format!("{}", setup.threads)) .arg("-o") .arg(&map_output); - - // if the user explicitly requested to use selective-alignment - // then enable that if opts.use_selective_alignment { salmon_quant_cmd.arg("--rad"); } else { - // otherwise default to sketch mode salmon_quant_cmd.arg("--sketch"); } - map_cmd_string = prog_utils::get_cmd_line_string(&salmon_quant_cmd); + let map_cmd_string = prog_utils::get_cmd_line_string(&salmon_quant_cmd); info!("salmon alevin cmd : {}", map_cmd_string); - sc_mapper = String::from("salmon"); - let mut input_files = vec![index]; input_files.extend_from_slice(reads1); input_files.extend_from_slice(reads2); - prog_utils::check_files_exist(&input_files)?; let map_start = Instant::now(); - let cres = prog_utils::execute_command( - &mut salmon_quant_cmd, - CommandVerbosityLevel::Quiet, - ) - .expect("failed to execute salmon [mapping phase]"); - - // if we had to filter the reads through a fifo - // wait for the thread feeding the fifo to finish + exec::run_checked(&mut salmon_quant_cmd, "salmon [mapping phase]")?; match frag_lib_xform { FragmentTransformationType::TransformedIntoFifo(xform_data) => { - // wait for it to join match xform_data.join_handle.join() { Ok(join_res) => { let xform_stats = join_res?; @@ -699,7 +573,13 @@ being used by simpleaf"#, let failed = xform_stats.failed_parsing; info!( "seq_geom_xform : observed {} input fragments. {} ({:.2}%) of them failed to parse and were not transformed", - total, failed, if total > 0 { (failed as f64) / (total as f64) } else { 0_f64 } * 100_f64 + total, + failed, + if total > 0 { + (failed as f64) / (total as f64) + } else { + 0_f64 + } * 100_f64 ); } Err(e) => { @@ -707,35 +587,42 @@ being used by simpleaf"#, } } } - FragmentTransformationType::Identity => { - // nothing to do. - } + FragmentTransformationType::Identity => {} } - map_duration = map_start.elapsed(); - - if !cres.status.success() { - bail!("salmon mapping failed with exit status {:?}", cres.status); - } + Ok(MappingStageOutput { + sc_mapper: String::from("salmon"), + map_cmd_string, + map_output, + map_duration: map_start.elapsed(), + }) } IndexType::NoIndex => { - bail!("Cannot perform mapping an quantification without known (piscem or salmon) index!"); + bail!( + "Cannot perform mapping an quantification without known (piscem or salmon) index!" + ); } } } else { - map_cmd_string = String::from(""); - sc_mapper = String::from(""); - map_output = opts - .map_dir - .expect("map-dir must be provided, since index, read1 and read2 were not."); - map_duration = Duration::new(0, 0); + Ok(MappingStageOutput { + sc_mapper: String::new(), + map_cmd_string: String::new(), + map_output: opts + .map_dir + .clone() + .context("map-dir must be provided, since index, read1 and read2 were not.")?, + map_duration: Duration::new(0, 0), + }) } +} - let map_output_string = map_output.display().to_string(); - - // make the quant directory +fn run_quant_stage( + opts: &MapQuantOpts, + setup: &QuantSetup, + mapping: &MappingStageOutput, + pl_info: &mut CBListInfo, +) -> anyhow::Result { let gpl_output = opts.output.join("af_quant"); - std::fs::create_dir_all(&gpl_output).with_context(|| { format!( "Failed to create quantification output directory {}", @@ -743,16 +630,13 @@ being used by simpleaf"#, ) })?; - // attempt to get the relevant information from map_info to propagate forward to the - // quantification directory - //get_mapping_info - let mapping_log = match index_type { + let mapping_log = match &setup.index_type { IndexType::Piscem(_) => { - let piscem_map_log_path = map_output.join("map_info.json"); + let piscem_map_log_path = mapping.map_output.join("map_info.json"); prog_parsing_utils::construct_json_from_piscem_log(piscem_map_log_path)? } IndexType::Salmon(_) => { - let salmon_log_path = map_output.join("logs").join("salmon_quant.log"); + let salmon_log_path = mapping.map_output.join("logs").join("salmon_quant.log"); prog_parsing_utils::construct_json_from_salmon_log(salmon_log_path)? } IndexType::NoIndex => { @@ -765,153 +649,121 @@ being used by simpleaf"#, }) } }; - let map_info_path = gpl_output.join("simpleaf_map_info.json"); let map_info_file = std::fs::File::create(map_info_path)?; serde_json::to_writer(map_info_file, &mapping_log)?; - let alevin_fry = rp.alevin_fry.unwrap().exe_path; - // alevin-fry generate permit list + let alevin_fry = setup + .rp + .alevin_fry + .as_ref() + .context("Alevin-fry program info is missing; please run `simpleaf set-paths`.")? + .exe_path + .clone(); let mut alevin_gpl_cmd = std::process::Command::new(format!("{}", &alevin_fry.display())); - - let gpl_threads = threads.min(8); + let gpl_threads = setup.threads.min(8); alevin_gpl_cmd.arg("generate-permit-list"); - alevin_gpl_cmd.arg("-i").arg(&map_output); - alevin_gpl_cmd.arg("-d").arg(ori.as_str()); + alevin_gpl_cmd.arg("-i").arg(&mapping.map_output); + alevin_gpl_cmd.arg("-d").arg(setup.ori.as_str()); alevin_gpl_cmd.arg("-t").arg(format!("{}", gpl_threads)); - - // add the filter mode - filter_meth.add_to_args(&mut alevin_gpl_cmd); - + setup.filter_meth.add_to_args(&mut alevin_gpl_cmd); alevin_gpl_cmd.arg("-o").arg(&gpl_output); - - info!( - "alevin-fry generate-permit-list cmd : {}", - prog_utils::get_cmd_line_string(&alevin_gpl_cmd) - ); - let input_files = vec![map_output.clone()]; + let gpl_cmd_string = prog_utils::get_cmd_line_string(&alevin_gpl_cmd); + info!("alevin-fry generate-permit-list cmd : {}", gpl_cmd_string); + let input_files = vec![mapping.map_output.clone()]; prog_utils::check_files_exist(&input_files)?; - let gpl_start = Instant::now(); - let gpl_proc_out = - prog_utils::execute_command(&mut alevin_gpl_cmd, CommandVerbosityLevel::Quiet) - .expect("could not execute [generate permit list]"); + exec::run_checked(&mut alevin_gpl_cmd, "[generate permit list]")?; let gpl_duration = gpl_start.elapsed(); - if !gpl_proc_out.status.success() { - bail!( - "alevin-fry generate-permit-list failed with exit status {:?}", - gpl_proc_out.status - ); - } - - // - // collate - // let mut alevin_collate_cmd = std::process::Command::new(format!("{}", &alevin_fry.display())); - alevin_collate_cmd.arg("collate"); alevin_collate_cmd.arg("-i").arg(&gpl_output); - alevin_collate_cmd.arg("-r").arg(&map_output); - alevin_collate_cmd.arg("-t").arg(format!("{}", threads)); - - info!( - "alevin-fry collate cmd : {}", - prog_utils::get_cmd_line_string(&alevin_collate_cmd) - ); - let input_files = vec![gpl_output.clone(), map_output]; + alevin_collate_cmd.arg("-r").arg(&mapping.map_output); + alevin_collate_cmd + .arg("-t") + .arg(format!("{}", setup.threads)); + let collate_cmd_string = prog_utils::get_cmd_line_string(&alevin_collate_cmd); + info!("alevin-fry collate cmd : {}", collate_cmd_string); + let input_files = vec![gpl_output.clone(), mapping.map_output.clone()]; prog_utils::check_files_exist(&input_files)?; - let collate_start = Instant::now(); - let collate_proc_out = - prog_utils::execute_command(&mut alevin_collate_cmd, CommandVerbosityLevel::Quiet) - .expect("could not execute [collate]"); + exec::run_checked(&mut alevin_collate_cmd, "[collate]")?; let collate_duration = collate_start.elapsed(); - if !collate_proc_out.status.success() { - bail!( - "alevin-fry collate failed with exit status {:?}", - collate_proc_out.status - ); - } - - // - // quant - // let mut alevin_quant_cmd = std::process::Command::new(format!("{}", &alevin_fry.display())); - alevin_quant_cmd .arg("quant") .arg("-i") .arg(&gpl_output) .arg("-o") .arg(&gpl_output); - alevin_quant_cmd.arg("-t").arg(format!("{}", threads)); - alevin_quant_cmd.arg("-m").arg(t2g_map_file.clone()); - alevin_quant_cmd.arg("-r").arg(opts.resolution); - + alevin_quant_cmd.arg("-t").arg(format!("{}", setup.threads)); + alevin_quant_cmd.arg("-m").arg(setup.t2g_map_file.clone()); + alevin_quant_cmd.arg("-r").arg(&opts.resolution); + let quant_cmd_string = prog_utils::get_cmd_line_string(&alevin_quant_cmd); info!("cmd : {:?}", alevin_quant_cmd); - - let input_files = vec![gpl_output.clone(), t2g_map_file]; + let input_files = vec![gpl_output.clone(), setup.t2g_map_file.clone()]; prog_utils::check_files_exist(&input_files)?; - let quant_start = Instant::now(); - let quant_proc_out = - prog_utils::execute_command(&mut alevin_quant_cmd, CommandVerbosityLevel::Quiet) - .expect("could not execute [quant]"); + exec::run_checked(&mut alevin_quant_cmd, "[quant]")?; let quant_duration = quant_start.elapsed(); - if !quant_proc_out.status.success() { - bail!("quant failed with exit status {:?}", quant_proc_out.status); - } - - // If we had a gene_id_to_name.tsv file handy, copy it over into the - // quantification directory. - if let Some(gene_name_path) = gene_id_to_name_opt { + if let Some(gene_name_path) = &setup.gene_id_to_name_opt { let target_path = gpl_output.join("gene_id_to_name.tsv"); - match std::fs::copy(&gene_name_path, &target_path) { + match std::fs::copy(gene_name_path, &target_path) { Ok(_) => { - info!("successfully copied the gene_name_to_id.tsv file into the quantification directory."); + info!( + "successfully copied the gene_name_to_id.tsv file into the quantification directory." + ); } Err(err) => { - warn!("could not successfully copy gene_id_to_name file from {:?} to {:?} because of {:?}", - gene_name_path, target_path, err - ); + warn!( + "could not successfully copy gene_id_to_name file from {:?} to {:?} because of {:?}", + gene_name_path, target_path, err + ); } } } - // If a permit/explit list with auxilary info was provided, - // we add the auxilary info to the barcodes.tsv file. let quants_mat_rows_p = gpl_output.join("alevin").join("quants_mat_rows.txt"); pl_info.update_af_quant_barcodes_tsv(&quants_mat_rows_p)?; - let mut convert_duration = None; - if opts.anndata_out { - let convert_start = Instant::now(); - let opath = gpl_output.join("alevin").join("quants.h5ad"); - af_anndata::convert_csr_to_anndata(&gpl_output, &opath)?; - convert_duration = Some(convert_start.elapsed()); - } + Ok(QuantStageOutput { + gpl_output, + gpl_cmd_string, + collate_cmd_string, + quant_cmd_string, + gpl_duration, + collate_duration, + quant_duration, + }) +} +fn write_quant_log( + opts: &MapQuantOpts, + mapping: &MappingStageOutput, + quant_stage: &QuantStageOutput, + convert_duration: Option, +) -> anyhow::Result<()> { let af_quant_info_file = opts.output.join("simpleaf_quant_log.json"); let mut af_quant_info = json!({ "time_info" : { - "map_time" : map_duration, - "gpl_time" : gpl_duration, - "collate_time" : collate_duration, - "quant_time" : quant_duration + "map_time" : mapping.map_duration, + "gpl_time" : quant_stage.gpl_duration, + "collate_time" : quant_stage.collate_duration, + "quant_time" : quant_stage.quant_duration }, "cmd_info" : { - "map_cmd" : map_cmd_string, - "gpl_cmd" : prog_utils::get_cmd_line_string(&alevin_gpl_cmd), - "collate_cmd" : prog_utils::get_cmd_line_string(&alevin_collate_cmd), - "quant_cmd" : prog_utils::get_cmd_line_string(&alevin_quant_cmd) + "map_cmd" : mapping.map_cmd_string, + "gpl_cmd" : quant_stage.gpl_cmd_string, + "collate_cmd" : quant_stage.collate_cmd_string, + "quant_cmd" : quant_stage.quant_cmd_string }, "map_info" : { - "mapper" : sc_mapper, - "map_cmd" : map_cmd_string, - "map_outdir": map_output_string + "mapper" : mapping.sc_mapper, + "map_cmd" : mapping.map_cmd_string, + "map_outdir": mapping.map_output.display().to_string() } }); @@ -919,12 +771,106 @@ being used by simpleaf"#, af_quant_info["time_info"]["conversion_time"] = json!(ctime); } - // write the relevant info about - // our run to file. - std::fs::write( - &af_quant_info_file, - serde_json::to_string_pretty(&af_quant_info).unwrap(), - ) - .with_context(|| format!("could not write {}", af_quant_info_file.display()))?; + io::write_json_pretty_atomic(&af_quant_info_file, &af_quant_info)?; Ok(()) } + +pub fn map_and_quant(af_home_path: &Path, opts: MapQuantOpts) -> anyhow::Result<()> { + validate_map_and_quant_opts(&opts)?; + let (setup, mut pl_info) = resolve_quant_setup(af_home_path, &opts)?; + let mapping = run_mapping_stage(&opts, &setup)?; + let quant_stage = run_quant_stage(&opts, &setup, &mapping, &mut pl_info)?; + + let mut convert_duration = None; + if opts.anndata_out { + let convert_start = Instant::now(); + let opath = quant_stage.gpl_output.join("alevin").join("quants.h5ad"); + af_anndata::convert_csr_to_anndata(&quant_stage.gpl_output, &opath)?; + convert_duration = Some(convert_start.elapsed()); + } + + write_quant_log(&opts, &mapping, &quant_stage, convert_duration) +} + +#[cfg(test)] +mod tests { + use clap::Parser; + + use crate::utils::af_utils::RnaChemistry; + use crate::{Cli, Commands}; + + use super::*; + + fn parse_quant_opts(args: &[&str]) -> MapQuantOpts { + let mut cli_args = vec!["simpleaf"]; + cli_args.extend_from_slice(args); + match Cli::parse_from(cli_args).command { + Commands::Quant(opts) => opts, + cmd => panic!("expected quant command, found {:?}", cmd), + } + } + + fn minimal_no_index_setup() -> QuantSetup { + QuantSetup { + rp: ReqProgs { + salmon: None, + piscem: None, + alevin_fry: None, + macs: None, + }, + index_type: IndexType::NoIndex, + t2g_map_file: PathBuf::from("/tmp/t2g.tsv"), + gene_id_to_name_opt: None, + chem: Chemistry::Rna(RnaChemistry::TenxV3), + ori: ExpectedOri::Forward, + filter_meth: CellFilterMethod::KneeFinding, + threads: 1, + } + } + + #[test] + fn mapping_stage_no_index_uses_map_dir() { + let opts = parse_quant_opts(&[ + "quant", + "-c", + "10xv3", + "-o", + "/tmp/out", + "-r", + "cr-like", + "--knee", + "--map-dir", + "/tmp/mapped", + ]); + let setup = minimal_no_index_setup(); + let stage = run_mapping_stage(&opts, &setup).expect("mapping stage should succeed"); + assert_eq!(stage.map_output, PathBuf::from("/tmp/mapped")); + assert_eq!(stage.sc_mapper, ""); + } + + #[test] + fn mapping_stage_no_index_fails_without_map_dir() { + let mut opts = parse_quant_opts(&[ + "quant", + "-c", + "10xv3", + "-o", + "/tmp/out", + "-r", + "cr-like", + "--knee", + "--map-dir", + "/tmp/mapped", + ]); + opts.map_dir = None; + opts.index = None; + + let setup = minimal_no_index_setup(); + let err = run_mapping_stage(&opts, &setup).expect_err("expected missing map-dir to fail"); + assert!( + format!("{:#}", err).contains("map-dir must be provided"), + "unexpected error: {:#}", + err + ); + } +} diff --git a/src/simpleaf_commands/refresh.rs b/src/simpleaf_commands/refresh.rs index 24e21e7..7db8926 100644 --- a/src/simpleaf_commands/refresh.rs +++ b/src/simpleaf_commands/refresh.rs @@ -1,7 +1,7 @@ use crate::utils::prog_utils::*; use anyhow::Context; -use serde_json::{json, Value}; +use serde_json::{Value, json}; use std::path::PathBuf; pub fn refresh_prog_info(af_home_path: PathBuf) -> anyhow::Result<()> { diff --git a/src/simpleaf_commands/workflow.rs b/src/simpleaf_commands/workflow.rs index 02ec4a2..156e40c 100644 --- a/src/simpleaf_commands/workflow.rs +++ b/src/simpleaf_commands/workflow.rs @@ -1,15 +1,15 @@ -use crate::utils::jrsonnet_main::{parse_jsonnet, ParseAction}; +use crate::utils::jrsonnet_main::{ParseAction, parse_jsonnet}; use crate::utils::prog_utils; use crate::utils::prog_utils::ReqProgs; use crate::utils::workflow_utils; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use cmd_lib::run_fun; -use serde_json::json; use serde_json::Value; +use serde_json::json; use std::fs; use std::path::Path; -use tabled::{settings::Style, Table, Tabled}; +use tabled::{Table, Tabled, settings::Style}; use tracing::{info, warn}; use super::WorkflowCommands; @@ -21,6 +21,85 @@ struct WorkflowTemplate { version: String, } +/// Fully resolved workflow run inputs after parsing either a manifest or template. +struct ResolvedWorkflowRun { + /// Parsed executable manifest. + instantiated_manifest: serde_json::Value, + /// Source path of the provided manifest/template (for logging context). + source_path: std::path::PathBuf, + /// Resolved output path extracted from the instantiated manifest. + output_path: std::path::PathBuf, +} + +/// Resolve `workflow run` inputs into an instantiated manifest and paths. +/// +/// This stage is responsible for parse/instantiate + output-path resolution. +fn resolve_workflow_run_inputs>( + af_home_path: T, + template: Option, + manifest: Option, + output_opt: Option, + jpaths: &Option>, + ext_codes: &Option>, +) -> anyhow::Result { + if let Some(manifest) = manifest { + info!("Loading and executing manifest."); + let instantiated_manifest = workflow_utils::parse_manifest(&manifest)?; + let output_path = workflow_utils::get_output_path(&instantiated_manifest)?; + return Ok(ResolvedWorkflowRun { + instantiated_manifest, + source_path: manifest, + output_path, + }); + } + + if let Some(template) = template { + if !template.exists() || !template.is_file() { + bail!("the path of the given workflow template file doesn't exist; Cannot proceed.") + } + + info!("Processing simpleaf template to produce and execute manifest."); + let workflow_json_string = workflow_utils::instantiate_workflow_template( + af_home_path.as_ref(), + template.as_path(), + output_opt.clone(), + jpaths, + ext_codes, + )?; + + let instantiated_manifest: Value = serde_json::from_str(workflow_json_string.as_str())?; + let output_path = workflow_utils::get_output_path(&instantiated_manifest)?; + + // If both command-line and template outputs are set, warn if template wins. + if let Some(requested_output_path) = output_opt + && requested_output_path != output_path + { + warn!( + r#"The output path {} was requested via the command line, but + the output path {} was resolved from the workflow template. + In this case, since the output variable is not used when instantiating + the template, the value ({}) present in the template must be used. + Please be aware that {} will not be used for output!"#, + requested_output_path.display(), + output_path.display(), + output_path.display(), + requested_output_path.display() + ); + } + + return Ok(ResolvedWorkflowRun { + instantiated_manifest, + source_path: template, + output_path, + }); + } + + bail!(concat!( + "You must have one of a manifest or template, ", + "but provided neither; this shouldn't happen" + )); +} + pub fn refresh_protocol_estuary>(af_home_path: T) -> anyhow::Result<()> { workflow_utils::get_protocol_estuary( af_home_path.as_ref(), @@ -124,7 +203,9 @@ pub fn patch_manifest_or_template>( } } _ => { - bail!("The patch function received a non-patch command. This should not happen. Please report this issue."); + bail!( + "The patch function received a non-patch command. This should not happen. Please report this issue." + ); } } Ok(()) @@ -167,7 +248,9 @@ pub fn list_workflows>(af_home_path: T) -> anyhow::Result<()> { } println!("{}", Table::new(workflow_entries).with(Style::rounded())); if print_na_cap { - println!("* : could not parse uninstantiated template to attempt extracting the version, please see [shorturl.at/gouB1] for further details"); + println!( + "* : could not parse uninstantiated template to attempt extracting the version, please see [shorturl.at/gouB1] for further details" + ); } Ok(()) } @@ -225,10 +308,14 @@ pub fn get_workflow>( */ match version_str.as_ref() { "N/A*" => { - warn!("couldn't evaluate the requested template to fetch the version number, it may be a deprecated version; consider refreshing."); + warn!( + "couldn't evaluate the requested template to fetch the version number, it may be a deprecated version; consider refreshing." + ); } "missing" => { - warn!("the template that was requested to be fetched appeared to be missing a version number, but this field should be present; consider refreshing or further investigating the issue."); + warn!( + "the template that was requested to be fetched appeared to be missing a version number, but this field should be present; consider refreshing or further investigating the issue." + ); } ver => { const REQ_VER: &str = "0.1.0"; @@ -237,7 +324,10 @@ pub fn get_workflow>( info!("getting workflow {} version {}", name, ver); } Err(_) => { - warn!("the version parsed from the workflow you are attempting to get is {}, but it should be at least {}.", version_str, REQ_VER); + warn!( + "the version parsed from the workflow you are attempting to get is {}, but it should be at least {}.", + version_str, REQ_VER + ); } } } @@ -260,7 +350,10 @@ pub fn get_workflow>( ) { Ok(_) => {} Err(e) => { - bail!("Could not copy workflow files to the output folder. The error was: {}", e); + bail!( + "Could not copy workflow files to the output folder. The error was: {}", + e + ); } }; } @@ -384,71 +477,17 @@ pub fn run_workflow>( skip_step, ext_codes, } => { - // designate that output is optional - let output_opt = output; - - let instantiated_manifest: serde_json::Value; - let source_path: std::path::PathBuf; - let output_path: std::path::PathBuf; - if let Some(manifest) = manifest { - // If the user passed a fully-instantiated - // manifest to execute - info!("Loading and executing manifest."); - // iterate json files and parse records to commands - // convert files into json string vector - instantiated_manifest = workflow_utils::parse_manifest(&manifest)?; - output_path = workflow_utils::get_output_path(&instantiated_manifest)?; - source_path = manifest.clone(); - } else if let Some(template) = template { - // check the validity of the file - if !template.exists() || !template.is_file() { - bail!("the path of the given workflow template file doesn't exist; Cannot proceed.") - } - - info!("Processing simpleaf template to produce and execute manifest."); - - // iterate json files and parse records to commands - // convert files into json string vector - let workflow_json_string = workflow_utils::instantiate_workflow_template( - af_home_path.as_ref(), - template.as_path(), - output_opt.clone(), - &jpaths, - &ext_codes, - )?; - - // write complete workflow (i.e. the manifest) JSON to output folder - instantiated_manifest = serde_json::from_str(workflow_json_string.as_str())?; - output_path = workflow_utils::get_output_path(&instantiated_manifest)?; - - // check if the output path we read from the instantiated template matches - // the output path requested by the user (if the user passed one in). If - // they do not match, issue an obnoxious warning. - // @DongzeHe : We should also probably log this warning to the output - // log for subsequent inspection. - if let Some(requested_output_path) = output_opt { - if requested_output_path != output_path { - warn!( - r#"The output path {} was requested via the command line, but - the output path {} was resolved from the workflow template. - In this case, since the output variable is not used when instantiating - the template, the value ({}) present in the template must be used. - Please be aware that {} will not be used for output!"#, - requested_output_path.display(), - output_path.display(), - output_path.display(), - requested_output_path.display() - ); - } - } - - source_path = template.clone(); - } else { - bail!(concat!( - "You must have one of a manifest or template, ", - "but provided neither; this shouldn't happen" - )); - } + let resolved = resolve_workflow_run_inputs( + af_home_path.as_ref(), + template, + manifest, + output, + &jpaths, + &ext_codes, + )?; + let instantiated_manifest = resolved.instantiated_manifest; + let source_path = resolved.source_path; + let output_path = resolved.output_path; // recursively make the output directory, which at this point // has been resolved as the one used in the template or manifest diff --git a/src/utils/af_utils.rs b/src/utils/af_utils.rs index c973471..e4fae0a 100644 --- a/src/utils/af_utils.rs +++ b/src/utils/af_utils.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{Context, Result, anyhow, bail}; // use cmd_lib::run_fun; use phf::phf_map; use seq_geom_parser::{AppendToCmdArgs, FragmentGeomDesc, PiscemGeomDesc, SalmonSeparateGeomDesc}; @@ -13,10 +13,10 @@ use strum_macros::EnumIter; use tracing::{debug, error, info, warn}; use crate::atac::commands::AtacChemistry; -use crate::utils::chem_utils::{get_single_custom_chem_from_file, CustomChemistry, ExpectedOri}; +use crate::utils::chem_utils::{CustomChemistry, ExpectedOri, get_single_custom_chem_from_file}; use crate::utils::{self, prog_utils}; -use super::chem_utils::{QueryInRegistry, LOCAL_PL_PATH_KEY, REMOTE_PL_URL_KEY}; +use super::chem_utils::{LOCAL_PL_PATH_KEY, QueryInRegistry, REMOTE_PL_URL_KEY}; /// The map from pre-specified chemistry types that salmon knows /// to the corresponding command line flag that salmon should be passed @@ -71,7 +71,9 @@ impl IndexType { IndexType::Salmon(_) => KNOWN_CHEM_MAP_SALMON.contains_key(chem), IndexType::Piscem(_) => KNOWN_CHEM_MAP_PISCEM.contains_key(chem), IndexType::NoIndex => { - info!("Since we are dealing with an already-mapped RAD file, the user is responsible for ensuring that a valid chemistry definition was provided during mapping"); + info!( + "Since we are dealing with an already-mapped RAD file, the user is responsible for ensuring that a valid chemistry definition was provided during mapping" + ); KNOWN_CHEM_MAP_SALMON.contains_key(chem) || KNOWN_CHEM_MAP_PISCEM.contains_key(chem) } } @@ -81,7 +83,9 @@ impl IndexType { IndexType::Salmon(_) => KNOWN_CHEM_MAP_PISCEM.contains_key(chem), IndexType::Piscem(_) => KNOWN_CHEM_MAP_SALMON.contains_key(chem), IndexType::NoIndex => { - info!("Since we are dealing with an already-mapped RAD file, the user is responsible for ensuring that a valid chemistry definition was provided during mapping"); + info!( + "Since we are dealing with an already-mapped RAD file, the user is responsible for ensuring that a valid chemistry definition was provided during mapping" + ); !self.is_known_chem(chem) } } @@ -269,21 +273,29 @@ impl Chemistry { "10xv3-5p" => match index_type { IndexType::Piscem(_) => Chemistry::Rna(RnaChemistry::TenxV35P), IndexType::NoIndex => { - info!("The 10xv3-5p chemistry flag is designed only for the piscem index. Please make sure the RAD file you are provided was mapped using piscem; otherwise the fragment orientations may not be treated correctly"); + info!( + "The 10xv3-5p chemistry flag is designed only for the piscem index. Please make sure the RAD file you are provided was mapped using piscem; otherwise the fragment orientations may not be treated correctly" + ); Chemistry::Rna(RnaChemistry::TenxV35P) } IndexType::Salmon(_) => { - bail!("The 10xv3-5p chemistry flag is not suppored under the salmon mapper. Instead please use the 10xv3 chemistry (which will treat samples as single-end)."); + bail!( + "The 10xv3-5p chemistry flag is not suppored under the salmon mapper. Instead please use the 10xv3 chemistry (which will treat samples as single-end)." + ); } }, "10xv2-5p" => match index_type { IndexType::Piscem(_) => Chemistry::Rna(RnaChemistry::TenxV25P), IndexType::NoIndex => { - info!("The 10xv2-5p chemistry flag is designed only for the piscem index. Please make sure the RAD file you are provided was mapped using piscem; otherwise the fragment orientations may not be treated correctly"); + info!( + "The 10xv2-5p chemistry flag is designed only for the piscem index. Please make sure the RAD file you are provided was mapped using piscem; otherwise the fragment orientations may not be treated correctly" + ); Chemistry::Rna(RnaChemistry::TenxV25P) } IndexType::Salmon(_) => { - bail!("The 10xv2-5p chemistry flag is not suppored under the salmon mapper. Instead please use the 10xv2 chemistry (which will treat samples as single-end)."); + bail!( + "The 10xv2-5p chemistry flag is not suppored under the salmon mapper. Instead please use the 10xv2 chemistry (which will treat samples as single-end)." + ); } }, s => { @@ -306,7 +318,12 @@ impl Chemistry { // not supported using this mapper (i.e. if it is a piscem-specific chem // and the mapper is salmon or vice versa). if index_type.is_unsupported_known_chem(s) { - bail!("The chemistry {} is not supported by the given mapper {}. Please switch to {}, provide the explicit geometry, or add this chemistry to the registry with the \"chemistry add\" command.", s, index_type.as_str(), index_type.counterpart().as_str()); + bail!( + "The chemistry {} is not supported by the given mapper {}. Please switch to {}, provide the explicit geometry, or add this chemistry to the registry with the \"chemistry add\" command.", + s, + index_type.as_str(), + index_type.counterpart().as_str() + ); } Chemistry::Custom(CustomChemistry::simple_custom(s).with_context(|| { format!( @@ -417,7 +434,11 @@ pub fn validate_geometry(geo: &str) -> Result<()> { match fg { Ok(_fg) => Ok(()), Err(e) => { - bail!("Could not parse geometry {}. Please ensure that it is a valid geometry definition wrapped by quotes. The error message was: {:?}", geo, e); + bail!( + "Could not parse geometry {}. Please ensure that it is a valid geometry definition wrapped by quotes. The error message was: {:?}", + geo, + e + ); } } } @@ -429,13 +450,19 @@ pub fn validate_geometry(geo: &str) -> Result<()> { /// will return an error (as there is no corresponding `FragmentGeomDesc` in general). pub fn extract_geometry(geo: &str) -> Result { if let Some(builtin_kwd) = is_builtin(geo) { - bail!("The provided geometry is a builtin keyword [{}] (preceeded by \"__\") and so no attempt was made to parse it", builtin_kwd); + bail!( + "The provided geometry is a builtin keyword [{}] (preceeded by \"__\") and so no attempt was made to parse it", + builtin_kwd + ); } else { let fg = FragmentGeomDesc::try_from(geo); match fg { Ok(fg) => Ok(fg), Err(e) => { - error!("Could not parse geometry {}. Please ensure that it is a valid geometry definition wrapped by quotes. The error message was: {:?}", geo, e); + error!( + "Could not parse geometry {}. Please ensure that it is a valid geometry definition wrapped by quotes. The error message was: {:?}", + geo, e + ); Err(e) } } @@ -524,7 +551,10 @@ pub fn get_permit_if_absent(af_home: &Path, chem: &Chemistry) -> Result { - error!("No permit list is registered for {}, so one cannot be used automatically. You should either provide a permit list directly on the command line, or re-add this chemistry to the registry with a permit list.", chem.registry_key()); + error!( + "No permit list is registered for {}, so one cannot be used automatically. You should either provide a permit list directly on the command line, or re-add this chemistry to the registry with a permit list.", + chem.registry_key() + ); bail!("No registered permit list available"); } Some(Value::String(lpath)) => { @@ -533,11 +563,17 @@ pub fn get_permit_if_absent(af_home: &Path, chem: &Chemistry) -> Result { - error!("Expected a JSON string associated with the {} key, but didn't find one; cannot proceed.", LOCAL_PL_PATH_KEY); + error!( + "Expected a JSON string associated with the {} key, but didn't find one; cannot proceed.", + LOCAL_PL_PATH_KEY + ); bail!("Wrong JSON type"); } } @@ -568,12 +604,21 @@ Please consider removing and re-adding this chemistry with a valid permit list." .to_str() .ok_or(anyhow!("cannot convert expected filename to proper string"))?; if hash.to_string() != expected_hash { - warn!("The permit list file for {}, obtained from the provided remote url {}, does not match the expected hash {} (the observed hash was {})", chem.registry_key(), rpath, expected_hash, hash.to_string()); + warn!( + "The permit list file for {}, obtained from the provided remote url {}, does not match the expected hash {} (the observed hash was {})", + chem.registry_key(), + rpath, + expected_hash, + hash.to_string() + ); } Ok(PermitListResult::DownloadSuccessful(expected_file_path)) } _ => { - error!("Expected a JSON string associated with the {} key, but didn't find one; cannot proceed.", REMOTE_PL_URL_KEY); + error!( + "Expected a JSON string associated with the {} key, but didn't find one; cannot proceed.", + REMOTE_PL_URL_KEY + ); bail!("Wrong JSON type"); } } @@ -620,7 +665,9 @@ pub fn add_or_transform_fragment_library( ) -> Result { let known_chem = match mapper_type { MapperType::MappedRadFile => { - bail!("Cannot add_or_transform_fragment library when dealing with an already-mapped RAD file."); + bail!( + "Cannot add_or_transform_fragment library when dealing with an already-mapped RAD file." + ); } MapperType::Piscem => KNOWN_CHEM_MAP_PISCEM.contains_key(fragment_geometry_str), MapperType::Salmon => KNOWN_CHEM_MAP_SALMON.contains_key(fragment_geometry_str), diff --git a/src/utils/chem_utils.rs b/src/utils/chem_utils.rs index 5893bbb..ef6790b 100644 --- a/src/utils/chem_utils.rs +++ b/src/utils/chem_utils.rs @@ -1,6 +1,6 @@ use crate::utils::af_utils::{extract_geometry, parse_resource_json_file, validate_geometry}; use crate::utils::constants::*; -use anyhow::{anyhow, Result}; +use anyhow::{Result, anyhow}; use serde::{Deserialize, Serialize}; use serde_json::Value; use std::collections::HashMap; @@ -162,14 +162,14 @@ impl fmt::Display for CustomChemistry { writeln!(f, "{}\t: {}", REMOTE_PL_URL_KEY, remote_pl_url)?; } - if let Some(serde_json::Value::Object(meta)) = self.meta() { - if !meta.is_empty() { - writeln!(f, "meta\t: {{")?; - for (k, v) in meta.iter() { - writeln!(f, " {}\t: {:#}", k, v)?; - } - writeln!(f, "}}")?; + if let Some(serde_json::Value::Object(meta)) = self.meta() + && !meta.is_empty() + { + writeln!(f, "meta\t: {{")?; + for (k, v) in meta.iter() { + writeln!(f, " {}\t: {:#}", k, v)?; } + writeln!(f, "}}")?; } Ok(()) } diff --git a/src/utils/jrsonnet_main.rs b/src/utils/jrsonnet_main.rs index 316738b..81d70e3 100644 --- a/src/utils/jrsonnet_main.rs +++ b/src/utils/jrsonnet_main.rs @@ -2,13 +2,12 @@ // https://github.com/CertainLach/jrsonnet/blob/master/cmds/jrsonnet/src/main.rs use crate::utils::workflow_utils::JsonPatch; -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow, bail}; use clap::Parser; use jrsonnet_cli::{GcOpts, ManifestOpts, MiscOpts, OutputOpts, StdOpts, TlaOpts, TraceOpts}; use jrsonnet_evaluator::{ - apply_tla, + State, apply_tla, error::{Error as JrError, ErrorKind}, - State, }; use std::path::{Path, PathBuf}; @@ -98,7 +97,9 @@ pub fn parse_jsonnet( // get main.jsonnet file path let main_jsonnet_file_path = utils_dir.join("main.jsonnet"); if !main_jsonnet_file_path.exists() { - bail!("Could not find main.jsonnet file protocol-asturay; Please update it by invoking `simpleaf workflow refresh`") + bail!( + "Could not find main.jsonnet file protocol-asturay; Please update it by invoking `simpleaf workflow refresh`" + ) } let main_jsonnet_file_str = main_jsonnet_file_path.to_str().with_context(|| { format!( @@ -193,7 +194,8 @@ impl From for Error { fn main_catch(opts: Opts) -> anyhow::Result { let s = State::default(); let trace = opts.trace.trace_format(); - match main_real(&s, opts) { + let eval_result = main_real(&s, opts); + match eval_result { Ok(js) => Ok(js), Err(e) => { if let Error::Evaluation(e) = e { diff --git a/src/utils/prog_utils.rs b/src/utils/prog_utils.rs index 5528471..91f1d2d 100644 --- a/src/utils/prog_utils.rs +++ b/src/utils/prog_utils.rs @@ -1,7 +1,8 @@ -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{Context, Result, anyhow, bail}; use cmd_lib::run_fun; use semver::{Version, VersionReq}; use serde::{Deserialize, Serialize}; +use std::collections::HashSet; use std::env; use std::ffi::{OsStr, OsString}; use std::path::{Path, PathBuf}; @@ -11,6 +12,8 @@ use tracing::{debug, error, info, warn}; use ureq::ResponseExt; use which::which; +use file_requirements::{FileRequirementBuildError, FileRequirementBuilder}; + // The below functions are taken from the [`execute`](https://crates.io/crates/execute) // crate. @@ -235,9 +238,12 @@ impl ReqProgs { if desired_ver.matches(¤t_ver) { // nothing to do here } else { - warn!("It is recommended to use piscem version {}, but currently version {} is being used. \ + warn!( + "It is recommended to use piscem version {}, but currently version {} is being used. \ Please consider installing the latest version of piscem and setting simpleaf to use this \ - new version by running the `refresh-prog-info` command.", &desired_ver, ¤t_ver); + new version by running the `refresh-prog-info` command.", + &desired_ver, ¤t_ver + ); } } } @@ -279,7 +285,10 @@ pub fn check_version_constraints_from_output>( let x = vs.split_whitespace(); if let Some(version) = x.last() { let ver = if version.split(".").count() > 3 { - warn!("version info {} is not a valid semver (more than 3 dotted version parts; looking only at the major, minor & patch versions).", version); + warn!( + "version info {} is not a valid semver (more than 3 dotted version parts; looking only at the major, minor & patch versions).", + version + ); version .split(".") .take(3) @@ -373,20 +382,20 @@ pub fn get_required_progs_from_paths( let opt_salmon = match salmon_exe { Some(p) => Some(p), - None => { - match get_which_executable("salmon") { - Ok(p) => Some(p), - Err(e) => match &opt_piscem { - None => { - return Err(e); - } - Some(_) => { - info!("could not find salmon executable, only piscem will be usable as a mapper."); - None - } - }, - } - } + None => match get_which_executable("salmon") { + Ok(p) => Some(p), + Err(e) => match &opt_piscem { + None => { + return Err(e); + } + Some(_) => { + info!( + "could not find salmon executable, only piscem will be usable as a mapper." + ); + None + } + }, + }, }; // We should only get to this point if we have at least one of piscem and salmon, sanity @@ -408,15 +417,15 @@ pub fn get_required_progs_from_paths( let opt_macs = match macs_exe { Some(p) => Some(p), - None => { - match get_which_executable("macs3") { - Ok(p) => Some(p), - Err(_e) => { - info!("Could not find macs3 executable, peak calling cannot be peformed by simpleaf"); - None - } + None => match get_which_executable("macs3") { + Ok(p) => Some(p), + Err(_e) => { + info!( + "Could not find macs3 executable, peak calling cannot be peformed by simpleaf" + ); + None } - } + }, }; if let Some(piscem) = opt_piscem { @@ -472,34 +481,67 @@ pub fn get_required_progs() -> Result { get_required_progs_from_paths(salmon_exe, piscem_exe, alevin_fry_exe, macs_exe) } +/// Check that all files in `file_vec` exist. +/// +/// This maintains backwards compatibility for existing call sites by expressing +/// the check as a conjunction of file terms in the generic file-requirement +/// engine. pub fn check_files_exist(file_vec: &[PathBuf]) -> Result<()> { - let mut all_valid = true; + let mut builder = FileRequirementBuilder::new(); + let mut seen = HashSet::new(); for fb in file_vec { - let er = fb.as_path().try_exists(); - match er { - Ok(true) => { - // do nothing - } - Ok(false) => { - error!( - "Required input file at path {} was not found.", - fb.display() - ); - all_valid = false; - } - Err(e) => { - error!("{:#?}", e); - all_valid = false; - } + if !seen.insert(fb.clone()) { + continue; } + builder + .require_file(fb) + .map_err(map_file_requirement_build_err)?; } + builder + .build() + .check() + .map_err(|e| anyhow!("Required input files were missing; cannot proceed! {}", e)) +} - if !all_valid { - return Err(anyhow!( - "Required input files were missing; cannot proceed!" - )); - } - Ok(()) +/// Check that a piscem index base contains all required files for either supported +/// on-disk layout: +/// +/// 1. C++ layout: includes `.sshash` +/// 2. Rust layout: includes both `.ssi` and `.ssi.mphf` +/// +/// In both cases the common files must exist (`.ctab` and `.refinfo`). +pub fn check_piscem_index_files(index_base: &Path) -> Result<()> { + let mut builder = FileRequirementBuilder::new(); + builder + .require_all(|all| { + all.require_file(index_base.with_extension("ctab"))?; + all.require_file(index_base.with_extension("refinfo"))?; + Ok(()) + }) + .map_err(map_file_requirement_build_err)?; + builder + .require_any(|any| { + any.require_file(index_base.with_extension("sshash"))?; + any.require_all(|rust_all| { + rust_all.require_file(index_base.with_extension("ssi"))?; + rust_all.require_file(index_base.with_extension("ssi.mphf"))?; + Ok(()) + })?; + Ok(()) + }) + .map_err(map_file_requirement_build_err)?; + + builder.build().check().map_err(|e| { + anyhow!( + "Required piscem index files were missing or incomplete for base {}. {}", + index_base.display(), + e + ) + }) +} + +fn map_file_requirement_build_err(err: FileRequirementBuildError) -> anyhow::Error { + anyhow!("Invalid file requirement expression: {}", err) } pub fn read_json(json_path: &Path) -> anyhow::Result { @@ -525,3 +567,56 @@ pub fn inspect_af_home(af_home_path: &Path) -> anyhow::Result )), } } + +#[cfg(test)] +mod tests { + use super::{check_files_exist, check_piscem_index_files}; + use std::fs; + use tempfile::tempdir; + + #[test] + fn check_files_exist_ignores_duplicate_entries() { + let td = tempdir().expect("failed to create tempdir"); + let file = td.path().join("a.txt"); + fs::write(&file, "").expect("failed to write test file"); + let files = vec![file.clone(), file]; + check_files_exist(&files).expect("duplicate terms should not fail legacy file checks"); + } + + #[test] + fn check_piscem_index_files_accepts_cpp_layout() { + let td = tempdir().expect("failed to create tempdir"); + let base = td.path().join("piscem_idx"); + fs::write(base.with_extension("ctab"), "").expect("failed to write ctab"); + fs::write(base.with_extension("refinfo"), "").expect("failed to write refinfo"); + fs::write(base.with_extension("sshash"), "").expect("failed to write sshash"); + check_piscem_index_files(base.as_path()).expect("cpp index layout should be accepted"); + } + + #[test] + fn check_piscem_index_files_accepts_rust_layout() { + let td = tempdir().expect("failed to create tempdir"); + let base = td.path().join("piscem_idx"); + fs::write(base.with_extension("ctab"), "").expect("failed to write ctab"); + fs::write(base.with_extension("refinfo"), "").expect("failed to write refinfo"); + fs::write(base.with_extension("ssi"), "").expect("failed to write ssi"); + fs::write(base.with_extension("ssi.mphf"), "").expect("failed to write ssi.mphf"); + check_piscem_index_files(base.as_path()).expect("rust index layout should be accepted"); + } + + #[test] + fn check_piscem_index_files_rejects_partial_layout() { + let td = tempdir().expect("failed to create tempdir"); + let base = td.path().join("piscem_idx"); + fs::write(base.with_extension("ctab"), "").expect("failed to write ctab"); + fs::write(base.with_extension("refinfo"), "").expect("failed to write refinfo"); + fs::write(base.with_extension("ssi"), "").expect("failed to write ssi"); + let err = + check_piscem_index_files(base.as_path()).expect_err("partial rust layout must fail"); + assert!( + format!("{:#}", err).contains("Required piscem index files were missing or incomplete"), + "unexpected error: {:#}", + err + ); + } +} diff --git a/src/utils/workflow_utils.rs b/src/utils/workflow_utils.rs index 7be22e8..248a305 100644 --- a/src/utils/workflow_utils.rs +++ b/src/utils/workflow_utils.rs @@ -2,11 +2,12 @@ // allow multiple registry, just like conda envs // find a way to pull files from github directly instead of using local copy of protocol estuary -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow, bail}; use chrono::{DateTime, Local}; use clap::Parser; // use cmd_lib::run_cmd; -use serde_json::{json, Map, Value}; +use crate::core::io::write_json_pretty_atomic; +use serde_json::{Map, Value, json}; use std::boxed::Box; use std::collections::HashMap; use std::fs; @@ -33,6 +34,42 @@ pub enum WFCommand { ExternalCommand(std::process::Command), } +/// Structured errors returned when executing workflow commands. +#[derive(Debug, thiserror::Error)] +pub enum WorkflowExecutionError { + /// A `simpleaf` command in the workflow failed. + #[error("Workflow step {step} ({program}) failed while executing `{command}`: {source}")] + SimpleafStepFailed { + step: u64, + program: String, + command: String, + #[source] + source: anyhow::Error, + }, + /// An external workflow command exited with a non-zero status. + #[error( + "Workflow step {step} ({program}) failed executing external command `{command}` with exit status {status}. stderr: {stderr}" + )] + ExternalCommandNonZero { + step: u64, + program: String, + command: String, + status: String, + stderr: String, + }, + /// An external workflow command could not be launched. + #[error( + "Workflow step {step} ({program}) could not launch external command `{command}`: {source}" + )] + ExternalCommandSpawnFailed { + step: u64, + program: String, + command: String, + #[source] + source: std::io::Error, + }, +} + #[allow(dead_code)] enum ColumnTypeTag { String, @@ -66,7 +103,7 @@ impl PatchCollection { self.patches.push(p); } - pub fn iter(&self) -> std::slice::Iter { + pub fn iter(&self) -> std::slice::Iter<'_, JsonPatch> { self.patches.iter() } } @@ -232,44 +269,56 @@ be certain you intend to do this.", } else { v.insert(json!(rec)); } - }, + } ColumnTypeTag::Number => { if let Ok(n) = rec.parse::() { v.insert(json!(n)); } else if let Ok(n) = rec.parse::() { v.insert(json!(n)); } else { - bail!("could not parse {}, which is expected to be a number, as such", rec); + bail!( + "could not parse {}, which is expected to be a number, as such", + rec + ); } - }, + } ColumnTypeTag::Boolean => { if let Ok(b) = rec.parse::() { v.insert(json!(b)); } else { - bail!("could not parse {}, which is expected to be boolean, as such", rec); + bail!( + "could not parse {}, which is expected to be boolean, as such", + rec + ); } - }, + } ColumnTypeTag::Array => { let no_pref = rec.strip_prefix('[').with_context( || format!("In record {}, array type must begin with [", rec))?; let no_suffix = no_pref.strip_suffix(']').with_context( || format!("In record {}, array type must end with ]", rec))?; let inner = no_suffix.trim(); - let rdr = csv::ReaderBuilder::new(). - has_headers(false).from_reader(inner.as_bytes()); + let rdr = csv::ReaderBuilder::new() + .has_headers(false) + .from_reader(inner.as_bytes()); let array_elems_result = rdr.into_records().next(); - if let Some(Ok(ok_array_elems)) = array_elems_result { - let array_elems = ok_array_elems.into_iter() - .map( |s| json!(s)).collect::>(); + if let Some(Ok(ok_array_elems)) = array_elems_result + { + let array_elems = ok_array_elems + .into_iter() + .map(|s| json!(s)) + .collect::>(); v.insert(serde_json::Value::Array(array_elems)); } - }, - _ => bail!("type not supported") + } + _ => bail!("type not supported"), } } } - }, - _ => bail!("encountered a path that led to a non-object entry; this shouldn't happen") + } + _ => bail!( + "encountered a path that led to a non-object entry; this shouldn't happen" + ), }; } else { bail!("Should never query a non-existent path!"); @@ -285,6 +334,123 @@ be certain you intend to do this.", Ok(patches) } +/// Validate workflow structure before planning or execution. +/// +/// This ensures command entries contain both `step` and `program_name`, +/// verifies expected types for core fields, and rejects unsupported +/// `simpleaf` command variants with actionable messages. +pub fn validate_manifest_structure(workflow: &Value) -> anyhow::Result<()> { + fn validate_node(node: &Value, path: &str) -> anyhow::Result<()> { + match node { + Value::Object(obj) => { + let has_step = obj.get(SystemFields::Step.as_str()).is_some(); + let has_program = obj.get(SystemFields::ProgramName.as_str()).is_some(); + if has_step ^ has_program { + bail!( + "Malformed workflow entry at {}: command records must contain both `step` and `program_name`.", + path + ); + } + + if has_step { + let step_val = obj.get(SystemFields::Step.as_str()).with_context(|| { + format!( + "Malformed workflow entry at {}: missing `step` value.", + path + ) + })?; + let step = step_val.as_u64().with_context(|| { + format!( + "Malformed workflow entry at {}: `step` must be a positive integer, found {:?}.", + path, step_val + ) + })?; + if step == 0 { + bail!( + "Malformed workflow entry at {}: `step` must be >= 1, found 0.", + path + ); + } + + if let Some(active_val) = obj.get(SystemFields::Active.as_str()) + && active_val.as_bool().is_none() + { + bail!( + "Malformed workflow entry at {}: `active` must be a boolean, found {:?}.", + path, + active_val + ); + } + + let program_name_val = obj + .get(SystemFields::ProgramName.as_str()) + .with_context(|| { + format!( + "Malformed workflow entry at {}: missing `program_name` value.", + path + ) + })?; + let program_name = program_name_val.as_str().with_context(|| { + format!( + "Malformed workflow entry at {}: `program_name` must be a string, found {:?}.", + path, program_name_val + ) + })?; + + if program_name.starts_with("simpleaf") + && !program_name.ends_with("index") + && !program_name.ends_with("quant") + { + bail!( + "Unsupported workflow command `{}` at {}. Only `simpleaf index` and `simpleaf quant` are currently supported.", + program_name, + path + ); + } + + if !program_name.starts_with("simpleaf") { + let args_val = obj + .get(SystemFields::ExternalArguments.as_str()) + .with_context(|| { + format!( + "Malformed external command at {}: missing `arguments` array.", + path + ) + })?; + if args_val.as_array().is_none() { + bail!( + "Malformed external command at {}: `arguments` must be an array, found {:?}.", + path, + args_val + ); + } + } + } + + // Recurse to validate nested workflow sections. + for (k, v) in obj { + let child_path = if path == "$" { + format!("$.{}", k) + } else { + format!("{}.{}", path, k) + }; + validate_node(v, &child_path)?; + } + Ok(()) + } + Value::Array(arr) => { + for (idx, v) in arr.iter().enumerate() { + validate_node(v, &format!("{}[{}]", path, idx))?; + } + Ok(()) + } + _ => Ok(()), + } + } + + validate_node(workflow, "$") +} + pub fn get_output_path(manifest: &serde_json::Value) -> anyhow::Result { // we assume that the path we want is /meta_info/output, and it *must* exist // as a key! @@ -292,7 +458,9 @@ pub fn get_output_path(manifest: &serde_json::Value) -> anyhow::Result match output { Value::String(s) => Ok(std::path::PathBuf::from(s)), _ => { - bail!("/meta_info/output must have JSON string type, int he manifest, but it did not.") + bail!( + "/meta_info/output must have JSON string type, int he manifest, but it did not." + ) } } } else { @@ -366,16 +534,11 @@ pub fn duration_to_dhms(d: chrono::Duration) -> String { execution_elapsed_time } -/// This function updates the start_at variable -/// if --resume is provided.\ -/// It finds the workflow_info.json exported by -/// simpleaf workflow from the previous run and -/// grab the "Succeed" and "Execution Terminated Step" -/// fields.\ -/// If the previous run was succeed, then we report an error -/// saying nothing to resume -/// If Execution Terminated Step is a negative number, that -/// means there was no previous execution: +/// Compute the effective `start_at` for `--resume` runs from a prior workflow log. +/// +/// This reads `"Succeed"` and `"Latest Run"."Execution Terminated Step"` from the +/// previous `simpleaf_workflow_log.json` payload and returns the step to resume from. +/// If the previous run already succeeded, resuming is rejected. pub fn update_start_at(v: &Value) -> anyhow::Result { let latest_run = v.get("Latest Run").with_context(|| { "Could not get the `Latest Run` field from the `simpleaf_workflow_log.json`; Cannot proceed" @@ -383,17 +546,17 @@ pub fn update_start_at(v: &Value) -> anyhow::Result { // Check if the previous run was succeed. If yes, then no need to resume let succeed = v .get("Succeed") - .with_context(|| { - "Could not get `Execution Terminated Step` from the log file; Cannot resume." - })? + .with_context( + || "Could not get `Execution Terminated Step` from the log file; Cannot resume.", + )? .as_bool() .with_context(|| "cannot parse `Succeed` as bool; Cannot resume.")?; let start_at = latest_run .get("Execution Terminated Step") - .with_context(|| { - "Could not get `Execution Terminated Step` from the log file; Cannot resume." - })? + .with_context( + || "Could not get `Execution Terminated Step` from the log file; Cannot resume.", + )? .as_u64() .with_context(|| "cannot parse `Execution Terminated Step` as str; Cannot resume.")?; @@ -404,6 +567,9 @@ pub fn update_start_at(v: &Value) -> anyhow::Result { } } +/// Load the previous workflow metadata log used by `--resume`. +/// +/// The expected file is `simpleaf_workflow_log.json` in the provided output directory. pub fn get_previous_log>(output: T) -> anyhow::Result { // the path to the expected log file let exec_log_path = output.as_ref().join("simpleaf_workflow_log.json"); @@ -426,9 +592,9 @@ pub fn get_previous_log>(output: T) -> anyhow::Result { } Ok(false) => { bail!( - "Could not find `simpleaf_workflow_log.json` in the output directory {:?}; Cannot resume.", - output.as_ref() - ) + "Could not find `simpleaf_workflow_log.json` in the output directory {:?}; Cannot resume.", + output.as_ref() + ) } Err(e) => { bail!(e) @@ -436,6 +602,10 @@ pub fn get_previous_log>(output: T) -> anyhow::Result { } } +/// Execute a planned workflow queue and update the workflow log after each successful step. +/// +/// Commands are run in queue order (already sorted by step during planning). +/// On the first command failure, this writes a failed log state and returns the error. pub fn execute_commands_in_workflow>( simpleaf_workflow: SimpleafWorkflow, af_home_path: T, @@ -454,6 +624,7 @@ pub fn execute_commands_in_workflow>( match cr.cmd { WFCommand::SimpleafCommand(cmd) => { + let command_string = pn.to_string(); let exec_result = match *cmd { Commands::Index(index_opts) => { crate::indexing::build_ref_and_index(af_home_path.as_ref(), index_opts) @@ -462,12 +633,23 @@ pub fn execute_commands_in_workflow>( Commands::Quant(quant_opts) => { crate::quant::map_and_quant(af_home_path.as_ref(), quant_opts) } - _ => todo!(), + unsupported_cmd => { + bail!( + "Unsupported simpleaf workflow command variant: {:?}. Only `index` and `quant` are currently supported in workflow execution.", + unsupported_cmd + ) + } }; if let Err(e) = exec_result { workflow_log.write(false)?; info!("Execution terminated at {} command for step {}", pn, step); - return Err(e); + return Err(WorkflowExecutionError::SimpleafStepFailed { + step, + program: pn.to_string(), + command: command_string, + source: e, + } + .into()); } else { info!("Successfully ran {} command for step {}", pn, step); workflow_log.update(&cr.field_trajectory_vec[..])?; @@ -490,23 +672,29 @@ pub fn execute_commands_in_workflow>( workflow_log.update(&cr.field_trajectory_vec[..])?; } else { workflow_log.write(false)?; - let cmd_stderr = std::str::from_utf8(&cres.stderr[..])?; - let msg = format!("{} command at step {} failed to exit with code 0 under the shell.\n\ - The exit status was: {}.\n\ - The stderr of the invocation was: {}.", pn, step, cres.status, cmd_stderr); - warn!(msg); - bail!(msg); + let cmd_stderr = + String::from_utf8_lossy(&cres.stderr).trim().to_string(); + let err = WorkflowExecutionError::ExternalCommandNonZero { + step, + program: pn.to_string(), + command: cmd_string, + status: cres.status.to_string(), + stderr: cmd_stderr, + }; + warn!("{:#}", err); + return Err(err.into()); } } Err(e) => { workflow_log.write(false)?; - let msg = format!( - "{} command at step {} failed to execute under the shell.\n\ - The returned error was: {:?}.\n", - pn, step, e - ); - warn!(msg); - bail!(msg); + let err = WorkflowExecutionError::ExternalCommandSpawnFailed { + step, + program: pn.to_string(), + command: cmd_string, + source: e, + }; + warn!("{:#}", err); + return Err(err.into()); } // TODO: use this in the log somewhere. } // invoke external cmd } @@ -527,6 +715,9 @@ pub fn initialize_workflow>( skip_step: Vec, resume: bool, ) -> anyhow::Result<(SimpleafWorkflow, WorkflowLog)> { + // Validate manifest shape before planning command execution. + validate_manifest_structure(&workflow_json_value)?; + // Instantiate a workflow log struct let mut wl = WorkflowLog::new( output.as_ref(), @@ -621,7 +812,7 @@ impl SimpleafWorkflow { .with_context(|| { format!( "Cannot parse step field value, {:?}, as an integer", - field.get(SystemFields::Step.as_str()).unwrap() + field.get(SystemFields::Step.as_str()) ) })?; @@ -633,7 +824,7 @@ impl SimpleafWorkflow { v.as_bool().with_context(|| { format!( "Cannot parse active field value, {:?}, as a boolean", - field.get(SystemFields::Active.as_str()).unwrap() + field.get(SystemFields::Active.as_str()) ) })? } else { @@ -644,37 +835,55 @@ impl SimpleafWorkflow { let cmd_field = workflow_log.get_mut_cmd_field(&curr_field_trajectory_vec)?; cmd_field[SystemFields::Active.as_str()] = json!(active); - // The field must contains a program_name - if let Some(program_name) = field.get(SystemFields::ProgramName.as_str()) { - pn = ProgramName::from_str(program_name.as_str().with_context(|| { - "Cannot create ProgramName struct from a program name" - })?); - // if active, then push to execution queue - if active { - info!("Parsing {} command for step {}", pn, step); - // The `step` will be used for sorting the cmd_queue vector. - // all commands must have a valid `step`. - let cmd = match pn.create_cmd(field) { - Ok(v) => v, - Err(e) => { - if pn.is_external() { - bail!("Could not parse external command {} for step {}. The error message was: {}", pn, step, e); - } else { - bail!("Could not parse simpleaf command {} for step {}. The error message was: {}", pn, step, e); - } + // The command record must contain a program_name field. + let program_name = field + .get(SystemFields::ProgramName.as_str()) + .with_context(|| { + format!( + "Malformed workflow command at step {}: missing `program_name` field.", + step + ) + })?; + pn = + ProgramName::from_str(program_name.as_str().with_context( + || "Cannot create ProgramName struct from a program name", + )?); + + // if active, then push to execution queue + if active { + info!("Parsing {} command for step {}", pn, step); + // The `step` will be used for sorting the cmd_queue vector. + // all commands must have a valid `step`. + let cmd = match pn.create_cmd(field) { + Ok(v) => v, + Err(e) => { + if pn.is_external() { + bail!( + "Could not parse external command {} for step {}. The error message was: {}", + pn, + step, + e + ); + } else { + bail!( + "Could not parse simpleaf command {} for step {}. The error message was: {}", + pn, + step, + e + ); } - }; - cmd_queue.push(CommandRecord { - step, - active, - program_name: pn, - cmd, - field_trajectory_vec: curr_field_trajectory_vec, - }); - } else { - info!("Skipping {} command for step {}", pn, step); - } // if active - } // if have ProgramName + } + }; + cmd_queue.push(CommandRecord { + step, + active, + program_name: pn, + cmd, + field_trajectory_vec: curr_field_trajectory_vec, + }); + } else { + info!("Skipping {} command for step {}", pn, step); + } // if active } else { // If this is not a command record, we move to the next level // recursively calling this function on the current field. @@ -729,6 +938,7 @@ pub struct WorkflowLog { // this vector records all field names in the complete workflow. // This is used for locating the step for each command field_id_to_name: Vec, + field_name_to_id: HashMap, // TODO: the trajectory vector in each CommandRecord can be // move here as a long vector, and in each CommandRecord // we just need to record the start pos and the length of its trajectory. @@ -772,12 +982,12 @@ impl WorkflowLog { let workflow_name = template .as_ref() .file_stem() - .unwrap_or_else(|| { - panic!( + .with_context(|| { + format!( "Cannot parse file name of file {}", template.as_ref().display() ) - }) + })? .to_string_lossy() .into_owned(); @@ -804,6 +1014,7 @@ impl WorkflowLog { value: workflow_json_value.clone(), cmd_runtime_records: Map::new(), field_id_to_name: Vec::new(), + field_name_to_id: HashMap::new(), // cmds_field_id_trajectory: Vec::new() previous_log, }) @@ -816,13 +1027,8 @@ impl WorkflowLog { }); } - /// Write log to the output path. - /// 1. an execution log file includes the whole workflow, - /// in which succeffully invoked commands have - /// a negative `step` - /// 2. an info log file records runtime, workflow name, - /// output path etc. - pub fn write(&self, succeed: bool) -> anyhow::Result<()> { + /// Build the metadata payload written to `simpleaf_workflow_log.json`. + fn build_meta_info_payload(&self, succeed: bool) -> anyhow::Result { // initiate meta_info let workflow_meta_info = if let Some(workflow_meta_info) = &self.workflow_meta_info { workflow_meta_info.to_owned() @@ -868,7 +1074,7 @@ impl WorkflowLog { let d = Local::now().signed_duration_since(self.workflow_start_time); let execution_elapsed_time = duration_to_dhms(d); - let meta_info = json!( + Ok(json!( { "Workflow Name": self.workflow_name, "Workflow Meta Info": workflow_meta_info, @@ -883,28 +1089,25 @@ impl WorkflowLog { "Command Runtime by Step": Value::from(self.cmd_runtime_records.clone()), }, "Previous Runs": previous_runs - }); + })) + } - // execution log - std::fs::write( - self.meta_info_path.as_path(), - serde_json::to_string_pretty(&meta_info) - .with_context(|| ("Cannot convert json value to string."))?, - ) - .with_context(|| { + /// Write log to the output path. + /// 1. an execution log file includes the whole workflow, + /// in which succeffully invoked commands have + /// a negative `step` + /// 2. an info log file records runtime, workflow name, + /// output path etc. + pub fn write(&self, succeed: bool) -> anyhow::Result<()> { + let meta_info = self.build_meta_info_payload(succeed)?; + write_json_pretty_atomic(self.meta_info_path.as_path(), &meta_info).with_context(|| { format!( "could not write workflow meta info JSON file to {}", self.meta_info_path.display() ) })?; - // execution log - std::fs::write( - self.exec_log_path.as_path(), - serde_json::to_string_pretty(&self.value) - .with_context(|| "Could not convert json value to string.")?, - ) - .with_context(|| { + write_json_pretty_atomic(self.exec_log_path.as_path(), &self.value).with_context(|| { format!( "could not write complete simpleaf workflow JSON file to {}", self.exec_log_path.display() @@ -916,11 +1119,13 @@ impl WorkflowLog { /// Get the index corresponds to the field name in the field_id_to_name vector. pub fn get_field_id(&mut self, field_name: &String) -> usize { - if let Ok(pos) = self.field_id_to_name.binary_search(field_name) { - pos + if let Some(pos) = self.field_name_to_id.get(field_name) { + *pos } else { + let pos = self.field_id_to_name.len(); self.field_id_to_name.push(field_name.to_owned()); - self.field_id_to_name.len() - 1 + self.field_name_to_id.insert(field_name.to_owned(), pos); + pos } } @@ -943,7 +1148,7 @@ impl WorkflowLog { curr_field_name = self .field_id_to_name .get(*curr_field_id) - .expect("Cannot map field ID to name."); + .with_context(|| format!("Cannot map field ID {} to name.", curr_field_id))?; // let curr_pos = field_trajectory_vec.first().expect("Cannot get the first element"); curr_field = curr_field .get_mut(curr_field_name) @@ -1047,6 +1252,13 @@ impl ProgramName { matches!(self, &ProgramName::External(_)) } + /// Return true if an argument contains shell operators and therefore + /// requires shell interpretation. + fn has_shell_metachar(arg: &str) -> bool { + arg.chars() + .any(|c| ['|', '&', ';', '<', '>', '$', '`', '(', ')'].contains(&c)) + } + /// Create a valid simpleaf command object using the arguments recoreded in the field. /// step and program name will be ignored in this procedure pub fn create_simpleaf_cmd(&self, value: &Value) -> anyhow::Result { @@ -1080,7 +1292,9 @@ impl ProgramName { } } } else { - warn!("Found an invalid root layer; Ignored. All root layers must represent a valid simpleaf command."); + warn!( + "Found an invalid root layer; Ignored. All root layers must represent a valid simpleaf command." + ); }; // check if empty @@ -1123,10 +1337,18 @@ impl ProgramName { })? ); } - // make Command struct for the command - let external_cmd = shell(arg_vec.join(" ")); + let requires_shell = arg_vec.iter().any(|s| Self::has_shell_metachar(s)); - Ok(WFCommand::ExternalCommand(external_cmd)) + if requires_shell { + // Preserve shell semantics for advanced command forms (e.g. pipes/redirection). + let external_cmd = shell(arg_vec.join(" ")); + Ok(WFCommand::ExternalCommand(external_cmd)) + } else { + // Prefer direct argv invocation for robust argument handling. + let mut external_cmd = std::process::Command::new(&arg_vec[0]); + external_cmd.args(&arg_vec[1..]); + Ok(WFCommand::ExternalCommand(external_cmd)) + } } pub fn create_cmd(&self, value: &Value) -> anyhow::Result { @@ -1360,19 +1582,22 @@ mod tests { use super::ProgramName; use super::WFCommand; use crate::utils::workflow_utils::SystemFields; - use serde_json::{json, Map, Value}; + use serde_json::{Map, Value, json}; // use crate::Cli; // use crate::Commands; // use crate::SimpleafCmdRecord; use crate::{ + Commands, utils::{ prog_utils::{get_cmd_line_string, shell}, - workflow_utils::{initialize_workflow, WorkflowLog}, + workflow_utils::{ + CommandRecord, SimpleafWorkflow, WorkflowLog, execute_commands_in_workflow, + initialize_workflow, validate_manifest_structure, + }, }, - Commands, }; - use core::panic; - use std::path::PathBuf; + use std::path::{Path, PathBuf}; + use tempfile::tempdir; #[test] fn test_workflow_command() { @@ -1482,69 +1707,70 @@ mod tests { ) .unwrap(); - match &wl { - WorkflowLog { - meta_info_path, + let WorkflowLog { + meta_info_path, + exec_log_path, + workflow_start_time: _, + command_runtime, + num_succ, + start_at, + workflow_name, + workflow_meta_info, + value, + cmd_runtime_records, + field_id_to_name, + field_name_to_id, + skip_step, + previous_log: _, + } = &wl; + { + // test wl + // check JSON log output json + assert_eq!( exec_log_path, - workflow_start_time: _, - command_runtime, - num_succ, - start_at, - workflow_name, - workflow_meta_info, - value, - cmd_runtime_records, - field_id_to_name, - skip_step, - previous_log: _, - } => { - // test wl - // check JSON log output json - assert_eq!( - exec_log_path, - &PathBuf::from("output_dir/workflow_execution_log.json") - ); - assert_eq!( - meta_info_path, - &PathBuf::from("output_dir/simpleaf_workflow_log.json") - ); + &PathBuf::from("output_dir/workflow_execution_log.json") + ); + assert_eq!( + meta_info_path, + &PathBuf::from("output_dir/simpleaf_workflow_log.json") + ); - assert_eq!(workflow_name, &String::from("fake_config")); + assert_eq!(workflow_name, &String::from("fake_config")); - assert_eq!( - workflow_meta_info, - &Some( - workflow_json_value - .get(SystemFields::MetaInfo.as_str()) - .unwrap() - .to_owned() - ) - ); + assert_eq!( + workflow_meta_info, + &Some( + workflow_json_value + .get(SystemFields::MetaInfo.as_str()) + .unwrap() + .to_owned() + ) + ); - let mut new_value = value.to_owned(); - new_value["rna"]["simpleaf_index"]["active"] = json!(true); - new_value["external-commands"]["HTO ref gunzip"]["active"] = json!(true); + let mut new_value = value.to_owned(); + new_value["rna"]["simpleaf_index"]["active"] = json!(true); + new_value["external-commands"]["HTO ref gunzip"]["active"] = json!(true); - assert_eq!(new_value, workflow_json_value); + assert_eq!(new_value, workflow_json_value); - assert_eq!(cmd_runtime_records, &Map::new()); + assert_eq!(cmd_runtime_records, &Map::new()); - assert_eq!(start_at, &2u64); - assert_eq!(skip_step, &vec![3]); - assert!( - field_id_to_name.contains(&"rna".to_string()) - && field_id_to_name.contains(&"meta_info".to_string()) - && field_id_to_name.contains(&"simpleaf_index".to_string()) - && field_id_to_name.contains(&"simpleaf_quant".to_string()) - && field_id_to_name.contains(&"external-commands".to_string()) - && field_id_to_name.contains(&"HTO ref gunzip".to_string()) - && field_id_to_name.contains(&"ADT ref gunzip".to_string()) - ); + assert_eq!(start_at, &2u64); + assert_eq!(skip_step, &vec![3]); + assert!( + field_id_to_name.contains(&"rna".to_string()) + && field_id_to_name.contains(&"meta_info".to_string()) + && field_id_to_name.contains(&"simpleaf_index".to_string()) + && field_id_to_name.contains(&"simpleaf_quant".to_string()) + && field_id_to_name.contains(&"external-commands".to_string()) + && field_id_to_name.contains(&"HTO ref gunzip".to_string()) + && field_id_to_name.contains(&"ADT ref gunzip".to_string()) + ); + assert_eq!(field_id_to_name.len(), field_name_to_id.len()); - assert!(command_runtime.is_none()); + assert!(command_runtime.is_none()); - assert_eq!(num_succ, &0); - } + assert_eq!(num_succ, &0); } // we started at 2, and skipped 3. So there are two commands @@ -1656,4 +1882,269 @@ mod tests { e => panic!("expected SimpleafCommand, found {:?}", e), }; } + + #[test] + fn test_get_field_id_is_stable_for_unsorted_insertions() { + let tmp = tempdir().unwrap(); + let mut wl = WorkflowLog::new( + tmp.path(), + Path::new("fake_template.jsonnet"), + &json!({}), + 1, + vec![], + false, + ) + .unwrap(); + + let id_z_1 = wl.get_field_id(&"z_field".to_string()); + let _id_a = wl.get_field_id(&"a_field".to_string()); + let id_z_2 = wl.get_field_id(&"z_field".to_string()); + + assert_eq!(id_z_1, id_z_2); + } + + #[test] + fn test_execute_commands_unsupported_simpleaf_command_returns_error() { + let tmp = tempdir().unwrap(); + let mut wl = WorkflowLog::new( + tmp.path(), + Path::new("fake_template.jsonnet"), + &json!({}), + 1, + vec![], + false, + ) + .unwrap(); + + let sw = SimpleafWorkflow { + af_home_path: tmp.path().to_path_buf(), + cmd_queue: vec![CommandRecord { + step: 1, + active: true, + program_name: ProgramName::Index, + cmd: WFCommand::SimpleafCommand(Box::new(Commands::Inspect {})), + field_trajectory_vec: vec![], + }], + }; + + let err = execute_commands_in_workflow(sw, tmp.path(), &mut wl).unwrap_err(); + assert!( + format!("{:#}", err).contains("Unsupported simpleaf workflow command variant"), + "unexpected error: {:#}", + err + ); + } + + #[test] + fn test_validate_manifest_structure_rejects_missing_program_name() { + let manifest = json!({ + "external": { + "broken": { + "step": 1, + "arguments": ["-l"] + } + } + }); + + let err = validate_manifest_structure(&manifest).unwrap_err(); + assert!( + format!("{:#}", err).contains("must contain both `step` and `program_name`"), + "unexpected error: {:#}", + err + ); + } + + #[test] + fn test_validate_manifest_structure_rejects_unsupported_simpleaf_command() { + let manifest = json!({ + "bad_simpleaf": { + "step": 1, + "program_name": "simpleaf inspect", + "arguments": [] + } + }); + + let err = validate_manifest_structure(&manifest).unwrap_err(); + assert!( + format!("{:#}", err) + .contains("Only `simpleaf index` and `simpleaf quant` are currently supported"), + "unexpected error: {:#}", + err + ); + } + + #[test] + fn test_initialize_workflow_resume_uses_previous_terminated_step() { + let tmp = tempdir().unwrap(); + let previous_log = json!({ + "Succeed": false, + "Latest Run": { + "Execution Terminated Step": 2 + } + }); + std::fs::write( + tmp.path().join("simpleaf_workflow_log.json"), + serde_json::to_string_pretty(&previous_log).unwrap(), + ) + .unwrap(); + + let manifest = json!({ + "external": { + "step1": { + "step": 1, + "program_name": "echo", + "arguments": ["one"] + }, + "step2": { + "step": 2, + "program_name": "echo", + "arguments": ["two"] + } + } + }); + + let (sw, wl) = initialize_workflow( + tmp.path(), + Path::new("resume_template.jsonnet"), + tmp.path(), + manifest, + 1, + vec![], + true, + ) + .unwrap(); + + assert_eq!(wl.start_at, 2); + assert_eq!(sw.cmd_queue.len(), 1); + assert_eq!(sw.cmd_queue[0].step, 2); + } + + #[test] + fn test_initialize_workflow_skip_step_excludes_skipped_command() { + let tmp = tempdir().unwrap(); + let manifest = json!({ + "external": { + "step1": { + "step": 1, + "program_name": "echo", + "arguments": ["one"] + }, + "step2": { + "step": 2, + "program_name": "echo", + "arguments": ["two"] + } + } + }); + + let (sw, wl) = initialize_workflow( + tmp.path(), + Path::new("skip_template.jsonnet"), + tmp.path(), + manifest, + 1, + vec![1], + false, + ) + .unwrap(); + + assert_eq!(wl.skip_step, vec![1]); + assert_eq!(sw.cmd_queue.len(), 1); + assert_eq!(sw.cmd_queue[0].step, 2); + } + + #[test] + fn test_external_command_builder_uses_shell_only_when_required() { + let direct = ProgramName::from_str("echo") + .create_external_cmd(&json!({"arguments": ["hello"]})) + .unwrap(); + let shell_required = ProgramName::from_str("echo") + .create_external_cmd(&json!({"arguments": ["hello", ">", "out.txt"]})) + .unwrap(); + + match direct { + WFCommand::ExternalCommand(cmd) => { + let cmd_line = get_cmd_line_string(&cmd); + assert!( + !cmd_line.contains(" -c "), + "expected direct argv invocation, saw: {}", + cmd_line + ); + } + _ => panic!("expected external command for direct argv case"), + } + + match shell_required { + WFCommand::ExternalCommand(cmd) => { + let cmd_line = get_cmd_line_string(&cmd); + assert!( + cmd_line.contains(" -c "), + "expected shell invocation for redirection case, saw: {}", + cmd_line + ); + } + _ => panic!("expected external command for shell case"), + } + } + + #[test] + fn test_execute_commands_external_failure_reports_status_and_stderr() { + let tmp = tempdir().unwrap(); + let mut wl = WorkflowLog::new( + tmp.path(), + Path::new("fake_template.jsonnet"), + &json!({}), + 1, + vec![], + false, + ) + .unwrap(); + + let sw = SimpleafWorkflow { + af_home_path: tmp.path().to_path_buf(), + cmd_queue: vec![CommandRecord { + step: 1, + active: true, + program_name: ProgramName::External("sh".to_string()), + cmd: WFCommand::ExternalCommand(shell("echo workflow_phase6_err 1>&2; exit 7")), + field_trajectory_vec: vec![], + }], + }; + + let err = execute_commands_in_workflow(sw, tmp.path(), &mut wl).unwrap_err(); + let rendered = format!("{:#}", err); + assert!( + rendered.contains("failed executing external command"), + "unexpected error: {}", + rendered + ); + assert!( + rendered.contains("exit status"), + "unexpected error: {}", + rendered + ); + assert!( + rendered.contains("workflow_phase6_err"), + "unexpected error: {}", + rendered + ); + } + + #[test] + fn test_workflow_log_write_creates_both_json_logs() { + let tmp = tempdir().unwrap(); + let wl = WorkflowLog::new( + tmp.path(), + Path::new("fake_template.jsonnet"), + &json!({}), + 1, + vec![], + false, + ) + .unwrap(); + + wl.write(true).unwrap(); + assert!(tmp.path().join("simpleaf_workflow_log.json").is_file()); + assert!(tmp.path().join("workflow_execution_log.json").is_file()); + } } diff --git a/tests/cli_help_snapshots.rs b/tests/cli_help_snapshots.rs new file mode 100644 index 0000000..6d0c5e3 --- /dev/null +++ b/tests/cli_help_snapshots.rs @@ -0,0 +1,116 @@ +use std::fs; +use std::path::{Path, PathBuf}; +use std::process::Command; + +use tempfile::tempdir; + +fn snapshot_cases() -> Vec<(&'static str, Vec<&'static str>)> { + vec![ + ("simpleaf___help.txt", vec!["--help"]), + ("simpleaf_index___help.txt", vec!["index", "--help"]), + ("simpleaf_quant___help.txt", vec!["quant", "--help"]), + ("simpleaf_chemistry___help.txt", vec!["chemistry", "--help"]), + ( + "simpleaf_chemistry_add___help.txt", + vec!["chemistry", "add", "--help"], + ), + ( + "simpleaf_chemistry_remove___help.txt", + vec!["chemistry", "remove", "--help"], + ), + ( + "simpleaf_chemistry_clean___help.txt", + vec!["chemistry", "clean", "--help"], + ), + ( + "simpleaf_chemistry_lookup___help.txt", + vec!["chemistry", "lookup", "--help"], + ), + ( + "simpleaf_chemistry_refresh___help.txt", + vec!["chemistry", "refresh", "--help"], + ), + ( + "simpleaf_chemistry_fetch___help.txt", + vec!["chemistry", "fetch", "--help"], + ), + ("simpleaf_inspect___help.txt", vec!["inspect", "--help"]), + ("simpleaf_set_paths___help.txt", vec!["set-paths", "--help"]), + ( + "simpleaf_refresh_prog_info___help.txt", + vec!["refresh-prog-info", "--help"], + ), + ("simpleaf_workflow___help.txt", vec!["workflow", "--help"]), + ( + "simpleaf_workflow_run___help.txt", + vec!["workflow", "run", "--help"], + ), + ( + "simpleaf_workflow_get___help.txt", + vec!["workflow", "get", "--help"], + ), + ( + "simpleaf_workflow_patch___help.txt", + vec!["workflow", "patch", "--help"], + ), + ( + "simpleaf_workflow_list___help.txt", + vec!["workflow", "list", "--help"], + ), + ( + "simpleaf_workflow_refresh___help.txt", + vec!["workflow", "refresh", "--help"], + ), + ("simpleaf_atac___help.txt", vec!["atac", "--help"]), + ( + "simpleaf_atac_index___help.txt", + vec!["atac", "index", "--help"], + ), + ( + "simpleaf_atac_process___help.txt", + vec!["atac", "process", "--help"], + ), + ] +} + +fn snapshots_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("snapshots") + .join("cli-help") +} + +#[test] +fn cli_help_outputs_match_snapshots() { + let binary = env!("CARGO_BIN_EXE_simpleaf"); + let af_home = tempdir().expect("unable to create temp ALEVIN_FRY_HOME"); + + for (snapshot_file, args) in snapshot_cases() { + let output = Command::new(binary) + .args(&args) + .env("ALEVIN_FRY_HOME", af_home.path()) + .env("COLUMNS", "100") + .output() + .expect("failed to execute simpleaf"); + + assert!( + output.status.success(), + "expected success for args {:?}, got status {:?}\nstderr:\n{}", + &args, + output.status.code(), + String::from_utf8_lossy(&output.stderr) + ); + + let actual = String::from_utf8(output.stdout).expect("stdout was not valid UTF-8"); + let expected_path = snapshots_dir().join(snapshot_file); + let expected = fs::read_to_string(&expected_path).unwrap_or_else(|e| { + panic!("failed reading snapshot {}: {}", expected_path.display(), e) + }); + + assert_eq!( + actual, expected, + "help output drifted for args {:?} against snapshot {}", + &args, snapshot_file + ); + } +} diff --git a/tests/cli_smoke.rs b/tests/cli_smoke.rs new file mode 100644 index 0000000..ca465b8 --- /dev/null +++ b/tests/cli_smoke.rs @@ -0,0 +1,67 @@ +use std::process::{Command, Output}; + +use tempfile::tempdir; + +fn run_simpleaf(args: &[&str]) -> Output { + let binary = env!("CARGO_BIN_EXE_simpleaf"); + let af_home = tempdir().expect("unable to create temp ALEVIN_FRY_HOME"); + let mut cmd = Command::new(binary); + cmd.args(args); + cmd.env("ALEVIN_FRY_HOME", af_home.path()); + cmd.output().expect("failed to execute simpleaf") +} + +fn assert_parse_error(args: &[&str]) { + let output = run_simpleaf(args); + assert!( + !output.status.success(), + "expected parse error for args {:?}", + args + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Usage:"), + "expected clap usage in stderr for {:?}, got:\n{}", + args, + stderr + ); +} + +#[test] +fn quant_requires_input_type_group() { + assert_parse_error(&[ + "quant", "-c", "10xv3", "-o", "/tmp/out", "-r", "cr-like", "--knee", + ]); +} + +#[test] +fn quant_rejects_conflicting_map_inputs() { + assert_parse_error(&[ + "quant", + "-c", + "10xv3", + "-o", + "/tmp/out", + "-r", + "cr-like", + "--knee", + "--map-dir", + "/tmp/mapped", + "--index", + "/tmp/index", + "-1", + "r1.fastq", + "-2", + "r2.fastq", + ]); +} + +#[test] +fn index_requires_gtf_when_fasta_is_provided() { + assert_parse_error(&["index", "-f", "genome.fa", "-o", "/tmp/index_out"]); +} + +#[test] +fn atac_command_requires_subcommand() { + assert_parse_error(&["atac"]); +} diff --git a/tests/phase1_regressions.rs b/tests/phase1_regressions.rs new file mode 100644 index 0000000..58bce8e --- /dev/null +++ b/tests/phase1_regressions.rs @@ -0,0 +1,59 @@ +use std::fs; +use std::process::Command; + +use serde_json::json; +use tempfile::tempdir; + +#[test] +fn salmon_missing_error_mentions_salmon_executable() { + let binary = env!("CARGO_BIN_EXE_simpleaf"); + let af_home = tempdir().expect("failed to create temp af home"); + let t2g = af_home.path().join("t2g.tsv"); + fs::write(&t2g, "tx1\tgene1\n").expect("failed to write t2g"); + + let af_info = json!({ + "prog_info": { + "salmon": null, + "piscem": null, + "alevin_fry": {"exe_path": "/bin/echo", "version": "0.11.2"}, + "macs": null + } + }); + fs::write( + af_home.path().join("simpleaf_info.json"), + serde_json::to_string_pretty(&af_info).expect("failed to serialize af info"), + ) + .expect("failed to write simpleaf_info.json"); + + let output = Command::new(binary) + .env("ALEVIN_FRY_HOME", af_home.path()) + .args([ + "quant", + "-c", + "10xv3", + "-o", + "/tmp/out", + "-r", + "cr-like", + "--knee", + "-i", + "/tmp/fake_index", + "--no-piscem", + "-1", + "r1.fastq", + "-2", + "r2.fastq", + "-m", + &t2g.to_string_lossy(), + ]) + .output() + .expect("failed to run simpleaf"); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("no salmon executable is provided"), + "stderr did not mention salmon executable:\n{}", + stderr + ); +} diff --git a/tests/snapshots/cli-help/simpleaf___help.txt b/tests/snapshots/cli-help/simpleaf___help.txt new file mode 100644 index 0000000..d722166 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf___help.txt @@ -0,0 +1,18 @@ +A rust framework to make using alevin-fry and alevin-fry-ATAC even simpler. + +Usage: simpleaf + +Commands: + index build the (expanded) reference index + chemistry operate on or inspect the chemistry registry + inspect inspect the current configuration + quant quantify a sample + set-paths set paths to the programs that simpleaf will use + refresh-prog-info refreshes version information associated with programs used by simpleaf + atac run a sub-command dealing with atac-seq data + workflow simpleaf workflow related command set + help Print this message or the help of the given subcommand(s) + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_atac___help.txt b/tests/snapshots/cli-help/simpleaf_atac___help.txt new file mode 100644 index 0000000..b0bb92f --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_atac___help.txt @@ -0,0 +1,13 @@ +run a sub-command dealing with atac-seq data + +Usage: simpleaf atac + +Commands: + index build a piscem index over the genome for scATAC-seq mapping + process process a scATAC-seq sample by performing mapping, barcode correction, and sorted + (deduplicated) BED file generation + help Print this message or the help of the given subcommand(s) + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_atac_index___help.txt b/tests/snapshots/cli-help/simpleaf_atac_index___help.txt new file mode 100644 index 0000000..e2a9344 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_atac_index___help.txt @@ -0,0 +1,26 @@ +build a piscem index over the genome for scATAC-seq mapping + +Usage: simpleaf atac index [OPTIONS] --input --output + +Options: + -i, --input target sequences (provide target sequences directly; avoid expanded + reference construction) + -o, --output path to output directory (will be created if it doesn't exist) + -t, --threads number of threads to use when running [default: 16] + --work-dir working directory where temporary files should be placed [default: + ./workdir.noindex] + --seed seed value to use in SSHash index construction (try changing this in + the rare event index build fails) [default: 1] + --overwrite overwrite existing files if the output directory is already populated + -h, --help Print help + -V, --version Print version + +Index Configuration Options: + -k, --kmer-length + the value of k to be used to construct the index [default: 25] + -m, --minimizer-length + the value of m to be used to construct the piscem index (must be < k) [default: 17] + +Piscem Index Options: + --decoy-paths path to (optional) decoy sequence used to insert poison k-mer + information into the index (only if using piscem >= 0.7) diff --git a/tests/snapshots/cli-help/simpleaf_atac_process___help.txt b/tests/snapshots/cli-help/simpleaf_atac_process___help.txt new file mode 100644 index 0000000..6399d38 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_atac_process___help.txt @@ -0,0 +1,86 @@ +process a scATAC-seq sample by performing mapping, barcode correction, and sorted (deduplicated) BED +file generation + +Usage: simpleaf atac process [OPTIONS] --index --barcode-reads --chemistry --output + +Options: + -c, --chemistry chemistry [possible values: 10x-v1, 10x-v2, 10x-multi] + -t, --threads number of threads to use when running [default: 16] + --output + --call-peaks do peak calling after generating the bed file + -h, --help Print help + -V, --version Print version + +Mapping Options: + -i, --index + path to index + -1, --reads1 + comma-separated list of paths to read 1 files + -2, --reads2 + comma-separated list of paths to read 2 files + -r, --reads + path to the read files containing single-end reads + -b, --barcode-reads + path to the read files containing the cell barcodes + --barcode-length + the length of the barcode read from which to extract the barcode (usually this is the + length of the entire read, and reads shorter than this will be discarded) [default: 16] + +Permit List Generation Options: + --permit-barcode-ori + The expected orientation of the barcodes in the permit list relative to the barcodes + extracted from the reads. If this is "fw", it is expected that the sequences will match + directly, if "rc" it is expected the reverse complement of the permit-list sequence will + match the reads' barcodes. If provided, this value will be used, if not provided, simpleaf + will attempt to look up the appropriate orientation in the chemistry registry + -u, --unfiltered-pl + Use the provided file as the unfiltered permit list (i.e. whitelist). This argument only + needs to be provided if you are providing the permit list explicitly, overriding the + default permit list for the provided chemistry + --min-reads + minimum read count threshold for a cell to be retained/processed; only used with + --unfiltered-pl [default: 10] + +Advanced Options: + --compress + compress the output mapping bed file + --ignore-ambig-hits + skip checking of the equivalence classes of k-mers that were too ambiguous to be otherwise + considered (passing this flag can speed up mapping slightly, but may reduce specificity) + --no-poison + do not consider poison k-mers, even if the underlying index contains them. In this case, + the mapping results will be identical to those obtained as if no poison table was added to + the index + --use-chr + use chromosomes as color + --thr + threshold to be considered for pseudoalignment [default: 0.7] + --bin-size + size of virtual color intervals [default: 1000] + --bin-overlap + size for virtual color interval overlap [default: 300] + --no-tn5-shift + do not apply Tn5 shift to mapped positions + --check-kmer-orphan + Check if any mapping kmer exist for a mate which is not mapped, but there exists mapping + for the other read. If set to true and a mapping kmer exists, then the pair would not be + mapped + --max-ec-card + determines the maximum cardinality equivalence class (number of (txp, orientation status) + pairs) to examine (cannot be used with --ignore-ambig-hits) [default: 4096] + --max-hit-occ + in the first pass, consider only k-mers having <= --max-hit-occ hits [default: 256] + --max-hit-occ-recover + if all k-mers have > --max-hit-occ hits, then make a second pass and consider k-mers + having <= --max-hit-occ-recover hits [default: 1024] + --max-read-occ + reads with more than this number of mappings will not have their mappings reported + [default: 2500] + +Peak Caller Options: + --gsize The value to be passed to the `macs3` `--gsize` (genome size) option. + Possible values are "hs", "mm", "ce", "dm" or an unsigned integer + [default: hs] + --qvalue The value to be passed to the `macs3` `--qvalue` (minimum FDR cutoff) + option [default: 0.1] + --extsize The value to be passed to the `macs3` `--extsize` option [default: 50] diff --git a/tests/snapshots/cli-help/simpleaf_chemistry___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry___help.txt new file mode 100644 index 0000000..b0c9a42 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry___help.txt @@ -0,0 +1,16 @@ +operate on or inspect the chemistry registry + +Usage: simpleaf chemistry + +Commands: + refresh Update the local chemistry registry according to the upstream repository + add Add a new or update an existing chemistry in the local registry + remove Remove chemistries from the local chemistry registry + clean Remove cached permit list files that do not belong to any registered chemistries + lookup Look up chemistries in the local registry and print the details + fetch Download the permit list files for registered chemistries + help Print this message or the help of the given subcommand(s) + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_add___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_add___help.txt new file mode 100644 index 0000000..e2c6824 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_add___help.txt @@ -0,0 +1,28 @@ +Add a new or update an existing chemistry in the local registry + +Usage: simpleaf chemistry add [OPTIONS] --name + +Options: + -n, --name The name to give to the chemistry + -g, --geometry A quoted string representing the geometry to which the + chemistry maps + -e, --expected-ori The direction of the first (most upstream) mappable biological + sequence [possible values: fw, rc, both] + --local-url The (fully-qualified) path to a local permit list file that + will be copied into the ALEVIN_FRY_HOME directory for future + use + --remote-url The url of a remote file that will be downloaded (on demand) to + provide a permit list for use with this chemistry. This file + should be obtainable with the equivalent of `wget `. + The file will only be downloaded the first time it is needed + and will be locally cached in ALEVIN_FRY_HOME after that + --version A semver format version tag, e.g., `0.1.0`, indicating the + version of the chemistry definition. To update a registered + chemistry, please provide a higher version number, e.g., + `0.2.0` [default: 0.0.0] + --from-json Instead of providing the chemistry directly on the command + line, use the chemistry definition provided in the provided + JSON file. This JSON file can be local or remote, but it must + contain a valid JSON object with the provided `--name` as the + key of the chemistry you wish to add + -h, --help Print help diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_clean___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_clean___help.txt new file mode 100644 index 0000000..126a384 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_clean___help.txt @@ -0,0 +1,8 @@ +Remove cached permit list files that do not belong to any registered chemistries + +Usage: simpleaf chemistry clean [OPTIONS] + +Options: + -d, --dry-run Print the permit list file(s) that will be removed without removing them + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_fetch___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_fetch___help.txt new file mode 100644 index 0000000..5cff394 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_fetch___help.txt @@ -0,0 +1,11 @@ +Download the permit list files for registered chemistries + +Usage: simpleaf chemistry fetch [OPTIONS] --name + +Options: + -n, --name A comma-separated list of chemistry names to fetch (or a *single* regex pattern + for matching multiple chemistries). Use '.*' to fetch for all registered + chemistries + -d, --dry-run Print the permit list file(s) that will be downloaded without downloading them + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_lookup___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_lookup___help.txt new file mode 100644 index 0000000..9bbd2fa --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_lookup___help.txt @@ -0,0 +1,9 @@ +Look up chemistries in the local registry and print the details + +Usage: simpleaf chemistry lookup --name + +Options: + -n, --name The name of a registered chemistry, or a regex pattern for matching registered + chemistries' names + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_refresh___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_refresh___help.txt new file mode 100644 index 0000000..ef471ac --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_refresh___help.txt @@ -0,0 +1,9 @@ +Update the local chemistry registry according to the upstream repository + +Usage: simpleaf chemistry refresh [OPTIONS] + +Options: + -f, --force overwrite existing chemistries even if the versions aren't newer + -d, --dry-run print the chemistries that will be added or updated without modifying the local + registry + -h, --help Print help diff --git a/tests/snapshots/cli-help/simpleaf_chemistry_remove___help.txt b/tests/snapshots/cli-help/simpleaf_chemistry_remove___help.txt new file mode 100644 index 0000000..b704273 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_chemistry_remove___help.txt @@ -0,0 +1,10 @@ +Remove chemistries from the local chemistry registry + +Usage: simpleaf chemistry remove [OPTIONS] --name + +Options: + -n, --name A chemistry name or a regex pattern matching the names of chemistries in the + registry to remove + -d, --dry-run Print the chemistries that would be removed without removing them + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_index___help.txt b/tests/snapshots/cli-help/simpleaf_index___help.txt new file mode 100644 index 0000000..29c9d44 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_index___help.txt @@ -0,0 +1,61 @@ +build the (expanded) reference index + +Usage: simpleaf index [OPTIONS] --output <--fasta |--ref-seq |--probe-csv |--feature-csv > + +Options: + -o, --output Path to output directory (will be created if it doesn't exist) + -t, --threads Number of threads to use when running [default: 16] + -k, --kmer-length The value of k to be used to construct the index [default: 31] + --gff3-format Denotes that the input annotation is a GFF3 (instead of GTF) file + --keep-duplicates Keep duplicated identical sequences when constructing the index + --overwrite Overwrite existing files if the output directory is already + populated + -h, --help Print help + -V, --version Print version + +Expanded Reference Options: + --ref-type Specify whether an expanded reference, spliced+intronic (or splici) + or spliced+unspliced (or spliceu), should be built [default: + spliced+intronic] + -f, --fasta Path to a reference genome to be used for the expanded reference + construction + -g, --gtf Path to a reference GTF/GFF3 file to be used for the expanded + reference construction + -r, --rlen The Read length used in roers to add flanking lengths to intronic + sequences + --dedup Deduplicate identical sequences in roers when building the expanded + reference + --spliced Path to a FASTA file with extra spliced sequence to add to the index + --unspliced Path to a FASTA file with extra unspliced sequence to add to the + index + +Direct Reference Options: + --feature-csv Path to a CSV file containing feature barcode sequences to use + for direct reference indexing. The file must follow the format of + 10x Feature Reference CSV. Currently, only three columns are + used: id, name, and sequence + --probe-csv Path to a CSV file containing probe sequences to use for direct + reference indexing. The file must follow the format of 10x Probe + Set Reference v2 CSV, containing four mandatory columns: gene_id, + probe_seq, probe_id, and included (TRUE or FALSE), and an + optional column: region (spliced or unspliced) + --ref-seq Path to a FASTA file containing reference sequences to directly + build index on, and avoid expanded reference construction + +Piscem Index Options: + -m, --minimizer-length + Minimizer length to be used to construct the piscem index (must be < k) [default: 19] + --decoy-paths + Paths to decoy sequence FASTA files used to insert poison k-mer information into the index + (only if using piscem >= 0.7) + --seed + The seed value to use in SSHash index construction (try changing this in the rare event + index build fails) [default: 1] + --work-dir + The working directory where temporary files should be placed [default: ./workdir.noindex] + --use-piscem + Use piscem instead of salmon for indexing and mapping (default) + +Alternative salmon-alevin Index Options: + -p, --sparse If this flag is passed, build the sparse rather than dense index for mapping + --no-piscem Don't use the default piscem mapper, instead, use salmon-alevin diff --git a/tests/snapshots/cli-help/simpleaf_inspect___help.txt b/tests/snapshots/cli-help/simpleaf_inspect___help.txt new file mode 100644 index 0000000..b4fd152 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_inspect___help.txt @@ -0,0 +1,7 @@ +inspect the current configuration + +Usage: simpleaf inspect + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_quant___help.txt b/tests/snapshots/cli-help/simpleaf_quant___help.txt new file mode 100644 index 0000000..da4cc31 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_quant___help.txt @@ -0,0 +1,78 @@ +quantify a sample + +Usage: simpleaf quant [OPTIONS] --chemistry --output --resolution <--expect-cells |--explicit-pl |--forced-cells |--knee|--unfiltered-pl []> <--index |--map-dir > + +Options: + -c, --chemistry The name of a registered chemistry or a quoted string representing a + custom geometry specification + -o, --output Path to the output directory + -t, --threads Number of threads to use when running [default: 16] + -h, --help Print help + -V, --version Print version + +Mapping Options: + -i, --index Path to a folder containing the index files + -1, --reads1 Comma-separated list of paths to read 1 files. The order must match + the read 2 files + -2, --reads2 Comma-separated list of paths to read 2 files. The order must match + the read 1 files + --no-piscem Don't use the default piscem mapper, instead, use salmon-alevin + --use-piscem Use piscem for mapping (requires that index points to the piscem + index) + -s, --use-selective-alignment Use selective-alignment for mapping (only if using salmon alevin as + the underlying mapper) + --map-dir Path to a mapped output directory containing a RAD file to skip + mapping + +Piscem Mapping Options: + --struct-constraints + If piscem >= 0.7.0, enable structural constraints + --ignore-ambig-hits + Skip checking of the equivalence classes of k-mers that were too ambiguous to be otherwise + considered (passing this flag can speed up mapping slightly, but may reduce specificity) + --no-poison + Do not consider poison k-mers, even if the underlying index contains them. In this case, + the mapping results will be identical to those obtained as if no poison table was added to + the index + --skipping-strategy + The skipping strategy to use for k-mer collection [default: permissive] [possible values: + permissive, strict] + --max-ec-card + Determines the maximum cardinality equivalence class (number of (txp, orientation status) + pairs) to examine (cannot be used with --ignore-ambig-hits) [default: 4096] + --max-hit-occ + In the first pass, consider only collected and matched k-mers of a read having <= + --max-hit-occ hits [default: 256] + --max-hit-occ-recover + If all collected and matched k-mers of a read have > --max-hit-occ hits, then make a + second pass and consider k-mers having <= --max-hit-occ-recover hits [default: 1024] + --max-read-occ + Threshold for discarding reads with too many mappings [default: 2500] + +Permit List Generation Options: + -k, --knee + Use knee filtering mode + -u, --unfiltered-pl [] + Use unfiltered permit list + -f, --forced-cells + Use forced number of cells + -x, --explicit-pl + Use a filtered, explicit permit list + -e, --expect-cells + Use expected number of cells + -d, --expected-ori + The expected direction/orientation of alignments in the chemistry being processed. If not + provided, will default to `fw` for 10xv2/10xv3, otherwise `both` [possible values: fw, rc, + both] + --min-reads + Minimum read count threshold for a cell to be retained/processed; only use with + --unfiltered-pl [default: 10] + +UMI Resolution Options: + -m, --t2g-map Path to a transcript to gene map file + -r, --resolution UMI resolution mode [possible values: cr-like, cr-like-em, + parsimony, parsimony-em, parsimony-gene, parsimony-gene-em] + +Output Options: + --anndata-out Generate an anndata (h5ad format) count matrix from the standard (matrix-market + format) output diff --git a/tests/snapshots/cli-help/simpleaf_refresh_prog_info___help.txt b/tests/snapshots/cli-help/simpleaf_refresh_prog_info___help.txt new file mode 100644 index 0000000..5125ed6 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_refresh_prog_info___help.txt @@ -0,0 +1,7 @@ +refreshes version information associated with programs used by simpleaf + +Usage: simpleaf refresh-prog-info + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_set_paths___help.txt b/tests/snapshots/cli-help/simpleaf_set_paths___help.txt new file mode 100644 index 0000000..03bc425 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_set_paths___help.txt @@ -0,0 +1,11 @@ +set paths to the programs that simpleaf will use + +Usage: simpleaf set-paths [OPTIONS] + +Options: + -s, --salmon path to salmon to use + -p, --piscem path to piscem to use + -a, --alevin-fry path to alein-fry to use + -m, --macs path to macs3 to use + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_workflow___help.txt b/tests/snapshots/cli-help/simpleaf_workflow___help.txt new file mode 100644 index 0000000..a769590 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_workflow___help.txt @@ -0,0 +1,18 @@ +simpleaf workflow related command set + +Usage: simpleaf workflow + simpleaf workflow + +Commands: + list Print a summary of the currently available workflows + refresh Update the local copy of protocol esturary to the latest version + get Get the workflow template and related files of a registered workflow + patch Patch a workflow template or instantiated manifest with a subset of parameters to produce + a series of workflow manifests + run Parse and instantiate a workflow template and invoke the workflow commands, or run an + instantiated manifest directly + help Print this message or the help of the given subcommand(s) + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_workflow_get___help.txt b/tests/snapshots/cli-help/simpleaf_workflow_get___help.txt new file mode 100644 index 0000000..2546441 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_workflow_get___help.txt @@ -0,0 +1,9 @@ +Get the workflow template and related files of a registered workflow + +Usage: simpleaf workflow get --name --output + +Options: + -o, --output path to dump the folder containing the workflow related files + -n, --name name of the queried workflow + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_workflow_list___help.txt b/tests/snapshots/cli-help/simpleaf_workflow_list___help.txt new file mode 100644 index 0000000..3d2835c --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_workflow_list___help.txt @@ -0,0 +1,7 @@ +Print a summary of the currently available workflows + +Usage: simpleaf workflow list + +Options: + -h, --help Print help + -V, --version Print version diff --git a/tests/snapshots/cli-help/simpleaf_workflow_patch___help.txt b/tests/snapshots/cli-help/simpleaf_workflow_patch___help.txt new file mode 100644 index 0000000..a007158 --- /dev/null +++ b/tests/snapshots/cli-help/simpleaf_workflow_patch___help.txt @@ -0,0 +1,22 @@ +Patch a workflow template or instantiated manifest with a subset of parameters to produce a series +of workflow manifests + +Usage: simpleaf workflow patch [OPTIONS] --patch <--manifest |--template