From 7cbf8ca0dec06f31c043e354bad7ce9e9469d869 Mon Sep 17 00:00:00 2001 From: Brandon Date: Tue, 9 Dec 2025 18:46:31 -0600 Subject: [PATCH 1/3] Improve fallbacks --- pipeline_config.example.yaml | 12 +- .../net/maizegenetics/commands/Orchestrate.kt | 122 +++++++++++++++--- 2 files changed, 114 insertions(+), 20 deletions(-) diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index 61b3e7d..2d793d7 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -106,11 +106,16 @@ convert_to_fasta: # Step 5: Align Mutated Assemblies # Realigns the mutated FASTA files from convert_to_fasta back to the reference -# Automatically uses ref_gff and ref_fasta from align_assemblies, and FASTA files from convert_to_fasta +# Auto-detects reference and query FASTAs from step 4 output based on filename matching # This creates a circular workflow where mutated sequences are realigned for comparison align_mutated_assemblies: + # ref_gff: "path/to/reference.gff" # Optional: Reference GFF file + # Uses align_assemblies.ref_gff if not specified + # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA from step 4 output + # Auto-detects file matching original ref name if not specified + # fasta_input: "/custom/fasta/input/" # Optional: Query FASTA input (file, directory, or text list) + # Auto-detects non-ref files from step 4 if not specified threads: 1 # Optional: Number of threads (default: 1) - # input: "/custom/fasta/input/" # Optional: Custom FASTA input (file, directory, or text list) # output: "/custom/alignment/output/" # Optional: Custom output directory # ============================================================================ @@ -127,6 +132,9 @@ pick_crossovers: # Example: # /path/to/assembly1.faparent1 # /path/to/assembly2.faparent2 + # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA file + # Uses mutated ref FASTA from step 5 if not specified + # Falls back to align_assemblies.ref_fasta if step 5 not run # output: "/custom/crossovers/output/" # Optional: Custom output directory # Step 7: Create Chain Files diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index 5820f92..770414a 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -66,14 +66,17 @@ data class ConvertToFastaConfig( ) data class AlignMutatedAssembliesConfig( + val ref_gff: String? = null, // Optional: Reference GFF (uses align_assemblies.ref_gff if not specified) + val ref_fasta: String? = null, // Optional: Reference FASTA (uses matching ref from step 4 output if not specified) + val fasta_input: String? = null, // Optional: Query FASTA input (uses non-ref files from step 4 if not specified) val threads: Int? = null, - val input: String? = null, // Custom FASTA input file/directory - val output: String? = null // Custom output directory + val output: String? = null // Custom output directory ) data class PickCrossoversConfig( val assembly_list: String, - val output: String? = null // Custom output directory + val ref_fasta: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) + val output: String? = null // Custom output directory ) data class CreateChainFilesConfig( @@ -253,8 +256,10 @@ class Orchestrate : CliktCommand(name = "orchestrate") { val alignMutatedAssembliesMap = configMap["align_mutated_assemblies"] as? Map val alignMutatedAssemblies = alignMutatedAssembliesMap?.let { AlignMutatedAssembliesConfig( + ref_gff = it["ref_gff"] as? String, + ref_fasta = it["ref_fasta"] as? String, + fasta_input = it["fasta_input"] as? String, threads = it["threads"] as? Int, - input = it["input"] as? String, output = it["output"] as? String ) } @@ -265,6 +270,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { val pickCrossovers = pickCrossoversMap?.let { PickCrossoversConfig( assembly_list = it["assembly_list"] as? String ?: throw IllegalArgumentException("pick_crossovers.assembly_list is required"), + ref_fasta = it["ref_fasta"] as? String, output = it["output"] as? String ) } @@ -381,6 +387,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { var fastaOutputDir: Path? = null var refFasta: Path? = null var refGff: Path? = null + var mutatedRefFasta: Path? = null // Mutated reference FASTA from step 5 var refkeyOutputDir: Path? = null var chainOutputDir: Path? = null var coordinatesOutputDir: Path? = null @@ -682,16 +689,68 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("STEP 5: Align Mutated Assemblies") logger.info("=".repeat(80)) - // Determine input (custom or from previous step) - val fastaInput = config.align_mutated_assemblies.input?.let { Path.of(it) } ?: fastaOutputDir - if (fastaInput == null) { - throw RuntimeException("Cannot run align-mutated-assemblies: no FASTA input available (specify 'input' in config or run convert-to-fasta first)") + // Determine ref_gff (config value or from step 1) + val step5RefGff = config.align_mutated_assemblies.ref_gff?.let { Path.of(it) } ?: refGff + if (step5RefGff == null) { + throw RuntimeException("Cannot run align-mutated-assemblies: reference GFF not available (specify 'ref_gff' in config or run align_assemblies first)") } - if (refFasta == null) { - throw RuntimeException("Cannot run align-mutated-assemblies: reference FASTA not available") + + // Determine ref_fasta and fasta_input from step 4 output + // If fastaOutputDir exists, find the reference FASTA (matching original ref name) and query FASTAs + var step5RefFasta: Path? = config.align_mutated_assemblies.ref_fasta?.let { Path.of(it) } + var step5FastaInput: Path? = config.align_mutated_assemblies.fasta_input?.let { Path.of(it) } + + // If not explicitly specified, try to derive from step 4 output + if (step5RefFasta == null || step5FastaInput == null) { + if (fastaOutputDir != null && fastaOutputDir.exists()) { + // Get the reference FASTA filename (without path) to match against step 4 output + val refFastaName = refFasta?.fileName?.toString()?.replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") + + if (refFastaName != null) { + // Find all FASTA files in the output directory + val allFastaFiles = fastaOutputDir.toFile().listFiles { file -> + file.isFile && file.name.matches(Regex(".*\\.(fa|fasta|fna)(\\.gz)?$")) + }?.map { it.toPath() } ?: emptyList() + + // Find the reference FASTA (filename contains the ref name) + val matchingRefFasta = allFastaFiles.find { path -> + path.fileName.toString().contains(refFastaName, ignoreCase = true) + } + + // Get non-reference FASTAs + val queryFastas = allFastaFiles.filter { path -> + !path.fileName.toString().contains(refFastaName, ignoreCase = true) + } + + if (step5RefFasta == null && matchingRefFasta != null) { + step5RefFasta = matchingRefFasta + logger.info("Auto-detected reference FASTA from step 4: $step5RefFasta") + } + + if (step5FastaInput == null && queryFastas.isNotEmpty()) { + // Create a text file listing the query FASTAs + val queryListFile = fastaOutputDir.resolve("query_fastas.txt") + queryListFile.writeText(queryFastas.joinToString("\n") { it.toAbsolutePath().toString() }) + step5FastaInput = queryListFile + logger.info("Auto-detected ${queryFastas.size} query FASTA files from step 4") + } + } + } + + // Fall back to original behavior if auto-detection failed + if (step5RefFasta == null) { + step5RefFasta = refFasta + } + if (step5FastaInput == null) { + step5FastaInput = fastaOutputDir + } + } + + if (step5FastaInput == null) { + throw RuntimeException("Cannot run align-mutated-assemblies: no FASTA input available (specify 'fasta_input' in config or run convert-to-fasta first)") } - if (refGff == null) { - throw RuntimeException("Cannot run align-mutated-assemblies: reference GFF not available") + if (step5RefFasta == null) { + throw RuntimeException("Cannot run align-mutated-assemblies: reference FASTA not available (specify 'ref_fasta' in config or run convert-to-fasta first)") } // Determine output directory (custom or default) @@ -700,9 +759,9 @@ class Orchestrate : CliktCommand(name = "orchestrate") { val args = buildList { add("align-mutated-assemblies") add("--work-dir=${workDir}") - add("--ref-gff=${refGff}") - add("--ref-fasta=${refFasta}") - add("--fasta-input=${fastaInput}") + add("--ref-gff=${step5RefGff}") + add("--ref-fasta=${step5RefFasta}") + add("--fasta-input=${step5FastaInput}") if (config.align_mutated_assemblies.threads != null) { add("--threads=${config.align_mutated_assemblies.threads}") } @@ -721,11 +780,33 @@ class Orchestrate : CliktCommand(name = "orchestrate") { throw RuntimeException("align-mutated-assemblies failed with exit code $exitCode") } + // Save the mutated reference FASTA for use in step 6 + mutatedRefFasta = step5RefFasta + logger.info("Step 5 completed successfully") logger.info("") } else { if (config.align_mutated_assemblies != null) { logger.info("Skipping align-mutated-assemblies (not in run_steps)") + + // Try to recover mutated ref FASTA from step 5 config or step 4 output + if (config.align_mutated_assemblies.ref_fasta != null) { + mutatedRefFasta = Path.of(config.align_mutated_assemblies.ref_fasta) + logger.info("Using configured mutated reference FASTA: $mutatedRefFasta") + } else if (fastaOutputDir != null && fastaOutputDir.exists()) { + // Try to auto-detect from step 4 output + val refFastaName = refFasta?.fileName?.toString()?.replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") + if (refFastaName != null) { + val matchingRefFasta = fastaOutputDir.toFile().listFiles { file -> + file.isFile && file.name.matches(Regex(".*\\.(fa|fasta|fna)(\\.gz)?$")) && + file.name.contains(refFastaName, ignoreCase = true) + }?.firstOrNull()?.toPath() + if (matchingRefFasta != null) { + mutatedRefFasta = matchingRefFasta + logger.info("Auto-detected mutated reference FASTA: $mutatedRefFasta") + } + } + } } else { logger.info("Skipping align-mutated-assemblies (not configured)") } @@ -738,8 +819,13 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("STEP 6: Pick Crossovers") logger.info("=".repeat(80)) - if (refFasta == null) { - throw RuntimeException("Cannot run pick-crossovers: reference FASTA not available") + // Use pick_crossovers.ref_fasta if specified, otherwise use mutated ref FASTA from step 5, + // finally fall back to original ref FASTA from step 1 + val pickCrossoversRefFasta = config.pick_crossovers.ref_fasta?.let { Path.of(it) } + ?: mutatedRefFasta + ?: refFasta + if (pickCrossoversRefFasta == null) { + throw RuntimeException("Cannot run pick-crossovers: reference FASTA not available (specify 'ref_fasta' in pick_crossovers config, run align_mutated_assemblies, or run align_assemblies first)") } // Determine output directory (custom or default) @@ -748,7 +834,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { val args = buildList { add("pick-crossovers") add("--work-dir=${workDir}") - add("--ref-fasta=${refFasta}") + add("--ref-fasta=${pickCrossoversRefFasta}") add("--assembly-list=${config.pick_crossovers.assembly_list}") if (customOutput != null) { add("--output-dir=${customOutput}") From ce058701cb7009c545d6ca5e0f183c769d709254 Mon Sep 17 00:00:00 2001 From: Brandon Date: Tue, 9 Dec 2025 19:01:21 -0600 Subject: [PATCH 2/3] Fix boolean key in yaml and DownsampleGvcf --- pipeline_config.example.yaml | 4 ++-- src/main/kotlin/net/maizegenetics/commands/DownsampleGvcf.kt | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index 2d793d7..87d81a2 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -90,7 +90,7 @@ downsample_gvcf: ignore_contig: "" # Optional: Comma-separated patterns to ignore rates: "0.01,0.05,0.1,0.15,0.2" # Optional: Comma-separated downsampling rates seed: 42 # Optional: Random seed for reproducibility - keep_ref: true # Optional: Keep reference blocks (default: true) + keep_ref: true # Optional: Keep reference blocks (true/false, default: true) min_ref_block_size: 20 # Optional: Minimum ref block size (default: 20) # input: "/custom/gvcf/directory/" # Optional: Custom GVCF input directory # output: "/custom/downsample/output/" # Optional: Custom output directory @@ -194,7 +194,7 @@ format_recombined_fastas: rope_bwt_chr_index: index_file_prefix: "phgIndex" # Optional: Prefix for index files (default: "phgIndex") threads: 20 # Optional: Number of threads (default: 20) - delete_fmr_index: true # Optional: Delete .fmr files after conversion (default: true) + delete_fmr_index: true # Optional: Delete .fmr files after conversion (true/false, default: true) # input: "/custom/fasta/input/" # Optional: Custom FASTA input (file, directory, or text list) # output: "/custom/phg_index/" # Optional: Custom output directory (default: work_dir/output/11_rope_bwt_index_results) # keyfile: "/path/to/keyfile.txt" # Optional: Pre-made keyfile (mutually exclusive with input) diff --git a/src/main/kotlin/net/maizegenetics/commands/DownsampleGvcf.kt b/src/main/kotlin/net/maizegenetics/commands/DownsampleGvcf.kt index fc3cb81..5728e5d 100644 --- a/src/main/kotlin/net/maizegenetics/commands/DownsampleGvcf.kt +++ b/src/main/kotlin/net/maizegenetics/commands/DownsampleGvcf.kt @@ -5,6 +5,7 @@ import com.github.ajalt.clikt.parameters.options.default import com.github.ajalt.clikt.parameters.options.flag import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required +import com.github.ajalt.clikt.parameters.types.boolean import com.github.ajalt.clikt.parameters.types.int import com.github.ajalt.clikt.parameters.types.path import net.maizegenetics.Constants @@ -61,7 +62,7 @@ class DownsampleGvcf : CliktCommand(name = "downsample-gvcf") { private val keepRef by option( "--keep-ref", help = "Keep reference blocks" - ).flag(default = true) + ).boolean().default(true) private val minRefBlockSize by option( "--min-ref-block-size", From 4e74cb9f98b1a8042a7e3bc9d53ae7696d6e59dc Mon Sep 17 00:00:00 2001 From: Brandon Date: Wed, 10 Dec 2025 13:21:35 -0600 Subject: [PATCH 3/3] Version bump --- build.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle.kts b/build.gradle.kts index ff01eb8..774750a 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "net.maizegenetics" -version = "0.2" +version = "0.2.1" repositories { mavenCentral()