From 1546c788824ad628eea56790c8a11795879c8212 Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 18 Dec 2025 11:12:54 -0600 Subject: [PATCH 01/11] Add a default assembly list option for "pick_crossover" step --- build.gradle.kts | 2 +- .../net/maizegenetics/commands/Orchestrate.kt | 63 ++++++++++++++++--- 2 files changed, 56 insertions(+), 9 deletions(-) diff --git a/build.gradle.kts b/build.gradle.kts index 6f64512..5d32bf3 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "net.maizegenetics" -version = "0.2.6" +version = "0.2.7" repositories { mavenCentral() diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index 5c30892..ba4b0dc 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -76,7 +76,7 @@ data class AlignMutatedAssembliesConfig( ) data class PickCrossoversConfig( - val assembly_list: String, + val assembly_list: String? = null, // Optional: If not specified, auto-generates from convert_to_fasta output val ref_fasta: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) val output: String? = null // Custom output directory ) @@ -272,16 +272,16 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ) } else null - // Parse pick_crossovers + // Parse pick_crossovers - check if key exists (even with empty/null value means "run with defaults") @Suppress("UNCHECKED_CAST") val pickCrossoversMap = configMap["pick_crossovers"] as? Map - val pickCrossovers = pickCrossoversMap?.let { + val pickCrossovers = if (configMap.containsKey("pick_crossovers")) { PickCrossoversConfig( - assembly_list = it["assembly_list"] as? String ?: throw IllegalArgumentException("pick_crossovers.assembly_list is required"), - ref_fasta = it["ref_fasta"] as? String, - output = it["output"] as? String + assembly_list = pickCrossoversMap?.get("assembly_list") as? String, + ref_fasta = pickCrossoversMap?.get("ref_fasta") as? String, + output = pickCrossoversMap?.get("output") as? String ) - } + } else null // Parse create_chain_files - check if key exists (even with empty/null value means "run with defaults") @Suppress("UNCHECKED_CAST") @@ -834,13 +834,60 @@ class Orchestrate : CliktCommand(name = "orchestrate") { throw RuntimeException("Cannot run pick-crossovers: reference FASTA not available (specify 'ref_fasta' in pick_crossovers config, run align_mutated_assemblies, or run align_assemblies first)") } + // Determine assembly list (custom or auto-generated from step 4) + val assemblyListPath: Path = if (config.pick_crossovers.assembly_list != null) { + Path.of(config.pick_crossovers.assembly_list).toAbsolutePath().normalize() + } else { + // Auto-generate assembly list from step 4 output (fastaOutputDir) + if (fastaOutputDir == null || !fastaOutputDir.exists()) { + throw RuntimeException("Cannot run pick-crossovers: no assembly_list provided and no FASTA output directory available from convert_to_fasta step") + } + + // Get all FASTA files from step 4 output + val fastaFiles = fastaOutputDir.toFile().listFiles { file -> + file.isFile && file.name.matches(Regex(".*\\.(fa|fasta|fna)(\\.gz)?$")) + }?.map { it.toPath() }?.sorted() ?: emptyList() + + if (fastaFiles.isEmpty()) { + throw RuntimeException("Cannot run pick-crossovers: no FASTA files found in $fastaOutputDir") + } + + // Create assembly list file with pathname format + // Name is derived from filename minus "_subsampled" suffix and extension + val assemblyListFile = fastaOutputDir.resolve("auto_assembly_list.txt") + val lines = fastaFiles.map { fastaPath -> + val fileName = fastaPath.fileName.toString() + // Remove extension (including .gz if present) + val baseName = fileName + .replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") + // Remove "_subsampled" suffix if present + .replace(Regex("_subsampled$"), "") + "${fastaPath.toAbsolutePath()}\t$baseName" + } + assemblyListFile.writeText(lines.joinToString("\n")) + logger.info("Auto-generated assembly list file: $assemblyListFile") + logger.info(" Contains ${fastaFiles.size} assemblies") + + assemblyListFile + } + + // Validate that the number of assemblies is even + val assemblyCount = assemblyListPath.readLines().filter { it.isNotBlank() }.size + if (assemblyCount % 2 != 0) { + throw RuntimeException( + "Cannot run pick-crossovers: assembly list contains $assemblyCount assemblies, " + + "but this step requires an even number of assembly files to work (assemblies are paired for crossover simulation)" + ) + } + logger.info("Assembly list contains $assemblyCount assemblies (validated: even count)") + // Determine output directory (custom or default) val customOutput = config.pick_crossovers.output?.let { Path.of(it) } val args = buildList { add("--work-dir=${workDir}") add("--ref-fasta=${pickCrossoversRefFasta}") - add("--assembly-list=${config.pick_crossovers.assembly_list}") + add("--assembly-list=${assemblyListPath}") if (customOutput != null) { add("--output-dir=${customOutput}") } From 982633512597e8e329914de4ceaea24234969908 Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 18 Dec 2025 11:14:55 -0600 Subject: [PATCH 02/11] Make parameter optional; update notes --- pipeline_config.example.yaml | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index 211c01d..e207bb4 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -131,16 +131,19 @@ align_mutated_assemblies: # Simulates crossover events to generate recombination breakpoints # Uses MLImpute's pick_crossovers.py to simulate synthetic recombinant genomes # Generates two populations: landrace (1250 rounds) and two-parent cross (1 round) +# IMPORTANT: Requires an EVEN number of assemblies (assemblies are paired for crossover simulation) pick_crossovers: - assembly_list: "path/to/assembly_list.txt" # Required: Tab-separated file with assembly paths and names - # Format: (one per line) - # Example: - # /path/to/assembly1.faparent1 - # /path/to/assembly2.faparent2 - # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA file - # Uses mutated ref FASTA from step 5 if not specified - # Falls back to align_assemblies.ref_fasta if step 5 not run - # output: "/custom/crossovers/output/" # Optional: Custom output directory + # assembly_list: "path/to/assembly_list.txt" # Optional: Tab-separated file with assembly paths and names + # Format: (one per line) + # Example: + # /path/to/assembly1.faparent1 + # /path/to/assembly2.faparent2 + # If not specified, auto-generates from convert_to_fasta (step 4) output + # Auto-generated names: filename minus "_subsampled" suffix and extension + # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA file + # Uses mutated ref FASTA from step 5 if not specified + # Falls back to align_assemblies.ref_fasta if step 5 not run + # output: "/custom/crossovers/output/" # Optional: Custom output directory # Step 7: Create Chain Files # Converts MAF alignment files to UCSC chain format for coordinate conversion @@ -383,9 +386,11 @@ convert_ropebwt2ps4g: # IMPORTANT NOTES: # ============================================================================ # -# Assembly List Format: +# Assembly List Format (for pick_crossovers, convert_coordinates, generate_recombined_sequences): # - Tab-separated file with two columns: path and name -# - Must have an EVEN number of assemblies (for parent pairing) +# - MUST have an EVEN number of assemblies (assemblies are paired for crossover simulation) +# - For pick_crossovers: If not provided, auto-generates from convert_to_fasta (step 4) output +# - Auto-generated names are derived from filename minus "_subsampled" suffix and extension # - Example: # /data/assemblies/B73.faB73 # /data/assemblies/Mo17.faMo17 @@ -401,7 +406,9 @@ convert_ropebwt2ps4g: # chr3 # # Recombination Pipeline Dependencies: -# - Step 6 (pick_crossovers) requires: ref_fasta from step 1 +# - Step 6 (pick_crossovers) requires: ref_fasta from step 1 or step 5 +# - If assembly_list not provided, auto-generates from step 4 FASTA outputs +# - MUST have an EVEN number of assemblies (will error if odd) # - Step 7 (create_chain_files) requires: MAF files from step 1 # - Step 8 (convert_coordinates) requires: outputs from steps 6 and 7 # - Step 9 (generate_recombined_sequences) requires: output from step 8 From 5b10ac924cb32348bba8c33ed7b259050b5126ca Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 18 Dec 2025 11:32:43 -0600 Subject: [PATCH 03/11] Change MAF file input defaults --- pipeline_config.example.yaml | 14 ++++--- .../net/maizegenetics/commands/Orchestrate.kt | 42 ++++++++++++++++--- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index e207bb4..866e1a6 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -147,12 +147,14 @@ pick_crossovers: # Step 7: Create Chain Files # Converts MAF alignment files to UCSC chain format for coordinate conversion -# Automatically uses MAF files from align_assemblies (step 1) +# By default, uses MAF files from align_mutated_assemblies (step 5) +# Falls back to MAF files from align_assemblies (step 1) if step 5 not run # Uses maf-convert tool (automatically downloaded if missing) create_chain_files: - jobs: 8 # Optional: Number of parallel jobs (default: 8) - # input: "/custom/maf/input/" # Optional: Custom MAF input (file, directory, or text list) - # output: "/custom/chain/output/" # Optional: Custom output directory + jobs: 8 # Optional: Number of parallel jobs (default: 8) + # maf_file_input: "/custom/maf/input/" # Optional: Custom MAF input (file, directory, or text list) + # Default: step 5 MAF outputs, fallback to step 1 + # output: "/custom/chain/output/" # Optional: Custom output directory # Step 8: Convert Coordinates # Converts crossover breakpoints from reference coordinates to assembly coordinates @@ -409,7 +411,9 @@ convert_ropebwt2ps4g: # - Step 6 (pick_crossovers) requires: ref_fasta from step 1 or step 5 # - If assembly_list not provided, auto-generates from step 4 FASTA outputs # - MUST have an EVEN number of assemblies (will error if odd) -# - Step 7 (create_chain_files) requires: MAF files from step 1 +# - Step 7 (create_chain_files) requires: MAF files from step 5 (default) or step 1 (fallback) +# - Uses align_mutated_assemblies MAF outputs by default +# - Falls back to align_assemblies MAF outputs if step 5 not run # - Step 8 (convert_coordinates) requires: outputs from steps 6 and 7 # - Step 9 (generate_recombined_sequences) requires: output from step 8 # - Step 10 (format_recombined_fastas) requires: output from step 9 diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index ba4b0dc..18d9c11 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -83,8 +83,8 @@ data class PickCrossoversConfig( data class CreateChainFilesConfig( val jobs: Int? = null, - val input: String? = null, // Custom MAF input file/directory - val output: String? = null // Custom output directory + val maf_file_input: String? = null, // Custom MAF input file/directory (default: step 5 MAF outputs) + val output: String? = null // Custom output directory ) data class ConvertCoordinatesConfig( @@ -289,7 +289,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { val createChainFiles = if (configMap.containsKey("create_chain_files")) { CreateChainFilesConfig( jobs = createChainFilesMap?.get("jobs") as? Int, - input = createChainFilesMap?.get("input") as? String, + maf_file_input = createChainFilesMap?.get("maf_file_input") as? String, output = createChainFilesMap?.get("output") as? String ) } else null @@ -396,6 +396,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { var refFasta: Path? = null var refGff: Path? = null var mutatedRefFasta: Path? = null // Mutated reference FASTA from step 5 + var mutatedMafFilePaths: Path? = null // MAF file paths from step 5 (align_mutated_assemblies) var refkeyOutputDir: Path? = null var chainOutputDir: Path? = null var coordinatesOutputDir: Path? = null @@ -789,6 +790,16 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Save the mutated reference FASTA for use in step 6 mutatedRefFasta = step5RefFasta + // Save the MAF file paths for use in step 7 (create_chain_files) + val step5OutputDir = customOutput ?: workDir.resolve("output").resolve("05_mutated_align_results") + mutatedMafFilePaths = step5OutputDir.resolve("maf_file_paths.txt") + if (mutatedMafFilePaths!!.exists()) { + logger.info("Step 5 MAF file paths: $mutatedMafFilePaths") + } else { + logger.warn("Step 5 MAF file paths not found: $mutatedMafFilePaths") + mutatedMafFilePaths = null + } + logger.info("Step 5 completed successfully") logger.info("") } else { @@ -813,6 +824,15 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } } } + + // Try to recover MAF file paths from previous step 5 run + val customOutput = config.align_mutated_assemblies.output?.let { Path.of(it) } + val step5OutputDir = customOutput ?: workDir.resolve("output").resolve("05_mutated_align_results") + val previousMafPaths = step5OutputDir.resolve("maf_file_paths.txt") + if (previousMafPaths.exists()) { + mutatedMafFilePaths = previousMafPaths + logger.info("Using previous step 5 MAF file paths: $mutatedMafFilePaths") + } } else { logger.info("Skipping align-mutated-assemblies (not configured)") } @@ -930,11 +950,14 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("STEP 7: Create Chain Files") logger.info("=".repeat(80)) - // Determine input (custom or from previous step) - val mafInput = config.create_chain_files.input?.let { Path.of(it) } ?: mafFilePaths + // Determine input (custom, or step 5 MAF files, or step 1 MAF files as fallback) + val mafInput = config.create_chain_files.maf_file_input?.let { Path.of(it) } + ?: mutatedMafFilePaths + ?: mafFilePaths if (mafInput == null) { - throw RuntimeException("Cannot run create-chain-files: no MAF input available (specify 'input' in config or run align-assemblies first)") + throw RuntimeException("Cannot run create-chain-files: no MAF input available (specify 'maf_file_input' in config, run align-mutated-assemblies, or run align-assemblies first)") } + logger.info("MAF input: $mafInput") // Determine output directory (custom or default) val customOutput = config.create_chain_files.output?.let { Path.of(it) } @@ -975,6 +998,13 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } else { logger.warn("Previous create-chain-files outputs not found. Downstream steps may fail.") } + + // Also recover MAF file path info if not already set + if (mutatedMafFilePaths == null && mafFilePaths == null) { + config.create_chain_files.maf_file_input?.let { + logger.info("Custom MAF input configured: $it") + } + } } else { logger.info("Skipping create-chain-files (not configured)") } From 96c55b2e3da842920e878585c15ef6e1e10bcd9a Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 18 Dec 2025 11:57:34 -0600 Subject: [PATCH 04/11] Update default assembly list location --- pipeline_config.example.yaml | 39 ++++++----- .../net/maizegenetics/commands/Orchestrate.kt | 70 ++++++++++++++----- 2 files changed, 74 insertions(+), 35 deletions(-) diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index 866e1a6..218a230 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -161,27 +161,27 @@ create_chain_files: # Automatically uses refkey files from pick_crossovers and chain files from create_chain_files # Uses CrossMap for coordinate conversion via chain files convert_coordinates: - assembly_list: "path/to/assembly_list.txt" # Required: Same assembly list as pick_crossovers - # input_chain: "/custom/chain/directory/" # Optional: Custom chain directory - # input_refkey: "/custom/refkey/directory/" # Optional: Custom refkey directory - # output: "/custom/coordinates/output/" # Optional: Custom output directory + # assembly_list: "path/to/assembly_list.txt" # Optional: defaults to assembly list from pick_crossovers + # input_chain: "/custom/chain/directory/" # Optional: Custom chain directory + # input_refkey: "/custom/refkey/directory/" # Optional: Custom refkey directory + # output: "/custom/coordinates/output/" # Optional: Custom output directory # Step 9: Generate Recombined Sequences # Creates recombined FASTA sequences by concatenating segments from parent assemblies # Automatically uses founder key files from convert_coordinates # Each founder gets its own recombined FASTA file generate_recombined_sequences: - assembly_list: "path/to/assembly_list.txt" # Required: Same assembly list as above - chromosome_list: "path/to/chromosome_list.txt" # Required: Text file with chromosome names (one per line) - # Example: - # chr1 - # chr2 - # chr3 - assembly_dir: "path/to/parent_assemblies/" # Required: Directory containing parent assembly FASTA files - # Files should be named to match assembly_list names - # Example: parent1.fa, parent2.fa - # input: "/custom/founder_key/directory/" # Optional: Custom founder key directory - # output: "/custom/recombined/output/" # Optional: Custom output directory + # assembly_list: "path/to/assembly_list.txt" # Optional: defaults to assembly list from pick_crossovers + chromosome_list: "path/to/chromosome_list.txt" # Required: Text file with chromosome names (one per line) + # Example: + # chr1 + # chr2 + # chr3 + assembly_dir: "path/to/parent_assemblies/" # Required: Directory containing parent assembly FASTA files + # Files should be named to match assembly_list names + # Example: parent1.fa, parent2.fa + # input: "/custom/founder_key/directory/" # Optional: Custom founder key directory + # output: "/custom/recombined/output/" # Optional: Custom output directory # Step 10: Format Recombined Fastas # Reformats recombined FASTA files to have consistent line widths using seqkit @@ -391,8 +391,10 @@ convert_ropebwt2ps4g: # Assembly List Format (for pick_crossovers, convert_coordinates, generate_recombined_sequences): # - Tab-separated file with two columns: path and name # - MUST have an EVEN number of assemblies (assemblies are paired for crossover simulation) -# - For pick_crossovers: If not provided, auto-generates from convert_to_fasta (step 4) output -# - Auto-generated names are derived from filename minus "_subsampled" suffix and extension +# - The same assembly list is shared across steps 6, 8, and 9: +# - Step 6 (pick_crossovers): If not provided, auto-generates from step 4 FASTA outputs +# - Steps 8 and 9: If not provided, automatically use the assembly list from step 6 +# - Auto-generated names are derived from filename minus "_subsampled" suffix and extension # - Example: # /data/assemblies/B73.faB73 # /data/assemblies/Mo17.faMo17 @@ -411,11 +413,14 @@ convert_ropebwt2ps4g: # - Step 6 (pick_crossovers) requires: ref_fasta from step 1 or step 5 # - If assembly_list not provided, auto-generates from step 4 FASTA outputs # - MUST have an EVEN number of assemblies (will error if odd) +# - Assembly list is saved and shared with steps 8 and 9 # - Step 7 (create_chain_files) requires: MAF files from step 5 (default) or step 1 (fallback) # - Uses align_mutated_assemblies MAF outputs by default # - Falls back to align_assemblies MAF outputs if step 5 not run # - Step 8 (convert_coordinates) requires: outputs from steps 6 and 7 +# - If assembly_list not provided, uses assembly list from step 6 # - Step 9 (generate_recombined_sequences) requires: output from step 8 +# - If assembly_list not provided, uses assembly list from step 6 # - Step 10 (format_recombined_fastas) requires: output from step 9 # # PS4G Creation Dependencies: diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index 18d9c11..dadcb59 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -88,14 +88,14 @@ data class CreateChainFilesConfig( ) data class ConvertCoordinatesConfig( - val assembly_list: String, - val input_chain: String? = null, // Custom chain directory - val input_refkey: String? = null, // Custom refkey directory - val output: String? = null // Custom output directory + val assembly_list: String? = null, // Optional: defaults to assembly list from pick_crossovers step + val input_chain: String? = null, // Custom chain directory + val input_refkey: String? = null, // Custom refkey directory + val output: String? = null // Custom output directory ) data class GenerateRecombinedSequencesConfig( - val assembly_list: String, + val assembly_list: String? = null, // Optional: defaults to assembly list from pick_crossovers step val chromosome_list: String, val assembly_dir: String, val input: String? = null, // Custom founder key directory @@ -294,24 +294,24 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ) } else null - // Parse convert_coordinates + // Parse convert_coordinates - check if key exists (even with empty/null value means "run with defaults") @Suppress("UNCHECKED_CAST") val convertCoordinatesMap = configMap["convert_coordinates"] as? Map - val convertCoordinates = convertCoordinatesMap?.let { + val convertCoordinates = if (configMap.containsKey("convert_coordinates")) { ConvertCoordinatesConfig( - assembly_list = it["assembly_list"] as? String ?: throw IllegalArgumentException("convert_coordinates.assembly_list is required"), - input_chain = it["input_chain"] as? String, - input_refkey = it["input_refkey"] as? String, - output = it["output"] as? String + assembly_list = convertCoordinatesMap?.get("assembly_list") as? String, + input_chain = convertCoordinatesMap?.get("input_chain") as? String, + input_refkey = convertCoordinatesMap?.get("input_refkey") as? String, + output = convertCoordinatesMap?.get("output") as? String ) - } + } else null // Parse generate_recombined_sequences @Suppress("UNCHECKED_CAST") val generateRecombinedSequencesMap = configMap["generate_recombined_sequences"] as? Map val generateRecombinedSequences = generateRecombinedSequencesMap?.let { GenerateRecombinedSequencesConfig( - assembly_list = it["assembly_list"] as? String ?: throw IllegalArgumentException("generate_recombined_sequences.assembly_list is required"), + assembly_list = it["assembly_list"] as? String, chromosome_list = it["chromosome_list"] as? String ?: throw IllegalArgumentException("generate_recombined_sequences.chromosome_list is required"), assembly_dir = it["assembly_dir"] as? String ?: throw IllegalArgumentException("generate_recombined_sequences.assembly_dir is required"), input = it["input"] as? String, @@ -397,6 +397,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { var refGff: Path? = null var mutatedRefFasta: Path? = null // Mutated reference FASTA from step 5 var mutatedMafFilePaths: Path? = null // MAF file paths from step 5 (align_mutated_assemblies) + var assemblyListPath: Path? = null // Assembly list from step 6 (pick_crossovers) var refkeyOutputDir: Path? = null var chainOutputDir: Path? = null var coordinatesOutputDir: Path? = null @@ -855,7 +856,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } // Determine assembly list (custom or auto-generated from step 4) - val assemblyListPath: Path = if (config.pick_crossovers.assembly_list != null) { + val step6AssemblyList: Path = if (config.pick_crossovers.assembly_list != null) { Path.of(config.pick_crossovers.assembly_list).toAbsolutePath().normalize() } else { // Auto-generate assembly list from step 4 output (fastaOutputDir) @@ -892,7 +893,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } // Validate that the number of assemblies is even - val assemblyCount = assemblyListPath.readLines().filter { it.isNotBlank() }.size + val assemblyCount = step6AssemblyList.readLines().filter { it.isNotBlank() }.size if (assemblyCount % 2 != 0) { throw RuntimeException( "Cannot run pick-crossovers: assembly list contains $assemblyCount assemblies, " + @@ -901,13 +902,16 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } logger.info("Assembly list contains $assemblyCount assemblies (validated: even count)") + // Save assembly list path for use in steps 8 and 9 + assemblyListPath = step6AssemblyList + // Determine output directory (custom or default) val customOutput = config.pick_crossovers.output?.let { Path.of(it) } val args = buildList { add("--work-dir=${workDir}") add("--ref-fasta=${pickCrossoversRefFasta}") - add("--assembly-list=${assemblyListPath}") + add("--assembly-list=${step6AssemblyList}") if (customOutput != null) { add("--output-dir=${customOutput}") } @@ -938,6 +942,18 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } else { logger.warn("Previous pick-crossovers outputs not found. Downstream steps may fail.") } + + // Try to recover assembly list from config or auto-generated file + if (config.pick_crossovers.assembly_list != null) { + assemblyListPath = Path.of(config.pick_crossovers.assembly_list).toAbsolutePath().normalize() + logger.info("Using configured assembly list: $assemblyListPath") + } else if (fastaOutputDir != null) { + val autoGeneratedList = fastaOutputDir.resolve("auto_assembly_list.txt") + if (autoGeneratedList.exists()) { + assemblyListPath = autoGeneratedList + logger.info("Using auto-generated assembly list from previous run: $assemblyListPath") + } + } } else { logger.info("Skipping pick-crossovers (not configured)") } @@ -1026,12 +1042,21 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Determine refkey input (custom or from previous step) val refkeyInput = config.convert_coordinates.input_refkey?.let { Path.of(it) } ?: refkeyOutputDir + // Determine assembly list (custom or from step 6) + val step8AssemblyList = config.convert_coordinates.assembly_list?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: assemblyListPath + if (step8AssemblyList == null) { + throw RuntimeException("Cannot run convert-coordinates: no assembly_list available (specify 'assembly_list' in config or run pick-crossovers first)") + } + logger.info("Assembly list: $step8AssemblyList") + // Determine output directory (custom or default) val customOutput = config.convert_coordinates.output?.let { Path.of(it) } val args = buildList { add("--work-dir=${workDir}") - add("--assembly-list=${config.convert_coordinates.assembly_list}") + add("--assembly-list=${step8AssemblyList}") add("--chain-dir=${chainInput}") if (refkeyInput != null) { add("--refkey-dir=${refkeyInput}") @@ -1084,12 +1109,21 @@ class Orchestrate : CliktCommand(name = "orchestrate") { throw RuntimeException("Cannot run generate-recombined-sequences: no founder key input available (specify 'input' in config or run convert-coordinates first)") } + // Determine assembly list (custom or from step 6) + val step9AssemblyList = config.generate_recombined_sequences.assembly_list?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: assemblyListPath + if (step9AssemblyList == null) { + throw RuntimeException("Cannot run generate-recombined-sequences: no assembly_list available (specify 'assembly_list' in config or run pick-crossovers first)") + } + logger.info("Assembly list: $step9AssemblyList") + // Determine output directory (custom or default) val customOutput = config.generate_recombined_sequences.output?.let { Path.of(it) } val args = buildList { add("--work-dir=${workDir}") - add("--assembly-list=${config.generate_recombined_sequences.assembly_list}") + add("--assembly-list=${step9AssemblyList}") add("--chromosome-list=${config.generate_recombined_sequences.chromosome_list}") add("--assembly-dir=${config.generate_recombined_sequences.assembly_dir}") add("--founder-key-dir=${founderKeyInput}") From 3c3feb3fb4bbf3de60877e45c983302131ec625a Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 18 Dec 2025 12:42:38 -0600 Subject: [PATCH 05/11] Update default parameters --- pipeline_config.example.yaml | 20 +++-- .../net/maizegenetics/commands/Orchestrate.kt | 90 ++++++++++++------- 2 files changed, 71 insertions(+), 39 deletions(-) diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index 218a230..7566594 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -168,20 +168,19 @@ convert_coordinates: # Step 9: Generate Recombined Sequences # Creates recombined FASTA sequences by concatenating segments from parent assemblies -# Automatically uses founder key files from convert_coordinates +# Automatically uses founder key files from convert_coordinates (step 8) # Each founder gets its own recombined FASTA file generate_recombined_sequences: # assembly_list: "path/to/assembly_list.txt" # Optional: defaults to assembly list from pick_crossovers - chromosome_list: "path/to/chromosome_list.txt" # Required: Text file with chromosome names (one per line) - # Example: + # chromosome_list: "path/to/chromosome_list.txt" # Optional: Text file with chromosome names (one per line) + # If not specified, auto-derives IDs from FASTAs + # Example format: # chr1 # chr2 # chr3 - assembly_dir: "path/to/parent_assemblies/" # Required: Directory containing parent assembly FASTA files - # Files should be named to match assembly_list names - # Example: parent1.fa, parent2.fa - # input: "/custom/founder_key/directory/" # Optional: Custom founder key directory - # output: "/custom/recombined/output/" # Optional: Custom output directory + # assembly_dir: "path/to/parent_assemblies/" # Optional: Directory containing parent assembly FASTA files + # Defaults to this step's output directory + # Step 10: Format Recombined Fastas # Reformats recombined FASTA files to have consistent line widths using seqkit @@ -401,9 +400,10 @@ convert_ropebwt2ps4g: # /data/assemblies/W22.faW22 # /data/assemblies/PH207.faPH207 # -# Chromosome List Format: +# Chromosome List Format (for generate_recombined_sequences): # - Plain text file with one chromosome name per line # - Names should match chromosome names in assemblies +# - If not provided, auto-derives from first assembly in assembly list using BioKotlin # - Example: # chr1 # chr2 @@ -421,6 +421,8 @@ convert_ropebwt2ps4g: # - If assembly_list not provided, uses assembly list from step 6 # - Step 9 (generate_recombined_sequences) requires: output from step 8 # - If assembly_list not provided, uses assembly list from step 6 +# - If chromosome_list not provided, auto-derives from first assembly using BioKotlin +# - If assembly_dir not provided, uses step 9 output directory # - Step 10 (format_recombined_fastas) requires: output from step 9 # # PS4G Creation Dependencies: diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index dadcb59..d3354c5 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -1,5 +1,6 @@ package net.maizegenetics.commands +import biokotlin.seqIO.NucSeqIO import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.core.parse import com.github.ajalt.clikt.parameters.options.default @@ -95,11 +96,9 @@ data class ConvertCoordinatesConfig( ) data class GenerateRecombinedSequencesConfig( - val assembly_list: String? = null, // Optional: defaults to assembly list from pick_crossovers step - val chromosome_list: String, - val assembly_dir: String, - val input: String? = null, // Custom founder key directory - val output: String? = null // Custom output directory + val assembly_list: String? = null, // Optional: defaults to assembly list from pick_crossovers step + val chromosome_list: String? = null, // Optional: auto-derives from first assembly in assembly list + val assembly_dir: String? = null // Optional: defaults to step 9 output directory ) data class FormatRecombinedFastasConfig( @@ -306,18 +305,16 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ) } else null - // Parse generate_recombined_sequences + // Parse generate_recombined_sequences - check if key exists (even with empty/null value means "run with defaults") @Suppress("UNCHECKED_CAST") val generateRecombinedSequencesMap = configMap["generate_recombined_sequences"] as? Map - val generateRecombinedSequences = generateRecombinedSequencesMap?.let { + val generateRecombinedSequences = if (configMap.containsKey("generate_recombined_sequences")) { GenerateRecombinedSequencesConfig( - assembly_list = it["assembly_list"] as? String, - chromosome_list = it["chromosome_list"] as? String ?: throw IllegalArgumentException("generate_recombined_sequences.chromosome_list is required"), - assembly_dir = it["assembly_dir"] as? String ?: throw IllegalArgumentException("generate_recombined_sequences.assembly_dir is required"), - input = it["input"] as? String, - output = it["output"] as? String + assembly_list = generateRecombinedSequencesMap?.get("assembly_list") as? String, + chromosome_list = generateRecombinedSequencesMap?.get("chromosome_list") as? String, + assembly_dir = generateRecombinedSequencesMap?.get("assembly_dir") as? String ) - } + } else null // Parse format_recombined_fastas - check if key exists (even with empty/null value means "run with defaults") @Suppress("UNCHECKED_CAST") @@ -1103,11 +1100,11 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("STEP 9: Generate Recombined Sequences") logger.info("=".repeat(80)) - // Determine founder key input (custom or from previous step) - val founderKeyInput = config.generate_recombined_sequences.input?.let { Path.of(it) } ?: coordinatesOutputDir - if (founderKeyInput == null) { - throw RuntimeException("Cannot run generate-recombined-sequences: no founder key input available (specify 'input' in config or run convert-coordinates first)") + // Use founder key input from previous step (convert_coordinates) + if (coordinatesOutputDir == null) { + throw RuntimeException("Cannot run generate-recombined-sequences: no founder key input available (run convert-coordinates first)") } + logger.info("Founder key directory: $coordinatesOutputDir") // Determine assembly list (custom or from step 6) val step9AssemblyList = config.generate_recombined_sequences.assembly_list?.let { @@ -1118,25 +1115,59 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } logger.info("Assembly list: $step9AssemblyList") - // Determine output directory (custom or default) - val customOutput = config.generate_recombined_sequences.output?.let { Path.of(it) } + // Determine chromosome list (custom or auto-derived from first assembly) + val step9ChromosomeList: Path = if (config.generate_recombined_sequences.chromosome_list != null) { + Path.of(config.generate_recombined_sequences.chromosome_list).toAbsolutePath().normalize() + } else { + // Auto-derive chromosome list from the first assembly in the assembly list + val firstLine = step9AssemblyList.readLines().firstOrNull { it.isNotBlank() } + ?: throw RuntimeException("Cannot run generate-recombined-sequences: assembly list is empty") + + // Assembly list format: pathname - extract the path (first column) + val firstAssemblyPath = firstLine.split("\t").firstOrNull()?.trim() + ?: throw RuntimeException("Cannot run generate-recombined-sequences: invalid assembly list format") + + logger.info("Auto-deriving chromosome list from first assembly: $firstAssemblyPath") + + // Use BioKotlin to read the FASTA and extract chromosome IDs + val seq = NucSeqIO(firstAssemblyPath).readAll() + val chromosomeIds = seq.keys.toList() + + if (chromosomeIds.isEmpty()) { + throw RuntimeException("Cannot run generate-recombined-sequences: no chromosomes found in $firstAssemblyPath") + } + + // Write chromosome list to a temporary file + val chromosomeListFile = workDir.resolve("output").resolve("09_recombined_sequences").resolve("auto_chromosome_list.txt") + chromosomeListFile.parent.createDirectories() + chromosomeListFile.writeText(chromosomeIds.joinToString("\n")) + logger.info("Auto-generated chromosome list: $chromosomeListFile") + logger.info(" Contains ${chromosomeIds.size} chromosomes: ${chromosomeIds.take(5).joinToString(", ")}${if (chromosomeIds.size > 5) ", ..." else ""}") + + chromosomeListFile + } + + // Determine output directory (step 9 default output) + val outputBase = workDir.resolve("output").resolve("09_recombined_sequences") + + // Determine assembly directory (custom or step 9 output directory) + val step9AssemblyDir = config.generate_recombined_sequences.assembly_dir?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: outputBase + logger.info("Assembly directory: $step9AssemblyDir") val args = buildList { add("--work-dir=${workDir}") add("--assembly-list=${step9AssemblyList}") - add("--chromosome-list=${config.generate_recombined_sequences.chromosome_list}") - add("--assembly-dir=${config.generate_recombined_sequences.assembly_dir}") - add("--founder-key-dir=${founderKeyInput}") - if (customOutput != null) { - add("--output-dir=${customOutput}") - } + add("--chromosome-list=${step9ChromosomeList}") + add("--assembly-dir=${step9AssemblyDir}") + add("--founder-key-dir=${coordinatesOutputDir}") } GenerateRecombinedSequences().parse(args) restoreOrchestratorLogging(workDir) - // Get output directory (use custom or default) - val outputBase = customOutput ?: workDir.resolve("output").resolve("09_recombined_sequences") + // Get output directory recombinedFastasDir = outputBase.resolve("recombinate_fastas") logger.info("Step 9 completed successfully") @@ -1145,9 +1176,8 @@ class Orchestrate : CliktCommand(name = "orchestrate") { if (config.generate_recombined_sequences != null) { logger.info("Skipping generate-recombined-sequences (not in run_steps)") - // Check custom output location first, then default - val customOutput = config.generate_recombined_sequences.output?.let { Path.of(it) } - val outputBase = customOutput ?: workDir.resolve("output").resolve("09_recombined_sequences") + // Check default output location + val outputBase = workDir.resolve("output").resolve("09_recombined_sequences") val previousRecombinedDir = outputBase.resolve("recombinate_fastas") if (previousRecombinedDir.exists()) { recombinedFastasDir = previousRecombinedDir From 13eee72c7ba72fe2a4f0aa6058bcec0a4138fc3e Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 18 Dec 2025 14:23:11 -0600 Subject: [PATCH 06/11] Fix filepath and .fa/.fasta issues --- pipeline_config.example.yaml | 4 +- .../commands/ConvertCoordinates.kt | 14 +++-- .../commands/CreateChainFiles.kt | 24 ++++++-- .../commands/GenerateRecombinedSequences.kt | 56 ++++++++++++++++--- .../net/maizegenetics/commands/Orchestrate.kt | 11 ++-- 5 files changed, 83 insertions(+), 26 deletions(-) diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index 7566594..cce69c9 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -139,7 +139,7 @@ pick_crossovers: # /path/to/assembly1.faparent1 # /path/to/assembly2.faparent2 # If not specified, auto-generates from convert_to_fasta (step 4) output - # Auto-generated names: filename minus "_subsampled" suffix and extension + # Auto-generated names: filename minus "_mutated" suffix and extension # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA file # Uses mutated ref FASTA from step 5 if not specified # Falls back to align_assemblies.ref_fasta if step 5 not run @@ -393,7 +393,7 @@ convert_ropebwt2ps4g: # - The same assembly list is shared across steps 6, 8, and 9: # - Step 6 (pick_crossovers): If not provided, auto-generates from step 4 FASTA outputs # - Steps 8 and 9: If not provided, automatically use the assembly list from step 6 -# - Auto-generated names are derived from filename minus "_subsampled" suffix and extension +# - Auto-generated names are derived from filename minus "_mutated" suffix and extension # - Example: # /data/assemblies/B73.faB73 # /data/assemblies/Mo17.faMo17 diff --git a/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt b/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt index 88c5acc..cd3c131 100644 --- a/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt +++ b/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt @@ -112,14 +112,16 @@ class ConvertCoordinates : CliktCommand(name = "convert-coordinates") { // Run convert_coords.py // Set PYTHONPATH so Python can find the 'python' package for imports - val pythonPath = mlimputeDir.resolve("src").toString() + // Use absolute paths since the working directory is set to outputDir + // Use sh -c to set PYTHONPATH inside pixi's environment + val pythonPath = mlimputeDir.resolve("src").toAbsolutePath().toString() + val scriptPath = pythonScript.toAbsolutePath().toString() + val assemblyListPath = assemblyList.toAbsolutePath().toString() + val chainDirPath = chainDir.toAbsolutePath().toString() + val shellCommand = "PYTHONPATH='$pythonPath' python '$scriptPath' --assembly-list '$assemblyListPath' --chain-dir '$chainDirPath'" logger.info("Running convert_coords.py") val exitCode = ProcessRunner.runCommand( - "env", "PYTHONPATH=$pythonPath", - "pixi", "run", - "python", pythonScript.toString(), - "--assembly-list", assemblyList.toString(), - "--chain-dir", chainDir.toString(), + "pixi", "run", "sh", "-c", shellCommand, workingDir = outputDir.toFile(), logger = logger ) diff --git a/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt b/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt index ea24faa..f6a8620 100644 --- a/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt +++ b/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt @@ -128,20 +128,34 @@ class CreateChainFiles : CliktCommand(name = "create-chain-files") { logger.info("Cleaning up temporary MAF directory") tempMafDir.toFile().deleteRecursively() - // Collect generated chain files + // Collect generated chain files (all files with .chain extension) val chainFiles = outputDir.listDirectoryEntries() .filter { it.isRegularFile() && it.extension == "chain" } .sorted() - if (chainFiles.isEmpty()) { + // Rename chain files to include "_subsampled" suffix if not already present + val renamedChainFiles = chainFiles.map { chainFile -> + val fileName = chainFile.nameWithoutExtension + if (!fileName.endsWith("_subsampled")) { + val newFileName = "${fileName}_subsampled.chain" + val newPath = chainFile.parent.resolve(newFileName) + chainFile.moveTo(newPath, overwrite = true) + logger.info("Renamed ${chainFile.fileName} to $newFileName") + newPath + } else { + chainFile + } + }.sorted() + + if (renamedChainFiles.isEmpty()) { logger.warn("No chain files generated") } else { - logger.info("Generated ${chainFiles.size} chain file(s):") - chainFiles.forEach { logger.info(" $it") } + logger.info("Generated ${renamedChainFiles.size} chain file(s):") + renamedChainFiles.forEach { logger.info(" $it") } // Write chain file paths to text file FileUtils.writeFilePaths( - chainFiles, + renamedChainFiles, outputDir.resolve(CHAIN_PATHS_FILE), logger, "Chain file" diff --git a/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt b/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt index 1e0e3a5..8be6c30 100644 --- a/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt +++ b/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt @@ -12,6 +12,7 @@ import net.maizegenetics.utils.ProcessRunner import net.maizegenetics.utils.ValidationUtils import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.Logger +import java.nio.file.Files import java.nio.file.Path import kotlin.io.path.* import kotlin.system.exitProcess @@ -119,17 +120,58 @@ class GenerateRecombinedSequences : CliktCommand(name = "generate-recombined-seq } } + // Create symlinks for FASTA files with different extensions + // The Python script expects .fa extension, but files might have .fasta or .fna + logger.info("Checking FASTA file extensions in assembly directory") + val assemblyNames = assemblyList.readLines() + .filter { it.isNotBlank() } + .mapNotNull { line -> + val parts = line.split("\t") + if (parts.size >= 2) parts[1].trim() else null + } + + assemblyNames.forEach { name -> + val faFile = assemblyDir.resolve("$name.fa") + if (!faFile.exists()) { + // Look for alternative extensions + val alternatives = listOf("fasta", "fna") + for (ext in alternatives) { + val altFile = assemblyDir.resolve("$name.$ext") + if (altFile.exists()) { + // Create symlink: name.fa -> name.fasta (or .fna) + try { + Files.createSymbolicLink(faFile, altFile.fileName) + logger.debug("Created symlink: ${faFile.fileName} -> ${altFile.fileName}") + + // Also create symlink for the FASTA index file if it exists + val altIndexFile = assemblyDir.resolve("$name.$ext.fai") + val faIndexFile = assemblyDir.resolve("$name.fa.fai") + if (altIndexFile.exists() && !faIndexFile.exists()) { + Files.createSymbolicLink(faIndexFile, altIndexFile.fileName) + logger.debug("Created symlink: ${faIndexFile.fileName} -> ${altIndexFile.fileName}") + } + } catch (e: Exception) { + logger.warn("Failed to create symlink for $name: ${e.message}") + } + break + } + } + } + } + // Run write_fastas.py // Set PYTHONPATH so Python can find the 'python' package for imports - val pythonPath = mlimputeDir.resolve("src").toString() + // Use absolute paths since the working directory is set to outputDir + // Use sh -c to set PYTHONPATH inside pixi's environment + val pythonPath = mlimputeDir.resolve("src").toAbsolutePath().toString() + val scriptPath = pythonScript.toAbsolutePath().toString() + val assemblyListPath = assemblyList.toAbsolutePath().toString() + val chromosomeListPath = chromosomeList.toAbsolutePath().toString() + val assemblyDirPath = assemblyDir.toAbsolutePath().toString() + val shellCommand = "PYTHONPATH='$pythonPath' python '$scriptPath' --assembly-list '$assemblyListPath' --chromosome-list '$chromosomeListPath' --assembly-dir '$assemblyDirPath'" logger.info("Running write_fastas.py") val exitCode = ProcessRunner.runCommand( - "env", "PYTHONPATH=$pythonPath", - "pixi", "run", - "python", pythonScript.toString(), - "--assembly-list", assemblyList.toString(), - "--chromosome-list", chromosomeList.toString(), - "--assembly-dir", assemblyDir.toString(), + "pixi", "run", "sh", "-c", shellCommand, workingDir = outputDir.toFile(), logger = logger ) diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index d3354c5..2704e9e 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -98,7 +98,7 @@ data class ConvertCoordinatesConfig( data class GenerateRecombinedSequencesConfig( val assembly_list: String? = null, // Optional: defaults to assembly list from pick_crossovers step val chromosome_list: String? = null, // Optional: auto-derives from first assembly in assembly list - val assembly_dir: String? = null // Optional: defaults to step 9 output directory + val assembly_dir: String? = null // Optional: defaults to step 4 FASTA output directory ) data class FormatRecombinedFastasConfig( @@ -871,15 +871,13 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } // Create assembly list file with pathname format - // Name is derived from filename minus "_subsampled" suffix and extension + // Name is derived from filename minus extension (keeping _mutated suffix) val assemblyListFile = fastaOutputDir.resolve("auto_assembly_list.txt") val lines = fastaFiles.map { fastaPath -> val fileName = fastaPath.fileName.toString() // Remove extension (including .gz if present) val baseName = fileName .replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") - // Remove "_subsampled" suffix if present - .replace(Regex("_subsampled$"), "") "${fastaPath.toAbsolutePath()}\t$baseName" } assemblyListFile.writeText(lines.joinToString("\n")) @@ -1150,10 +1148,11 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Determine output directory (step 9 default output) val outputBase = workDir.resolve("output").resolve("09_recombined_sequences") - // Determine assembly directory (custom or step 9 output directory) + // Determine assembly directory (custom or from step 4 FASTA output) + // The Python script needs to read parent FASTA files which are in the FASTA output directory val step9AssemblyDir = config.generate_recombined_sequences.assembly_dir?.let { Path.of(it).toAbsolutePath().normalize() - } ?: outputBase + } ?: fastaOutputDir ?: throw RuntimeException("Cannot run generate-recombined-sequences: no assembly directory available (specify 'assembly_dir' in config or run convert-to-fasta first)") logger.info("Assembly directory: $step9AssemblyDir") val args = buildList { From b7c2f94422890c7ac70a416fd2b7c5b496fc44d1 Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 18 Dec 2025 15:46:27 -0600 Subject: [PATCH 07/11] Rearrange steps --- pipeline_config.example.yaml | 116 ++++---- .../commands/AlignMutatedAssemblies.kt | 6 +- .../commands/ConvertCoordinates.kt | 8 +- .../commands/CreateChainFiles.kt | 6 +- .../commands/FormatRecombinedFastas.kt | 8 +- .../commands/GenerateRecombinedSequences.kt | 8 +- .../net/maizegenetics/commands/Orchestrate.kt | 279 ++++++------------ .../maizegenetics/commands/PickCrossovers.kt | 6 +- 8 files changed, 170 insertions(+), 267 deletions(-) diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index cce69c9..c851a83 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -11,14 +11,14 @@ # 02. maf-to-gvcf: Convert MAF alignment files to compressed GVCF format # 03. downsample-gvcf: Downsample variants at specified rates per chromosome # 04. convert-to-fasta: Generate FASTA files from downsampled variants -# 05. align-mutated-assemblies: Realign mutated FASTA files back to reference # # RECOMBINATION PIPELINE (Generate Recombinant Genomes): -# 06. pick-crossovers: Simulate crossover events in reference coordinates -# 07. create-chain-files: Convert MAF files to chain format for coordinate conversion -# 08. convert-coordinates: Convert crossover points to assembly coordinates -# 09. generate-recombined-sequences: Create recombined FASTA files from parent assemblies -# 10. format-recombined-fastas: Format recombined FASTA files with consistent line widths +# 05. pick-crossovers: Simulate crossover events in reference coordinates +# 06. create-chain-files: Convert MAF files to chain format for coordinate conversion +# 07. convert-coordinates: Convert crossover points to assembly coordinates +# 08. generate-recombined-sequences: Create recombined FASTA files from parent assemblies +# 09. format-recombined-fastas: Format recombined FASTA files with consistent line widths +# 10. align-mutated-assemblies: Realign formatted recombined FASTA files to reference # # PS4G CREATION (PHG Indexing for Genotype Imputation): # 11. rope-bwt-chr-index: Create PHGv2 ropebwt3 index from recombined FASTA files @@ -46,14 +46,14 @@ run_steps: - maf_to_gvcf # Step 02: Convert alignments to GVCF - downsample_gvcf # Step 03: Downsample variants - convert_to_fasta # Step 04: Generate mutated FASTA files - - align_mutated_assemblies # Step 05: Realign mutated sequences # Recombination Pipeline (Generate Recombinant Genomes) - - pick_crossovers # Step 06: Pick crossover points - - create_chain_files # Step 07: Create coordinate conversion chains - - convert_coordinates # Step 08: Convert coordinates to assembly space - - generate_recombined_sequences # Step 09: Generate recombined sequences - - format_recombined_fastas # Step 10: Format recombined FASTA files + - pick_crossovers # Step 05: Pick crossover points + - create_chain_files # Step 06: Create coordinate conversion chains + - convert_coordinates # Step 07: Convert coordinates to assembly space + - generate_recombined_sequences # Step 08: Generate recombined sequences + - format_recombined_fastas # Step 09: Format recombined FASTA files + - align_mutated_assemblies # Step 10: Realign formatted sequences to reference # PS4G Creation (PHG Indexing for Genotype Imputation) - rope_bwt_chr_index # Step 11: Create PHGv2 ropebwt3 index @@ -62,7 +62,7 @@ run_steps: - convert_ropebwt2ps4g # Step 14: Convert BED alignments to PS4G format # ============================================================================ -# MAIN PIPELINE CONFIGURATION (Steps 1-5) +# MAIN PIPELINE CONFIGURATION (Steps 1-4) # ============================================================================ # Step 1: Align Assemblies @@ -109,25 +109,11 @@ convert_to_fasta: # input: "/custom/gvcf/input/" # Optional: Custom GVCF input (file, directory, or text list) # output: "/custom/fasta/output/" # Optional: Custom output directory -# Step 5: Align Mutated Assemblies -# Realigns the mutated FASTA files from convert_to_fasta back to the reference -# Auto-detects reference and query FASTAs from step 4 output based on filename matching -# This creates a circular workflow where mutated sequences are realigned for comparison -align_mutated_assemblies: - # ref_gff: "path/to/reference.gff" # Optional: Reference GFF file - # Uses align_assemblies.ref_gff if not specified - # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA from step 4 output - # Auto-detects file matching original ref name if not specified - # fasta_input: "/custom/fasta/input/" # Optional: Query FASTA input (file, directory, or text list) - # Auto-detects non-ref files from step 4 if not specified - threads: 1 # Optional: Number of threads (default: 1) - # output: "/custom/alignment/output/" # Optional: Custom output directory - # ============================================================================ -# RECOMBINATION PIPELINE CONFIGURATION (Steps 6-10) +# RECOMBINATION PIPELINE CONFIGURATION (Steps 5-10) # ============================================================================ -# Step 6: Pick Crossovers +# Step 5: Pick Crossovers # Simulates crossover events to generate recombination breakpoints # Uses MLImpute's pick_crossovers.py to simulate synthetic recombinant genomes # Generates two populations: landrace (1250 rounds) and two-parent cross (1 round) @@ -141,22 +127,20 @@ pick_crossovers: # If not specified, auto-generates from convert_to_fasta (step 4) output # Auto-generated names: filename minus "_mutated" suffix and extension # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA file - # Uses mutated ref FASTA from step 5 if not specified - # Falls back to align_assemblies.ref_fasta if step 5 not run + # Uses align_assemblies.ref_fasta if not specified # output: "/custom/crossovers/output/" # Optional: Custom output directory -# Step 7: Create Chain Files +# Step 6: Create Chain Files # Converts MAF alignment files to UCSC chain format for coordinate conversion -# By default, uses MAF files from align_mutated_assemblies (step 5) -# Falls back to MAF files from align_assemblies (step 1) if step 5 not run +# Uses MAF files from align_assemblies (step 1) # Uses maf-convert tool (automatically downloaded if missing) create_chain_files: jobs: 8 # Optional: Number of parallel jobs (default: 8) # maf_file_input: "/custom/maf/input/" # Optional: Custom MAF input (file, directory, or text list) - # Default: step 5 MAF outputs, fallback to step 1 + # Default: step 1 MAF outputs # output: "/custom/chain/output/" # Optional: Custom output directory -# Step 8: Convert Coordinates +# Step 7: Convert Coordinates # Converts crossover breakpoints from reference coordinates to assembly coordinates # Automatically uses refkey files from pick_crossovers and chain files from create_chain_files # Uses CrossMap for coordinate conversion via chain files @@ -166,9 +150,9 @@ convert_coordinates: # input_refkey: "/custom/refkey/directory/" # Optional: Custom refkey directory # output: "/custom/coordinates/output/" # Optional: Custom output directory -# Step 9: Generate Recombined Sequences +# Step 8: Generate Recombined Sequences # Creates recombined FASTA sequences by concatenating segments from parent assemblies -# Automatically uses founder key files from convert_coordinates (step 8) +# Automatically uses founder key files from convert_coordinates (step 7) # Each founder gets its own recombined FASTA file generate_recombined_sequences: # assembly_list: "path/to/assembly_list.txt" # Optional: defaults to assembly list from pick_crossovers @@ -182,7 +166,7 @@ generate_recombined_sequences: # Defaults to this step's output directory -# Step 10: Format Recombined Fastas +# Step 9: Format Recombined Fastas # Reformats recombined FASTA files to have consistent line widths using seqkit # Automatically uses recombined FASTA files from generate_recombined_sequences # Ensures compatibility with downstream tools @@ -192,6 +176,20 @@ format_recombined_fastas: # input: "/custom/recombined/fastas/" # Optional: Custom FASTA input (file, directory, or text list) # output: "/custom/formatted/output/" # Optional: Custom output directory +# Step 10: Align Mutated Assemblies +# Realigns the formatted recombined FASTA files back to the reference +# Uses reference GFF and FASTA from align_assemblies (step 1) +# Uses FASTA files from format_recombined_fastas (step 9) as query input +align_mutated_assemblies: + # ref_gff: "path/to/reference.gff" # Optional: Reference GFF file + # Uses align_assemblies.ref_gff if not specified + # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA file + # Uses align_assemblies.ref_fasta if not specified + # fasta_input: "/custom/fasta/input/" # Optional: Query FASTA input (file, directory, or text list) + # Uses format_recombined_fastas output if not specified + threads: 1 # Optional: Number of threads (default: 1) + # output: "/custom/alignment/output/" # Optional: Custom output directory + # ============================================================================ # PS4G CREATION CONFIGURATION (Steps 11-14) # ============================================================================ @@ -258,39 +256,37 @@ convert_ropebwt2ps4g: # ./gradlew run --args="orchestrate --config pipeline_config.yaml" # Note: Environment setup runs automatically if needed on first run! # -# 2. Run only the main pipeline (variant simulation, steps 1-5): +# 2. Run only the main pipeline (variant simulation, steps 1-4): # run_steps: # - align_assemblies # - maf_to_gvcf # - downsample_gvcf # - convert_to_fasta -# - align_mutated_assemblies # # Comment out recombination steps # -# 3. Run only the recombination pipeline with PS4G creation (steps 1, 6-14): +# 3. Run only the recombination pipeline with PS4G creation (steps 1, 5-14): # run_steps: # - align_assemblies # Required for MAF files # # - maf_to_gvcf # Skip GVCF steps # # - downsample_gvcf # # - convert_to_fasta -# # - align_mutated_assemblies # - pick_crossovers # - create_chain_files # - convert_coordinates # - generate_recombined_sequences # - format_recombined_fastas +# - align_mutated_assemblies # Realign recombined sequences # - rope_bwt_chr_index # Create PHG index # - ropebwt_mem # Align FASTQ reads # - build_spline_knots # Build spline knots # - convert_ropebwt2ps4g # Convert BED to PS4G # -# 4. Run steps 1-2-4-5 (skip downsampling): +# 4. Run steps 1-2-4 (skip downsampling): # run_steps: # - align_assemblies # - maf_to_gvcf # # - downsample_gvcf # Commented out # - convert_to_fasta # Will use maf_to_gvcf outputs -# - align_mutated_assemblies # # 5. Rerun only format step (requires previous outputs): # run_steps: @@ -390,9 +386,9 @@ convert_ropebwt2ps4g: # Assembly List Format (for pick_crossovers, convert_coordinates, generate_recombined_sequences): # - Tab-separated file with two columns: path and name # - MUST have an EVEN number of assemblies (assemblies are paired for crossover simulation) -# - The same assembly list is shared across steps 6, 8, and 9: -# - Step 6 (pick_crossovers): If not provided, auto-generates from step 4 FASTA outputs -# - Steps 8 and 9: If not provided, automatically use the assembly list from step 6 +# - The same assembly list is shared across steps 5, 7, and 8: +# - Step 5 (pick_crossovers): If not provided, auto-generates from step 4 FASTA outputs +# - Steps 7 and 8: If not provided, automatically use the assembly list from step 5 # - Auto-generated names are derived from filename minus "_mutated" suffix and extension # - Example: # /data/assemblies/B73.faB73 @@ -410,20 +406,22 @@ convert_ropebwt2ps4g: # chr3 # # Recombination Pipeline Dependencies: -# - Step 6 (pick_crossovers) requires: ref_fasta from step 1 or step 5 +# - Step 5 (pick_crossovers) requires: ref_fasta from step 1 # - If assembly_list not provided, auto-generates from step 4 FASTA outputs # - MUST have an EVEN number of assemblies (will error if odd) -# - Assembly list is saved and shared with steps 8 and 9 -# - Step 7 (create_chain_files) requires: MAF files from step 5 (default) or step 1 (fallback) -# - Uses align_mutated_assemblies MAF outputs by default -# - Falls back to align_assemblies MAF outputs if step 5 not run -# - Step 8 (convert_coordinates) requires: outputs from steps 6 and 7 -# - If assembly_list not provided, uses assembly list from step 6 -# - Step 9 (generate_recombined_sequences) requires: output from step 8 -# - If assembly_list not provided, uses assembly list from step 6 +# - Assembly list is saved and shared with steps 7 and 8 +# - Step 6 (create_chain_files) requires: MAF files from step 1 +# - Uses align_assemblies MAF outputs +# - Step 7 (convert_coordinates) requires: outputs from steps 5 and 6 +# - If assembly_list not provided, uses assembly list from step 5 +# - Step 8 (generate_recombined_sequences) requires: output from step 7 +# - If assembly_list not provided, uses assembly list from step 5 # - If chromosome_list not provided, auto-derives from first assembly using BioKotlin -# - If assembly_dir not provided, uses step 9 output directory -# - Step 10 (format_recombined_fastas) requires: output from step 9 +# - If assembly_dir not provided, uses step 8 output directory +# - Step 9 (format_recombined_fastas) requires: output from step 8 +# - Step 10 (align_mutated_assemblies) requires: outputs from steps 1 and 9 +# - Uses ref_gff and ref_fasta from step 1 +# - Uses FASTA files from step 9 as query input # # PS4G Creation Dependencies: # - Step 11 (rope_bwt_chr_index) requires: formatted FASTA files from step 10 diff --git a/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt index 5331064..c642224 100644 --- a/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt @@ -19,8 +19,8 @@ import kotlin.system.exitProcess class AlignMutatedAssemblies : CliktCommand(name = "align-mutated-assemblies") { companion object { - private const val LOG_FILE_NAME = "05_align_mutated_assemblies.log" - private const val MUTATED_ALIGNMENT_RESULTS_DIR = "05_mutated_alignment_results" + private const val LOG_FILE_NAME = "10_align_mutated_assemblies.log" + private const val MUTATED_ALIGNMENT_RESULTS_DIR = "10_mutated_alignment_results" private const val MAF_PATHS_FILE = "maf_file_paths.txt" // minimap2 parameters @@ -71,7 +71,7 @@ class AlignMutatedAssemblies : CliktCommand(name = "align-mutated-assemblies") { private val outputDir by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/05_mutated_alignment_results)" + help = "Custom output directory (default: work_dir/output/10_mutated_alignment_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private fun collectFastaFiles(): List { diff --git a/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt b/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt index cd3c131..14951aa 100644 --- a/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt +++ b/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt @@ -18,12 +18,12 @@ import kotlin.system.exitProcess class ConvertCoordinates : CliktCommand(name = "convert-coordinates") { companion object { - private const val LOG_FILE_NAME = "08_convert_coordinates.log" - private const val COORDS_RESULTS_DIR = "08_coordinates_results" + private const val LOG_FILE_NAME = "07_convert_coordinates.log" + private const val COORDS_RESULTS_DIR = "07_coordinates_results" private const val KEY_PATHS_FILE = "key_file_paths.txt" private const val FOUNDER_KEY_PATHS_FILE = "founder_key_file_paths.txt" private const val PYTHON_SCRIPT = "src/python/cross/convert_coords.py" - private const val DEFAULT_REFKEY_DIR = "06_crossovers_results" + private const val DEFAULT_REFKEY_DIR = "05_crossovers_results" } private val logger: Logger = LogManager.getLogger(ConvertCoordinates::class.java) @@ -53,7 +53,7 @@ class ConvertCoordinates : CliktCommand(name = "convert-coordinates") { private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/08_coordinates_results)" + help = "Custom output directory (default: work_dir/output/07_coordinates_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) override fun run() { diff --git a/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt b/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt index f6a8620..ff8d2c9 100644 --- a/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt +++ b/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt @@ -19,8 +19,8 @@ import kotlin.system.exitProcess class CreateChainFiles : CliktCommand(name = "create-chain-files") { companion object { - private const val LOG_FILE_NAME = "07_create_chain_files.log" - private const val CHAIN_RESULTS_DIR = "07_chain_results" + private const val LOG_FILE_NAME = "06_create_chain_files.log" + private const val CHAIN_RESULTS_DIR = "06_chain_results" private const val CHAIN_PATHS_FILE = "chain_file_paths.txt" private const val BASH_SCRIPT = "src/python/cross/create_chains.sh" private const val DEFAULT_JOBS = 8 @@ -49,7 +49,7 @@ class CreateChainFiles : CliktCommand(name = "create-chain-files") { private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/07_chain_results)" + help = "Custom output directory (default: work_dir/output/06_chain_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private fun collectMafFiles(): List { diff --git a/src/main/kotlin/net/maizegenetics/commands/FormatRecombinedFastas.kt b/src/main/kotlin/net/maizegenetics/commands/FormatRecombinedFastas.kt index eeed27b..5e5cd73 100644 --- a/src/main/kotlin/net/maizegenetics/commands/FormatRecombinedFastas.kt +++ b/src/main/kotlin/net/maizegenetics/commands/FormatRecombinedFastas.kt @@ -18,12 +18,12 @@ import kotlin.system.exitProcess class FormatRecombinedFastas : CliktCommand(name = "format-recombined-fastas") { companion object { - private const val LOG_FILE_NAME = "10_format_recombined_fastas.log" - private const val FORMATTED_RESULTS_DIR = "10_formatted_fastas" + private const val LOG_FILE_NAME = "09_format_recombined_fastas.log" + private const val FORMATTED_RESULTS_DIR = "09_formatted_fastas" private const val FORMATTED_FASTA_PATHS_FILE = "formatted_fasta_paths.txt" private const val DEFAULT_LINE_WIDTH = 60 private const val DEFAULT_THREADS = 8 - private const val DEFAULT_INPUT_DIR = "09_recombined_sequences/recombinate_fastas" + private const val DEFAULT_INPUT_DIR = "08_recombined_sequences/recombinate_fastas" } private val logger: Logger = LogManager.getLogger(FormatRecombinedFastas::class.java) @@ -53,7 +53,7 @@ class FormatRecombinedFastas : CliktCommand(name = "format-recombined-fastas") { private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/10_formatted_fastas)" + help = "Custom output directory (default: work_dir/output/09_formatted_fastas)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private fun collectFastaFiles(): List { diff --git a/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt b/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt index 8be6c30..d15f8bf 100644 --- a/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt +++ b/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt @@ -19,12 +19,12 @@ import kotlin.system.exitProcess class GenerateRecombinedSequences : CliktCommand(name = "generate-recombined-sequences") { companion object { - private const val LOG_FILE_NAME = "09_generate_recombined_sequences.log" - private const val RECOMBINED_RESULTS_DIR = "09_recombined_sequences" + private const val LOG_FILE_NAME = "08_generate_recombined_sequences.log" + private const val RECOMBINED_RESULTS_DIR = "08_recombined_sequences" private const val RECOMBINED_FASTAS_DIR = "recombinate_fastas" private const val FASTA_PATHS_FILE = "recombined_fasta_paths.txt" private const val PYTHON_SCRIPT = "src/python/cross/write_fastas.py" - private const val DEFAULT_FOUNDER_KEY_DIR = "08_coordinates_results" + private const val DEFAULT_FOUNDER_KEY_DIR = "07_coordinates_results" } private val logger: Logger = LogManager.getLogger(GenerateRecombinedSequences::class.java) @@ -60,7 +60,7 @@ class GenerateRecombinedSequences : CliktCommand(name = "generate-recombined-seq private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/09_recombined_sequences)" + help = "Custom output directory (default: work_dir/output/08_recombined_sequences)" ).path(mustExist = false, canBeFile = false, canBeDir = true) override fun run() { diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index 2704e9e..4256c2a 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -70,8 +70,8 @@ data class ConvertToFastaConfig( data class AlignMutatedAssembliesConfig( val ref_gff: String? = null, // Optional: Reference GFF (uses align_assemblies.ref_gff if not specified) - val ref_fasta: String? = null, // Optional: Reference FASTA (uses matching ref from step 4 output if not specified) - val fasta_input: String? = null, // Optional: Query FASTA input (uses non-ref files from step 4 if not specified) + val ref_fasta: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) + val fasta_input: String? = null, // Optional: Query FASTA input (uses format_recombined_fastas output if not specified) val threads: Int? = null, val output: String? = null // Custom output directory ) @@ -392,9 +392,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { var fastaOutputDir: Path? = null var refFasta: Path? = null var refGff: Path? = null - var mutatedRefFasta: Path? = null // Mutated reference FASTA from step 5 - var mutatedMafFilePaths: Path? = null // MAF file paths from step 5 (align_mutated_assemblies) - var assemblyListPath: Path? = null // Assembly list from step 6 (pick_crossovers) + var assemblyListPath: Path? = null // Assembly list from step 5 (pick_crossovers) var refkeyOutputDir: Path? = null var chainOutputDir: Path? = null var coordinatesOutputDir: Path? = null @@ -696,160 +694,17 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } - // Step 5: Align Mutated Assemblies (if configured and should run) - if (config.align_mutated_assemblies != null && shouldRunStep("align_mutated_assemblies", config)) { - logger.info("=".repeat(80)) - logger.info("STEP 5: Align Mutated Assemblies") - logger.info("=".repeat(80)) - - // Determine ref_gff (config value or from step 1) - val step5RefGff = config.align_mutated_assemblies.ref_gff?.let { Path.of(it) } ?: refGff - if (step5RefGff == null) { - throw RuntimeException("Cannot run align-mutated-assemblies: reference GFF not available (specify 'ref_gff' in config or run align_assemblies first)") - } - - // Determine ref_fasta and fasta_input from step 4 output - // If fastaOutputDir exists, find the reference FASTA (matching original ref name) and query FASTAs - var step5RefFasta: Path? = config.align_mutated_assemblies.ref_fasta?.let { Path.of(it) } - var step5FastaInput: Path? = config.align_mutated_assemblies.fasta_input?.let { Path.of(it) } - - // If not explicitly specified, try to derive from step 4 output - if (step5RefFasta == null || step5FastaInput == null) { - if (fastaOutputDir != null && fastaOutputDir.exists()) { - // Get the reference FASTA filename (without path) to match against step 4 output - val refFastaName = refFasta?.fileName?.toString()?.replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") - - if (refFastaName != null) { - // Find all FASTA files in the output directory - val allFastaFiles = fastaOutputDir.toFile().listFiles { file -> - file.isFile && file.name.matches(Regex(".*\\.(fa|fasta|fna)(\\.gz)?$")) - }?.map { it.toPath() } ?: emptyList() - - // Find the reference FASTA (filename contains the ref name) - val matchingRefFasta = allFastaFiles.find { path -> - path.fileName.toString().contains(refFastaName, ignoreCase = true) - } - - // Get non-reference FASTAs - val queryFastas = allFastaFiles.filter { path -> - !path.fileName.toString().contains(refFastaName, ignoreCase = true) - } - - if (step5RefFasta == null && matchingRefFasta != null) { - step5RefFasta = matchingRefFasta - logger.info("Auto-detected reference FASTA from step 4: $step5RefFasta") - } - - if (step5FastaInput == null && queryFastas.isNotEmpty()) { - // Create a text file listing the query FASTAs - val queryListFile = fastaOutputDir.resolve("query_fastas.txt") - queryListFile.writeText(queryFastas.joinToString("\n") { it.toAbsolutePath().toString() }) - step5FastaInput = queryListFile - logger.info("Auto-detected ${queryFastas.size} query FASTA files from step 4") - } - } - } - - // Fall back to original behavior if auto-detection failed - if (step5RefFasta == null) { - step5RefFasta = refFasta - } - if (step5FastaInput == null) { - step5FastaInput = fastaOutputDir - } - } - - if (step5FastaInput == null) { - throw RuntimeException("Cannot run align-mutated-assemblies: no FASTA input available (specify 'fasta_input' in config or run convert-to-fasta first)") - } - if (step5RefFasta == null) { - throw RuntimeException("Cannot run align-mutated-assemblies: reference FASTA not available (specify 'ref_fasta' in config or run convert-to-fasta first)") - } - - // Determine output directory (custom or default) - val customOutput = config.align_mutated_assemblies.output?.let { Path.of(it) } - - val args = buildList { - add("--work-dir=${workDir}") - add("--ref-gff=${step5RefGff}") - add("--ref-fasta=${step5RefFasta}") - add("--fasta-input=${step5FastaInput}") - if (config.align_mutated_assemblies.threads != null) { - add("--threads=${config.align_mutated_assemblies.threads}") - } - if (customOutput != null) { - add("--output-dir=${customOutput}") - } - } - - AlignMutatedAssemblies().parse(args) - restoreOrchestratorLogging(workDir) - - // Save the mutated reference FASTA for use in step 6 - mutatedRefFasta = step5RefFasta - - // Save the MAF file paths for use in step 7 (create_chain_files) - val step5OutputDir = customOutput ?: workDir.resolve("output").resolve("05_mutated_align_results") - mutatedMafFilePaths = step5OutputDir.resolve("maf_file_paths.txt") - if (mutatedMafFilePaths!!.exists()) { - logger.info("Step 5 MAF file paths: $mutatedMafFilePaths") - } else { - logger.warn("Step 5 MAF file paths not found: $mutatedMafFilePaths") - mutatedMafFilePaths = null - } - - logger.info("Step 5 completed successfully") - logger.info("") - } else { - if (config.align_mutated_assemblies != null) { - logger.info("Skipping align-mutated-assemblies (not in run_steps)") - - // Try to recover mutated ref FASTA from step 5 config or step 4 output - if (config.align_mutated_assemblies.ref_fasta != null) { - mutatedRefFasta = Path.of(config.align_mutated_assemblies.ref_fasta) - logger.info("Using configured mutated reference FASTA: $mutatedRefFasta") - } else if (fastaOutputDir != null && fastaOutputDir.exists()) { - // Try to auto-detect from step 4 output - val refFastaName = refFasta?.fileName?.toString()?.replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") - if (refFastaName != null) { - val matchingRefFasta = fastaOutputDir.toFile().listFiles { file -> - file.isFile && file.name.matches(Regex(".*\\.(fa|fasta|fna)(\\.gz)?$")) && - file.name.contains(refFastaName, ignoreCase = true) - }?.firstOrNull()?.toPath() - if (matchingRefFasta != null) { - mutatedRefFasta = matchingRefFasta - logger.info("Auto-detected mutated reference FASTA: $mutatedRefFasta") - } - } - } - - // Try to recover MAF file paths from previous step 5 run - val customOutput = config.align_mutated_assemblies.output?.let { Path.of(it) } - val step5OutputDir = customOutput ?: workDir.resolve("output").resolve("05_mutated_align_results") - val previousMafPaths = step5OutputDir.resolve("maf_file_paths.txt") - if (previousMafPaths.exists()) { - mutatedMafFilePaths = previousMafPaths - logger.info("Using previous step 5 MAF file paths: $mutatedMafFilePaths") - } - } else { - logger.info("Skipping align-mutated-assemblies (not configured)") - } - logger.info("") - } - - // Step 6: Pick Crossovers (if configured and should run) + // Step 5: Pick Crossovers (if configured and should run) if (config.pick_crossovers != null && shouldRunStep("pick_crossovers", config)) { logger.info("=".repeat(80)) - logger.info("STEP 6: Pick Crossovers") + logger.info("STEP 5: Pick Crossovers") logger.info("=".repeat(80)) - // Use pick_crossovers.ref_fasta if specified, otherwise use mutated ref FASTA from step 5, - // finally fall back to original ref FASTA from step 1 + // Use pick_crossovers.ref_fasta if specified, otherwise use ref FASTA from step 1 val pickCrossoversRefFasta = config.pick_crossovers.ref_fasta?.let { Path.of(it) } - ?: mutatedRefFasta ?: refFasta if (pickCrossoversRefFasta == null) { - throw RuntimeException("Cannot run pick-crossovers: reference FASTA not available (specify 'ref_fasta' in pick_crossovers config, run align_mutated_assemblies, or run align_assemblies first)") + throw RuntimeException("Cannot run pick-crossovers: reference FASTA not available (specify 'ref_fasta' in pick_crossovers config or run align_assemblies first)") } // Determine assembly list (custom or auto-generated from step 4) @@ -916,13 +771,13 @@ class Orchestrate : CliktCommand(name = "orchestrate") { restoreOrchestratorLogging(workDir) // Get output directory (use custom or default) - refkeyOutputDir = customOutput ?: workDir.resolve("output").resolve("06_crossovers_results") + refkeyOutputDir = customOutput ?: workDir.resolve("output").resolve("05_crossovers_results") if (!refkeyOutputDir.exists()) { throw RuntimeException("Expected refkey output directory not found: $refkeyOutputDir") } - logger.info("Step 6 completed successfully") + logger.info("Step 5 completed successfully") logger.info("") } else { if (config.pick_crossovers != null) { @@ -930,7 +785,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Check custom output location first, then default val customOutput = config.pick_crossovers.output?.let { Path.of(it) } - val previousRefkeyDir = customOutput ?: workDir.resolve("output").resolve("06_crossovers_results") + val previousRefkeyDir = customOutput ?: workDir.resolve("output").resolve("05_crossovers_results") if (previousRefkeyDir.exists()) { refkeyOutputDir = previousRefkeyDir logger.info("Using previous pick-crossovers outputs: $refkeyOutputDir") @@ -955,18 +810,17 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } - // Step 7: Create Chain Files (if configured and should run) + // Step 6: Create Chain Files (if configured and should run) if (config.create_chain_files != null && shouldRunStep("create_chain_files", config)) { logger.info("=".repeat(80)) - logger.info("STEP 7: Create Chain Files") + logger.info("STEP 6: Create Chain Files") logger.info("=".repeat(80)) - // Determine input (custom, or step 5 MAF files, or step 1 MAF files as fallback) + // Determine input (custom or step 1 MAF files) val mafInput = config.create_chain_files.maf_file_input?.let { Path.of(it) } - ?: mutatedMafFilePaths ?: mafFilePaths if (mafInput == null) { - throw RuntimeException("Cannot run create-chain-files: no MAF input available (specify 'maf_file_input' in config, run align-mutated-assemblies, or run align-assemblies first)") + throw RuntimeException("Cannot run create-chain-files: no MAF input available (specify 'maf_file_input' in config or run align-assemblies first)") } logger.info("MAF input: $mafInput") @@ -988,13 +842,13 @@ class Orchestrate : CliktCommand(name = "orchestrate") { restoreOrchestratorLogging(workDir) // Get output directory (use custom or default) - chainOutputDir = customOutput ?: workDir.resolve("output").resolve("07_chain_results") + chainOutputDir = customOutput ?: workDir.resolve("output").resolve("06_chain_results") if (!chainOutputDir.exists()) { throw RuntimeException("Expected chain output directory not found: $chainOutputDir") } - logger.info("Step 7 completed successfully") + logger.info("Step 6 completed successfully") logger.info("") } else { if (config.create_chain_files != null) { @@ -1002,30 +856,23 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Check custom output location first, then default val customOutput = config.create_chain_files.output?.let { Path.of(it) } - val previousChainDir = customOutput ?: workDir.resolve("output").resolve("07_chain_results") + val previousChainDir = customOutput ?: workDir.resolve("output").resolve("06_chain_results") if (previousChainDir.exists()) { chainOutputDir = previousChainDir logger.info("Using previous create-chain-files outputs: $chainOutputDir") } else { logger.warn("Previous create-chain-files outputs not found. Downstream steps may fail.") } - - // Also recover MAF file path info if not already set - if (mutatedMafFilePaths == null && mafFilePaths == null) { - config.create_chain_files.maf_file_input?.let { - logger.info("Custom MAF input configured: $it") - } - } } else { logger.info("Skipping create-chain-files (not configured)") } logger.info("") } - // Step 8: Convert Coordinates (if configured and should run) + // Step 7: Convert Coordinates (if configured and should run) if (config.convert_coordinates != null && shouldRunStep("convert_coordinates", config)) { logger.info("=".repeat(80)) - logger.info("STEP 8: Convert Coordinates") + logger.info("STEP 7: Convert Coordinates") logger.info("=".repeat(80)) // Determine chain input (custom or from previous step) @@ -1065,13 +912,13 @@ class Orchestrate : CliktCommand(name = "orchestrate") { restoreOrchestratorLogging(workDir) // Get output directory (use custom or default) - coordinatesOutputDir = customOutput ?: workDir.resolve("output").resolve("08_coordinates_results") + coordinatesOutputDir = customOutput ?: workDir.resolve("output").resolve("07_coordinates_results") if (!coordinatesOutputDir.exists()) { throw RuntimeException("Expected coordinates output directory not found: $coordinatesOutputDir") } - logger.info("Step 8 completed successfully") + logger.info("Step 7 completed successfully") logger.info("") } else { if (config.convert_coordinates != null) { @@ -1079,7 +926,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Check custom output location first, then default val customOutput = config.convert_coordinates.output?.let { Path.of(it) } - val previousCoordsDir = customOutput ?: workDir.resolve("output").resolve("08_coordinates_results") + val previousCoordsDir = customOutput ?: workDir.resolve("output").resolve("07_coordinates_results") if (previousCoordsDir.exists()) { coordinatesOutputDir = previousCoordsDir logger.info("Using previous convert-coordinates outputs: $coordinatesOutputDir") @@ -1092,10 +939,10 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } - // Step 9: Generate Recombined Sequences (if configured and should run) + // Step 8: Generate Recombined Sequences (if configured and should run) if (config.generate_recombined_sequences != null && shouldRunStep("generate_recombined_sequences", config)) { logger.info("=".repeat(80)) - logger.info("STEP 9: Generate Recombined Sequences") + logger.info("STEP 8: Generate Recombined Sequences") logger.info("=".repeat(80)) // Use founder key input from previous step (convert_coordinates) @@ -1136,7 +983,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } // Write chromosome list to a temporary file - val chromosomeListFile = workDir.resolve("output").resolve("09_recombined_sequences").resolve("auto_chromosome_list.txt") + val chromosomeListFile = workDir.resolve("output").resolve("08_recombined_sequences").resolve("auto_chromosome_list.txt") chromosomeListFile.parent.createDirectories() chromosomeListFile.writeText(chromosomeIds.joinToString("\n")) logger.info("Auto-generated chromosome list: $chromosomeListFile") @@ -1145,8 +992,8 @@ class Orchestrate : CliktCommand(name = "orchestrate") { chromosomeListFile } - // Determine output directory (step 9 default output) - val outputBase = workDir.resolve("output").resolve("09_recombined_sequences") + // Determine output directory (step 8 default output) + val outputBase = workDir.resolve("output").resolve("08_recombined_sequences") // Determine assembly directory (custom or from step 4 FASTA output) // The Python script needs to read parent FASTA files which are in the FASTA output directory @@ -1169,14 +1016,14 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Get output directory recombinedFastasDir = outputBase.resolve("recombinate_fastas") - logger.info("Step 9 completed successfully") + logger.info("Step 8 completed successfully") logger.info("") } else { if (config.generate_recombined_sequences != null) { logger.info("Skipping generate-recombined-sequences (not in run_steps)") // Check default output location - val outputBase = workDir.resolve("output").resolve("09_recombined_sequences") + val outputBase = workDir.resolve("output").resolve("08_recombined_sequences") val previousRecombinedDir = outputBase.resolve("recombinate_fastas") if (previousRecombinedDir.exists()) { recombinedFastasDir = previousRecombinedDir @@ -1190,10 +1037,10 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } - // Step 10: Format Recombined Fastas (if configured and should run) + // Step 9: Format Recombined Fastas (if configured and should run) if (config.format_recombined_fastas != null && shouldRunStep("format_recombined_fastas", config)) { logger.info("=".repeat(80)) - logger.info("STEP 10: Format Recombined Fastas") + logger.info("STEP 9: Format Recombined Fastas") logger.info("=".repeat(80)) // Determine input (custom or from previous step) @@ -1223,9 +1070,9 @@ class Orchestrate : CliktCommand(name = "orchestrate") { restoreOrchestratorLogging(workDir) // Get output directory (use custom or default) - formattedFastasDir = customOutput ?: workDir.resolve("output").resolve("10_formatted_fastas") + formattedFastasDir = customOutput ?: workDir.resolve("output").resolve("09_formatted_fastas") - logger.info("Step 10 completed successfully") + logger.info("Step 9 completed successfully") logger.info("") } else { if (config.format_recombined_fastas != null) { @@ -1233,7 +1080,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Check custom output location first, then default val customOutput = config.format_recombined_fastas.output?.let { Path.of(it) } - val previousFormattedDir = customOutput ?: workDir.resolve("output").resolve("10_formatted_fastas") + val previousFormattedDir = customOutput ?: workDir.resolve("output").resolve("09_formatted_fastas") if (previousFormattedDir.exists()) { formattedFastasDir = previousFormattedDir logger.info("Using previous format-recombined-fastas outputs: $formattedFastasDir") @@ -1246,6 +1093,64 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } + // Step 10: Align Mutated Assemblies (if configured and should run) + if (config.align_mutated_assemblies != null && shouldRunStep("align_mutated_assemblies", config)) { + logger.info("=".repeat(80)) + logger.info("STEP 10: Align Mutated Assemblies") + logger.info("=".repeat(80)) + + // Determine ref_gff (config value or from step 1) + val step10RefGff = config.align_mutated_assemblies.ref_gff?.let { Path.of(it) } ?: refGff + if (step10RefGff == null) { + throw RuntimeException("Cannot run align-mutated-assemblies: reference GFF not available (specify 'ref_gff' in config or run align_assemblies first)") + } + + // Determine ref_fasta (config value or from step 1) + val step10RefFasta = config.align_mutated_assemblies.ref_fasta?.let { Path.of(it) } ?: refFasta + if (step10RefFasta == null) { + throw RuntimeException("Cannot run align-mutated-assemblies: reference FASTA not available (specify 'ref_fasta' in config or run align_assemblies first)") + } + + // Determine fasta_input (config value or from format_recombined_fastas output) + val step10FastaInput = config.align_mutated_assemblies.fasta_input?.let { Path.of(it) } ?: formattedFastasDir + if (step10FastaInput == null) { + throw RuntimeException("Cannot run align-mutated-assemblies: no FASTA input available (specify 'fasta_input' in config or run format-recombined-fastas first)") + } + + logger.info("Reference GFF: $step10RefGff") + logger.info("Reference FASTA: $step10RefFasta") + logger.info("FASTA input: $step10FastaInput") + + // Determine output directory (custom or default) + val customOutput = config.align_mutated_assemblies.output?.let { Path.of(it) } + + val args = buildList { + add("--work-dir=${workDir}") + add("--ref-gff=${step10RefGff}") + add("--ref-fasta=${step10RefFasta}") + add("--fasta-input=${step10FastaInput}") + if (config.align_mutated_assemblies.threads != null) { + add("--threads=${config.align_mutated_assemblies.threads}") + } + if (customOutput != null) { + add("--output-dir=${customOutput}") + } + } + + AlignMutatedAssemblies().parse(args) + restoreOrchestratorLogging(workDir) + + logger.info("Step 10 completed successfully") + logger.info("") + } else { + if (config.align_mutated_assemblies != null) { + logger.info("Skipping align-mutated-assemblies (not in run_steps)") + } else { + logger.info("Skipping align-mutated-assemblies (not configured)") + } + logger.info("") + } + // Pipeline completed successfully logger.info("=".repeat(80)) logger.info("PIPELINE COMPLETED SUCCESSFULLY!") diff --git a/src/main/kotlin/net/maizegenetics/commands/PickCrossovers.kt b/src/main/kotlin/net/maizegenetics/commands/PickCrossovers.kt index 974606a..6063bf9 100644 --- a/src/main/kotlin/net/maizegenetics/commands/PickCrossovers.kt +++ b/src/main/kotlin/net/maizegenetics/commands/PickCrossovers.kt @@ -18,8 +18,8 @@ import kotlin.system.exitProcess class PickCrossovers : CliktCommand(name = "pick-crossovers") { companion object { - private const val LOG_FILE_NAME = "06_pick_crossovers.log" - private const val CROSSOVERS_RESULTS_DIR = "06_crossovers_results" + private const val LOG_FILE_NAME = "05_pick_crossovers.log" + private const val CROSSOVERS_RESULTS_DIR = "05_crossovers_results" private const val REFKEY_PATHS_FILE = "refkey_file_paths.txt" private const val PYTHON_SCRIPT = "src/python/cross/pick_crossovers.py" } @@ -46,7 +46,7 @@ class PickCrossovers : CliktCommand(name = "pick-crossovers") { private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/06_crossovers_results)" + help = "Custom output directory (default: work_dir/output/05_crossovers_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) override fun run() { From 7a47920dfa549eed270a73d58e4475e2658b444a Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 18 Dec 2025 16:41:41 -0600 Subject: [PATCH 08/11] Add MAF conversion; improve default inputs --- pipeline_config.example.yaml | 124 ++++---- .../net/maizegenetics/commands/Orchestrate.kt | 273 +++++++++++++++++- 2 files changed, 344 insertions(+), 53 deletions(-) diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index c851a83..6a00c6d 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -6,7 +6,7 @@ # ============================================================================ # This pipeline consists of two main workflows: # -# MAIN PIPELINE (Variant Simulation): +# VARIANT PIPELINE (Generate variants): # 01. align-assemblies: Align query assemblies to reference using AnchorWave # 02. maf-to-gvcf: Convert MAF alignment files to compressed GVCF format # 03. downsample-gvcf: Downsample variants at specified rates per chromosome @@ -18,13 +18,14 @@ # 07. convert-coordinates: Convert crossover points to assembly coordinates # 08. generate-recombined-sequences: Create recombined FASTA files from parent assemblies # 09. format-recombined-fastas: Format recombined FASTA files with consistent line widths -# 10. align-mutated-assemblies: Realign formatted recombined FASTA files to reference # # PS4G CREATION (PHG Indexing for Genotype Imputation): -# 11. rope-bwt-chr-index: Create PHGv2 ropebwt3 index from recombined FASTA files -# 12. ropebwt-mem: Align FASTQ reads to ropebwt3 index and generate BED files -# 13. build-spline-knots: Build spline knots from hVCF or gVCF files for imputation -# 14. convert-ropebwt2ps4g: Convert RopeBWT3 BED alignments to PS4G format +# 10. align-mutated-assemblies: Realign formatted recombined FASTA files to reference +# 11. mutated-maf-to-gvcf: Convert mutated MAF files to compressed GVCF format +# 12. rope-bwt-chr-index: Create PHGv2 ropebwt3 index from recombined FASTA files +# 13. ropebwt-mem: Align FASTQ reads to ropebwt3 index and generate BED files +# 14. build-spline-knots: Build spline knots from hVCF or gVCF files for imputation +# 15. convert-ropebwt2ps4g: Convert RopeBWT3 BED alignments to PS4G format # # NOTE (1): Environment setup is automatic! The orchestrate command will automatically # detect if the working directory and required tools (MLImpute, biokotlin-tools, PHGv2) @@ -53,13 +54,14 @@ run_steps: - convert_coordinates # Step 07: Convert coordinates to assembly space - generate_recombined_sequences # Step 08: Generate recombined sequences - format_recombined_fastas # Step 09: Format recombined FASTA files - - align_mutated_assemblies # Step 10: Realign formatted sequences to reference # PS4G Creation (PHG Indexing for Genotype Imputation) - - rope_bwt_chr_index # Step 11: Create PHGv2 ropebwt3 index - - ropebwt_mem # Step 12: Align FASTQ reads to index - - build_spline_knots # Step 13: Build spline knots from VCF files - - convert_ropebwt2ps4g # Step 14: Convert BED alignments to PS4G format + - align_mutated_assemblies # Step 10: Realign formatted sequences to reference + - mutated_maf_to_gvcf # Step 11: Convert mutated MAF files to GVCF + - rope_bwt_chr_index # Step 12: Create PHGv2 ropebwt3 index + - ropebwt_mem # Step 13: Align FASTQ reads to index + - build_spline_knots # Step 14: Build spline knots from VCF files + - convert_ropebwt2ps4g # Step 15: Convert BED alignments to PS4G format # ============================================================================ # MAIN PIPELINE CONFIGURATION (Steps 1-4) @@ -110,7 +112,7 @@ convert_to_fasta: # output: "/custom/fasta/output/" # Optional: Custom output directory # ============================================================================ -# RECOMBINATION PIPELINE CONFIGURATION (Steps 5-10) +# RECOMBINATION PIPELINE CONFIGURATION (Steps 5-9) # ============================================================================ # Step 5: Pick Crossovers @@ -164,8 +166,6 @@ generate_recombined_sequences: # chr3 # assembly_dir: "path/to/parent_assemblies/" # Optional: Directory containing parent assembly FASTA files # Defaults to this step's output directory - - # Step 9: Format Recombined Fastas # Reformats recombined FASTA files to have consistent line widths using seqkit # Automatically uses recombined FASTA files from generate_recombined_sequences @@ -176,6 +176,10 @@ format_recombined_fastas: # input: "/custom/recombined/fastas/" # Optional: Custom FASTA input (file, directory, or text list) # output: "/custom/formatted/output/" # Optional: Custom output directory +# ============================================================================ +# PS4G CREATION CONFIGURATION (Steps 10-15) +# ============================================================================ + # Step 10: Align Mutated Assemblies # Realigns the formatted recombined FASTA files back to the reference # Uses reference GFF and FASTA from align_assemblies (step 1) @@ -190,26 +194,35 @@ align_mutated_assemblies: threads: 1 # Optional: Number of threads (default: 1) # output: "/custom/alignment/output/" # Optional: Custom output directory -# ============================================================================ -# PS4G CREATION CONFIGURATION (Steps 11-14) -# ============================================================================ +# Step 11: Mutated MAF to GVCF Conversion +# Converts MAF files from align_mutated_assemblies to compressed GVCF format +# Automatically uses ref_fasta from align_assemblies and MAF output from align_mutated_assemblies +mutated_maf_to_gvcf: + # reference_file: "path/to/reference.fa" # Optional: Reference FASTA file + # Uses align_assemblies.ref_fasta if not specified + # maf_file: "/custom/maf/files.txt" # Optional: MAF file, directory, or text list + # Uses step 10 MAF outputs if not specified + # output_file: "sample.g.vcf.gz" # Optional: Output GVCF filename (auto-generated if not specified) + # sample_name: "optional_sample_name" # Optional: Override sample name in GVCF + # output_dir: "/custom/gvcf/output/" # Optional: Custom output directory -# Step 11: PHG Rope-BWT-Chr Index +# Step 12: PHG Rope-BWT-Chr Index # Creates a PHGv2 ropebwt3 index from recombined FASTA files for genotype imputation -# Automatically uses formatted FASTA files from format_recombined_fastas +# By default, auto-generates a keyfile from format_recombined_fastas (step 9) output: +# - Sample names are derived from FASTA filenames (without extension) +# - Underscores in sample names are automatically converted to hyphens (logged as warning) # Can also accept a pre-made keyfile with custom sample names rope_bwt_chr_index: index_file_prefix: "phgIndex" # Optional: Prefix for index files (default: "phgIndex") threads: 20 # Optional: Number of threads (default: 20) delete_fmr_index: true # Optional: Delete .fmr files after conversion (true/false, default: true) - # input: "/custom/fasta/input/" # Optional: Custom FASTA input (file, directory, or text list) - # output: "/custom/phg_index/" # Optional: Custom output directory (default: work_dir/output/11_rope_bwt_index_results) - # keyfile: "/path/to/keyfile.txt" # Optional: Pre-made keyfile (mutually exclusive with input) + # output: "/custom/phg_index/" # Optional: Custom output directory (default: work_dir/output/12_rope_bwt_index_results) + # keyfile: "/path/to/keyfile.txt" # Optional: Pre-made keyfile (overrides auto-generation) # Keyfile format: Tab-delimited with 'Fasta' and 'SampleName' columns # WARNING: Sample names should NOT contain underscores # PHG uses underscores internally (e.g., samplename_contig) -# Step 12: Ropebwt3 Mem Alignment +# Step 13: Ropebwt3 Mem Alignment # Aligns FASTQ reads to the ropebwt3 index and generates BED alignment files # Automatically uses index file from rope_bwt_chr_index and calculates -l parameter # Processes multiple FASTQ files iteratively, generating one BED file per sample @@ -218,35 +231,35 @@ ropebwt_mem: # Supports: .fq, .fastq, .fq.gz, .fastq.gz threads: 40 # Optional: Number of threads (default: 1) p_value: 168 # Optional: The -p parameter (default: 168) - # index_file: "/custom/index.fmd" # Optional: Custom index file (auto-detected from step 11) - # l_value: 100 # Optional: The -l parameter (auto-calculated as 2 × FASTA count from step 11) - # output: "/custom/bed/output/" # Optional: Custom output directory (default: work_dir/output/12_rope_bwt_mem_results) + # index_file: "/custom/index.fmd" # Optional: Custom index file (auto-detected from step 12) + # l_value: 100 # Optional: The -l parameter (auto-calculated as 2 × FASTA count from step 12) + # output: "/custom/bed/output/" # Optional: Custom output directory (default: work_dir/output/13_rope_bwt_mem_results) -# Step 13: Build Spline Knots +# Step 14: Build Spline Knots # Builds spline knots from hVCF or gVCF files for PHGv2 machine learning imputation # This is an independent step that does not depend on previous pipeline outputs # Used to create spline representations for downstream imputation tasks build_spline_knots: vcf_dir: "path/to/vcf_files/" # Required: Directory containing hVCF or gVCF files - vcf_type: "hvcf" # Optional: Type of VCFs (hvcf or gvcf, default: hvcf) + vcf_type: "gvcf" # Optional: Type of VCFs (hvcf or gvcf, default: gvcf) min_indel_length: 10 # Optional: Min indel length for gVCF (default: 10) num_bps_per_knot: 50000 # Optional: Max base pairs per knot (default: 50000) random_seed: 12345 # Optional: Random seed for reproducibility (default: 12345) # contig_list: "chr1,chr2,chr3" # Optional: Comma-separated chromosome list (default: all) - # output: "/custom/spline/output/" # Optional: Custom output directory (default: work_dir/output/13_spline_knots_results) + # output: "/custom/spline/output/" # Optional: Custom output directory (default: work_dir/output/14_spline_knots_results) -# Step 14: Convert RopeBWT to PS4G +# Step 15: Convert RopeBWT to PS4G # Converts RopeBWT3 BED alignment files to PS4G format for gamete support tracking -# Automatically uses BED files from ropebwt_mem (step 12) and spline knots from build_spline_knots (step 13) +# Automatically uses BED files from ropebwt_mem (step 13) and spline knots from build_spline_knots (step 14) # Processes each BED file iteratively, generating one PS4G file per sample convert_ropebwt2ps4g: min_mem_length: 135 # Optional: Minimum MEM length threshold in bp (default: 135) max_num_hits: 16 # Optional: Maximum haplotype hits per alignment (default: 16) # bed_input: "/custom/bed/files/" # Optional: Custom BED input (file, directory, or text list) - # Auto-detected from step 12 if not specified - # spline_knot_dir: "/custom/spline/" # Optional: Custom spline knot directory # Auto-detected from step 13 if not specified - # output: "/custom/ps4g/output/" # Optional: Custom output directory (default: work_dir/output/14_convert_ropebwt2ps4g_results) + # spline_knot_dir: "/custom/spline/" # Optional: Custom spline knot directory + # Auto-detected from step 14 if not specified + # output: "/custom/ps4g/output/" # Optional: Custom output directory (default: work_dir/output/15_convert_ropebwt2ps4g_results) # ============================================================================ # Usage Examples: @@ -264,7 +277,7 @@ convert_ropebwt2ps4g: # - convert_to_fasta # # Comment out recombination steps # -# 3. Run only the recombination pipeline with PS4G creation (steps 1, 5-14): +# 3. Run only the recombination pipeline with PS4G creation (steps 1, 5-15): # run_steps: # - align_assemblies # Required for MAF files # # - maf_to_gvcf # Skip GVCF steps @@ -276,6 +289,7 @@ convert_ropebwt2ps4g: # - generate_recombined_sequences # - format_recombined_fastas # - align_mutated_assemblies # Realign recombined sequences +# - mutated_maf_to_gvcf # Convert mutated MAF to GVCF # - rope_bwt_chr_index # Create PHG index # - ropebwt_mem # Align FASTQ reads # - build_spline_knots # Build spline knots @@ -310,23 +324,25 @@ convert_ropebwt2ps4g: # input: "/data/my_gvcf_directory/" # output: "/results/my_fasta_output/" # -# 8. Run only PS4G index creation with custom FASTA files: +# 8. Run PS4G index creation with a pre-made keyfile: # run_steps: # - rope_bwt_chr_index # rope_bwt_chr_index: -# input: "/path/to/my/fasta/files/" -# output_dir: "my_phg_index" +# keyfile: "/path/to/my/keyfile.txt" +# output: "my_phg_index" # index_file_prefix: "myCustomIndex" # threads: 40 # -# 9. Run PS4G index creation with a pre-made keyfile: +# 9. Run PS4G index creation with custom settings (using auto-generated keyfile): # run_steps: # - rope_bwt_chr_index # rope_bwt_chr_index: -# keyfile: "/path/to/my/keyfile.txt" -# output_dir: "my_phg_index" +# # No keyfile = auto-generates from step 9 FASTA files +# output: "my_phg_index" +# index_file_prefix: "customIndex" +# threads: 40 # -# 10. Run only ropebwt-mem alignment (requires step 11 output): +# 10. Run only ropebwt-mem alignment (requires step 12 output): # run_steps: # - ropebwt_mem # ropebwt_mem: @@ -361,11 +377,11 @@ convert_ropebwt2ps4g: # contig_list: "chr1,chr2,chr3,chr4,chr5" # output: "/custom/spline/output/" # -# 14. Run convert-ropebwt2ps4g with auto-detection (requires steps 12-13): +# 14. Run convert-ropebwt2ps4g with auto-detection (requires steps 13-14): # run_steps: # - convert_ropebwt2ps4g # convert_ropebwt2ps4g: -# # Auto-detects BED files from step 12 and spline knots from step 13 +# # Auto-detects BED files from step 13 and spline knots from step 14 # min_mem_length: 135 # max_num_hits: 16 # @@ -422,18 +438,24 @@ convert_ropebwt2ps4g: # - Step 10 (align_mutated_assemblies) requires: outputs from steps 1 and 9 # - Uses ref_gff and ref_fasta from step 1 # - Uses FASTA files from step 9 as query input +# - Step 11 (mutated_maf_to_gvcf) requires: outputs from steps 1 and 10 +# - Uses ref_fasta from step 1 +# - Uses MAF files from step 10 as input # # PS4G Creation Dependencies: -# - Step 11 (rope_bwt_chr_index) requires: formatted FASTA files from step 10 -# - Can also accept custom FASTA files or a pre-made keyfile +# - Step 11 (mutated_maf_to_gvcf) requires: MAF files from step 10 and ref_fasta from step 1 +# - Step 12 (rope_bwt_chr_index) auto-generates keyfile from step 9 FASTA files: +# - Sample names derived from filenames (without extension) +# - Underscores are auto-converted to hyphens (logged as warning) +# - Can also accept a pre-made keyfile (overrides auto-generation) # - Keyfile must be tab-delimited with 'Fasta' and 'SampleName' columns # - Sample names should NOT contain underscores (PHG internal requirement) -# - Step 12 (ropebwt_mem) requires: index file (.fmd) from step 11 -# - Step 12 auto-calculates -l parameter from keyfile (2 × FASTA count) +# - Step 13 (ropebwt_mem) requires: index file (.fmd) from step 12 +# - Step 13 auto-calculates -l parameter from keyfile (2 × FASTA count) # - FASTQ files can be compressed (.fq.gz, .fastq.gz) or uncompressed (.fq, .fastq) -# - Step 13 (build_spline_knots) is independent: requires only VCF files -# - Step 14 (convert_ropebwt2ps4g) requires: BED files from step 12 and spline knots from step 13 -# - Step 14 auto-detects inputs if not specified +# - Step 14 (build_spline_knots) is independent: requires only VCF files +# - Step 15 (convert_ropebwt2ps4g) requires: BED files from step 13 and spline knots from step 14 +# - Step 15 auto-detects inputs if not specified # # Custom Input/Output Paths: # - Each step supports optional 'input' and 'output' parameters diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index 4256c2a..d48ea7a 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -31,7 +31,9 @@ data class PipelineConfig( val create_chain_files: CreateChainFilesConfig? = null, val convert_coordinates: ConvertCoordinatesConfig? = null, val generate_recombined_sequences: GenerateRecombinedSequencesConfig? = null, - val format_recombined_fastas: FormatRecombinedFastasConfig? = null + val format_recombined_fastas: FormatRecombinedFastasConfig? = null, + val mutated_maf_to_gvcf: MutatedMafToGvcfConfig? = null, + val rope_bwt_chr_index: RopeBwtChrIndexConfig? = null ) data class AlignAssembliesConfig( @@ -108,6 +110,22 @@ data class FormatRecombinedFastasConfig( val output: String? = null // Custom output directory ) +data class MutatedMafToGvcfConfig( + val reference_file: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) + val maf_file: String? = null, // Optional: MAF file/directory/list (uses align_mutated_assemblies output if not specified) + val output_file: String? = null, // Optional: Output GVCF file name + val sample_name: String? = null, // Optional: Sample name for GVCF + val output_dir: String? = null // Optional: Custom GVCF output directory +) + +data class RopeBwtChrIndexConfig( + val keyfile: String? = null, // Optional: Pre-made keyfile (if not specified, auto-generates from format_recombined_fastas output) + val index_file_prefix: String? = null, // Optional: Prefix for index files (default: "phgIndex") + val threads: Int? = null, // Optional: Number of threads (default: 20) + val delete_fmr_index: Boolean? = null, // Optional: Delete .fmr files after conversion (default: true) + val output: String? = null // Optional: Custom output directory +) + class Orchestrate : CliktCommand(name = "orchestrate") { companion object { private const val LOG_FILE_NAME = "00_orchestrate.log" @@ -328,6 +346,32 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ) } else null + // Parse mutated_maf_to_gvcf - check if key exists (even with empty/null value means "run with defaults") + @Suppress("UNCHECKED_CAST") + val mutatedMafToGvcfMap = configMap["mutated_maf_to_gvcf"] as? Map + val mutatedMafToGvcf = if (configMap.containsKey("mutated_maf_to_gvcf")) { + MutatedMafToGvcfConfig( + reference_file = mutatedMafToGvcfMap?.get("reference_file") as? String, + maf_file = mutatedMafToGvcfMap?.get("maf_file") as? String, + output_file = mutatedMafToGvcfMap?.get("output_file") as? String, + sample_name = mutatedMafToGvcfMap?.get("sample_name") as? String, + output_dir = mutatedMafToGvcfMap?.get("output_dir") as? String + ) + } else null + + // Parse rope_bwt_chr_index - check if key exists (even with empty/null value means "run with defaults") + @Suppress("UNCHECKED_CAST") + val ropeBwtChrIndexMap = configMap["rope_bwt_chr_index"] as? Map + val ropeBwtChrIndex = if (configMap.containsKey("rope_bwt_chr_index")) { + RopeBwtChrIndexConfig( + keyfile = ropeBwtChrIndexMap?.get("keyfile") as? String, + index_file_prefix = ropeBwtChrIndexMap?.get("index_file_prefix") as? String, + threads = ropeBwtChrIndexMap?.get("threads") as? Int, + delete_fmr_index = ropeBwtChrIndexMap?.get("delete_fmr_index") as? Boolean, + output = ropeBwtChrIndexMap?.get("output") as? String + ) + } else null + return PipelineConfig( work_dir = workDir, run_steps = runSteps, @@ -340,7 +384,9 @@ class Orchestrate : CliktCommand(name = "orchestrate") { create_chain_files = createChainFiles, convert_coordinates = convertCoordinates, generate_recombined_sequences = generateRecombinedSequences, - format_recombined_fastas = formatRecombinedFastas + format_recombined_fastas = formatRecombinedFastas, + mutated_maf_to_gvcf = mutatedMafToGvcf, + rope_bwt_chr_index = ropeBwtChrIndex ) } catch (e: Exception) { logger.error("Failed to parse configuration file: ${e.message}", e) @@ -398,6 +444,8 @@ class Orchestrate : CliktCommand(name = "orchestrate") { var coordinatesOutputDir: Path? = null var recombinedFastasDir: Path? = null var formattedFastasDir: Path? = null + var mutatedMafFilePaths: Path? = null // MAF file paths from step 10 (align_mutated_assemblies) + var ropeBwtIndexDir: Path? = null // RopeBWT index output directory from step 12 try { // Step 1: Align Assemblies (if configured and should run) @@ -1140,17 +1188,238 @@ class Orchestrate : CliktCommand(name = "orchestrate") { AlignMutatedAssemblies().parse(args) restoreOrchestratorLogging(workDir) + // Get output path (use custom or default) + val outputBase = customOutput ?: workDir.resolve("output").resolve("10_mutated_alignment_results") + mutatedMafFilePaths = outputBase.toAbsolutePath().normalize().resolve("maf_file_paths.txt") + + if (!mutatedMafFilePaths.exists()) { + throw RuntimeException("Expected MAF paths file not found: $mutatedMafFilePaths") + } + logger.info("Step 10 completed successfully") logger.info("") } else { if (config.align_mutated_assemblies != null) { logger.info("Skipping align-mutated-assemblies (not in run_steps)") + + // Check custom output location first, then default + val customOutput = config.align_mutated_assemblies.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val outputBase = (customOutput ?: workDir.resolve("output").resolve("10_mutated_alignment_results")) + .toAbsolutePath().normalize() + val previousMafPaths = outputBase.resolve("maf_file_paths.txt") + + if (previousMafPaths.exists()) { + mutatedMafFilePaths = previousMafPaths + logger.info("Using previous align-mutated-assemblies outputs: $mutatedMafFilePaths") + } else { + logger.warn("Previous align-mutated-assemblies outputs not found. Downstream steps may fail.") + } } else { logger.info("Skipping align-mutated-assemblies (not configured)") } logger.info("") } + // Step 11: Mutated MAF to GVCF (if configured and should run) + if (config.mutated_maf_to_gvcf != null && shouldRunStep("mutated_maf_to_gvcf", config)) { + logger.info("=".repeat(80)) + logger.info("STEP 11: Mutated MAF to GVCF Conversion") + logger.info("=".repeat(80)) + + // Determine reference file (custom or from step 1) - resolve to absolute path + val step11RefFasta = config.mutated_maf_to_gvcf.reference_file?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: refFasta + if (step11RefFasta == null) { + throw RuntimeException("Cannot run mutated-maf-to-gvcf: reference FASTA not available (specify 'reference_file' in config or run align-assemblies first)") + } + + // Determine MAF input (custom or from step 10) - resolve to absolute path + val mafInput = config.mutated_maf_to_gvcf.maf_file?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: mutatedMafFilePaths + if (mafInput == null) { + throw RuntimeException("Cannot run mutated-maf-to-gvcf: no MAF input available (specify 'maf_file' in config or run align-mutated-assemblies first)") + } + + // Determine output directory (custom or default) - resolve to absolute path + val customOutputDir = config.mutated_maf_to_gvcf.output_dir?.let { + Path.of(it).toAbsolutePath().normalize() + } + + // Determine output file if specified - resolve to absolute path + val outputFile = config.mutated_maf_to_gvcf.output_file?.let { + Path.of(it).toAbsolutePath().normalize() + } + + logger.info("Reference FASTA: $step11RefFasta") + logger.info("MAF input: $mafInput") + + val args = buildList { + add("--work-dir=$workDir") + add("--reference-file=$step11RefFasta") + add("--maf-file=$mafInput") + if (outputFile != null) { + add("--output-file=$outputFile") + } + if (config.mutated_maf_to_gvcf.sample_name != null) { + add("--sample-name=${config.mutated_maf_to_gvcf.sample_name}") + } + if (customOutputDir != null) { + add("--output-dir=$customOutputDir") + } + } + + MafToGvcf().parse(args) + restoreOrchestratorLogging(workDir) + + // Get output directory (use custom or default) + val mutatedGvcfOutputDir = (customOutputDir ?: workDir.resolve("output").resolve("11_mutated_gvcf_results")) + .toAbsolutePath().normalize() + + if (!mutatedGvcfOutputDir.exists()) { + throw RuntimeException("Expected mutated GVCF output directory not found: $mutatedGvcfOutputDir") + } + + logger.info("Step 11 completed successfully") + logger.info("") + } else { + if (config.mutated_maf_to_gvcf != null) { + logger.info("Skipping mutated-maf-to-gvcf (not in run_steps)") + } else { + logger.info("Skipping mutated-maf-to-gvcf (not configured)") + } + logger.info("") + } + + // Step 12: RopeBWT Chr Index (if configured and should run) + if (config.rope_bwt_chr_index != null && shouldRunStep("rope_bwt_chr_index", config)) { + logger.info("=".repeat(80)) + logger.info("STEP 12: RopeBWT Chr Index") + logger.info("=".repeat(80)) + + // Determine output directory (custom or default) - resolve to absolute path + val customOutput = config.rope_bwt_chr_index.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val outputBase = customOutput ?: workDir.resolve("output").resolve("12_rope_bwt_index_results") + outputBase.createDirectories() + + // Determine keyfile - either provided or auto-generated from format_recombined_fastas output + val actualKeyfile: Path = if (config.rope_bwt_chr_index.keyfile != null) { + // Use provided keyfile + val keyfilePath = Path.of(config.rope_bwt_chr_index.keyfile).toAbsolutePath().normalize() + logger.info("Using provided keyfile: $keyfilePath") + keyfilePath + } else { + // Auto-generate keyfile from format_recombined_fastas output + if (formattedFastasDir == null || !formattedFastasDir.exists()) { + throw RuntimeException("Cannot run rope-bwt-chr-index: no FASTA input available (specify 'keyfile' in config or run format-recombined-fastas first)") + } + + logger.info("Auto-generating keyfile from formatted FASTA files in: $formattedFastasDir") + + // Collect FASTA files + val fastaFiles = formattedFastasDir.toFile().listFiles { file -> + file.isFile && file.name.matches(Regex(".*\\.(fa|fasta|fna)(\\.gz)?$")) + }?.map { it.toPath() }?.sorted() ?: emptyList() + + if (fastaFiles.isEmpty()) { + throw RuntimeException("Cannot run rope-bwt-chr-index: no FASTA files found in $formattedFastasDir") + } + + logger.info("Found ${fastaFiles.size} FASTA files") + + // Generate keyfile with sample names derived from filenames + val keyfilePath = outputBase.resolve("phg_keyfile.txt") + val keyfileLines = mutableListOf("Fasta\tSampleName") + val renamedSamples = mutableListOf>() // original -> fixed + + fastaFiles.forEach { fastaFile -> + var sampleName = fastaFile.fileName.toString() + .replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") + + // Replace underscores with hyphens and warn + if (sampleName.contains("_")) { + val originalName = sampleName + sampleName = sampleName.replace("_", "-") + renamedSamples.add(Pair(originalName, sampleName)) + } + + keyfileLines.add("${fastaFile.toAbsolutePath()}\t$sampleName") + } + + // Write keyfile + keyfilePath.writeText(keyfileLines.joinToString("\n")) + logger.info("Generated keyfile: $keyfilePath") + + // Warn about renamed samples + if (renamedSamples.isNotEmpty()) { + logger.warn("WARNING: The following sample names contained underscores and were converted to hyphens:") + renamedSamples.forEach { (original, fixed) -> + logger.warn(" '$original' -> '$fixed'") + } + logger.warn("PHG uses underscores internally for contig renaming (format: samplename_contig)") + } + + keyfilePath + } + + logger.info("Keyfile: $actualKeyfile") + + val args = buildList { + add("--work-dir=$workDir") + add("--keyfile=$actualKeyfile") + if (config.rope_bwt_chr_index.index_file_prefix != null) { + add("--index-file-prefix=${config.rope_bwt_chr_index.index_file_prefix}") + } + if (config.rope_bwt_chr_index.threads != null) { + add("--threads=${config.rope_bwt_chr_index.threads}") + } + if (config.rope_bwt_chr_index.delete_fmr_index != null) { + add("--delete-fmr-index=${config.rope_bwt_chr_index.delete_fmr_index}") + } + if (customOutput != null) { + add("--output-dir=$customOutput") + } + } + + RopeBwtChrIndex().parse(args) + restoreOrchestratorLogging(workDir) + + // Track output directory + ropeBwtIndexDir = outputBase + + if (!ropeBwtIndexDir.exists()) { + throw RuntimeException("Expected RopeBWT index output directory not found: $ropeBwtIndexDir") + } + + logger.info("Step 12 completed successfully") + logger.info("") + } else { + if (config.rope_bwt_chr_index != null) { + logger.info("Skipping rope-bwt-chr-index (not in run_steps)") + + // Check custom output location first, then default + val customOutput = config.rope_bwt_chr_index.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val previousIndexDir = (customOutput ?: workDir.resolve("output").resolve("12_rope_bwt_index_results")) + .toAbsolutePath().normalize() + if (previousIndexDir.exists()) { + ropeBwtIndexDir = previousIndexDir + logger.info("Using previous rope-bwt-chr-index outputs: $ropeBwtIndexDir") + } else { + logger.warn("Previous rope-bwt-chr-index outputs not found. Downstream steps may fail.") + } + } else { + logger.info("Skipping rope-bwt-chr-index (not configured)") + } + logger.info("") + } + // Pipeline completed successfully logger.info("=".repeat(80)) logger.info("PIPELINE COMPLETED SUCCESSFULLY!") From ed29ead40c33333e7b2cf0f1da090b4b8ee7f49d Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 18 Dec 2025 16:49:08 -0600 Subject: [PATCH 09/11] Fix output --- .../net/maizegenetics/commands/Orchestrate.kt | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index d48ea7a..de5f954 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -1245,9 +1245,11 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } // Determine output directory (custom or default) - resolve to absolute path - val customOutputDir = config.mutated_maf_to_gvcf.output_dir?.let { + // Always use step 11 output directory by default (not MafToGvcf's default) + val mutatedGvcfOutputDir = (config.mutated_maf_to_gvcf.output_dir?.let { Path.of(it).toAbsolutePath().normalize() - } + } ?: workDir.resolve("output").resolve("11_mutated_gvcf_results")) + .toAbsolutePath().normalize() // Determine output file if specified - resolve to absolute path val outputFile = config.mutated_maf_to_gvcf.output_file?.let { @@ -1256,29 +1258,24 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("Reference FASTA: $step11RefFasta") logger.info("MAF input: $mafInput") + logger.info("Output directory: $mutatedGvcfOutputDir") val args = buildList { add("--work-dir=$workDir") add("--reference-file=$step11RefFasta") add("--maf-file=$mafInput") + add("--output-dir=$mutatedGvcfOutputDir") // Always pass output dir to ensure step 11 location if (outputFile != null) { add("--output-file=$outputFile") } if (config.mutated_maf_to_gvcf.sample_name != null) { add("--sample-name=${config.mutated_maf_to_gvcf.sample_name}") } - if (customOutputDir != null) { - add("--output-dir=$customOutputDir") - } } MafToGvcf().parse(args) restoreOrchestratorLogging(workDir) - // Get output directory (use custom or default) - val mutatedGvcfOutputDir = (customOutputDir ?: workDir.resolve("output").resolve("11_mutated_gvcf_results")) - .toAbsolutePath().normalize() - if (!mutatedGvcfOutputDir.exists()) { throw RuntimeException("Expected mutated GVCF output directory not found: $mutatedGvcfOutputDir") } From 473d9540d8c311c2f2eeecdbe554c37e41c822d3 Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 18 Dec 2025 17:43:04 -0600 Subject: [PATCH 10/11] Fix ropebwt keyfile issue --- .../net/maizegenetics/commands/Orchestrate.kt | 9 ++-- .../maizegenetics/commands/RopeBwtChrIndex.kt | 43 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index de5f954..83f81ca 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -1329,9 +1329,9 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("Found ${fastaFiles.size} FASTA files") - // Generate keyfile with sample names derived from filenames + // Generate keyfile with sample names derived from filenames (no header) val keyfilePath = outputBase.resolve("phg_keyfile.txt") - val keyfileLines = mutableListOf("Fasta\tSampleName") + val keyfileLines = mutableListOf() val renamedSamples = mutableListOf>() // original -> fixed fastaFiles.forEach { fastaFile -> @@ -1375,8 +1375,9 @@ class Orchestrate : CliktCommand(name = "orchestrate") { if (config.rope_bwt_chr_index.threads != null) { add("--threads=${config.rope_bwt_chr_index.threads}") } - if (config.rope_bwt_chr_index.delete_fmr_index != null) { - add("--delete-fmr-index=${config.rope_bwt_chr_index.delete_fmr_index}") + // delete-fmr-index is a presence flag (include only when true) + if (config.rope_bwt_chr_index.delete_fmr_index == true) { + add("--delete-fmr-index") } if (customOutput != null) { add("--output-dir=$customOutput") diff --git a/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt b/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt index 479c6e6..be81f3f 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt @@ -2,8 +2,8 @@ package net.maizegenetics.commands import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.default +import com.github.ajalt.clikt.parameters.options.flag import com.github.ajalt.clikt.parameters.options.option -import com.github.ajalt.clikt.parameters.types.boolean import com.github.ajalt.clikt.parameters.types.int import com.github.ajalt.clikt.parameters.types.path import net.maizegenetics.Constants @@ -20,11 +20,10 @@ import kotlin.system.exitProcess class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { companion object { private const val LOG_FILE_NAME = "11_rope_bwt_chr_index.log" - private const val ROPE_BWT_RESULTS_DIR = "11_rope_bwt_index_results" + private const val ROPE_BWT_RESULTS_DIR = "12_rope_bwt_index_results" private const val KEYFILE_NAME = "phg_keyfile.txt" private const val DEFAULT_INDEX_PREFIX = "phgIndex" private const val DEFAULT_THREADS = 20 - private const val DEFAULT_DELETE_FMR = true } private val logger: Logger = LogManager.getLogger(RopeBwtChrIndex::class.java) @@ -42,12 +41,12 @@ class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { private val keyfile by option( "--keyfile", "-k", - help = "Pre-made keyfile (tab-delimited with 'Fasta' and 'SampleName' columns). Mutually exclusive with --fasta-input." + help = "Pre-made keyfile (tab-delimited: fasta_pathsample_name, no header). Mutually exclusive with --fasta-input." ).path(mustExist = true, canBeFile = true, canBeDir = false) private val outputDirOption by option( "--output-dir", "-o", - help = "Output directory for index files (default: work_dir/output/11_rope_bwt_index_results)" + help = "Output directory for index files (default: work_dir/output/12_rope_bwt_index_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private val indexFilePrefix by option( @@ -63,9 +62,8 @@ class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { private val deleteFmrIndex by option( "--delete-fmr-index", - help = "Delete .fmr index files after converting to .fmd" - ).boolean() - .default(DEFAULT_DELETE_FMR) + help = "Delete .fmr index files after converting to .fmd (presence flag: include to enable)" + ).flag() private fun collectFastaFiles(): List { return FileUtils.collectFiles( @@ -80,7 +78,7 @@ class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { logger.info("Generating keyfile from FASTA files") val keyfilePath = outputDir.resolve(KEYFILE_NAME) - val keyfileLines = mutableListOf("Fasta\tSampleName") +val keyfileLines = mutableListOf() val problemNames = mutableListOf() fastaFiles.forEach { fastaFile -> @@ -119,19 +117,12 @@ class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { exitProcess(1) } - val header = lines[0].split("\t") - if (header.size != 2 || header[0] != "Fasta" || header[1] != "SampleName") { - logger.error("Invalid keyfile header. Expected: 'Fasta\\tSampleName'") - logger.error("Got: ${lines[0]}") - exitProcess(1) - } - val problemNames = mutableListOf() - for (i in 1 until lines.size) { + for (i in lines.indices) { val parts = lines[i].split("\t") if (parts.size != 2) { - logger.error("Invalid keyfile line $i: ${lines[i]}") - logger.error("Expected 2 tab-separated columns") + logger.error("Invalid keyfile line ${i + 1}: ${lines[i]}") + logger.error("Expected 2 tab-separated columns (fasta_pathsample_name)") exitProcess(1) } @@ -202,14 +193,22 @@ class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { // Run PHG rope-bwt-chr-index command logger.info("Running PHG rope-bwt-chr-index...") - val exitCode = ProcessRunner.runCommand( + + // Build command arguments - delete-fmr-index is a presence flag (no value) + val commandArgs = mutableListOf( phgBinary.toString(), "rope-bwt-chr-index", "--keyfile", actualKeyfile.toString(), "--output-dir", outputDir.toString(), "--index-file-prefix", indexFilePrefix, - "--threads", threads.toString(), - "--delete-fmr-index", deleteFmrIndex.toString(), + "--threads", threads.toString() + ) + if (deleteFmrIndex) { + commandArgs.add("--delete-fmr-index") + } + + val exitCode = ProcessRunner.runCommand( + *commandArgs.toTypedArray(), workingDir = workDir.toFile(), logger = logger ) From 14cf3f2681e015d53523e6d9d3a2c030ec1fc174 Mon Sep 17 00:00:00 2001 From: Brandon Date: Mon, 22 Dec 2025 16:17:08 -0600 Subject: [PATCH 11/11] Remove on-the-fly regex --- .../kotlin/net/maizegenetics/commands/Orchestrate.kt | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index 83f81ca..f1306f0 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -129,6 +129,9 @@ data class RopeBwtChrIndexConfig( class Orchestrate : CliktCommand(name = "orchestrate") { companion object { private const val LOG_FILE_NAME = "00_orchestrate.log" + // Regex patterns reused across multiple operations + private val FASTA_FILE_PATTERN = Regex(".*\\.(fa|fasta|fna)(\\.gz)?$") + private val FASTA_EXTENSION_PATTERN = Regex("\\.(fa|fasta|fna)(\\.gz)?$") } private val logger: Logger = LogManager.getLogger(Orchestrate::class.java) @@ -766,7 +769,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Get all FASTA files from step 4 output val fastaFiles = fastaOutputDir.toFile().listFiles { file -> - file.isFile && file.name.matches(Regex(".*\\.(fa|fasta|fna)(\\.gz)?$")) + file.isFile && file.name.matches(FASTA_FILE_PATTERN) }?.map { it.toPath() }?.sorted() ?: emptyList() if (fastaFiles.isEmpty()) { @@ -779,8 +782,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { val lines = fastaFiles.map { fastaPath -> val fileName = fastaPath.fileName.toString() // Remove extension (including .gz if present) - val baseName = fileName - .replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") + val baseName = fileName.replace(FASTA_EXTENSION_PATTERN, "") "${fastaPath.toAbsolutePath()}\t$baseName" } assemblyListFile.writeText(lines.joinToString("\n")) @@ -1320,7 +1322,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Collect FASTA files val fastaFiles = formattedFastasDir.toFile().listFiles { file -> - file.isFile && file.name.matches(Regex(".*\\.(fa|fasta|fna)(\\.gz)?$")) + file.isFile && file.name.matches(FASTA_FILE_PATTERN) }?.map { it.toPath() }?.sorted() ?: emptyList() if (fastaFiles.isEmpty()) { @@ -1336,7 +1338,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { fastaFiles.forEach { fastaFile -> var sampleName = fastaFile.fileName.toString() - .replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") + .replace(FASTA_EXTENSION_PATTERN, "") // Replace underscores with hyphens and warn if (sampleName.contains("_")) {