diff --git a/build.gradle.kts b/build.gradle.kts index 6f64512..5d32bf3 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "net.maizegenetics" -version = "0.2.6" +version = "0.2.7" repositories { mavenCentral() diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index 211c01d..6a00c6d 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -6,25 +6,26 @@ # ============================================================================ # This pipeline consists of two main workflows: # -# MAIN PIPELINE (Variant Simulation): +# VARIANT PIPELINE (Generate variants): # 01. align-assemblies: Align query assemblies to reference using AnchorWave # 02. maf-to-gvcf: Convert MAF alignment files to compressed GVCF format # 03. downsample-gvcf: Downsample variants at specified rates per chromosome # 04. convert-to-fasta: Generate FASTA files from downsampled variants -# 05. align-mutated-assemblies: Realign mutated FASTA files back to reference # # RECOMBINATION PIPELINE (Generate Recombinant Genomes): -# 06. pick-crossovers: Simulate crossover events in reference coordinates -# 07. create-chain-files: Convert MAF files to chain format for coordinate conversion -# 08. convert-coordinates: Convert crossover points to assembly coordinates -# 09. generate-recombined-sequences: Create recombined FASTA files from parent assemblies -# 10. format-recombined-fastas: Format recombined FASTA files with consistent line widths +# 05. pick-crossovers: Simulate crossover events in reference coordinates +# 06. create-chain-files: Convert MAF files to chain format for coordinate conversion +# 07. convert-coordinates: Convert crossover points to assembly coordinates +# 08. generate-recombined-sequences: Create recombined FASTA files from parent assemblies +# 09. format-recombined-fastas: Format recombined FASTA files with consistent line widths # # PS4G CREATION (PHG Indexing for Genotype Imputation): -# 11. rope-bwt-chr-index: Create PHGv2 ropebwt3 index from recombined FASTA files -# 12. ropebwt-mem: Align FASTQ reads to ropebwt3 index and generate BED files -# 13. build-spline-knots: Build spline knots from hVCF or gVCF files for imputation -# 14. convert-ropebwt2ps4g: Convert RopeBWT3 BED alignments to PS4G format +# 10. align-mutated-assemblies: Realign formatted recombined FASTA files to reference +# 11. mutated-maf-to-gvcf: Convert mutated MAF files to compressed GVCF format +# 12. rope-bwt-chr-index: Create PHGv2 ropebwt3 index from recombined FASTA files +# 13. ropebwt-mem: Align FASTQ reads to ropebwt3 index and generate BED files +# 14. build-spline-knots: Build spline knots from hVCF or gVCF files for imputation +# 15. convert-ropebwt2ps4g: Convert RopeBWT3 BED alignments to PS4G format # # NOTE (1): Environment setup is automatic! The orchestrate command will automatically # detect if the working directory and required tools (MLImpute, biokotlin-tools, PHGv2) @@ -46,23 +47,24 @@ run_steps: - maf_to_gvcf # Step 02: Convert alignments to GVCF - downsample_gvcf # Step 03: Downsample variants - convert_to_fasta # Step 04: Generate mutated FASTA files - - align_mutated_assemblies # Step 05: Realign mutated sequences # Recombination Pipeline (Generate Recombinant Genomes) - - pick_crossovers # Step 06: Pick crossover points - - create_chain_files # Step 07: Create coordinate conversion chains - - convert_coordinates # Step 08: Convert coordinates to assembly space - - generate_recombined_sequences # Step 09: Generate recombined sequences - - format_recombined_fastas # Step 10: Format recombined FASTA files + - pick_crossovers # Step 05: Pick crossover points + - create_chain_files # Step 06: Create coordinate conversion chains + - convert_coordinates # Step 07: Convert coordinates to assembly space + - generate_recombined_sequences # Step 08: Generate recombined sequences + - format_recombined_fastas # Step 09: Format recombined FASTA files # PS4G Creation (PHG Indexing for Genotype Imputation) - - rope_bwt_chr_index # Step 11: Create PHGv2 ropebwt3 index - - ropebwt_mem # Step 12: Align FASTQ reads to index - - build_spline_knots # Step 13: Build spline knots from VCF files - - convert_ropebwt2ps4g # Step 14: Convert BED alignments to PS4G format + - align_mutated_assemblies # Step 10: Realign formatted sequences to reference + - mutated_maf_to_gvcf # Step 11: Convert mutated MAF files to GVCF + - rope_bwt_chr_index # Step 12: Create PHGv2 ropebwt3 index + - ropebwt_mem # Step 13: Align FASTQ reads to index + - build_spline_knots # Step 14: Build spline knots from VCF files + - convert_ropebwt2ps4g # Step 15: Convert BED alignments to PS4G format # ============================================================================ -# MAIN PIPELINE CONFIGURATION (Steps 1-5) +# MAIN PIPELINE CONFIGURATION (Steps 1-4) # ============================================================================ # Step 1: Align Assemblies @@ -109,76 +111,62 @@ convert_to_fasta: # input: "/custom/gvcf/input/" # Optional: Custom GVCF input (file, directory, or text list) # output: "/custom/fasta/output/" # Optional: Custom output directory -# Step 5: Align Mutated Assemblies -# Realigns the mutated FASTA files from convert_to_fasta back to the reference -# Auto-detects reference and query FASTAs from step 4 output based on filename matching -# This creates a circular workflow where mutated sequences are realigned for comparison -align_mutated_assemblies: - # ref_gff: "path/to/reference.gff" # Optional: Reference GFF file - # Uses align_assemblies.ref_gff if not specified - # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA from step 4 output - # Auto-detects file matching original ref name if not specified - # fasta_input: "/custom/fasta/input/" # Optional: Query FASTA input (file, directory, or text list) - # Auto-detects non-ref files from step 4 if not specified - threads: 1 # Optional: Number of threads (default: 1) - # output: "/custom/alignment/output/" # Optional: Custom output directory - # ============================================================================ -# RECOMBINATION PIPELINE CONFIGURATION (Steps 6-10) +# RECOMBINATION PIPELINE CONFIGURATION (Steps 5-9) # ============================================================================ -# Step 6: Pick Crossovers +# Step 5: Pick Crossovers # Simulates crossover events to generate recombination breakpoints # Uses MLImpute's pick_crossovers.py to simulate synthetic recombinant genomes # Generates two populations: landrace (1250 rounds) and two-parent cross (1 round) +# IMPORTANT: Requires an EVEN number of assemblies (assemblies are paired for crossover simulation) pick_crossovers: - assembly_list: "path/to/assembly_list.txt" # Required: Tab-separated file with assembly paths and names - # Format: (one per line) - # Example: - # /path/to/assembly1.faparent1 - # /path/to/assembly2.faparent2 - # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA file - # Uses mutated ref FASTA from step 5 if not specified - # Falls back to align_assemblies.ref_fasta if step 5 not run - # output: "/custom/crossovers/output/" # Optional: Custom output directory + # assembly_list: "path/to/assembly_list.txt" # Optional: Tab-separated file with assembly paths and names + # Format: (one per line) + # Example: + # /path/to/assembly1.faparent1 + # /path/to/assembly2.faparent2 + # If not specified, auto-generates from convert_to_fasta (step 4) output + # Auto-generated names: filename minus "_mutated" suffix and extension + # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA file + # Uses align_assemblies.ref_fasta if not specified + # output: "/custom/crossovers/output/" # Optional: Custom output directory -# Step 7: Create Chain Files +# Step 6: Create Chain Files # Converts MAF alignment files to UCSC chain format for coordinate conversion -# Automatically uses MAF files from align_assemblies (step 1) +# Uses MAF files from align_assemblies (step 1) # Uses maf-convert tool (automatically downloaded if missing) create_chain_files: - jobs: 8 # Optional: Number of parallel jobs (default: 8) - # input: "/custom/maf/input/" # Optional: Custom MAF input (file, directory, or text list) - # output: "/custom/chain/output/" # Optional: Custom output directory + jobs: 8 # Optional: Number of parallel jobs (default: 8) + # maf_file_input: "/custom/maf/input/" # Optional: Custom MAF input (file, directory, or text list) + # Default: step 1 MAF outputs + # output: "/custom/chain/output/" # Optional: Custom output directory -# Step 8: Convert Coordinates +# Step 7: Convert Coordinates # Converts crossover breakpoints from reference coordinates to assembly coordinates # Automatically uses refkey files from pick_crossovers and chain files from create_chain_files # Uses CrossMap for coordinate conversion via chain files convert_coordinates: - assembly_list: "path/to/assembly_list.txt" # Required: Same assembly list as pick_crossovers - # input_chain: "/custom/chain/directory/" # Optional: Custom chain directory - # input_refkey: "/custom/refkey/directory/" # Optional: Custom refkey directory - # output: "/custom/coordinates/output/" # Optional: Custom output directory + # assembly_list: "path/to/assembly_list.txt" # Optional: defaults to assembly list from pick_crossovers + # input_chain: "/custom/chain/directory/" # Optional: Custom chain directory + # input_refkey: "/custom/refkey/directory/" # Optional: Custom refkey directory + # output: "/custom/coordinates/output/" # Optional: Custom output directory -# Step 9: Generate Recombined Sequences +# Step 8: Generate Recombined Sequences # Creates recombined FASTA sequences by concatenating segments from parent assemblies -# Automatically uses founder key files from convert_coordinates +# Automatically uses founder key files from convert_coordinates (step 7) # Each founder gets its own recombined FASTA file generate_recombined_sequences: - assembly_list: "path/to/assembly_list.txt" # Required: Same assembly list as above - chromosome_list: "path/to/chromosome_list.txt" # Required: Text file with chromosome names (one per line) - # Example: - # chr1 - # chr2 - # chr3 - assembly_dir: "path/to/parent_assemblies/" # Required: Directory containing parent assembly FASTA files - # Files should be named to match assembly_list names - # Example: parent1.fa, parent2.fa - # input: "/custom/founder_key/directory/" # Optional: Custom founder key directory - # output: "/custom/recombined/output/" # Optional: Custom output directory - -# Step 10: Format Recombined Fastas + # assembly_list: "path/to/assembly_list.txt" # Optional: defaults to assembly list from pick_crossovers + # chromosome_list: "path/to/chromosome_list.txt" # Optional: Text file with chromosome names (one per line) + # If not specified, auto-derives IDs from FASTAs + # Example format: + # chr1 + # chr2 + # chr3 + # assembly_dir: "path/to/parent_assemblies/" # Optional: Directory containing parent assembly FASTA files + # Defaults to this step's output directory +# Step 9: Format Recombined Fastas # Reformats recombined FASTA files to have consistent line widths using seqkit # Automatically uses recombined FASTA files from generate_recombined_sequences # Ensures compatibility with downstream tools @@ -189,25 +177,52 @@ format_recombined_fastas: # output: "/custom/formatted/output/" # Optional: Custom output directory # ============================================================================ -# PS4G CREATION CONFIGURATION (Steps 11-14) +# PS4G CREATION CONFIGURATION (Steps 10-15) # ============================================================================ -# Step 11: PHG Rope-BWT-Chr Index +# Step 10: Align Mutated Assemblies +# Realigns the formatted recombined FASTA files back to the reference +# Uses reference GFF and FASTA from align_assemblies (step 1) +# Uses FASTA files from format_recombined_fastas (step 9) as query input +align_mutated_assemblies: + # ref_gff: "path/to/reference.gff" # Optional: Reference GFF file + # Uses align_assemblies.ref_gff if not specified + # ref_fasta: "path/to/reference.fa" # Optional: Reference FASTA file + # Uses align_assemblies.ref_fasta if not specified + # fasta_input: "/custom/fasta/input/" # Optional: Query FASTA input (file, directory, or text list) + # Uses format_recombined_fastas output if not specified + threads: 1 # Optional: Number of threads (default: 1) + # output: "/custom/alignment/output/" # Optional: Custom output directory + +# Step 11: Mutated MAF to GVCF Conversion +# Converts MAF files from align_mutated_assemblies to compressed GVCF format +# Automatically uses ref_fasta from align_assemblies and MAF output from align_mutated_assemblies +mutated_maf_to_gvcf: + # reference_file: "path/to/reference.fa" # Optional: Reference FASTA file + # Uses align_assemblies.ref_fasta if not specified + # maf_file: "/custom/maf/files.txt" # Optional: MAF file, directory, or text list + # Uses step 10 MAF outputs if not specified + # output_file: "sample.g.vcf.gz" # Optional: Output GVCF filename (auto-generated if not specified) + # sample_name: "optional_sample_name" # Optional: Override sample name in GVCF + # output_dir: "/custom/gvcf/output/" # Optional: Custom output directory + +# Step 12: PHG Rope-BWT-Chr Index # Creates a PHGv2 ropebwt3 index from recombined FASTA files for genotype imputation -# Automatically uses formatted FASTA files from format_recombined_fastas +# By default, auto-generates a keyfile from format_recombined_fastas (step 9) output: +# - Sample names are derived from FASTA filenames (without extension) +# - Underscores in sample names are automatically converted to hyphens (logged as warning) # Can also accept a pre-made keyfile with custom sample names rope_bwt_chr_index: index_file_prefix: "phgIndex" # Optional: Prefix for index files (default: "phgIndex") threads: 20 # Optional: Number of threads (default: 20) delete_fmr_index: true # Optional: Delete .fmr files after conversion (true/false, default: true) - # input: "/custom/fasta/input/" # Optional: Custom FASTA input (file, directory, or text list) - # output: "/custom/phg_index/" # Optional: Custom output directory (default: work_dir/output/11_rope_bwt_index_results) - # keyfile: "/path/to/keyfile.txt" # Optional: Pre-made keyfile (mutually exclusive with input) + # output: "/custom/phg_index/" # Optional: Custom output directory (default: work_dir/output/12_rope_bwt_index_results) + # keyfile: "/path/to/keyfile.txt" # Optional: Pre-made keyfile (overrides auto-generation) # Keyfile format: Tab-delimited with 'Fasta' and 'SampleName' columns # WARNING: Sample names should NOT contain underscores # PHG uses underscores internally (e.g., samplename_contig) -# Step 12: Ropebwt3 Mem Alignment +# Step 13: Ropebwt3 Mem Alignment # Aligns FASTQ reads to the ropebwt3 index and generates BED alignment files # Automatically uses index file from rope_bwt_chr_index and calculates -l parameter # Processes multiple FASTQ files iteratively, generating one BED file per sample @@ -216,35 +231,35 @@ ropebwt_mem: # Supports: .fq, .fastq, .fq.gz, .fastq.gz threads: 40 # Optional: Number of threads (default: 1) p_value: 168 # Optional: The -p parameter (default: 168) - # index_file: "/custom/index.fmd" # Optional: Custom index file (auto-detected from step 11) - # l_value: 100 # Optional: The -l parameter (auto-calculated as 2 × FASTA count from step 11) - # output: "/custom/bed/output/" # Optional: Custom output directory (default: work_dir/output/12_rope_bwt_mem_results) + # index_file: "/custom/index.fmd" # Optional: Custom index file (auto-detected from step 12) + # l_value: 100 # Optional: The -l parameter (auto-calculated as 2 × FASTA count from step 12) + # output: "/custom/bed/output/" # Optional: Custom output directory (default: work_dir/output/13_rope_bwt_mem_results) -# Step 13: Build Spline Knots +# Step 14: Build Spline Knots # Builds spline knots from hVCF or gVCF files for PHGv2 machine learning imputation # This is an independent step that does not depend on previous pipeline outputs # Used to create spline representations for downstream imputation tasks build_spline_knots: vcf_dir: "path/to/vcf_files/" # Required: Directory containing hVCF or gVCF files - vcf_type: "hvcf" # Optional: Type of VCFs (hvcf or gvcf, default: hvcf) + vcf_type: "gvcf" # Optional: Type of VCFs (hvcf or gvcf, default: gvcf) min_indel_length: 10 # Optional: Min indel length for gVCF (default: 10) num_bps_per_knot: 50000 # Optional: Max base pairs per knot (default: 50000) random_seed: 12345 # Optional: Random seed for reproducibility (default: 12345) # contig_list: "chr1,chr2,chr3" # Optional: Comma-separated chromosome list (default: all) - # output: "/custom/spline/output/" # Optional: Custom output directory (default: work_dir/output/13_spline_knots_results) + # output: "/custom/spline/output/" # Optional: Custom output directory (default: work_dir/output/14_spline_knots_results) -# Step 14: Convert RopeBWT to PS4G +# Step 15: Convert RopeBWT to PS4G # Converts RopeBWT3 BED alignment files to PS4G format for gamete support tracking -# Automatically uses BED files from ropebwt_mem (step 12) and spline knots from build_spline_knots (step 13) +# Automatically uses BED files from ropebwt_mem (step 13) and spline knots from build_spline_knots (step 14) # Processes each BED file iteratively, generating one PS4G file per sample convert_ropebwt2ps4g: min_mem_length: 135 # Optional: Minimum MEM length threshold in bp (default: 135) max_num_hits: 16 # Optional: Maximum haplotype hits per alignment (default: 16) # bed_input: "/custom/bed/files/" # Optional: Custom BED input (file, directory, or text list) - # Auto-detected from step 12 if not specified - # spline_knot_dir: "/custom/spline/" # Optional: Custom spline knot directory # Auto-detected from step 13 if not specified - # output: "/custom/ps4g/output/" # Optional: Custom output directory (default: work_dir/output/14_convert_ropebwt2ps4g_results) + # spline_knot_dir: "/custom/spline/" # Optional: Custom spline knot directory + # Auto-detected from step 14 if not specified + # output: "/custom/ps4g/output/" # Optional: Custom output directory (default: work_dir/output/15_convert_ropebwt2ps4g_results) # ============================================================================ # Usage Examples: @@ -254,39 +269,38 @@ convert_ropebwt2ps4g: # ./gradlew run --args="orchestrate --config pipeline_config.yaml" # Note: Environment setup runs automatically if needed on first run! # -# 2. Run only the main pipeline (variant simulation, steps 1-5): +# 2. Run only the main pipeline (variant simulation, steps 1-4): # run_steps: # - align_assemblies # - maf_to_gvcf # - downsample_gvcf # - convert_to_fasta -# - align_mutated_assemblies # # Comment out recombination steps # -# 3. Run only the recombination pipeline with PS4G creation (steps 1, 6-14): +# 3. Run only the recombination pipeline with PS4G creation (steps 1, 5-15): # run_steps: # - align_assemblies # Required for MAF files # # - maf_to_gvcf # Skip GVCF steps # # - downsample_gvcf # # - convert_to_fasta -# # - align_mutated_assemblies # - pick_crossovers # - create_chain_files # - convert_coordinates # - generate_recombined_sequences # - format_recombined_fastas +# - align_mutated_assemblies # Realign recombined sequences +# - mutated_maf_to_gvcf # Convert mutated MAF to GVCF # - rope_bwt_chr_index # Create PHG index # - ropebwt_mem # Align FASTQ reads # - build_spline_knots # Build spline knots # - convert_ropebwt2ps4g # Convert BED to PS4G # -# 4. Run steps 1-2-4-5 (skip downsampling): +# 4. Run steps 1-2-4 (skip downsampling): # run_steps: # - align_assemblies # - maf_to_gvcf # # - downsample_gvcf # Commented out # - convert_to_fasta # Will use maf_to_gvcf outputs -# - align_mutated_assemblies # # 5. Rerun only format step (requires previous outputs): # run_steps: @@ -310,23 +324,25 @@ convert_ropebwt2ps4g: # input: "/data/my_gvcf_directory/" # output: "/results/my_fasta_output/" # -# 8. Run only PS4G index creation with custom FASTA files: +# 8. Run PS4G index creation with a pre-made keyfile: # run_steps: # - rope_bwt_chr_index # rope_bwt_chr_index: -# input: "/path/to/my/fasta/files/" -# output_dir: "my_phg_index" +# keyfile: "/path/to/my/keyfile.txt" +# output: "my_phg_index" # index_file_prefix: "myCustomIndex" # threads: 40 # -# 9. Run PS4G index creation with a pre-made keyfile: +# 9. Run PS4G index creation with custom settings (using auto-generated keyfile): # run_steps: # - rope_bwt_chr_index # rope_bwt_chr_index: -# keyfile: "/path/to/my/keyfile.txt" -# output_dir: "my_phg_index" +# # No keyfile = auto-generates from step 9 FASTA files +# output: "my_phg_index" +# index_file_prefix: "customIndex" +# threads: 40 # -# 10. Run only ropebwt-mem alignment (requires step 11 output): +# 10. Run only ropebwt-mem alignment (requires step 12 output): # run_steps: # - ropebwt_mem # ropebwt_mem: @@ -361,11 +377,11 @@ convert_ropebwt2ps4g: # contig_list: "chr1,chr2,chr3,chr4,chr5" # output: "/custom/spline/output/" # -# 14. Run convert-ropebwt2ps4g with auto-detection (requires steps 12-13): +# 14. Run convert-ropebwt2ps4g with auto-detection (requires steps 13-14): # run_steps: # - convert_ropebwt2ps4g # convert_ropebwt2ps4g: -# # Auto-detects BED files from step 12 and spline knots from step 13 +# # Auto-detects BED files from step 13 and spline knots from step 14 # min_mem_length: 135 # max_num_hits: 16 # @@ -383,41 +399,63 @@ convert_ropebwt2ps4g: # IMPORTANT NOTES: # ============================================================================ # -# Assembly List Format: +# Assembly List Format (for pick_crossovers, convert_coordinates, generate_recombined_sequences): # - Tab-separated file with two columns: path and name -# - Must have an EVEN number of assemblies (for parent pairing) +# - MUST have an EVEN number of assemblies (assemblies are paired for crossover simulation) +# - The same assembly list is shared across steps 5, 7, and 8: +# - Step 5 (pick_crossovers): If not provided, auto-generates from step 4 FASTA outputs +# - Steps 7 and 8: If not provided, automatically use the assembly list from step 5 +# - Auto-generated names are derived from filename minus "_mutated" suffix and extension # - Example: # /data/assemblies/B73.faB73 # /data/assemblies/Mo17.faMo17 # /data/assemblies/W22.faW22 # /data/assemblies/PH207.faPH207 # -# Chromosome List Format: +# Chromosome List Format (for generate_recombined_sequences): # - Plain text file with one chromosome name per line # - Names should match chromosome names in assemblies +# - If not provided, auto-derives from first assembly in assembly list using BioKotlin # - Example: # chr1 # chr2 # chr3 # # Recombination Pipeline Dependencies: -# - Step 6 (pick_crossovers) requires: ref_fasta from step 1 -# - Step 7 (create_chain_files) requires: MAF files from step 1 -# - Step 8 (convert_coordinates) requires: outputs from steps 6 and 7 -# - Step 9 (generate_recombined_sequences) requires: output from step 8 -# - Step 10 (format_recombined_fastas) requires: output from step 9 +# - Step 5 (pick_crossovers) requires: ref_fasta from step 1 +# - If assembly_list not provided, auto-generates from step 4 FASTA outputs +# - MUST have an EVEN number of assemblies (will error if odd) +# - Assembly list is saved and shared with steps 7 and 8 +# - Step 6 (create_chain_files) requires: MAF files from step 1 +# - Uses align_assemblies MAF outputs +# - Step 7 (convert_coordinates) requires: outputs from steps 5 and 6 +# - If assembly_list not provided, uses assembly list from step 5 +# - Step 8 (generate_recombined_sequences) requires: output from step 7 +# - If assembly_list not provided, uses assembly list from step 5 +# - If chromosome_list not provided, auto-derives from first assembly using BioKotlin +# - If assembly_dir not provided, uses step 8 output directory +# - Step 9 (format_recombined_fastas) requires: output from step 8 +# - Step 10 (align_mutated_assemblies) requires: outputs from steps 1 and 9 +# - Uses ref_gff and ref_fasta from step 1 +# - Uses FASTA files from step 9 as query input +# - Step 11 (mutated_maf_to_gvcf) requires: outputs from steps 1 and 10 +# - Uses ref_fasta from step 1 +# - Uses MAF files from step 10 as input # # PS4G Creation Dependencies: -# - Step 11 (rope_bwt_chr_index) requires: formatted FASTA files from step 10 -# - Can also accept custom FASTA files or a pre-made keyfile +# - Step 11 (mutated_maf_to_gvcf) requires: MAF files from step 10 and ref_fasta from step 1 +# - Step 12 (rope_bwt_chr_index) auto-generates keyfile from step 9 FASTA files: +# - Sample names derived from filenames (without extension) +# - Underscores are auto-converted to hyphens (logged as warning) +# - Can also accept a pre-made keyfile (overrides auto-generation) # - Keyfile must be tab-delimited with 'Fasta' and 'SampleName' columns # - Sample names should NOT contain underscores (PHG internal requirement) -# - Step 12 (ropebwt_mem) requires: index file (.fmd) from step 11 -# - Step 12 auto-calculates -l parameter from keyfile (2 × FASTA count) +# - Step 13 (ropebwt_mem) requires: index file (.fmd) from step 12 +# - Step 13 auto-calculates -l parameter from keyfile (2 × FASTA count) # - FASTQ files can be compressed (.fq.gz, .fastq.gz) or uncompressed (.fq, .fastq) -# - Step 13 (build_spline_knots) is independent: requires only VCF files -# - Step 14 (convert_ropebwt2ps4g) requires: BED files from step 12 and spline knots from step 13 -# - Step 14 auto-detects inputs if not specified +# - Step 14 (build_spline_knots) is independent: requires only VCF files +# - Step 15 (convert_ropebwt2ps4g) requires: BED files from step 13 and spline knots from step 14 +# - Step 15 auto-detects inputs if not specified # # Custom Input/Output Paths: # - Each step supports optional 'input' and 'output' parameters diff --git a/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt index 5331064..c642224 100644 --- a/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt @@ -19,8 +19,8 @@ import kotlin.system.exitProcess class AlignMutatedAssemblies : CliktCommand(name = "align-mutated-assemblies") { companion object { - private const val LOG_FILE_NAME = "05_align_mutated_assemblies.log" - private const val MUTATED_ALIGNMENT_RESULTS_DIR = "05_mutated_alignment_results" + private const val LOG_FILE_NAME = "10_align_mutated_assemblies.log" + private const val MUTATED_ALIGNMENT_RESULTS_DIR = "10_mutated_alignment_results" private const val MAF_PATHS_FILE = "maf_file_paths.txt" // minimap2 parameters @@ -71,7 +71,7 @@ class AlignMutatedAssemblies : CliktCommand(name = "align-mutated-assemblies") { private val outputDir by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/05_mutated_alignment_results)" + help = "Custom output directory (default: work_dir/output/10_mutated_alignment_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private fun collectFastaFiles(): List { diff --git a/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt b/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt index 88c5acc..14951aa 100644 --- a/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt +++ b/src/main/kotlin/net/maizegenetics/commands/ConvertCoordinates.kt @@ -18,12 +18,12 @@ import kotlin.system.exitProcess class ConvertCoordinates : CliktCommand(name = "convert-coordinates") { companion object { - private const val LOG_FILE_NAME = "08_convert_coordinates.log" - private const val COORDS_RESULTS_DIR = "08_coordinates_results" + private const val LOG_FILE_NAME = "07_convert_coordinates.log" + private const val COORDS_RESULTS_DIR = "07_coordinates_results" private const val KEY_PATHS_FILE = "key_file_paths.txt" private const val FOUNDER_KEY_PATHS_FILE = "founder_key_file_paths.txt" private const val PYTHON_SCRIPT = "src/python/cross/convert_coords.py" - private const val DEFAULT_REFKEY_DIR = "06_crossovers_results" + private const val DEFAULT_REFKEY_DIR = "05_crossovers_results" } private val logger: Logger = LogManager.getLogger(ConvertCoordinates::class.java) @@ -53,7 +53,7 @@ class ConvertCoordinates : CliktCommand(name = "convert-coordinates") { private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/08_coordinates_results)" + help = "Custom output directory (default: work_dir/output/07_coordinates_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) override fun run() { @@ -112,14 +112,16 @@ class ConvertCoordinates : CliktCommand(name = "convert-coordinates") { // Run convert_coords.py // Set PYTHONPATH so Python can find the 'python' package for imports - val pythonPath = mlimputeDir.resolve("src").toString() + // Use absolute paths since the working directory is set to outputDir + // Use sh -c to set PYTHONPATH inside pixi's environment + val pythonPath = mlimputeDir.resolve("src").toAbsolutePath().toString() + val scriptPath = pythonScript.toAbsolutePath().toString() + val assemblyListPath = assemblyList.toAbsolutePath().toString() + val chainDirPath = chainDir.toAbsolutePath().toString() + val shellCommand = "PYTHONPATH='$pythonPath' python '$scriptPath' --assembly-list '$assemblyListPath' --chain-dir '$chainDirPath'" logger.info("Running convert_coords.py") val exitCode = ProcessRunner.runCommand( - "env", "PYTHONPATH=$pythonPath", - "pixi", "run", - "python", pythonScript.toString(), - "--assembly-list", assemblyList.toString(), - "--chain-dir", chainDir.toString(), + "pixi", "run", "sh", "-c", shellCommand, workingDir = outputDir.toFile(), logger = logger ) diff --git a/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt b/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt index ea24faa..ff8d2c9 100644 --- a/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt +++ b/src/main/kotlin/net/maizegenetics/commands/CreateChainFiles.kt @@ -19,8 +19,8 @@ import kotlin.system.exitProcess class CreateChainFiles : CliktCommand(name = "create-chain-files") { companion object { - private const val LOG_FILE_NAME = "07_create_chain_files.log" - private const val CHAIN_RESULTS_DIR = "07_chain_results" + private const val LOG_FILE_NAME = "06_create_chain_files.log" + private const val CHAIN_RESULTS_DIR = "06_chain_results" private const val CHAIN_PATHS_FILE = "chain_file_paths.txt" private const val BASH_SCRIPT = "src/python/cross/create_chains.sh" private const val DEFAULT_JOBS = 8 @@ -49,7 +49,7 @@ class CreateChainFiles : CliktCommand(name = "create-chain-files") { private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/07_chain_results)" + help = "Custom output directory (default: work_dir/output/06_chain_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private fun collectMafFiles(): List { @@ -128,20 +128,34 @@ class CreateChainFiles : CliktCommand(name = "create-chain-files") { logger.info("Cleaning up temporary MAF directory") tempMafDir.toFile().deleteRecursively() - // Collect generated chain files + // Collect generated chain files (all files with .chain extension) val chainFiles = outputDir.listDirectoryEntries() .filter { it.isRegularFile() && it.extension == "chain" } .sorted() - if (chainFiles.isEmpty()) { + // Rename chain files to include "_subsampled" suffix if not already present + val renamedChainFiles = chainFiles.map { chainFile -> + val fileName = chainFile.nameWithoutExtension + if (!fileName.endsWith("_subsampled")) { + val newFileName = "${fileName}_subsampled.chain" + val newPath = chainFile.parent.resolve(newFileName) + chainFile.moveTo(newPath, overwrite = true) + logger.info("Renamed ${chainFile.fileName} to $newFileName") + newPath + } else { + chainFile + } + }.sorted() + + if (renamedChainFiles.isEmpty()) { logger.warn("No chain files generated") } else { - logger.info("Generated ${chainFiles.size} chain file(s):") - chainFiles.forEach { logger.info(" $it") } + logger.info("Generated ${renamedChainFiles.size} chain file(s):") + renamedChainFiles.forEach { logger.info(" $it") } // Write chain file paths to text file FileUtils.writeFilePaths( - chainFiles, + renamedChainFiles, outputDir.resolve(CHAIN_PATHS_FILE), logger, "Chain file" diff --git a/src/main/kotlin/net/maizegenetics/commands/FormatRecombinedFastas.kt b/src/main/kotlin/net/maizegenetics/commands/FormatRecombinedFastas.kt index eeed27b..5e5cd73 100644 --- a/src/main/kotlin/net/maizegenetics/commands/FormatRecombinedFastas.kt +++ b/src/main/kotlin/net/maizegenetics/commands/FormatRecombinedFastas.kt @@ -18,12 +18,12 @@ import kotlin.system.exitProcess class FormatRecombinedFastas : CliktCommand(name = "format-recombined-fastas") { companion object { - private const val LOG_FILE_NAME = "10_format_recombined_fastas.log" - private const val FORMATTED_RESULTS_DIR = "10_formatted_fastas" + private const val LOG_FILE_NAME = "09_format_recombined_fastas.log" + private const val FORMATTED_RESULTS_DIR = "09_formatted_fastas" private const val FORMATTED_FASTA_PATHS_FILE = "formatted_fasta_paths.txt" private const val DEFAULT_LINE_WIDTH = 60 private const val DEFAULT_THREADS = 8 - private const val DEFAULT_INPUT_DIR = "09_recombined_sequences/recombinate_fastas" + private const val DEFAULT_INPUT_DIR = "08_recombined_sequences/recombinate_fastas" } private val logger: Logger = LogManager.getLogger(FormatRecombinedFastas::class.java) @@ -53,7 +53,7 @@ class FormatRecombinedFastas : CliktCommand(name = "format-recombined-fastas") { private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/10_formatted_fastas)" + help = "Custom output directory (default: work_dir/output/09_formatted_fastas)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private fun collectFastaFiles(): List { diff --git a/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt b/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt index 1e0e3a5..d15f8bf 100644 --- a/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt +++ b/src/main/kotlin/net/maizegenetics/commands/GenerateRecombinedSequences.kt @@ -12,18 +12,19 @@ import net.maizegenetics.utils.ProcessRunner import net.maizegenetics.utils.ValidationUtils import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.Logger +import java.nio.file.Files import java.nio.file.Path import kotlin.io.path.* import kotlin.system.exitProcess class GenerateRecombinedSequences : CliktCommand(name = "generate-recombined-sequences") { companion object { - private const val LOG_FILE_NAME = "09_generate_recombined_sequences.log" - private const val RECOMBINED_RESULTS_DIR = "09_recombined_sequences" + private const val LOG_FILE_NAME = "08_generate_recombined_sequences.log" + private const val RECOMBINED_RESULTS_DIR = "08_recombined_sequences" private const val RECOMBINED_FASTAS_DIR = "recombinate_fastas" private const val FASTA_PATHS_FILE = "recombined_fasta_paths.txt" private const val PYTHON_SCRIPT = "src/python/cross/write_fastas.py" - private const val DEFAULT_FOUNDER_KEY_DIR = "08_coordinates_results" + private const val DEFAULT_FOUNDER_KEY_DIR = "07_coordinates_results" } private val logger: Logger = LogManager.getLogger(GenerateRecombinedSequences::class.java) @@ -59,7 +60,7 @@ class GenerateRecombinedSequences : CliktCommand(name = "generate-recombined-seq private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/09_recombined_sequences)" + help = "Custom output directory (default: work_dir/output/08_recombined_sequences)" ).path(mustExist = false, canBeFile = false, canBeDir = true) override fun run() { @@ -119,17 +120,58 @@ class GenerateRecombinedSequences : CliktCommand(name = "generate-recombined-seq } } + // Create symlinks for FASTA files with different extensions + // The Python script expects .fa extension, but files might have .fasta or .fna + logger.info("Checking FASTA file extensions in assembly directory") + val assemblyNames = assemblyList.readLines() + .filter { it.isNotBlank() } + .mapNotNull { line -> + val parts = line.split("\t") + if (parts.size >= 2) parts[1].trim() else null + } + + assemblyNames.forEach { name -> + val faFile = assemblyDir.resolve("$name.fa") + if (!faFile.exists()) { + // Look for alternative extensions + val alternatives = listOf("fasta", "fna") + for (ext in alternatives) { + val altFile = assemblyDir.resolve("$name.$ext") + if (altFile.exists()) { + // Create symlink: name.fa -> name.fasta (or .fna) + try { + Files.createSymbolicLink(faFile, altFile.fileName) + logger.debug("Created symlink: ${faFile.fileName} -> ${altFile.fileName}") + + // Also create symlink for the FASTA index file if it exists + val altIndexFile = assemblyDir.resolve("$name.$ext.fai") + val faIndexFile = assemblyDir.resolve("$name.fa.fai") + if (altIndexFile.exists() && !faIndexFile.exists()) { + Files.createSymbolicLink(faIndexFile, altIndexFile.fileName) + logger.debug("Created symlink: ${faIndexFile.fileName} -> ${altIndexFile.fileName}") + } + } catch (e: Exception) { + logger.warn("Failed to create symlink for $name: ${e.message}") + } + break + } + } + } + } + // Run write_fastas.py // Set PYTHONPATH so Python can find the 'python' package for imports - val pythonPath = mlimputeDir.resolve("src").toString() + // Use absolute paths since the working directory is set to outputDir + // Use sh -c to set PYTHONPATH inside pixi's environment + val pythonPath = mlimputeDir.resolve("src").toAbsolutePath().toString() + val scriptPath = pythonScript.toAbsolutePath().toString() + val assemblyListPath = assemblyList.toAbsolutePath().toString() + val chromosomeListPath = chromosomeList.toAbsolutePath().toString() + val assemblyDirPath = assemblyDir.toAbsolutePath().toString() + val shellCommand = "PYTHONPATH='$pythonPath' python '$scriptPath' --assembly-list '$assemblyListPath' --chromosome-list '$chromosomeListPath' --assembly-dir '$assemblyDirPath'" logger.info("Running write_fastas.py") val exitCode = ProcessRunner.runCommand( - "env", "PYTHONPATH=$pythonPath", - "pixi", "run", - "python", pythonScript.toString(), - "--assembly-list", assemblyList.toString(), - "--chromosome-list", chromosomeList.toString(), - "--assembly-dir", assemblyDir.toString(), + "pixi", "run", "sh", "-c", shellCommand, workingDir = outputDir.toFile(), logger = logger ) diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index 5c30892..f1306f0 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -1,5 +1,6 @@ package net.maizegenetics.commands +import biokotlin.seqIO.NucSeqIO import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.core.parse import com.github.ajalt.clikt.parameters.options.default @@ -30,7 +31,9 @@ data class PipelineConfig( val create_chain_files: CreateChainFilesConfig? = null, val convert_coordinates: ConvertCoordinatesConfig? = null, val generate_recombined_sequences: GenerateRecombinedSequencesConfig? = null, - val format_recombined_fastas: FormatRecombinedFastasConfig? = null + val format_recombined_fastas: FormatRecombinedFastasConfig? = null, + val mutated_maf_to_gvcf: MutatedMafToGvcfConfig? = null, + val rope_bwt_chr_index: RopeBwtChrIndexConfig? = null ) data class AlignAssembliesConfig( @@ -69,37 +72,35 @@ data class ConvertToFastaConfig( data class AlignMutatedAssembliesConfig( val ref_gff: String? = null, // Optional: Reference GFF (uses align_assemblies.ref_gff if not specified) - val ref_fasta: String? = null, // Optional: Reference FASTA (uses matching ref from step 4 output if not specified) - val fasta_input: String? = null, // Optional: Query FASTA input (uses non-ref files from step 4 if not specified) + val ref_fasta: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) + val fasta_input: String? = null, // Optional: Query FASTA input (uses format_recombined_fastas output if not specified) val threads: Int? = null, val output: String? = null // Custom output directory ) data class PickCrossoversConfig( - val assembly_list: String, + val assembly_list: String? = null, // Optional: If not specified, auto-generates from convert_to_fasta output val ref_fasta: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) val output: String? = null // Custom output directory ) data class CreateChainFilesConfig( val jobs: Int? = null, - val input: String? = null, // Custom MAF input file/directory - val output: String? = null // Custom output directory + val maf_file_input: String? = null, // Custom MAF input file/directory (default: step 5 MAF outputs) + val output: String? = null // Custom output directory ) data class ConvertCoordinatesConfig( - val assembly_list: String, - val input_chain: String? = null, // Custom chain directory - val input_refkey: String? = null, // Custom refkey directory - val output: String? = null // Custom output directory + val assembly_list: String? = null, // Optional: defaults to assembly list from pick_crossovers step + val input_chain: String? = null, // Custom chain directory + val input_refkey: String? = null, // Custom refkey directory + val output: String? = null // Custom output directory ) data class GenerateRecombinedSequencesConfig( - val assembly_list: String, - val chromosome_list: String, - val assembly_dir: String, - val input: String? = null, // Custom founder key directory - val output: String? = null // Custom output directory + val assembly_list: String? = null, // Optional: defaults to assembly list from pick_crossovers step + val chromosome_list: String? = null, // Optional: auto-derives from first assembly in assembly list + val assembly_dir: String? = null // Optional: defaults to step 4 FASTA output directory ) data class FormatRecombinedFastasConfig( @@ -109,9 +110,28 @@ data class FormatRecombinedFastasConfig( val output: String? = null // Custom output directory ) +data class MutatedMafToGvcfConfig( + val reference_file: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) + val maf_file: String? = null, // Optional: MAF file/directory/list (uses align_mutated_assemblies output if not specified) + val output_file: String? = null, // Optional: Output GVCF file name + val sample_name: String? = null, // Optional: Sample name for GVCF + val output_dir: String? = null // Optional: Custom GVCF output directory +) + +data class RopeBwtChrIndexConfig( + val keyfile: String? = null, // Optional: Pre-made keyfile (if not specified, auto-generates from format_recombined_fastas output) + val index_file_prefix: String? = null, // Optional: Prefix for index files (default: "phgIndex") + val threads: Int? = null, // Optional: Number of threads (default: 20) + val delete_fmr_index: Boolean? = null, // Optional: Delete .fmr files after conversion (default: true) + val output: String? = null // Optional: Custom output directory +) + class Orchestrate : CliktCommand(name = "orchestrate") { companion object { private const val LOG_FILE_NAME = "00_orchestrate.log" + // Regex patterns reused across multiple operations + private val FASTA_FILE_PATTERN = Regex(".*\\.(fa|fasta|fna)(\\.gz)?$") + private val FASTA_EXTENSION_PATTERN = Regex("\\.(fa|fasta|fna)(\\.gz)?$") } private val logger: Logger = LogManager.getLogger(Orchestrate::class.java) @@ -272,16 +292,16 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ) } else null - // Parse pick_crossovers + // Parse pick_crossovers - check if key exists (even with empty/null value means "run with defaults") @Suppress("UNCHECKED_CAST") val pickCrossoversMap = configMap["pick_crossovers"] as? Map - val pickCrossovers = pickCrossoversMap?.let { + val pickCrossovers = if (configMap.containsKey("pick_crossovers")) { PickCrossoversConfig( - assembly_list = it["assembly_list"] as? String ?: throw IllegalArgumentException("pick_crossovers.assembly_list is required"), - ref_fasta = it["ref_fasta"] as? String, - output = it["output"] as? String + assembly_list = pickCrossoversMap?.get("assembly_list") as? String, + ref_fasta = pickCrossoversMap?.get("ref_fasta") as? String, + output = pickCrossoversMap?.get("output") as? String ) - } + } else null // Parse create_chain_files - check if key exists (even with empty/null value means "run with defaults") @Suppress("UNCHECKED_CAST") @@ -289,35 +309,33 @@ class Orchestrate : CliktCommand(name = "orchestrate") { val createChainFiles = if (configMap.containsKey("create_chain_files")) { CreateChainFilesConfig( jobs = createChainFilesMap?.get("jobs") as? Int, - input = createChainFilesMap?.get("input") as? String, + maf_file_input = createChainFilesMap?.get("maf_file_input") as? String, output = createChainFilesMap?.get("output") as? String ) } else null - // Parse convert_coordinates + // Parse convert_coordinates - check if key exists (even with empty/null value means "run with defaults") @Suppress("UNCHECKED_CAST") val convertCoordinatesMap = configMap["convert_coordinates"] as? Map - val convertCoordinates = convertCoordinatesMap?.let { + val convertCoordinates = if (configMap.containsKey("convert_coordinates")) { ConvertCoordinatesConfig( - assembly_list = it["assembly_list"] as? String ?: throw IllegalArgumentException("convert_coordinates.assembly_list is required"), - input_chain = it["input_chain"] as? String, - input_refkey = it["input_refkey"] as? String, - output = it["output"] as? String + assembly_list = convertCoordinatesMap?.get("assembly_list") as? String, + input_chain = convertCoordinatesMap?.get("input_chain") as? String, + input_refkey = convertCoordinatesMap?.get("input_refkey") as? String, + output = convertCoordinatesMap?.get("output") as? String ) - } + } else null - // Parse generate_recombined_sequences + // Parse generate_recombined_sequences - check if key exists (even with empty/null value means "run with defaults") @Suppress("UNCHECKED_CAST") val generateRecombinedSequencesMap = configMap["generate_recombined_sequences"] as? Map - val generateRecombinedSequences = generateRecombinedSequencesMap?.let { + val generateRecombinedSequences = if (configMap.containsKey("generate_recombined_sequences")) { GenerateRecombinedSequencesConfig( - assembly_list = it["assembly_list"] as? String ?: throw IllegalArgumentException("generate_recombined_sequences.assembly_list is required"), - chromosome_list = it["chromosome_list"] as? String ?: throw IllegalArgumentException("generate_recombined_sequences.chromosome_list is required"), - assembly_dir = it["assembly_dir"] as? String ?: throw IllegalArgumentException("generate_recombined_sequences.assembly_dir is required"), - input = it["input"] as? String, - output = it["output"] as? String + assembly_list = generateRecombinedSequencesMap?.get("assembly_list") as? String, + chromosome_list = generateRecombinedSequencesMap?.get("chromosome_list") as? String, + assembly_dir = generateRecombinedSequencesMap?.get("assembly_dir") as? String ) - } + } else null // Parse format_recombined_fastas - check if key exists (even with empty/null value means "run with defaults") @Suppress("UNCHECKED_CAST") @@ -331,6 +349,32 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ) } else null + // Parse mutated_maf_to_gvcf - check if key exists (even with empty/null value means "run with defaults") + @Suppress("UNCHECKED_CAST") + val mutatedMafToGvcfMap = configMap["mutated_maf_to_gvcf"] as? Map + val mutatedMafToGvcf = if (configMap.containsKey("mutated_maf_to_gvcf")) { + MutatedMafToGvcfConfig( + reference_file = mutatedMafToGvcfMap?.get("reference_file") as? String, + maf_file = mutatedMafToGvcfMap?.get("maf_file") as? String, + output_file = mutatedMafToGvcfMap?.get("output_file") as? String, + sample_name = mutatedMafToGvcfMap?.get("sample_name") as? String, + output_dir = mutatedMafToGvcfMap?.get("output_dir") as? String + ) + } else null + + // Parse rope_bwt_chr_index - check if key exists (even with empty/null value means "run with defaults") + @Suppress("UNCHECKED_CAST") + val ropeBwtChrIndexMap = configMap["rope_bwt_chr_index"] as? Map + val ropeBwtChrIndex = if (configMap.containsKey("rope_bwt_chr_index")) { + RopeBwtChrIndexConfig( + keyfile = ropeBwtChrIndexMap?.get("keyfile") as? String, + index_file_prefix = ropeBwtChrIndexMap?.get("index_file_prefix") as? String, + threads = ropeBwtChrIndexMap?.get("threads") as? Int, + delete_fmr_index = ropeBwtChrIndexMap?.get("delete_fmr_index") as? Boolean, + output = ropeBwtChrIndexMap?.get("output") as? String + ) + } else null + return PipelineConfig( work_dir = workDir, run_steps = runSteps, @@ -343,7 +387,9 @@ class Orchestrate : CliktCommand(name = "orchestrate") { create_chain_files = createChainFiles, convert_coordinates = convertCoordinates, generate_recombined_sequences = generateRecombinedSequences, - format_recombined_fastas = formatRecombinedFastas + format_recombined_fastas = formatRecombinedFastas, + mutated_maf_to_gvcf = mutatedMafToGvcf, + rope_bwt_chr_index = ropeBwtChrIndex ) } catch (e: Exception) { logger.error("Failed to parse configuration file: ${e.message}", e) @@ -395,12 +441,14 @@ class Orchestrate : CliktCommand(name = "orchestrate") { var fastaOutputDir: Path? = null var refFasta: Path? = null var refGff: Path? = null - var mutatedRefFasta: Path? = null // Mutated reference FASTA from step 5 + var assemblyListPath: Path? = null // Assembly list from step 5 (pick_crossovers) var refkeyOutputDir: Path? = null var chainOutputDir: Path? = null var coordinatesOutputDir: Path? = null var recombinedFastasDir: Path? = null var formattedFastasDir: Path? = null + var mutatedMafFilePaths: Path? = null // MAF file paths from step 10 (align_mutated_assemblies) + var ropeBwtIndexDir: Path? = null // RopeBWT index output directory from step 12 try { // Step 1: Align Assemblies (if configured and should run) @@ -697,142 +745,65 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } - // Step 5: Align Mutated Assemblies (if configured and should run) - if (config.align_mutated_assemblies != null && shouldRunStep("align_mutated_assemblies", config)) { + // Step 5: Pick Crossovers (if configured and should run) + if (config.pick_crossovers != null && shouldRunStep("pick_crossovers", config)) { logger.info("=".repeat(80)) - logger.info("STEP 5: Align Mutated Assemblies") + logger.info("STEP 5: Pick Crossovers") logger.info("=".repeat(80)) - // Determine ref_gff (config value or from step 1) - val step5RefGff = config.align_mutated_assemblies.ref_gff?.let { Path.of(it) } ?: refGff - if (step5RefGff == null) { - throw RuntimeException("Cannot run align-mutated-assemblies: reference GFF not available (specify 'ref_gff' in config or run align_assemblies first)") + // Use pick_crossovers.ref_fasta if specified, otherwise use ref FASTA from step 1 + val pickCrossoversRefFasta = config.pick_crossovers.ref_fasta?.let { Path.of(it) } + ?: refFasta + if (pickCrossoversRefFasta == null) { + throw RuntimeException("Cannot run pick-crossovers: reference FASTA not available (specify 'ref_fasta' in pick_crossovers config or run align_assemblies first)") } - // Determine ref_fasta and fasta_input from step 4 output - // If fastaOutputDir exists, find the reference FASTA (matching original ref name) and query FASTAs - var step5RefFasta: Path? = config.align_mutated_assemblies.ref_fasta?.let { Path.of(it) } - var step5FastaInput: Path? = config.align_mutated_assemblies.fasta_input?.let { Path.of(it) } - - // If not explicitly specified, try to derive from step 4 output - if (step5RefFasta == null || step5FastaInput == null) { - if (fastaOutputDir != null && fastaOutputDir.exists()) { - // Get the reference FASTA filename (without path) to match against step 4 output - val refFastaName = refFasta?.fileName?.toString()?.replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") - - if (refFastaName != null) { - // Find all FASTA files in the output directory - val allFastaFiles = fastaOutputDir.toFile().listFiles { file -> - file.isFile && file.name.matches(Regex(".*\\.(fa|fasta|fna)(\\.gz)?$")) - }?.map { it.toPath() } ?: emptyList() - - // Find the reference FASTA (filename contains the ref name) - val matchingRefFasta = allFastaFiles.find { path -> - path.fileName.toString().contains(refFastaName, ignoreCase = true) - } - - // Get non-reference FASTAs - val queryFastas = allFastaFiles.filter { path -> - !path.fileName.toString().contains(refFastaName, ignoreCase = true) - } - - if (step5RefFasta == null && matchingRefFasta != null) { - step5RefFasta = matchingRefFasta - logger.info("Auto-detected reference FASTA from step 4: $step5RefFasta") - } - - if (step5FastaInput == null && queryFastas.isNotEmpty()) { - // Create a text file listing the query FASTAs - val queryListFile = fastaOutputDir.resolve("query_fastas.txt") - queryListFile.writeText(queryFastas.joinToString("\n") { it.toAbsolutePath().toString() }) - step5FastaInput = queryListFile - logger.info("Auto-detected ${queryFastas.size} query FASTA files from step 4") - } - } + // Determine assembly list (custom or auto-generated from step 4) + val step6AssemblyList: Path = if (config.pick_crossovers.assembly_list != null) { + Path.of(config.pick_crossovers.assembly_list).toAbsolutePath().normalize() + } else { + // Auto-generate assembly list from step 4 output (fastaOutputDir) + if (fastaOutputDir == null || !fastaOutputDir.exists()) { + throw RuntimeException("Cannot run pick-crossovers: no assembly_list provided and no FASTA output directory available from convert_to_fasta step") } - - // Fall back to original behavior if auto-detection failed - if (step5RefFasta == null) { - step5RefFasta = refFasta + + // Get all FASTA files from step 4 output + val fastaFiles = fastaOutputDir.toFile().listFiles { file -> + file.isFile && file.name.matches(FASTA_FILE_PATTERN) + }?.map { it.toPath() }?.sorted() ?: emptyList() + + if (fastaFiles.isEmpty()) { + throw RuntimeException("Cannot run pick-crossovers: no FASTA files found in $fastaOutputDir") } - if (step5FastaInput == null) { - step5FastaInput = fastaOutputDir + + // Create assembly list file with pathname format + // Name is derived from filename minus extension (keeping _mutated suffix) + val assemblyListFile = fastaOutputDir.resolve("auto_assembly_list.txt") + val lines = fastaFiles.map { fastaPath -> + val fileName = fastaPath.fileName.toString() + // Remove extension (including .gz if present) + val baseName = fileName.replace(FASTA_EXTENSION_PATTERN, "") + "${fastaPath.toAbsolutePath()}\t$baseName" } + assemblyListFile.writeText(lines.joinToString("\n")) + logger.info("Auto-generated assembly list file: $assemblyListFile") + logger.info(" Contains ${fastaFiles.size} assemblies") + + assemblyListFile } - if (step5FastaInput == null) { - throw RuntimeException("Cannot run align-mutated-assemblies: no FASTA input available (specify 'fasta_input' in config or run convert-to-fasta first)") - } - if (step5RefFasta == null) { - throw RuntimeException("Cannot run align-mutated-assemblies: reference FASTA not available (specify 'ref_fasta' in config or run convert-to-fasta first)") - } - - // Determine output directory (custom or default) - val customOutput = config.align_mutated_assemblies.output?.let { Path.of(it) } - - val args = buildList { - add("--work-dir=${workDir}") - add("--ref-gff=${step5RefGff}") - add("--ref-fasta=${step5RefFasta}") - add("--fasta-input=${step5FastaInput}") - if (config.align_mutated_assemblies.threads != null) { - add("--threads=${config.align_mutated_assemblies.threads}") - } - if (customOutput != null) { - add("--output-dir=${customOutput}") - } + // Validate that the number of assemblies is even + val assemblyCount = step6AssemblyList.readLines().filter { it.isNotBlank() }.size + if (assemblyCount % 2 != 0) { + throw RuntimeException( + "Cannot run pick-crossovers: assembly list contains $assemblyCount assemblies, " + + "but this step requires an even number of assembly files to work (assemblies are paired for crossover simulation)" + ) } + logger.info("Assembly list contains $assemblyCount assemblies (validated: even count)") - AlignMutatedAssemblies().parse(args) - restoreOrchestratorLogging(workDir) - - // Save the mutated reference FASTA for use in step 6 - mutatedRefFasta = step5RefFasta - - logger.info("Step 5 completed successfully") - logger.info("") - } else { - if (config.align_mutated_assemblies != null) { - logger.info("Skipping align-mutated-assemblies (not in run_steps)") - - // Try to recover mutated ref FASTA from step 5 config or step 4 output - if (config.align_mutated_assemblies.ref_fasta != null) { - mutatedRefFasta = Path.of(config.align_mutated_assemblies.ref_fasta) - logger.info("Using configured mutated reference FASTA: $mutatedRefFasta") - } else if (fastaOutputDir != null && fastaOutputDir.exists()) { - // Try to auto-detect from step 4 output - val refFastaName = refFasta?.fileName?.toString()?.replace(Regex("\\.(fa|fasta|fna)(\\.gz)?$"), "") - if (refFastaName != null) { - val matchingRefFasta = fastaOutputDir.toFile().listFiles { file -> - file.isFile && file.name.matches(Regex(".*\\.(fa|fasta|fna)(\\.gz)?$")) && - file.name.contains(refFastaName, ignoreCase = true) - }?.firstOrNull()?.toPath() - if (matchingRefFasta != null) { - mutatedRefFasta = matchingRefFasta - logger.info("Auto-detected mutated reference FASTA: $mutatedRefFasta") - } - } - } - } else { - logger.info("Skipping align-mutated-assemblies (not configured)") - } - logger.info("") - } - - // Step 6: Pick Crossovers (if configured and should run) - if (config.pick_crossovers != null && shouldRunStep("pick_crossovers", config)) { - logger.info("=".repeat(80)) - logger.info("STEP 6: Pick Crossovers") - logger.info("=".repeat(80)) - - // Use pick_crossovers.ref_fasta if specified, otherwise use mutated ref FASTA from step 5, - // finally fall back to original ref FASTA from step 1 - val pickCrossoversRefFasta = config.pick_crossovers.ref_fasta?.let { Path.of(it) } - ?: mutatedRefFasta - ?: refFasta - if (pickCrossoversRefFasta == null) { - throw RuntimeException("Cannot run pick-crossovers: reference FASTA not available (specify 'ref_fasta' in pick_crossovers config, run align_mutated_assemblies, or run align_assemblies first)") - } + // Save assembly list path for use in steps 8 and 9 + assemblyListPath = step6AssemblyList // Determine output directory (custom or default) val customOutput = config.pick_crossovers.output?.let { Path.of(it) } @@ -840,7 +811,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { val args = buildList { add("--work-dir=${workDir}") add("--ref-fasta=${pickCrossoversRefFasta}") - add("--assembly-list=${config.pick_crossovers.assembly_list}") + add("--assembly-list=${step6AssemblyList}") if (customOutput != null) { add("--output-dir=${customOutput}") } @@ -850,13 +821,13 @@ class Orchestrate : CliktCommand(name = "orchestrate") { restoreOrchestratorLogging(workDir) // Get output directory (use custom or default) - refkeyOutputDir = customOutput ?: workDir.resolve("output").resolve("06_crossovers_results") + refkeyOutputDir = customOutput ?: workDir.resolve("output").resolve("05_crossovers_results") if (!refkeyOutputDir.exists()) { throw RuntimeException("Expected refkey output directory not found: $refkeyOutputDir") } - logger.info("Step 6 completed successfully") + logger.info("Step 5 completed successfully") logger.info("") } else { if (config.pick_crossovers != null) { @@ -864,30 +835,44 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Check custom output location first, then default val customOutput = config.pick_crossovers.output?.let { Path.of(it) } - val previousRefkeyDir = customOutput ?: workDir.resolve("output").resolve("06_crossovers_results") + val previousRefkeyDir = customOutput ?: workDir.resolve("output").resolve("05_crossovers_results") if (previousRefkeyDir.exists()) { refkeyOutputDir = previousRefkeyDir logger.info("Using previous pick-crossovers outputs: $refkeyOutputDir") } else { logger.warn("Previous pick-crossovers outputs not found. Downstream steps may fail.") } + + // Try to recover assembly list from config or auto-generated file + if (config.pick_crossovers.assembly_list != null) { + assemblyListPath = Path.of(config.pick_crossovers.assembly_list).toAbsolutePath().normalize() + logger.info("Using configured assembly list: $assemblyListPath") + } else if (fastaOutputDir != null) { + val autoGeneratedList = fastaOutputDir.resolve("auto_assembly_list.txt") + if (autoGeneratedList.exists()) { + assemblyListPath = autoGeneratedList + logger.info("Using auto-generated assembly list from previous run: $assemblyListPath") + } + } } else { logger.info("Skipping pick-crossovers (not configured)") } logger.info("") } - // Step 7: Create Chain Files (if configured and should run) + // Step 6: Create Chain Files (if configured and should run) if (config.create_chain_files != null && shouldRunStep("create_chain_files", config)) { logger.info("=".repeat(80)) - logger.info("STEP 7: Create Chain Files") + logger.info("STEP 6: Create Chain Files") logger.info("=".repeat(80)) - // Determine input (custom or from previous step) - val mafInput = config.create_chain_files.input?.let { Path.of(it) } ?: mafFilePaths + // Determine input (custom or step 1 MAF files) + val mafInput = config.create_chain_files.maf_file_input?.let { Path.of(it) } + ?: mafFilePaths if (mafInput == null) { - throw RuntimeException("Cannot run create-chain-files: no MAF input available (specify 'input' in config or run align-assemblies first)") + throw RuntimeException("Cannot run create-chain-files: no MAF input available (specify 'maf_file_input' in config or run align-assemblies first)") } + logger.info("MAF input: $mafInput") // Determine output directory (custom or default) val customOutput = config.create_chain_files.output?.let { Path.of(it) } @@ -907,13 +892,13 @@ class Orchestrate : CliktCommand(name = "orchestrate") { restoreOrchestratorLogging(workDir) // Get output directory (use custom or default) - chainOutputDir = customOutput ?: workDir.resolve("output").resolve("07_chain_results") + chainOutputDir = customOutput ?: workDir.resolve("output").resolve("06_chain_results") if (!chainOutputDir.exists()) { throw RuntimeException("Expected chain output directory not found: $chainOutputDir") } - logger.info("Step 7 completed successfully") + logger.info("Step 6 completed successfully") logger.info("") } else { if (config.create_chain_files != null) { @@ -921,7 +906,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Check custom output location first, then default val customOutput = config.create_chain_files.output?.let { Path.of(it) } - val previousChainDir = customOutput ?: workDir.resolve("output").resolve("07_chain_results") + val previousChainDir = customOutput ?: workDir.resolve("output").resolve("06_chain_results") if (previousChainDir.exists()) { chainOutputDir = previousChainDir logger.info("Using previous create-chain-files outputs: $chainOutputDir") @@ -934,10 +919,10 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } - // Step 8: Convert Coordinates (if configured and should run) + // Step 7: Convert Coordinates (if configured and should run) if (config.convert_coordinates != null && shouldRunStep("convert_coordinates", config)) { logger.info("=".repeat(80)) - logger.info("STEP 8: Convert Coordinates") + logger.info("STEP 7: Convert Coordinates") logger.info("=".repeat(80)) // Determine chain input (custom or from previous step) @@ -949,12 +934,21 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Determine refkey input (custom or from previous step) val refkeyInput = config.convert_coordinates.input_refkey?.let { Path.of(it) } ?: refkeyOutputDir + // Determine assembly list (custom or from step 6) + val step8AssemblyList = config.convert_coordinates.assembly_list?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: assemblyListPath + if (step8AssemblyList == null) { + throw RuntimeException("Cannot run convert-coordinates: no assembly_list available (specify 'assembly_list' in config or run pick-crossovers first)") + } + logger.info("Assembly list: $step8AssemblyList") + // Determine output directory (custom or default) val customOutput = config.convert_coordinates.output?.let { Path.of(it) } val args = buildList { add("--work-dir=${workDir}") - add("--assembly-list=${config.convert_coordinates.assembly_list}") + add("--assembly-list=${step8AssemblyList}") add("--chain-dir=${chainInput}") if (refkeyInput != null) { add("--refkey-dir=${refkeyInput}") @@ -968,13 +962,13 @@ class Orchestrate : CliktCommand(name = "orchestrate") { restoreOrchestratorLogging(workDir) // Get output directory (use custom or default) - coordinatesOutputDir = customOutput ?: workDir.resolve("output").resolve("08_coordinates_results") + coordinatesOutputDir = customOutput ?: workDir.resolve("output").resolve("07_coordinates_results") if (!coordinatesOutputDir.exists()) { throw RuntimeException("Expected coordinates output directory not found: $coordinatesOutputDir") } - logger.info("Step 8 completed successfully") + logger.info("Step 7 completed successfully") logger.info("") } else { if (config.convert_coordinates != null) { @@ -982,7 +976,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Check custom output location first, then default val customOutput = config.convert_coordinates.output?.let { Path.of(it) } - val previousCoordsDir = customOutput ?: workDir.resolve("output").resolve("08_coordinates_results") + val previousCoordsDir = customOutput ?: workDir.resolve("output").resolve("07_coordinates_results") if (previousCoordsDir.exists()) { coordinatesOutputDir = previousCoordsDir logger.info("Using previous convert-coordinates outputs: $coordinatesOutputDir") @@ -995,48 +989,91 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } - // Step 9: Generate Recombined Sequences (if configured and should run) + // Step 8: Generate Recombined Sequences (if configured and should run) if (config.generate_recombined_sequences != null && shouldRunStep("generate_recombined_sequences", config)) { logger.info("=".repeat(80)) - logger.info("STEP 9: Generate Recombined Sequences") + logger.info("STEP 8: Generate Recombined Sequences") logger.info("=".repeat(80)) - // Determine founder key input (custom or from previous step) - val founderKeyInput = config.generate_recombined_sequences.input?.let { Path.of(it) } ?: coordinatesOutputDir - if (founderKeyInput == null) { - throw RuntimeException("Cannot run generate-recombined-sequences: no founder key input available (specify 'input' in config or run convert-coordinates first)") + // Use founder key input from previous step (convert_coordinates) + if (coordinatesOutputDir == null) { + throw RuntimeException("Cannot run generate-recombined-sequences: no founder key input available (run convert-coordinates first)") } + logger.info("Founder key directory: $coordinatesOutputDir") - // Determine output directory (custom or default) - val customOutput = config.generate_recombined_sequences.output?.let { Path.of(it) } + // Determine assembly list (custom or from step 6) + val step9AssemblyList = config.generate_recombined_sequences.assembly_list?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: assemblyListPath + if (step9AssemblyList == null) { + throw RuntimeException("Cannot run generate-recombined-sequences: no assembly_list available (specify 'assembly_list' in config or run pick-crossovers first)") + } + logger.info("Assembly list: $step9AssemblyList") + + // Determine chromosome list (custom or auto-derived from first assembly) + val step9ChromosomeList: Path = if (config.generate_recombined_sequences.chromosome_list != null) { + Path.of(config.generate_recombined_sequences.chromosome_list).toAbsolutePath().normalize() + } else { + // Auto-derive chromosome list from the first assembly in the assembly list + val firstLine = step9AssemblyList.readLines().firstOrNull { it.isNotBlank() } + ?: throw RuntimeException("Cannot run generate-recombined-sequences: assembly list is empty") + + // Assembly list format: pathname - extract the path (first column) + val firstAssemblyPath = firstLine.split("\t").firstOrNull()?.trim() + ?: throw RuntimeException("Cannot run generate-recombined-sequences: invalid assembly list format") + + logger.info("Auto-deriving chromosome list from first assembly: $firstAssemblyPath") + + // Use BioKotlin to read the FASTA and extract chromosome IDs + val seq = NucSeqIO(firstAssemblyPath).readAll() + val chromosomeIds = seq.keys.toList() + + if (chromosomeIds.isEmpty()) { + throw RuntimeException("Cannot run generate-recombined-sequences: no chromosomes found in $firstAssemblyPath") + } + + // Write chromosome list to a temporary file + val chromosomeListFile = workDir.resolve("output").resolve("08_recombined_sequences").resolve("auto_chromosome_list.txt") + chromosomeListFile.parent.createDirectories() + chromosomeListFile.writeText(chromosomeIds.joinToString("\n")) + logger.info("Auto-generated chromosome list: $chromosomeListFile") + logger.info(" Contains ${chromosomeIds.size} chromosomes: ${chromosomeIds.take(5).joinToString(", ")}${if (chromosomeIds.size > 5) ", ..." else ""}") + + chromosomeListFile + } + + // Determine output directory (step 8 default output) + val outputBase = workDir.resolve("output").resolve("08_recombined_sequences") + + // Determine assembly directory (custom or from step 4 FASTA output) + // The Python script needs to read parent FASTA files which are in the FASTA output directory + val step9AssemblyDir = config.generate_recombined_sequences.assembly_dir?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: fastaOutputDir ?: throw RuntimeException("Cannot run generate-recombined-sequences: no assembly directory available (specify 'assembly_dir' in config or run convert-to-fasta first)") + logger.info("Assembly directory: $step9AssemblyDir") val args = buildList { add("--work-dir=${workDir}") - add("--assembly-list=${config.generate_recombined_sequences.assembly_list}") - add("--chromosome-list=${config.generate_recombined_sequences.chromosome_list}") - add("--assembly-dir=${config.generate_recombined_sequences.assembly_dir}") - add("--founder-key-dir=${founderKeyInput}") - if (customOutput != null) { - add("--output-dir=${customOutput}") - } + add("--assembly-list=${step9AssemblyList}") + add("--chromosome-list=${step9ChromosomeList}") + add("--assembly-dir=${step9AssemblyDir}") + add("--founder-key-dir=${coordinatesOutputDir}") } GenerateRecombinedSequences().parse(args) restoreOrchestratorLogging(workDir) - // Get output directory (use custom or default) - val outputBase = customOutput ?: workDir.resolve("output").resolve("09_recombined_sequences") + // Get output directory recombinedFastasDir = outputBase.resolve("recombinate_fastas") - logger.info("Step 9 completed successfully") + logger.info("Step 8 completed successfully") logger.info("") } else { if (config.generate_recombined_sequences != null) { logger.info("Skipping generate-recombined-sequences (not in run_steps)") - // Check custom output location first, then default - val customOutput = config.generate_recombined_sequences.output?.let { Path.of(it) } - val outputBase = customOutput ?: workDir.resolve("output").resolve("09_recombined_sequences") + // Check default output location + val outputBase = workDir.resolve("output").resolve("08_recombined_sequences") val previousRecombinedDir = outputBase.resolve("recombinate_fastas") if (previousRecombinedDir.exists()) { recombinedFastasDir = previousRecombinedDir @@ -1050,10 +1087,10 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } - // Step 10: Format Recombined Fastas (if configured and should run) + // Step 9: Format Recombined Fastas (if configured and should run) if (config.format_recombined_fastas != null && shouldRunStep("format_recombined_fastas", config)) { logger.info("=".repeat(80)) - logger.info("STEP 10: Format Recombined Fastas") + logger.info("STEP 9: Format Recombined Fastas") logger.info("=".repeat(80)) // Determine input (custom or from previous step) @@ -1083,9 +1120,9 @@ class Orchestrate : CliktCommand(name = "orchestrate") { restoreOrchestratorLogging(workDir) // Get output directory (use custom or default) - formattedFastasDir = customOutput ?: workDir.resolve("output").resolve("10_formatted_fastas") + formattedFastasDir = customOutput ?: workDir.resolve("output").resolve("09_formatted_fastas") - logger.info("Step 10 completed successfully") + logger.info("Step 9 completed successfully") logger.info("") } else { if (config.format_recombined_fastas != null) { @@ -1093,7 +1130,7 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Check custom output location first, then default val customOutput = config.format_recombined_fastas.output?.let { Path.of(it) } - val previousFormattedDir = customOutput ?: workDir.resolve("output").resolve("10_formatted_fastas") + val previousFormattedDir = customOutput ?: workDir.resolve("output").resolve("09_formatted_fastas") if (previousFormattedDir.exists()) { formattedFastasDir = previousFormattedDir logger.info("Using previous format-recombined-fastas outputs: $formattedFastasDir") @@ -1106,6 +1143,283 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } + // Step 10: Align Mutated Assemblies (if configured and should run) + if (config.align_mutated_assemblies != null && shouldRunStep("align_mutated_assemblies", config)) { + logger.info("=".repeat(80)) + logger.info("STEP 10: Align Mutated Assemblies") + logger.info("=".repeat(80)) + + // Determine ref_gff (config value or from step 1) + val step10RefGff = config.align_mutated_assemblies.ref_gff?.let { Path.of(it) } ?: refGff + if (step10RefGff == null) { + throw RuntimeException("Cannot run align-mutated-assemblies: reference GFF not available (specify 'ref_gff' in config or run align_assemblies first)") + } + + // Determine ref_fasta (config value or from step 1) + val step10RefFasta = config.align_mutated_assemblies.ref_fasta?.let { Path.of(it) } ?: refFasta + if (step10RefFasta == null) { + throw RuntimeException("Cannot run align-mutated-assemblies: reference FASTA not available (specify 'ref_fasta' in config or run align_assemblies first)") + } + + // Determine fasta_input (config value or from format_recombined_fastas output) + val step10FastaInput = config.align_mutated_assemblies.fasta_input?.let { Path.of(it) } ?: formattedFastasDir + if (step10FastaInput == null) { + throw RuntimeException("Cannot run align-mutated-assemblies: no FASTA input available (specify 'fasta_input' in config or run format-recombined-fastas first)") + } + + logger.info("Reference GFF: $step10RefGff") + logger.info("Reference FASTA: $step10RefFasta") + logger.info("FASTA input: $step10FastaInput") + + // Determine output directory (custom or default) + val customOutput = config.align_mutated_assemblies.output?.let { Path.of(it) } + + val args = buildList { + add("--work-dir=${workDir}") + add("--ref-gff=${step10RefGff}") + add("--ref-fasta=${step10RefFasta}") + add("--fasta-input=${step10FastaInput}") + if (config.align_mutated_assemblies.threads != null) { + add("--threads=${config.align_mutated_assemblies.threads}") + } + if (customOutput != null) { + add("--output-dir=${customOutput}") + } + } + + AlignMutatedAssemblies().parse(args) + restoreOrchestratorLogging(workDir) + + // Get output path (use custom or default) + val outputBase = customOutput ?: workDir.resolve("output").resolve("10_mutated_alignment_results") + mutatedMafFilePaths = outputBase.toAbsolutePath().normalize().resolve("maf_file_paths.txt") + + if (!mutatedMafFilePaths.exists()) { + throw RuntimeException("Expected MAF paths file not found: $mutatedMafFilePaths") + } + + logger.info("Step 10 completed successfully") + logger.info("") + } else { + if (config.align_mutated_assemblies != null) { + logger.info("Skipping align-mutated-assemblies (not in run_steps)") + + // Check custom output location first, then default + val customOutput = config.align_mutated_assemblies.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val outputBase = (customOutput ?: workDir.resolve("output").resolve("10_mutated_alignment_results")) + .toAbsolutePath().normalize() + val previousMafPaths = outputBase.resolve("maf_file_paths.txt") + + if (previousMafPaths.exists()) { + mutatedMafFilePaths = previousMafPaths + logger.info("Using previous align-mutated-assemblies outputs: $mutatedMafFilePaths") + } else { + logger.warn("Previous align-mutated-assemblies outputs not found. Downstream steps may fail.") + } + } else { + logger.info("Skipping align-mutated-assemblies (not configured)") + } + logger.info("") + } + + // Step 11: Mutated MAF to GVCF (if configured and should run) + if (config.mutated_maf_to_gvcf != null && shouldRunStep("mutated_maf_to_gvcf", config)) { + logger.info("=".repeat(80)) + logger.info("STEP 11: Mutated MAF to GVCF Conversion") + logger.info("=".repeat(80)) + + // Determine reference file (custom or from step 1) - resolve to absolute path + val step11RefFasta = config.mutated_maf_to_gvcf.reference_file?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: refFasta + if (step11RefFasta == null) { + throw RuntimeException("Cannot run mutated-maf-to-gvcf: reference FASTA not available (specify 'reference_file' in config or run align-assemblies first)") + } + + // Determine MAF input (custom or from step 10) - resolve to absolute path + val mafInput = config.mutated_maf_to_gvcf.maf_file?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: mutatedMafFilePaths + if (mafInput == null) { + throw RuntimeException("Cannot run mutated-maf-to-gvcf: no MAF input available (specify 'maf_file' in config or run align-mutated-assemblies first)") + } + + // Determine output directory (custom or default) - resolve to absolute path + // Always use step 11 output directory by default (not MafToGvcf's default) + val mutatedGvcfOutputDir = (config.mutated_maf_to_gvcf.output_dir?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: workDir.resolve("output").resolve("11_mutated_gvcf_results")) + .toAbsolutePath().normalize() + + // Determine output file if specified - resolve to absolute path + val outputFile = config.mutated_maf_to_gvcf.output_file?.let { + Path.of(it).toAbsolutePath().normalize() + } + + logger.info("Reference FASTA: $step11RefFasta") + logger.info("MAF input: $mafInput") + logger.info("Output directory: $mutatedGvcfOutputDir") + + val args = buildList { + add("--work-dir=$workDir") + add("--reference-file=$step11RefFasta") + add("--maf-file=$mafInput") + add("--output-dir=$mutatedGvcfOutputDir") // Always pass output dir to ensure step 11 location + if (outputFile != null) { + add("--output-file=$outputFile") + } + if (config.mutated_maf_to_gvcf.sample_name != null) { + add("--sample-name=${config.mutated_maf_to_gvcf.sample_name}") + } + } + + MafToGvcf().parse(args) + restoreOrchestratorLogging(workDir) + + if (!mutatedGvcfOutputDir.exists()) { + throw RuntimeException("Expected mutated GVCF output directory not found: $mutatedGvcfOutputDir") + } + + logger.info("Step 11 completed successfully") + logger.info("") + } else { + if (config.mutated_maf_to_gvcf != null) { + logger.info("Skipping mutated-maf-to-gvcf (not in run_steps)") + } else { + logger.info("Skipping mutated-maf-to-gvcf (not configured)") + } + logger.info("") + } + + // Step 12: RopeBWT Chr Index (if configured and should run) + if (config.rope_bwt_chr_index != null && shouldRunStep("rope_bwt_chr_index", config)) { + logger.info("=".repeat(80)) + logger.info("STEP 12: RopeBWT Chr Index") + logger.info("=".repeat(80)) + + // Determine output directory (custom or default) - resolve to absolute path + val customOutput = config.rope_bwt_chr_index.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val outputBase = customOutput ?: workDir.resolve("output").resolve("12_rope_bwt_index_results") + outputBase.createDirectories() + + // Determine keyfile - either provided or auto-generated from format_recombined_fastas output + val actualKeyfile: Path = if (config.rope_bwt_chr_index.keyfile != null) { + // Use provided keyfile + val keyfilePath = Path.of(config.rope_bwt_chr_index.keyfile).toAbsolutePath().normalize() + logger.info("Using provided keyfile: $keyfilePath") + keyfilePath + } else { + // Auto-generate keyfile from format_recombined_fastas output + if (formattedFastasDir == null || !formattedFastasDir.exists()) { + throw RuntimeException("Cannot run rope-bwt-chr-index: no FASTA input available (specify 'keyfile' in config or run format-recombined-fastas first)") + } + + logger.info("Auto-generating keyfile from formatted FASTA files in: $formattedFastasDir") + + // Collect FASTA files + val fastaFiles = formattedFastasDir.toFile().listFiles { file -> + file.isFile && file.name.matches(FASTA_FILE_PATTERN) + }?.map { it.toPath() }?.sorted() ?: emptyList() + + if (fastaFiles.isEmpty()) { + throw RuntimeException("Cannot run rope-bwt-chr-index: no FASTA files found in $formattedFastasDir") + } + + logger.info("Found ${fastaFiles.size} FASTA files") + + // Generate keyfile with sample names derived from filenames (no header) + val keyfilePath = outputBase.resolve("phg_keyfile.txt") + val keyfileLines = mutableListOf() + val renamedSamples = mutableListOf>() // original -> fixed + + fastaFiles.forEach { fastaFile -> + var sampleName = fastaFile.fileName.toString() + .replace(FASTA_EXTENSION_PATTERN, "") + + // Replace underscores with hyphens and warn + if (sampleName.contains("_")) { + val originalName = sampleName + sampleName = sampleName.replace("_", "-") + renamedSamples.add(Pair(originalName, sampleName)) + } + + keyfileLines.add("${fastaFile.toAbsolutePath()}\t$sampleName") + } + + // Write keyfile + keyfilePath.writeText(keyfileLines.joinToString("\n")) + logger.info("Generated keyfile: $keyfilePath") + + // Warn about renamed samples + if (renamedSamples.isNotEmpty()) { + logger.warn("WARNING: The following sample names contained underscores and were converted to hyphens:") + renamedSamples.forEach { (original, fixed) -> + logger.warn(" '$original' -> '$fixed'") + } + logger.warn("PHG uses underscores internally for contig renaming (format: samplename_contig)") + } + + keyfilePath + } + + logger.info("Keyfile: $actualKeyfile") + + val args = buildList { + add("--work-dir=$workDir") + add("--keyfile=$actualKeyfile") + if (config.rope_bwt_chr_index.index_file_prefix != null) { + add("--index-file-prefix=${config.rope_bwt_chr_index.index_file_prefix}") + } + if (config.rope_bwt_chr_index.threads != null) { + add("--threads=${config.rope_bwt_chr_index.threads}") + } + // delete-fmr-index is a presence flag (include only when true) + if (config.rope_bwt_chr_index.delete_fmr_index == true) { + add("--delete-fmr-index") + } + if (customOutput != null) { + add("--output-dir=$customOutput") + } + } + + RopeBwtChrIndex().parse(args) + restoreOrchestratorLogging(workDir) + + // Track output directory + ropeBwtIndexDir = outputBase + + if (!ropeBwtIndexDir.exists()) { + throw RuntimeException("Expected RopeBWT index output directory not found: $ropeBwtIndexDir") + } + + logger.info("Step 12 completed successfully") + logger.info("") + } else { + if (config.rope_bwt_chr_index != null) { + logger.info("Skipping rope-bwt-chr-index (not in run_steps)") + + // Check custom output location first, then default + val customOutput = config.rope_bwt_chr_index.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val previousIndexDir = (customOutput ?: workDir.resolve("output").resolve("12_rope_bwt_index_results")) + .toAbsolutePath().normalize() + if (previousIndexDir.exists()) { + ropeBwtIndexDir = previousIndexDir + logger.info("Using previous rope-bwt-chr-index outputs: $ropeBwtIndexDir") + } else { + logger.warn("Previous rope-bwt-chr-index outputs not found. Downstream steps may fail.") + } + } else { + logger.info("Skipping rope-bwt-chr-index (not configured)") + } + logger.info("") + } + // Pipeline completed successfully logger.info("=".repeat(80)) logger.info("PIPELINE COMPLETED SUCCESSFULLY!") diff --git a/src/main/kotlin/net/maizegenetics/commands/PickCrossovers.kt b/src/main/kotlin/net/maizegenetics/commands/PickCrossovers.kt index 974606a..6063bf9 100644 --- a/src/main/kotlin/net/maizegenetics/commands/PickCrossovers.kt +++ b/src/main/kotlin/net/maizegenetics/commands/PickCrossovers.kt @@ -18,8 +18,8 @@ import kotlin.system.exitProcess class PickCrossovers : CliktCommand(name = "pick-crossovers") { companion object { - private const val LOG_FILE_NAME = "06_pick_crossovers.log" - private const val CROSSOVERS_RESULTS_DIR = "06_crossovers_results" + private const val LOG_FILE_NAME = "05_pick_crossovers.log" + private const val CROSSOVERS_RESULTS_DIR = "05_crossovers_results" private const val REFKEY_PATHS_FILE = "refkey_file_paths.txt" private const val PYTHON_SCRIPT = "src/python/cross/pick_crossovers.py" } @@ -46,7 +46,7 @@ class PickCrossovers : CliktCommand(name = "pick-crossovers") { private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/06_crossovers_results)" + help = "Custom output directory (default: work_dir/output/05_crossovers_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) override fun run() { diff --git a/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt b/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt index 479c6e6..be81f3f 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt @@ -2,8 +2,8 @@ package net.maizegenetics.commands import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.default +import com.github.ajalt.clikt.parameters.options.flag import com.github.ajalt.clikt.parameters.options.option -import com.github.ajalt.clikt.parameters.types.boolean import com.github.ajalt.clikt.parameters.types.int import com.github.ajalt.clikt.parameters.types.path import net.maizegenetics.Constants @@ -20,11 +20,10 @@ import kotlin.system.exitProcess class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { companion object { private const val LOG_FILE_NAME = "11_rope_bwt_chr_index.log" - private const val ROPE_BWT_RESULTS_DIR = "11_rope_bwt_index_results" + private const val ROPE_BWT_RESULTS_DIR = "12_rope_bwt_index_results" private const val KEYFILE_NAME = "phg_keyfile.txt" private const val DEFAULT_INDEX_PREFIX = "phgIndex" private const val DEFAULT_THREADS = 20 - private const val DEFAULT_DELETE_FMR = true } private val logger: Logger = LogManager.getLogger(RopeBwtChrIndex::class.java) @@ -42,12 +41,12 @@ class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { private val keyfile by option( "--keyfile", "-k", - help = "Pre-made keyfile (tab-delimited with 'Fasta' and 'SampleName' columns). Mutually exclusive with --fasta-input." + help = "Pre-made keyfile (tab-delimited: fasta_pathsample_name, no header). Mutually exclusive with --fasta-input." ).path(mustExist = true, canBeFile = true, canBeDir = false) private val outputDirOption by option( "--output-dir", "-o", - help = "Output directory for index files (default: work_dir/output/11_rope_bwt_index_results)" + help = "Output directory for index files (default: work_dir/output/12_rope_bwt_index_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private val indexFilePrefix by option( @@ -63,9 +62,8 @@ class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { private val deleteFmrIndex by option( "--delete-fmr-index", - help = "Delete .fmr index files after converting to .fmd" - ).boolean() - .default(DEFAULT_DELETE_FMR) + help = "Delete .fmr index files after converting to .fmd (presence flag: include to enable)" + ).flag() private fun collectFastaFiles(): List { return FileUtils.collectFiles( @@ -80,7 +78,7 @@ class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { logger.info("Generating keyfile from FASTA files") val keyfilePath = outputDir.resolve(KEYFILE_NAME) - val keyfileLines = mutableListOf("Fasta\tSampleName") +val keyfileLines = mutableListOf() val problemNames = mutableListOf() fastaFiles.forEach { fastaFile -> @@ -119,19 +117,12 @@ class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { exitProcess(1) } - val header = lines[0].split("\t") - if (header.size != 2 || header[0] != "Fasta" || header[1] != "SampleName") { - logger.error("Invalid keyfile header. Expected: 'Fasta\\tSampleName'") - logger.error("Got: ${lines[0]}") - exitProcess(1) - } - val problemNames = mutableListOf() - for (i in 1 until lines.size) { + for (i in lines.indices) { val parts = lines[i].split("\t") if (parts.size != 2) { - logger.error("Invalid keyfile line $i: ${lines[i]}") - logger.error("Expected 2 tab-separated columns") + logger.error("Invalid keyfile line ${i + 1}: ${lines[i]}") + logger.error("Expected 2 tab-separated columns (fasta_pathsample_name)") exitProcess(1) } @@ -202,14 +193,22 @@ class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { // Run PHG rope-bwt-chr-index command logger.info("Running PHG rope-bwt-chr-index...") - val exitCode = ProcessRunner.runCommand( + + // Build command arguments - delete-fmr-index is a presence flag (no value) + val commandArgs = mutableListOf( phgBinary.toString(), "rope-bwt-chr-index", "--keyfile", actualKeyfile.toString(), "--output-dir", outputDir.toString(), "--index-file-prefix", indexFilePrefix, - "--threads", threads.toString(), - "--delete-fmr-index", deleteFmrIndex.toString(), + "--threads", threads.toString() + ) + if (deleteFmrIndex) { + commandArgs.add("--delete-fmr-index") + } + + val exitCode = ProcessRunner.runCommand( + *commandArgs.toTypedArray(), workingDir = workDir.toFile(), logger = logger )