seq_sim/pipeline_config.example.yaml at main · maize-genetics/seq_sim · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
# Example Pipeline Configuration for seq_sim orchestrate command
# This file demonstrates all available configuration options

# ============================================================================
# seqSim v0.2 PIPELINE OVERVIEW
# ============================================================================
# This pipeline consists of two main workflows:
#
# VARIANT PIPELINE (Generate variants):
# 01. align-assemblies:              Align query assemblies to reference using AnchorWave
# 02. maf-to-gvcf:                   Convert MAF alignment files to compressed GVCF format
# 03. downsample-gvcf:               Downsample variants at specified rates per chromosome
# 04. convert-to-fasta:              Generate FASTA files from downsampled variants
#
# RECOMBINATION PIPELINE (Generate Recombinant Genomes):
# 05. pick-crossovers:               Simulate crossover events in reference coordinates
# 06. create-chain-files:            Convert MAF files to chain format for coordinate conversion
# 07. convert-coordinates:           Convert crossover points to assembly coordinates
# 08. generate-recombined-sequences: Create recombined FASTA files from parent assemblies
# 09. format-recombined-fastas:      Format recombined FASTA files with consistent line widths
#
# PS4G CREATION (PHG Indexing for Genotype Imputation):
# 10. align-mutated-assemblies:      Realign formatted recombined FASTA files to reference
# 11. mutated-maf-to-gvcf:           Convert mutated MAF files to compressed GVCF format
# 12. rope-bwt-chr-index:            Create PHGv2 ropebwt3 index from recombined FASTA files
# 13. ropebwt-mem:                   Align FASTQ reads to ropebwt3 index and generate BED files
# 14. build-spline-knots:            Build spline knots from hVCF or gVCF files for imputation
# 15. convert-ropebwt2ps4g:          Convert RopeBWT3 BED alignments to PS4G format
#
# NOTE (1): Environment setup is automatic! The orchestrate command will automatically
#       detect if the working directory and required tools (MLImpute, biokotlin-tools, PHGv2)
#       are missing and run setup-environment for you. No manual setup required!
#
# NOTE (2): Only edit values in fields, please!
# ============================================================================

# Optional: Working directory for the pipeline (defaults to "seq_sim_work")
# The orchestrate command will create this and set up tools automatically if needed
work_dir: "seq_sim_work"

# Optional: Specify which pipeline steps to execute
# If omitted, all configured steps will be executed
# To skip a step, comment it out with '#' at the beginning of the line
run_steps:
  # Main Pipeline (Variant Simulation)
  - align_assemblies               # Step 01: Align original assemblies
  - maf_to_gvcf                    # Step 02: Convert alignments to GVCF
  - downsample_gvcf                # Step 03: Downsample variants
  - convert_to_fasta               # Step 04: Generate mutated FASTA files

  # Recombination Pipeline (Generate Recombinant Genomes)
  - pick_crossovers                # Step 05: Pick crossover points
  - create_chain_files             # Step 06: Create coordinate conversion chains
  - convert_coordinates            # Step 07: Convert coordinates to assembly space
  - generate_recombined_sequences  # Step 08: Generate recombined sequences
  - format_recombined_fastas       # Step 09: Format recombined FASTA files

  # PS4G Creation (PHG Indexing for Genotype Imputation)
  - align_mutated_assemblies       # Step 10: Realign formatted sequences to reference
  - mutated_maf_to_gvcf            # Step 11: Convert mutated MAF files to GVCF
  - rope_bwt_chr_index             # Step 12: Create PHGv2 ropebwt3 index
  - ropebwt_mem                    # Step 13: Align FASTQ reads to index
  - build_spline_knots             # Step 14: Build spline knots from VCF files
  - convert_ropebwt2ps4g           # Step 15: Convert BED alignments to PS4G format

# ============================================================================
# MAIN PIPELINE CONFIGURATION (Steps 1-4)
# ============================================================================

# Step 1: Align Assemblies
# Aligns query assemblies to a reference using AnchorWave and minimap2
# Required to start the pipeline (unless using previous outputs)
align_assemblies:
  ref_gff: "path/to/reference.gff"       # Required: Reference GFF annotation file
  ref_fasta: "path/to/reference.fa"      # Required: Reference FASTA file
  query_fasta: "path/to/queries.txt"     # Required: Single query file, directory, or text list
  threads: 1                             # Optional: Number of threads (default: 1)
  # output: "/custom/output/path/"       # Optional: Custom output directory

# Step 2: MAF to GVCF Conversion
# Converts MAF files from align_assemblies to compressed GVCF format
# Automatically uses ref_fasta from align_assemblies and MAF output paths
maf_to_gvcf:
  # reference_file: "path/to/reference.fa"  # Optional: Reference FASTA file
                                            #           Uses align_assemblies.ref_fasta if not specified
  # maf_file: "/custom/maf/files.txt"       # Optional: MAF file, directory, or text list
                                            #           Uses step 1 MAF outputs if not specified
  # output_file: "sample.g.vcf.gz"          # Optional: Output GVCF filename (auto-generated if not specified)
  # sample_name: "optional_sample_name"     # Optional: Override sample name in GVCF
  # output_dir: "/custom/gvcf/output/"      # Optional: Custom output directory

# Step 3: Downsample GVCF
# Downsamples GVCF files at specified rates per chromosome
# Automatically uses GVCF output directory from maf_to_gvcf
downsample_gvcf:
  ignore_contig: "__NO_MATCH__"          # Optional: Comma-separated patterns to ignore (currently needed)
  rates: "0.01,0.05,0.1,0.15,0.2"        # Optional: Comma-separated downsampling rates
  seed: 42                               # Optional: Random seed for reproducibility
  keep_ref: true                         # Optional: Keep reference blocks (true/false, default: true)
  min_ref_block_size: 20                 # Optional: Minimum ref block size (default: 20)
  # input: "/custom/gvcf/directory/"     # Optional: Custom GVCF input directory
  # output: "/custom/downsample/output/" # Optional: Custom output directory

# Step 4: Convert to FASTA
# Converts downsampled GVCF files to FASTA format
# Automatically uses ref_fasta from align_assemblies and GVCF output from downsample_gvcf
convert_to_fasta:
  missing_records_as: "asRef"            # Optional: Missing records (asN, asRef, asNone)
  missing_genotype_as: "asN"             # Optional: Missing genotypes (asN, asRef, asNone)
  ignore_contig: "__NO_MATCH__"          # Optional: Comma-separated patterns to skip (contigs matching will be ignored) (currently needed)
  # input: "/custom/gvcf/input/"         # Optional: Custom GVCF input (file, directory, or text list)
  # output: "/custom/fasta/output/"      # Optional: Custom output directory

# ============================================================================
# RECOMBINATION PIPELINE CONFIGURATION (Steps 5-9)
# ============================================================================

# Step 5: Pick Crossovers
# Simulates crossover events to generate recombination breakpoints
# Uses MLImpute's pick_crossovers.py to simulate synthetic recombinant genomes
# Generates two populations: landrace (1250 rounds) and two-parent cross (1 round)
# IMPORTANT: Requires an EVEN number of assemblies (assemblies are paired for crossover simulation)
pick_crossovers:
  # assembly_list: "path/to/assembly_list.txt"  # Optional: Tab-separated file with assembly paths and names
                                                # Format: <path><TAB><name> (one per line)
                                                # Example:
                                                # /path/to/assembly1.fa<TAB>parent1
                                                # /path/to/assembly2.fa<TAB>parent2
                                                # If not specified, auto-generates from convert_to_fasta (step 4) output
                                                # Auto-generated names: filename minus "_mutated" suffix and extension
  # ref_fasta: "path/to/reference.fa"           # Optional: Reference FASTA file
                                                #           Uses align_assemblies.ref_fasta if not specified
  # output: "/custom/crossovers/output/"        # Optional: Custom output directory

# Step 6: Create Chain Files
# Converts MAF alignment files to UCSC chain format for coordinate conversion
# Uses MAF files from align_assemblies (step 1)
# Uses maf-convert tool (automatically downloaded if missing)
create_chain_files:
  jobs: 8                                        # Optional: Number of parallel jobs (default: 8)
  # maf_file_input: "/custom/maf/input/"        # Optional: Custom MAF input (file, directory, or text list)
                                                 #           Default: step 1 MAF outputs
  # output: "/custom/chain/output/"              # Optional: Custom output directory

# Step 7: Convert Coordinates
# Converts crossover breakpoints from reference coordinates to assembly coordinates
# Automatically uses refkey files from pick_crossovers and chain files from create_chain_files
# Uses CrossMap for coordinate conversion via chain files
convert_coordinates:
  # assembly_list: "path/to/assembly_list.txt"   # Optional: defaults to assembly list from pick_crossovers
  # input_chain: "/custom/chain/directory/"      # Optional: Custom chain directory
  # input_refkey: "/custom/refkey/directory/"    # Optional: Custom refkey directory
  # output: "/custom/coordinates/output/"        # Optional: Custom output directory

# Step 8: Generate Recombined Sequences
# Creates recombined FASTA sequences by concatenating segments from parent assemblies
# Automatically uses founder key files from convert_coordinates (step 7)
# Each founder gets its own recombined FASTA file
generate_recombined_sequences:
  # assembly_list: "path/to/assembly_list.txt"      # Optional: defaults to assembly list from pick_crossovers
  # chromosome_list: "path/to/chromosome_list.txt"  # Optional: Text file with chromosome names (one per line)
                                                    #           If not specified, auto-derives IDs from FASTAs
                                                    #           Example format:
                                                    #           chr1
                                                    #           chr2
                                                    #           chr3
  # assembly_dir: "path/to/parent_assemblies/"      # Optional: Directory containing parent assembly FASTA files
                                                    #           Defaults to this step's output directory
# Step 9: Format Recombined Fastas
# Reformats recombined FASTA files to have consistent line widths using seqkit
# Automatically uses recombined FASTA files from generate_recombined_sequences
# Ensures compatibility with downstream tools
format_recombined_fastas:
  line_width: 60                         # Optional: Characters per line (default: 60)
  threads: 8                             # Optional: Number of threads (default: 8)
  # input: "/custom/recombined/fastas/"  # Optional: Custom FASTA input (file, directory, or text list)
  # output: "/custom/formatted/output/"  # Optional: Custom output directory

# ============================================================================
# PS4G CREATION CONFIGURATION (Steps 10-15)
# ============================================================================

# Step 10: Align Mutated Assemblies
# Realigns the formatted recombined FASTA files back to the reference
# Uses reference GFF and FASTA from align_assemblies (step 1)
# Uses FASTA files from format_recombined_fastas (step 9) as query input
align_mutated_assemblies:
  # ref_gff: "path/to/reference.gff"     # Optional: Reference GFF file
                                         #           Uses align_assemblies.ref_gff if not specified
  # ref_fasta: "path/to/reference.fa"    # Optional: Reference FASTA file
                                         #           Uses align_assemblies.ref_fasta if not specified
  # fasta_input: "/custom/fasta/input/"  # Optional: Query FASTA input (file, directory, or text list)
                                         #           Uses format_recombined_fastas output if not specified
  threads: 1                             # Optional: Number of threads (default: 1)
  # output: "/custom/alignment/output/"  # Optional: Custom output directory

# Step 11: Mutated MAF to GVCF Conversion
# Converts MAF files from align_mutated_assemblies to compressed GVCF format
# Automatically uses ref_fasta from align_assemblies and MAF output from align_mutated_assemblies
mutated_maf_to_gvcf:
  # reference_file: "path/to/reference.fa"  # Optional: Reference FASTA file
                                            #           Uses align_assemblies.ref_fasta if not specified
  # maf_file: "/custom/maf/files.txt"       # Optional: MAF file, directory, or text list
                                            #           Uses step 10 MAF outputs if not specified
  # output_file: "sample.g.vcf.gz"          # Optional: Output GVCF filename (auto-generated if not specified)
  # sample_name: "optional_sample_name"     # Optional: Override sample name in GVCF
  # output_dir: "/custom/gvcf/output/"      # Optional: Custom output directory

# Step 12: PHG Rope-BWT-Chr Index
# Creates a PHGv2 ropebwt3 index from recombined FASTA files for genotype imputation
# By default, auto-generates a keyfile from format_recombined_fastas (step 9) output:
#   - Sample names are derived from FASTA filenames (without extension)
#   - Underscores in sample names are automatically converted to hyphens (logged as warning)
# Can also accept a pre-made keyfile with custom sample names
rope_bwt_chr_index:
  index_file_prefix: "phgIndex"          # Optional: Prefix for index files (default: "phgIndex")
  threads: 20                            # Optional: Number of threads (default: 20)
  delete_fmr_index: true                 # Optional: Delete .fmr files after conversion (true/false, default: true)
  # output: "/custom/phg_index/"         # Optional: Custom output directory (default: work_dir/output/12_rope_bwt_index_results)
  # keyfile: "/path/to/keyfile.txt"      # Optional: Pre-made keyfile (overrides auto-generation)
                                         #           Keyfile format: Tab-delimited with 'Fasta' and 'SampleName' columns
                                         #           WARNING: Sample names should NOT contain underscores
                                         #           PHG uses underscores internally (e.g., samplename_contig)

# Step 13: Ropebwt3 Mem Alignment
# Aligns FASTQ reads to the ropebwt3 index and generates BED alignment files
# Automatically uses index file from rope_bwt_chr_index and calculates -l parameter
# Processes multiple FASTQ files iteratively, generating one BED file per sample
ropebwt_mem:
  fastq_input: "path/to/fastq_files/"    # Required: FASTQ file, directory, or text list
                                         # Supports: .fq, .fastq, .fq.gz, .fastq.gz
  threads: 40                            # Optional: Number of threads (default: 1)
  p_value: 168                           # Optional: The -p parameter (default: 168)
  # index_file: "/custom/index.fmd"      # Optional: Custom index file (auto-detected from step 12)
  # l_value: 100                         # Optional: The -l parameter (auto-calculated as 2 × FASTA count from step 12)
  # output: "/custom/bed/output/"        # Optional: Custom output directory (default: work_dir/output/13_rope_bwt_mem_results)

# Step 14: Build Spline Knots
# Builds spline knots from hVCF or gVCF files for PHGv2 machine learning imputation
# This is an independent step that does not depend on previous pipeline outputs
# Used to create spline representations for downstream imputation tasks
build_spline_knots:
  vcf_dir: "path/to/vcf_files/"          # Required: Directory containing hVCF or gVCF files
  vcf_type: "gvcf"                       # Optional: Type of VCFs (hvcf or gvcf, default: gvcf)
  min_indel_length: 10                   # Optional: Min indel length for gVCF (default: 10)
  num_bps_per_knot: 50000                # Optional: Max base pairs per knot (default: 50000)
  random_seed: 12345                     # Optional: Random seed for reproducibility (default: 12345)
  # contig_list: "chr1,chr2,chr3"        # Optional: Comma-separated chromosome list (default: all)
  # output: "/custom/spline/output/"     # Optional: Custom output directory (default: work_dir/output/14_spline_knots_results)

# Step 15: Convert RopeBWT to PS4G
# Converts RopeBWT3 BED alignment files to PS4G format for gamete support tracking
# Automatically uses BED files from ropebwt_mem (step 13) and spline knots from build_spline_knots (step 14)
# Processes each BED file iteratively, generating one PS4G file per sample
convert_ropebwt2ps4g:
  min_mem_length: 135                    # Optional: Minimum MEM length threshold in bp (default: 135)
  max_num_hits: 16                       # Optional: Maximum haplotype hits per alignment (default: 16)
  # bed_input: "/custom/bed/files/"      # Optional: Custom BED input (file, directory, or text list)
                                         #           Auto-detected from step 13 if not specified
  # spline_knot_dir: "/custom/spline/"   # Optional: Custom spline knot directory
                                         #           Auto-detected from step 14 if not specified
  # output: "/custom/ps4g/output/"       # Optional: Custom output directory (default: work_dir/output/15_convert_ropebwt2ps4g_results)

# ============================================================================
# Usage Examples:
# ============================================================================
#
# 1. Run the full pipeline (all configured steps):
#    ./gradlew run --args="orchestrate --config pipeline_config.yaml"
#    Note: Environment setup runs automatically if needed on first run!
#
# 2. Run only the main pipeline (variant simulation, steps 1-4):
#    run_steps:
#      - align_assemblies
#      - maf_to_gvcf
#      - downsample_gvcf
#      - convert_to_fasta
#      # Comment out recombination steps
#
# 3. Run only the recombination pipeline with PS4G creation (steps 1, 5-15):
#    run_steps:
#      - align_assemblies          # Required for MAF files
#      # - maf_to_gvcf             # Skip GVCF steps
#      # - downsample_gvcf
#      # - convert_to_fasta
#      - pick_crossovers
#      - create_chain_files
#      - convert_coordinates
#      - generate_recombined_sequences
#      - format_recombined_fastas
#      - align_mutated_assemblies  # Realign recombined sequences
#      - mutated_maf_to_gvcf       # Convert mutated MAF to GVCF
#      - rope_bwt_chr_index        # Create PHG index
#      - ropebwt_mem               # Align FASTQ reads
#      - build_spline_knots        # Build spline knots
#      - convert_ropebwt2ps4g      # Convert BED to PS4G
#
# 4. Run steps 1-2-4 (skip downsampling):
#    run_steps:
#      - align_assemblies
#      - maf_to_gvcf
#      # - downsample_gvcf          # Commented out
#      - convert_to_fasta           # Will use maf_to_gvcf outputs
#
# 5. Rerun only format step (requires previous outputs):
#    run_steps:
#      # Comment out all previous steps
#      - format_recombined_fastas   # Only this will run
#
# 6. Start at step 3 with your own GVCF files (custom input):
#    run_steps:
#      - downsample_gvcf
#      - convert_to_fasta
#    downsample_gvcf:
#      input: "/path/to/my/existing/gvcf_files/"  # Use your own GVCF files
#      rates: "0.1,0.2"
#    convert_to_fasta:
#      # Will automatically use output from downsample_gvcf
#
# 7. Run step 4 with custom input AND output:
#    run_steps:
#      - convert_to_fasta
#    convert_to_fasta:
#      input: "/data/my_gvcf_directory/"
#      output: "/results/my_fasta_output/"
#
# 8. Run PS4G index creation with a pre-made keyfile:
#    run_steps:
#      - rope_bwt_chr_index
#    rope_bwt_chr_index:
#      keyfile: "/path/to/my/keyfile.txt"
#      output: "my_phg_index"
#      index_file_prefix: "myCustomIndex"
#      threads: 40
#
# 9. Run PS4G index creation with custom settings (using auto-generated keyfile):
#    run_steps:
#      - rope_bwt_chr_index
#    rope_bwt_chr_index:
#      # No keyfile = auto-generates from step 9 FASTA files
#      output: "my_phg_index"
#      index_file_prefix: "customIndex"
#      threads: 40
#
# 10. Run only ropebwt-mem alignment (requires step 12 output):
#    run_steps:
#      - ropebwt_mem
#    ropebwt_mem:
#      fastq_input: "/path/to/fastq/samples/"
#      threads: 40
#
# 11. Run ropebwt-mem with custom parameters:
#    run_steps:
#      - ropebwt_mem
#    ropebwt_mem:
#      fastq_input: "samples.txt"
#      index_file: "/custom/path/to/index.fmd"
#      l_value: 100
#      p_value: 200
#      threads: 40
#      output: "/custom/bed/output/"
#
# 12. Run build-spline-knots as an independent step:
#    run_steps:
#      - build_spline_knots
#    build_spline_knots:
#      vcf_dir: "/path/to/vcf_files/"
#      vcf_type: "gvcf"
#      num_bps_per_knot: 100000
#
# 13. Run build-spline-knots with specific chromosomes:
#    run_steps:
#      - build_spline_knots
#    build_spline_knots:
#      vcf_dir: "/path/to/vcf_files/"
#      vcf_type: "hvcf"
#      contig_list: "chr1,chr2,chr3,chr4,chr5"
#      output: "/custom/spline/output/"
#
# 14. Run convert-ropebwt2ps4g with auto-detection (requires steps 13-14):
#    run_steps:
#      - convert_ropebwt2ps4g
#    convert_ropebwt2ps4g:
#      # Auto-detects BED files from step 13 and spline knots from step 14
#      min_mem_length: 135
#      max_num_hits: 16
#
# 15. Run convert-ropebwt2ps4g with custom inputs:
#    run_steps:
#      - convert_ropebwt2ps4g
#    convert_ropebwt2ps4g:
#      bed_input: "/path/to/bed/files/"
#      spline_knot_dir: "/path/to/spline/knots/"
#      min_mem_length: 148
#      max_num_hits: 50
#      output: "/custom/ps4g/output/"
#
# ============================================================================
# IMPORTANT NOTES:
# ============================================================================
#
# Assembly List Format (for pick_crossovers, convert_coordinates, generate_recombined_sequences):
# - Tab-separated file with two columns: path and name
# - MUST have an EVEN number of assemblies (assemblies are paired for crossover simulation)
# - The same assembly list is shared across steps 5, 7, and 8:
#   - Step 5 (pick_crossovers): If not provided, auto-generates from step 4 FASTA outputs
#   - Steps 7 and 8: If not provided, automatically use the assembly list from step 5
# - Auto-generated names are derived from filename minus "_mutated" suffix and extension
# - Example:
#   /data/assemblies/B73.fa<TAB>B73
#   /data/assemblies/Mo17.fa<TAB>Mo17
#   /data/assemblies/W22.fa<TAB>W22
#   /data/assemblies/PH207.fa<TAB>PH207
#
# Chromosome List Format (for generate_recombined_sequences):
# - Plain text file with one chromosome name per line
# - Names should match chromosome names in assemblies
# - If not provided, auto-derives from first assembly in assembly list using BioKotlin
# - Example:
#   chr1
#   chr2
#   chr3
#
# Recombination Pipeline Dependencies:
# - Step 5 (pick_crossovers) requires: ref_fasta from step 1
#   - If assembly_list not provided, auto-generates from step 4 FASTA outputs
#   - MUST have an EVEN number of assemblies (will error if odd)
#   - Assembly list is saved and shared with steps 7 and 8
# - Step 6 (create_chain_files) requires: MAF files from step 1
#   - Uses align_assemblies MAF outputs
# - Step 7 (convert_coordinates) requires: outputs from steps 5 and 6
#   - If assembly_list not provided, uses assembly list from step 5
# - Step 8 (generate_recombined_sequences) requires: output from step 7
#   - If assembly_list not provided, uses assembly list from step 5
#   - If chromosome_list not provided, auto-derives from first assembly using BioKotlin
#   - If assembly_dir not provided, uses step 8 output directory
# - Step 9 (format_recombined_fastas) requires: output from step 8
# - Step 10 (align_mutated_assemblies) requires: outputs from steps 1 and 9
#   - Uses ref_gff and ref_fasta from step 1
#   - Uses FASTA files from step 9 as query input
# - Step 11 (mutated_maf_to_gvcf) requires: outputs from steps 1 and 10
#   - Uses ref_fasta from step 1
#   - Uses MAF files from step 10 as input
#
# PS4G Creation Dependencies:
# - Step 11 (mutated_maf_to_gvcf) requires: MAF files from step 10 and ref_fasta from step 1
# - Step 12 (rope_bwt_chr_index) auto-generates keyfile from step 9 FASTA files:
#   - Sample names derived from filenames (without extension)
#   - Underscores are auto-converted to hyphens (logged as warning)
# - Can also accept a pre-made keyfile (overrides auto-generation)
# - Keyfile must be tab-delimited with 'Fasta' and 'SampleName' columns
# - Sample names should NOT contain underscores (PHG internal requirement)
# - Step 13 (ropebwt_mem) requires: index file (.fmd) from step 12
# - Step 13 auto-calculates -l parameter from keyfile (2 × FASTA count)
# - FASTQ files can be compressed (.fq.gz, .fastq.gz) or uncompressed (.fq, .fastq)
# - Step 14 (build_spline_knots) is independent: requires only VCF files
# - Step 15 (convert_ropebwt2ps4g) requires: BED files from step 13 and spline knots from step 14
# - Step 15 auto-detects inputs if not specified
#
# Custom Input/Output Paths:
# - Each step supports optional 'input' and 'output' parameters
# - Use 'input' to point to your own files when starting mid-pipeline
# - Use 'output' to direct outputs to custom locations
# - If 'input' is not specified, the step uses output from the previous step
# - If 'output' is not specified, the step uses the default location
# - Custom outputs are automatically used by the next step in the chain
#
# ============================================================================