Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ plugins {
}

group = "net.maizegenetics"
version = "0.2.5"
version = "0.2.6"

repositories {
mavenCentral()
Expand Down
19 changes: 12 additions & 7 deletions pipeline_config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,19 @@ align_assemblies:
# Converts MAF files from align_assemblies to compressed GVCF format
# Automatically uses ref_fasta from align_assemblies and MAF output paths
maf_to_gvcf:
sample_name: "optional_sample_name" # Optional: Override sample name in GVCF
# input: "/custom/maf/files.txt" # Optional: Custom MAF input (file, directory, or text list)
# output: "/custom/gvcf/output/" # Optional: Custom output directory
# reference_file: "path/to/reference.fa" # Optional: Reference FASTA file
# Uses align_assemblies.ref_fasta if not specified
# maf_file: "/custom/maf/files.txt" # Optional: MAF file, directory, or text list
# Uses step 1 MAF outputs if not specified
# output_file: "sample.g.vcf.gz" # Optional: Output GVCF filename (auto-generated if not specified)
# sample_name: "optional_sample_name" # Optional: Override sample name in GVCF
# output_dir: "/custom/gvcf/output/" # Optional: Custom output directory

# Step 3: Downsample GVCF
# Downsamples GVCF files at specified rates per chromosome
# Automatically uses GVCF output directory from maf_to_gvcf
downsample_gvcf:
ignore_contig: "" # Optional: Comma-separated patterns to ignore
ignore_contig: "__NO_MATCH__" # Optional: Comma-separated patterns to ignore (currently needed)
rates: "0.01,0.05,0.1,0.15,0.2" # Optional: Comma-separated downsampling rates
seed: 42 # Optional: Random seed for reproducibility
keep_ref: true # Optional: Keep reference blocks (true/false, default: true)
Expand All @@ -101,6 +105,7 @@ downsample_gvcf:
convert_to_fasta:
missing_records_as: "asRef" # Optional: Missing records (asN, asRef, asNone)
missing_genotype_as: "asN" # Optional: Missing genotypes (asN, asRef, asNone)
ignore_contig: "__NO_MATCH__" # Optional: Comma-separated patterns to skip (contigs matching will be ignored) (currently needed)
# input: "/custom/gvcf/input/" # Optional: Custom GVCF input (file, directory, or text list)
# output: "/custom/fasta/output/" # Optional: Custom output directory

Expand Down Expand Up @@ -142,9 +147,9 @@ pick_crossovers:
# Automatically uses MAF files from align_assemblies (step 1)
# Uses maf-convert tool (automatically downloaded if missing)
create_chain_files:
jobs: 8 # Optional: Number of parallel jobs (default: 8)
# input: "/custom/maf/input/" # Optional: Custom MAF input (file, directory, or text list)
# output: "/custom/chain/output/" # Optional: Custom output directory
jobs: 8 # Optional: Number of parallel jobs (default: 8)
# input: "/custom/maf/input/" # Optional: Custom MAF input (file, directory, or text list)
# output: "/custom/chain/output/" # Optional: Custom output directory

# Step 8: Convert Coordinates
# Converts crossover breakpoints from reference coordinates to assembly coordinates
Expand Down
15 changes: 13 additions & 2 deletions src/main/kotlin/net/maizegenetics/commands/ConvertToFasta.kt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ class ConvertToFasta : CliktCommand(name = "convert-to-fasta") {
).choice("asN", "asRef", "asNone", ignoreCase = true)
.default("asN")

private val ignoreContig by option(
"--ignore-contig",
help = "Comma-separated list of string patterns to ignore (contigs containing these patterns will be skipped)"
).default("")

private val outputDirOption by option(
"--output-dir", "-o",
help = "Custom output directory (default: work_dir/output/04_fasta_results)"
Expand Down Expand Up @@ -153,6 +158,9 @@ class ConvertToFasta : CliktCommand(name = "convert-to-fasta") {
logger.info("Reference FASTA: $refFasta")
logger.info("Missing records as: $missingRecordsAs")
logger.info("Missing genotype as: $missingGenotypeAs")
if (ignoreContig.isNotEmpty()) {
logger.info("Ignore contig patterns: $ignoreContig")
}

// Collect GVCF files
val gvcfFiles = collectGvcfFiles()
Expand Down Expand Up @@ -211,8 +219,8 @@ class ConvertToFasta : CliktCommand(name = "convert-to-fasta") {
"FASTA file"
)

// Clean up temp directory
if (tempDir.exists() && preparedFiles.any { it.parent == tempDir }) {
// Clean up temp directory - always clean up if it exists
if (tempDir.exists()) {
logger.info("Cleaning up temporary uncompressed files")
try {
tempDir.deleteRecursively()
Expand Down Expand Up @@ -246,6 +254,9 @@ class ConvertToFasta : CliktCommand(name = "convert-to-fasta") {
add("--fasta-file=${refFasta.toAbsolutePath()}")
add("--missing-records-as=$missingRecordsAs")
add("--missing-genotype-as=$missingGenotypeAs")
if (ignoreContig.isNotEmpty()) {
add("--ignore-contig=$ignoreContig")
}
}

// Run MLImpute ConvertToFasta command
Expand Down
3 changes: 2 additions & 1 deletion src/main/kotlin/net/maizegenetics/commands/DownsampleGvcf.kt
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ class DownsampleGvcf : CliktCommand(name = "downsample-gvcf") {
logger.info("Output directory: $outputDir")

// Clean up temp directory if not keeping uncompressed files
if (!keepUncompressed && tempDir.exists() && mlimputeInputDir == tempDir) {
// Always clean up if temp directory exists and was used or created
if (!keepUncompressed && tempDir.exists()) {
logger.info("Cleaning up temporary uncompressed files")
try {
tempDir.deleteRecursively()
Expand Down
28 changes: 21 additions & 7 deletions src/main/kotlin/net/maizegenetics/commands/MafToGvcf.kt
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,29 @@ class MafToGvcf : CliktCommand(name = "maf-to-gvcf") {
logger.info("Sample name: $finalSampleName")

// Determine output file name
val outputFileName = if (outputFile != null && isSingleMaf) {
outputFile!!.fileName
// Note: biokotlin-tools automatically compresses and adds .gz extension
val outputFileName: Path
val expectedOutputPath: Path

if (outputFile != null && isSingleMaf) {
val userFileName = outputFile!!.fileName.toString()
// If user specified .gz extension, strip it since biokotlin-tools adds it
if (userFileName.endsWith(".gz")) {
outputFileName = Path.of(userFileName.removeSuffix(".gz"))
expectedOutputPath = outputDir.resolve(userFileName)
} else {
outputFileName = outputFile!!.fileName
expectedOutputPath = outputDir.resolve("${userFileName}.gz")
}
} else {
// Auto-generate output filename based on MAF filename (compressed)
Path.of("${mafBaseName}.g.vcf.gz")
// Auto-generate output filename based on MAF filename
// biokotlin-tools will add .gz when compressing
outputFileName = Path.of("${mafBaseName}.g.vcf")
expectedOutputPath = outputDir.resolve("${mafBaseName}.g.vcf.gz")
}

val fullOutputPath = outputDir.resolve(outputFileName)
logger.info("Output file: $fullOutputPath")
logger.info("Output file: $expectedOutputPath")

// Run biokotlin-tools maf-to-gvcf-converter through pixi to use Java 21
logger.info("Running biokotlin-tools maf-to-gvcf-converter")
Expand All @@ -173,7 +187,7 @@ class MafToGvcf : CliktCommand(name = "maf-to-gvcf") {

logger.info("Conversion completed for: ${mafFile.name}")

// Return the GVCF file path
return fullOutputPath
// Return the GVCF file path (with .gz extension added by biokotlin-tools)
return expectedOutputPath
}
}
Loading