From f1cc579bc1b421477f81e78a520b127361d6c71d Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Tue, 23 Dec 2025 15:50:33 -0500 Subject: [PATCH 01/14] Refactor and initial functions for recombining gvcf files based on picked crossover points. --- .../commands/MutateAssemblies.kt | 66 +----- .../maizegenetics/commands/RecombineGvcfs.kt | 207 ++++++++++++++++++ .../utils/VariantContextUtils.kt | 76 +++++++ src/test/kotlin/RecombineGvcfsTest.kt | 2 + .../commands/MutateAssembliesTest.kt | 13 +- 5 files changed, 297 insertions(+), 67 deletions(-) create mode 100644 src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt create mode 100644 src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt create mode 100644 src/test/kotlin/RecombineGvcfsTest.kt diff --git a/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt index 2230077..a672b79 100644 --- a/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt @@ -20,38 +20,16 @@ import htsjdk.variant.vcf.VCFHeaderLine import htsjdk.variant.vcf.VCFHeaderLineCount import htsjdk.variant.vcf.VCFHeaderLineType import htsjdk.variant.vcf.VCFInfoHeaderLine +import net.maizegenetics.net.maizegenetics.utils.Position +import net.maizegenetics.net.maizegenetics.utils.SimpleVariant +import net.maizegenetics.net.maizegenetics.utils.VariantContextUtils import java.io.File import java.util.HashSet import kotlin.io.path.createDirectories import kotlin.io.path.exists -data class Position(val contig: String, val position: Int) : Comparable { - override fun compareTo(other: Position): Int { - if (this.contig == other.contig) { - return this.position.compareTo(other.position) - } - - val thisContig = this.contig.replace("chr", "", ignoreCase = true).trim() - val otherContig = other.contig.replace("chr", "", ignoreCase = true).trim() - - return try { - thisContig.toInt() - otherContig.toInt() - } catch (e: NumberFormatException) { - // If we can't convert contigs to an int, then compare the strings - contig.compareTo(other.contig) - } - - } - - override fun toString(): String { - return "$contig:$position" - } -} -data class SimpleVariant(val refStart: Position, val refEnd: Position, - val refAllele: String, val altAllele: String, - val isAddedMutation: Boolean = false) @OptIn(kotlin.io.path.ExperimentalPathApi::class) class MutateAssemblies : CliktCommand(name = "mutate-assemblies") { @@ -156,7 +134,7 @@ class MutateAssemblies : CliktCommand(name = "mutate-assemblies") { return } - if (isIndel(currentSimpleVariant) && isIndel(overlappingVariant)) { + if (VariantContextUtils.isIndel(currentSimpleVariant) && VariantContextUtils.isIndel(overlappingVariant)) { //skip as it will be tricky/slow to handle return //TODO figure out a way to handle this well } @@ -260,7 +238,7 @@ class MutateAssemblies : CliktCommand(name = "mutate-assemblies") { .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER) .build().use { writer -> - writer.writeHeader(createGenericHeader(listOf(sampleName), emptySet())) + writer.writeHeader(VariantContextUtils.createGenericHeader(listOf(sampleName), emptySet())) val sortedVariants = baseVariantMap.asMapOfRanges().toList().sortedBy { it.first.lowerEndpoint() } @@ -294,40 +272,6 @@ class MutateAssemblies : CliktCommand(name = "mutate-assemblies") { } } - fun createGenericHeader(taxaNames: List, altLines:Set): VCFHeader { - val headerLines = createGenericHeaderLineSet() as MutableSet - headerLines.addAll(altLines) - return VCFHeader(headerLines, taxaNames) - } - - fun createGenericHeaderLineSet(): Set { - val headerLines: MutableSet = HashSet() - headerLines.add(VCFFormatHeaderLine("AD", 3, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")) - headerLines.add( - VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)") - ) - headerLines.add(VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality")) - headerLines.add(VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype")) - headerLines.add( - VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification") - ) - headerLines.add(VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total Depth")) - headerLines.add(VCFInfoHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Number of Samples With Data")) - headerLines.add(VCFInfoHeaderLine("AF", 3, VCFHeaderLineType.Integer, "Allele Frequency")) - headerLines.add(VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval")) - // These last 4 are needed for assembly g/hvcfs, but not for reference. I am keeping them in as header - // lines but they will only be added to the data lines if they are present in the VariantContext. - headerLines.add(VCFInfoHeaderLine("ASM_Chr", 1, VCFHeaderLineType.String, "Assembly chromosome")) - headerLines.add(VCFInfoHeaderLine("ASM_Start", 1, VCFHeaderLineType.Integer, "Assembly start position")) - headerLines.add(VCFInfoHeaderLine("ASM_End", 1, VCFHeaderLineType.Integer, "Assembly end position")) - headerLines.add(VCFInfoHeaderLine("ASM_Strand", 1, VCFHeaderLineType.String, "Assembly strand")) - return headerLines - } - fun isIndel(variant: SimpleVariant?): Boolean { - if(variant == null) return false - return !((variant.refAllele.length == 1 && variant.altAllele.length == 1) || //SNP - (variant.refAllele.length == 1 && variant.altAllele == "")) //RefBlock - } } \ No newline at end of file diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt new file mode 100644 index 0000000..25c0690 --- /dev/null +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -0,0 +1,207 @@ +package net.maizegenetics.net.maizegenetics.commands + +import com.github.ajalt.clikt.core.CliktCommand +import com.github.ajalt.clikt.parameters.options.option +import com.github.ajalt.clikt.parameters.options.required +import com.github.ajalt.clikt.parameters.types.path +import com.google.common.collect.Range +import com.google.common.collect.RangeMap +import com.google.common.collect.TreeRangeMap +import htsjdk.variant.variantcontext.writer.Options +import htsjdk.variant.variantcontext.writer.VariantContextWriter +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder +import htsjdk.variant.vcf.VCFFileReader +import htsjdk.variant.vcf.VCFReader +import net.maizegenetics.net.maizegenetics.utils.Position +import net.maizegenetics.net.maizegenetics.utils.VariantContextUtils +import java.io.File +import java.nio.file.Path + +data class RecombinationRange(val chrom: String, val start: Int, val end: Int, val targetSampleName: String) + +class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { + + private val inputBedDir by option(help = "Input Bed dir") + .path(canBeFile = false, canBeDir = true) + .required() + + private val inputGvcfDir by option(help = "Input GVCF dir") + .path(canBeFile = false, canBeDir = true) + .required() + + private val outputDir by option(help = "Output dir") + .path(canBeFile = false, canBeDir = true) + .required() + + + override fun run() { + // Implementation goes here + recombineGvcfs(inputBedDir, inputGvcfDir, outputDir) + } + + fun recombineGvcfs(inputBedDir: Path, inputGvcfDir: java.nio.file.Path, outputDir: java.nio.file.Path) { + // Placeholder for the actual recombination logic + println("Recombining GVCFs from $inputGvcfDir using BED files from $inputBedDir into $outputDir") + + //Build BedFile Map + val (recombinationMap, sampleNames) = buildRecombinationMap(inputBedDir) + //Build Output writers for each sample name + val outputWriters = buildOutputWriterMap(sampleNames, outputDir) + //Process GVCFs and write out recombined files + processGvcfsAndWrite(recombinationMap, inputGvcfDir, outputWriters) + //Close the GVCF writers + outputWriters.values.forEach { it.close() } + + //Sort the gvcfs + } + + fun buildRecombinationMap(inputBedDir: Path): Pair>, List> { + //loop through each file in the inputBedDir + val recombinationMap = mutableMapOf>() + val targetNames = mutableSetOf() + inputBedDir.toFile().listFiles()?.forEach { bedFile -> + val bedFileSampleName = bedFile.name.substringBeforeLast("_") + bedFile.forEachLine { line -> + val parts = line.split("\t") + if (parts.size >= 4) { + val chrom = parts[0] + val start = parts[1].toInt() + 1 //BED is 0 based, VCF is 1 based + val end = parts[2].toInt() + val targetSampleName = parts[3] + + val range = Range.closed(Position(chrom, start), Position(chrom, end)) + recombinationMap.computeIfAbsent(bedFileSampleName) { TreeRangeMap.create() }.put(range, targetSampleName) + targetNames.add(targetSampleName) + } + } + } + return Pair(recombinationMap, targetNames.toList()) + } + +// fun buildRecombinationMap(inputBedDir: Path): Pair>, List> { +// //loop through each file in the inputBedDir +// val recombinationMap = mutableMapOf>() +// val targetNames = mutableSetOf() +// inputBedDir.toFile().listFiles()?.forEach { bedFile -> +// val bedFileSampleName = bedFile.name.substringBeforeLast("_") +// bedFile.forEachLine { line -> +// val parts = line.split("\t") +// if (parts.size >= 4) { +// val chrom = parts[0] +// val start = parts[1].toInt() +// val end = parts[2].toInt() +// val targetSampleName = parts[3] +// +// val range = RecombinationRange(chrom, start, end, targetSampleName) +// recombinationMap.computeIfAbsent(bedFileSampleName) { mutableListOf() }.add(range) +// targetNames.add(targetSampleName) +// } +// } +// } +// return Pair(recombinationMap, targetNames.toList()) +// } + + + + fun buildOutputWriterMap(sampleNames: List, outputDir: Path): Map { + return sampleNames.associateWith { sampleName -> + val outputFile = outputDir.resolve("${sampleName}_recombined.gvcf") + val writer = VariantContextWriterBuilder() + .unsetOption(Options.INDEX_ON_THE_FLY) + .setOutputFile(outputFile.toFile()) + .setOutputFileType(VariantContextWriterBuilder.OutputType.VCF) + .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER) + .build() + + writer.writeHeader(VariantContextUtils.createGenericHeader(listOf(sampleName), emptySet())) + writer + } + } + + fun processGvcfsAndWrite( + recombinationMap: Map>, + inputGvcfDir: Path, + outputWriters: Map + ) { + inputGvcfDir.toFile().listFiles()?.forEach { gvcfFile -> + val sampleName = gvcfFile.name.substringBeforeLast(".g.vcf") + val ranges = recombinationMap[sampleName] ?: return@forEach + + VCFFileReader(gvcfFile, false).use { gvcfReader -> + processSingleGVCFFile(gvcfReader, ranges, outputWriters) + } + } + } + +// fun processGvcfsAndWrite( +// recombinationMap: Map>, +// inputGvcfDir: Path, +// outputWriters: Map +// ) { +// inputGvcfDir.toFile().listFiles()?.forEach { gvcfFile -> +// val sampleName = gvcfFile.name.substringBeforeLast(".g.vcf") +// val ranges = recombinationMap[sampleName] ?: return@forEach +// +// VCFFileReader(gvcfFile, false).use { gvcfReader -> +// processSingleGVCFFile(gvcfReader, ranges, outputWriters) +// } +// } +// } + + fun processSingleGVCFFile( + gvcfReader: VCFReader, + ranges: RangeMap, + outputWriters: Map + ) { + //Need to loop through each range and each gvcf record. + //We need to see if the variant falls within the range. If so, write it to the appropriate output writer. + //There are edge cases where the variant can span multiple ranges(Due to RefBlock) which is valid just need to resize the variant and write out. + //We do need to make sure that if an indel spans a range boundary that we correctly handle it by assigning it to the left most range but not the right + + //This might actually be very simple, If we convert the List into a Range Map, we only need to check the start position. + //Then if the variant is an RefBlock we need to resize it to start at the end of the range and move to the next range. Continue doing this + val iterator = gvcfReader.iterator() + while (iterator.hasNext()) { + val vc = iterator.next() + val startPos = Position(vc.contig, vc.start) + val endPos = Position(vc.contig, vc.end) + + val targetSampleName = ranges.get(startPos) ?: continue + + val outputWriter = outputWriters[targetSampleName] ?: continue + + if((vc.reference.length() == 1) && (vc.alternateAlleles.first().displayString == "NON_REF" || vc.reference.length() == vc.alternateAlleles[0].length())) { + //RefBlock or SNP polymorphism + outputWriter.add(vc) + } else { + //Indel case, need to check if it spans multiple ranges + TODO("Handle Indel spanning multiple ranges") + var currentStartPos = startPos + var currentEndPos = endPos + var currentVc = vc + + while (true) { + val rangeTargetSampleName = ranges.get(currentStartPos) ?: break + val range = ranges.asMapOfRanges().entries.find { it.value == rangeTargetSampleName }?.key ?: break + + if (range.contains(currentEndPos)) { + //Fully contained within the range + outputWriter.add(currentVc) + break + } else { + //Partially contained, need to resize and write out + val resizedEndPos = Position(range.upperEndpoint().contig, range.upperEndpoint().position) +// val resizedVc = VariantContextUtils.resizeVariantContext(currentVc, currentStartPos.position, resizedEndPos.position) + +// outputWriter.add(resizedVc) + + //Move to the next range + currentStartPos = Position(currentVc.contig, resizedEndPos.position + 1) +// currentVc = VariantContextUtils.resizeVariantContext(currentVc, currentStartPos.position, currentEndPos.position) + } + } + } + } + } + +} \ No newline at end of file diff --git a/src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt b/src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt new file mode 100644 index 0000000..5179b33 --- /dev/null +++ b/src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt @@ -0,0 +1,76 @@ +package net.maizegenetics.net.maizegenetics.utils + +import htsjdk.variant.vcf.VCFFormatHeaderLine +import htsjdk.variant.vcf.VCFHeader +import htsjdk.variant.vcf.VCFHeaderLine +import htsjdk.variant.vcf.VCFHeaderLineCount +import htsjdk.variant.vcf.VCFHeaderLineType +import htsjdk.variant.vcf.VCFInfoHeaderLine +import java.util.HashSet + +data class Position(val contig: String, val position: Int) : Comparable { + override fun compareTo(other: Position): Int { + + if (this.contig == other.contig) { + return this.position.compareTo(other.position) + } + + val thisContig = this.contig.replace("chr", "", ignoreCase = true).trim() + val otherContig = other.contig.replace("chr", "", ignoreCase = true).trim() + + return try { + thisContig.toInt() - otherContig.toInt() + } catch (e: NumberFormatException) { + // If we can't convert contigs to an int, then compare the strings + contig.compareTo(other.contig) + } + + } + + override fun toString(): String { + return "$contig:$position" + } +} +data class SimpleVariant(val refStart: Position, val refEnd: Position, + val refAllele: String, val altAllele: String, + val isAddedMutation: Boolean = false) + +class VariantContextUtils { + companion object { + fun createGenericHeader(taxaNames: List, altLines:Set): VCFHeader { + val headerLines = createGenericHeaderLineSet() as MutableSet + headerLines.addAll(altLines) + return VCFHeader(headerLines, taxaNames) + } + + fun createGenericHeaderLineSet(): Set { + val headerLines: MutableSet = HashSet() + headerLines.add(VCFFormatHeaderLine("AD", 3, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")) + headerLines.add( + VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)") + ) + headerLines.add(VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality")) + headerLines.add(VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype")) + headerLines.add( + VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification") + ) + headerLines.add(VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total Depth")) + headerLines.add(VCFInfoHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Number of Samples With Data")) + headerLines.add(VCFInfoHeaderLine("AF", 3, VCFHeaderLineType.Integer, "Allele Frequency")) + headerLines.add(VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval")) + // These last 4 are needed for assembly g/hvcfs, but not for reference. I am keeping them in as header + // lines but they will only be added to the data lines if they are present in the VariantContext. + headerLines.add(VCFInfoHeaderLine("ASM_Chr", 1, VCFHeaderLineType.String, "Assembly chromosome")) + headerLines.add(VCFInfoHeaderLine("ASM_Start", 1, VCFHeaderLineType.Integer, "Assembly start position")) + headerLines.add(VCFInfoHeaderLine("ASM_End", 1, VCFHeaderLineType.Integer, "Assembly end position")) + headerLines.add(VCFInfoHeaderLine("ASM_Strand", 1, VCFHeaderLineType.String, "Assembly strand")) + return headerLines + } + + fun isIndel(variant: SimpleVariant?): Boolean { + if(variant == null) return false + return !((variant.refAllele.length == 1 && variant.altAllele.length == 1) || //SNP + (variant.refAllele.length == 1 && variant.altAllele == "")) //RefBlock + } + } +} \ No newline at end of file diff --git a/src/test/kotlin/RecombineGvcfsTest.kt b/src/test/kotlin/RecombineGvcfsTest.kt new file mode 100644 index 0000000..3473505 --- /dev/null +++ b/src/test/kotlin/RecombineGvcfsTest.kt @@ -0,0 +1,2 @@ +class RecombineGvcfsTest { +} \ No newline at end of file diff --git a/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt b/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt index 40862c4..0160ef4 100644 --- a/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt @@ -7,8 +7,9 @@ import htsjdk.variant.variantcontext.Allele import htsjdk.variant.variantcontext.VariantContextBuilder import htsjdk.variant.vcf.VCFFileReader import net.maizegenetics.net.maizegenetics.commands.MutateAssemblies -import net.maizegenetics.net.maizegenetics.commands.Position -import net.maizegenetics.net.maizegenetics.commands.SimpleVariant +import net.maizegenetics.net.maizegenetics.utils.Position +import net.maizegenetics.net.maizegenetics.utils.SimpleVariant +import net.maizegenetics.net.maizegenetics.utils.VariantContextUtils import java.io.File import kotlin.test.Test import kotlin.test.assertEquals @@ -306,16 +307,16 @@ class MutateAssembliesTest { val mutateAssemblies = MutateAssemblies() val refBlockVariant = SimpleVariant(Position("chr1", 100), Position("chr1", 150), "A", "") - assert(!mutateAssemblies.isIndel(refBlockVariant)) + assert(!VariantContextUtils.isIndel(refBlockVariant)) val snpVariant = SimpleVariant(Position("chr1", 150), Position("chr1", 150), "C", "G") - assert(!mutateAssemblies.isIndel(snpVariant)) + assert(!VariantContextUtils.isIndel(snpVariant)) val insertionVariant = SimpleVariant(Position("chr1", 200), Position("chr1", 200), "A", "AGG") - assert(mutateAssemblies.isIndel(insertionVariant)) + assert(VariantContextUtils.isIndel(insertionVariant)) val deletionVariant = SimpleVariant(Position("chr1", 201), Position("chr1", 205), "GGGGG", "G") - assert(mutateAssemblies.isIndel(deletionVariant)) + assert(VariantContextUtils.isIndel(deletionVariant)) } private fun createSimpleBaseVariantMap(): RangeMap { From b6754d51c0805e59069cb083f50fa30fce3f502c Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Wed, 7 Jan 2026 15:50:58 -0500 Subject: [PATCH 02/14] Adding in more functionality for the recombine gvcf code --- .../maizegenetics/commands/RecombineGvcfs.kt | 169 +++++++++++------- src/test/kotlin/RecombineGvcfsTest.kt | 41 +++++ 2 files changed, 148 insertions(+), 62 deletions(-) diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt index 25c0690..d49129b 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -1,5 +1,8 @@ package net.maizegenetics.net.maizegenetics.commands +import biokotlin.seq.NucSeq +import biokotlin.seq.NucSeqRecord +import biokotlin.seqIO.NucSeqIO import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required @@ -7,6 +10,10 @@ import com.github.ajalt.clikt.parameters.types.path import com.google.common.collect.Range import com.google.common.collect.RangeMap import com.google.common.collect.TreeRangeMap +import htsjdk.variant.variantcontext.Allele +import htsjdk.variant.variantcontext.GenotypeBuilder +import htsjdk.variant.variantcontext.VariantContext +import htsjdk.variant.variantcontext.VariantContextBuilder import htsjdk.variant.variantcontext.writer.Options import htsjdk.variant.variantcontext.writer.VariantContextWriter import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder @@ -33,13 +40,20 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { .path(canBeFile = false, canBeDir = true) .required() + private val refFile by option(help = "Ref file") + .path(canBeFile = true, canBeDir = false) + .required() + override fun run() { // Implementation goes here - recombineGvcfs(inputBedDir, inputGvcfDir, outputDir) + recombineGvcfs(inputBedDir, inputGvcfDir, outputDir,refFile) } - fun recombineGvcfs(inputBedDir: Path, inputGvcfDir: java.nio.file.Path, outputDir: java.nio.file.Path) { + fun recombineGvcfs(inputBedDir: Path, inputGvcfDir: Path, outputDir: Path, refFile: Path) { + println("Loading in the reference Genome from $refFile") + val refSeq = NucSeqIO(refFile.toFile().path).readAll() + // Placeholder for the actual recombination logic println("Recombining GVCFs from $inputGvcfDir using BED files from $inputBedDir into $outputDir") @@ -48,7 +62,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { //Build Output writers for each sample name val outputWriters = buildOutputWriterMap(sampleNames, outputDir) //Process GVCFs and write out recombined files - processGvcfsAndWrite(recombinationMap, inputGvcfDir, outputWriters) + processGvcfsAndWrite(recombinationMap, inputGvcfDir, outputWriters, refSeq) //Close the GVCF writers outputWriters.values.forEach { it.close() } @@ -77,31 +91,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { } return Pair(recombinationMap, targetNames.toList()) } - -// fun buildRecombinationMap(inputBedDir: Path): Pair>, List> { -// //loop through each file in the inputBedDir -// val recombinationMap = mutableMapOf>() -// val targetNames = mutableSetOf() -// inputBedDir.toFile().listFiles()?.forEach { bedFile -> -// val bedFileSampleName = bedFile.name.substringBeforeLast("_") -// bedFile.forEachLine { line -> -// val parts = line.split("\t") -// if (parts.size >= 4) { -// val chrom = parts[0] -// val start = parts[1].toInt() -// val end = parts[2].toInt() -// val targetSampleName = parts[3] -// -// val range = RecombinationRange(chrom, start, end, targetSampleName) -// recombinationMap.computeIfAbsent(bedFileSampleName) { mutableListOf() }.add(range) -// targetNames.add(targetSampleName) -// } -// } -// } -// return Pair(recombinationMap, targetNames.toList()) -// } - - + fun buildOutputWriterMap(sampleNames: List, outputDir: Path): Map { return sampleNames.associateWith { sampleName -> @@ -121,14 +111,15 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { fun processGvcfsAndWrite( recombinationMap: Map>, inputGvcfDir: Path, - outputWriters: Map + outputWriters: Map, + refSeq :Map ) { inputGvcfDir.toFile().listFiles()?.forEach { gvcfFile -> val sampleName = gvcfFile.name.substringBeforeLast(".g.vcf") val ranges = recombinationMap[sampleName] ?: return@forEach VCFFileReader(gvcfFile, false).use { gvcfReader -> - processSingleGVCFFile(gvcfReader, ranges, outputWriters) + processSingleGVCFFile(gvcfReader, ranges, outputWriters, refSeq) } } } @@ -151,7 +142,8 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { fun processSingleGVCFFile( gvcfReader: VCFReader, ranges: RangeMap, - outputWriters: Map + outputWriters: Map, + refSeq :Map ) { //Need to loop through each range and each gvcf record. //We need to see if the variant falls within the range. If so, write it to the appropriate output writer. @@ -166,42 +158,95 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { val startPos = Position(vc.contig, vc.start) val endPos = Position(vc.contig, vc.end) - val targetSampleName = ranges.get(startPos) ?: continue + var targetSampleName = ranges.get(startPos) ?: continue - val outputWriter = outputWriters[targetSampleName] ?: continue + var outputWriter = outputWriters[targetSampleName] ?: continue - if((vc.reference.length() == 1) && (vc.alternateAlleles.first().displayString == "NON_REF" || vc.reference.length() == vc.alternateAlleles[0].length())) { - //RefBlock or SNP polymorphism - outputWriter.add(vc) + if((vc.reference.length() == 1) && vc.reference.length() == vc.alternateAlleles[0].length()) { + // SNP polymorphism + //can write out directly as it is only 1 position and will not overlap but will need to change the genotype name + val newVc = changeSampleName(vc, targetSampleName) + outputWriter.add(newVc) + } + else if(vc.reference.length() == 1 && vc.alternateAlleles.first().displayString == "NON_REF"){ + //This is a RefBlock + //We need to 'walk through' the refBlock by splitting it up into multiple refBlocks and write out to the correct output writer + //Get the current start position + processRefBlockOverlap(startPos, endPos, ranges, outputWriters, refSeq, vc) + } + } + } + + fun processRefBlockOverlap( + startPos: Position, + endPos: Position, + ranges: RangeMap, + outputWriters: Map, + refSeq: Map, + vc: VariantContext + ){ + var currentStartPos = startPos + while (currentStartPos <= endPos) { + //Get the target sample name for the current start position + val rangesEntry = ranges.getEntry(currentStartPos) ?: break + val range = rangesEntry.key + val targetSampleName = rangesEntry.value + + val outputWriter = outputWriters[targetSampleName] ?: break + + val refAllele = refSeq[vc.contig]!!.get(currentStartPos.position - 1).char + + //Check to see if the refBlock is fully contained within the range + if (range.contains(endPos)) { + //Fully contained within the range, write out and break + val newVc = buildRefBlock( + vc.contig, + currentStartPos.position, + endPos.position, + "$refAllele", + targetSampleName + ) + outputWriter.add(newVc) + break } else { - //Indel case, need to check if it spans multiple ranges - TODO("Handle Indel spanning multiple ranges") - var currentStartPos = startPos - var currentEndPos = endPos - var currentVc = vc - - while (true) { - val rangeTargetSampleName = ranges.get(currentStartPos) ?: break - val range = ranges.asMapOfRanges().entries.find { it.value == rangeTargetSampleName }?.key ?: break - - if (range.contains(currentEndPos)) { - //Fully contained within the range - outputWriter.add(currentVc) - break - } else { - //Partially contained, need to resize and write out - val resizedEndPos = Position(range.upperEndpoint().contig, range.upperEndpoint().position) -// val resizedVc = VariantContextUtils.resizeVariantContext(currentVc, currentStartPos.position, resizedEndPos.position) - -// outputWriter.add(resizedVc) - - //Move to the next range - currentStartPos = Position(currentVc.contig, resizedEndPos.position + 1) -// currentVc = VariantContextUtils.resizeVariantContext(currentVc, currentStartPos.position, currentEndPos.position) - } - } + //Partially contained, need to resize and write out + val resizedEndPos = Position(range.upperEndpoint().contig, range.upperEndpoint().position) + val newVc = buildRefBlock( + vc.contig, + currentStartPos.position, + range.upperEndpoint().position, + "$refAllele", + targetSampleName + ) + outputWriter.add(newVc) + //move up the start + currentStartPos = Position(vc.contig, resizedEndPos.position + 1) } } } + fun changeSampleName(vc: VariantContext, newSampleName: String): VariantContext { + val builder = VariantContextBuilder(vc) + val genotypes = vc.genotypes.map { genotype -> + GenotypeBuilder(genotype) + .name(newSampleName) + .make() + } + builder.genotypes(genotypes) + return builder.make() + } + + fun buildRefBlock(chrom: String, start: Int, end: Int, refAllele: String, sampleName: String): VariantContext { + return VariantContextBuilder() + .chr(chrom) + .start(start.toLong()) + .stop(end.toLong()) + .alleles(listOf(refAllele, "")) + .genotypes( + listOf( + GenotypeBuilder(sampleName).alleles(listOf(Allele.create(refAllele,true))).make() + ) + ).make() + } + } \ No newline at end of file diff --git a/src/test/kotlin/RecombineGvcfsTest.kt b/src/test/kotlin/RecombineGvcfsTest.kt index 3473505..725bfd2 100644 --- a/src/test/kotlin/RecombineGvcfsTest.kt +++ b/src/test/kotlin/RecombineGvcfsTest.kt @@ -1,2 +1,43 @@ +import htsjdk.variant.variantcontext.Allele +import htsjdk.variant.variantcontext.GenotypeBuilder +import htsjdk.variant.variantcontext.VariantContextBuilder +import net.maizegenetics.net.maizegenetics.commands.RecombineGvcfs +import kotlin.test.DefaultAsserter.assertEquals +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertFalse + class RecombineGvcfsTest { + + @Test + fun testChangeSampleName() { + + val recombineGvcfs = RecombineGvcfs() + + val originalVariantContext = VariantContextBuilder() + .chr("1") + .start(100) + .stop(100) + .id("rs123") + .alleles(listOf(Allele.REF_A, Allele.ALT_C)) + .genotypes( + listOf( + GenotypeBuilder("Sample1").alleles(listOf(Allele.REF_A, Allele.REF_A)).make() + ) + ) + .make() + + val newSampleName = "NewSample" + val renamed = recombineGvcfs.changeSampleName(originalVariantContext, newSampleName) + + assertEquals("SampleName was not updated", newSampleName, renamed.genotypes.sampleNames.first()) + assertEquals("Alleles were not preserved", originalVariantContext.genotypes.first().alleles, renamed.genotypes.first().alleles) + //check that the rest of the variant matches and that Sample1 does not exist + assertEquals("Contigs do not match",originalVariantContext.contig, renamed.contig) + assertEquals("Start positions do not match",originalVariantContext.start, renamed.start) + assertEquals("Stop positions do not match",originalVariantContext.end, renamed.end) + assertEquals("IDs do not match",originalVariantContext.id, renamed.id) + assertEquals("Alleles do not match",originalVariantContext.alleles, renamed.alleles) + assertFalse( renamed.genotypes.sampleNames.contains("Sample1"), "Old sample name still exists") + } } \ No newline at end of file From 2ee47a2dff199c5e942957b07523a47ebed99fc2 Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Fri, 9 Jan 2026 14:17:53 -0500 Subject: [PATCH 03/14] Adding in additional functionality for recombine gvcfs --- .../maizegenetics/commands/RecombineGvcfs.kt | 176 +++++++++++++++++- .../commands}/RecombineGvcfsTest.kt | 21 ++- 2 files changed, 186 insertions(+), 11 deletions(-) rename src/test/kotlin/{ => net/maizegenetics/commands}/RecombineGvcfsTest.kt (60%) diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt index d49129b..f7984d0 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -3,6 +3,7 @@ package net.maizegenetics.net.maizegenetics.commands import biokotlin.seq.NucSeq import biokotlin.seq.NucSeqRecord import biokotlin.seqIO.NucSeqIO +import biokotlin.util.bufferedReader import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required @@ -20,9 +21,11 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder import htsjdk.variant.vcf.VCFFileReader import htsjdk.variant.vcf.VCFReader import net.maizegenetics.net.maizegenetics.utils.Position +import net.maizegenetics.net.maizegenetics.utils.SimpleVariant import net.maizegenetics.net.maizegenetics.utils.VariantContextUtils import java.io.File import java.nio.file.Path +import kotlin.io.path.bufferedReader data class RecombinationRange(val chrom: String, val start: Int, val end: Int, val targetSampleName: String) @@ -44,13 +47,17 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { .path(canBeFile = true, canBeDir = false) .required() + private val outputBedDir by option(help = "Output Bed dir") + .path(canBeFile = false, canBeDir = true) + .required() + override fun run() { // Implementation goes here - recombineGvcfs(inputBedDir, inputGvcfDir, outputDir,refFile) + recombineGvcfs(inputBedDir, inputGvcfDir, refFile, outputDir, outputBedDir) } - fun recombineGvcfs(inputBedDir: Path, inputGvcfDir: Path, outputDir: Path, refFile: Path) { + fun recombineGvcfs(inputBedDir: Path, inputGvcfDir: Path, refFile: Path, outputDir: Path, outputBedDir: Path) { println("Loading in the reference Genome from $refFile") val refSeq = NucSeqIO(refFile.toFile().path).readAll() @@ -59,6 +66,12 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { //Build BedFile Map val (recombinationMap, sampleNames) = buildRecombinationMap(inputBedDir) + + println("Resizing recombination maps for large indels from the GVCF files") + resizeRecombinationMapsForIndels(recombinationMap, inputGvcfDir) + println("Writing out the new BED files to $outputBedDir") + writeResizedBedFiles(recombinationMap, outputBedDir) + //Build Output writers for each sample name val outputWriters = buildOutputWriterMap(sampleNames, outputDir) //Process GVCFs and write out recombined files @@ -91,7 +104,158 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { } return Pair(recombinationMap, targetNames.toList()) } - + + + fun resizeRecombinationMapsForIndels(recombinationMap: Map>, gvcfDir: Path) { + println("Collecting indels that require resizing of recombination ranges") + + + //loop through the gvcf files and process them to find indels that will require resizing + //Do this quickly by just parsing the file like normal and not use htsjdk + val indelsForResizing = findAllOverlappingIndels(recombinationMap, gvcfDir) + + if(indelsForResizing.isEmpty()) { + println("No overlapping indels found that require resizing of recombination ranges") + return + } + + //Need to flip the region map so we can follow the target sample names when we have an overlapping indel + val flippedRecombinationMap = flipRecombinationMap(recombinationMap) + + //Now we can loop through the indels and resize the original ranges + resizeMaps(indelsForResizing, recombinationMap, flippedRecombinationMap) + + } + + fun findAllOverlappingIndels(recombinationMap: Map>, gvcfDir: Path): List> { + return gvcfDir.toFile().listFiles()?.flatMap { gvcfFile -> + val sampleName = gvcfFile.name.substringBeforeLast(".g.vcf") + val ranges = + recombinationMap[sampleName] ?: return@flatMap emptyList>() + + findOverlappingIndelsInGvcf(sampleName, gvcfFile, ranges) + }?: emptyList() + } + + fun findOverlappingIndelsInGvcf(sampleName: String, gvcfFile: File, ranges: RangeMap): List> { + val reader = bufferedReader(gvcfFile.absolutePath) + + var currentLine = reader.readLine() + val overlappingIndels = mutableListOf>() + while(currentLine != null) { + if(currentLine.startsWith("#")) { + currentLine = reader.readLine() + continue + } + val parts = currentLine.split("\t") + val chrom = parts[0] + val pos = parts[1].toInt() + val ref = parts[3] + val alt = parts[4] + + val startPos = Position(chrom, pos) + val endPos = Position(chrom, pos + ref.length - 1) + + //check if its an indel first + if(ref.length != alt.length && alt != "") { + //It's an indel + //Now check to see if it overlaps more than one range. This can be done by checking the ranges coming out of the start and end positions + val startRange = ranges.getEntry(startPos) + val endRange = ranges.getEntry(endPos) + if(startRange != null && endRange != null && startRange != endRange) { + //It overlaps more than one range + val simpleVariant = SimpleVariant(startPos, endPos, ref, alt) + println("Found overlapping indel at $chrom:$pos $ref->$alt") + overlappingIndels.add(Triple(sampleName, startRange.value, simpleVariant)) + } + } + currentLine = reader.readLine() + } + return overlappingIndels + } + + fun flipRecombinationMap(recombinationMap: Map>): Map> { + val flippedMap = mutableMapOf>() + + for((sampleName, rangeMap) in recombinationMap) { + for(entry in rangeMap.asMapOfRanges().entries) { + val range = entry.key + val targetSampleName = entry.value + + flippedMap.computeIfAbsent(targetSampleName) { TreeRangeMap.create() }.put(range, sampleName) + } + } + + return flippedMap + } + + fun resizeMaps( + indelsForResizing: List>, + recombinationMap: Map>, + flippedRecombinationMap: Map> + ) { + + TODO("Implement resizing of recombination maps for overlapping indels") +// for((sourceSampleName, targetSampleName, indel) in indelsForResizing) { +// val sourceRangeMap = recombinationMap[sourceSampleName] ?: continue +// val targetRangeMap = flippedRecombinationMap[targetSampleName] ?: continue +// +// //Find the ranges that the indel overlaps in the source map +// val startRangeEntry = sourceRangeMap.getEntry(indel.start) +// val endRangeEntry = sourceRangeMap.getEntry(indel.end) +// +// if(startRangeEntry == null || endRangeEntry == null) continue +// +// val startRange = startRangeEntry.key +// val endRange = endRangeEntry.key +// +// //Now we need to resize the ranges between startRange and endRange +// val rangesToResize = sourceRangeMap.asMapOfRanges().keys.filter { range -> +// range.isConnected(Range.closed(indel.start, indel.end)) +// } +// +// for(range in rangesToResize) { +// sourceRangeMap.remove(range) +// } +// +// //Re-add the resized ranges +// var currentStartPos = startRange.lowerEndpoint().position +// for(range in rangesToResize) { +// val newEndPos = if(range == endRange) { +// endRange.upperEndpoint().position +// } else { +// range.upperEndpoint().position - 1 +// } +// +// val newRange = Range.closed( +// Position(range.lowerEndpoint().contig, currentStartPos), +// Position(range.lowerEndpoint().contig, newEndPos) +// ) +// sourceRangeMap.put(newRange, sourceRangeMap.get(range)!!) +// +// currentStartPos = newEndPos + 1 +// } +// } + } + + fun writeResizedBedFiles(recombinationMap: Map>, outputBedDir: Path) { + println("Writing resized BED files") + for((sampleName, rangeMap) in recombinationMap) { + val outputBedFile = File(outputBedDir.toFile(), "${sampleName}_resized.bed") + outputBedFile.bufferedWriter().use { writer -> + for(entry in rangeMap.asMapOfRanges().entries) { + val range = entry.key + val targetSampleName = entry.value + val chrom = range.lowerEndpoint().contig + val start = range.lowerEndpoint().position - 1 //Convert back to 0 based for BED + val end = range.upperEndpoint().position + + writer.write("$chrom\t$start\t$end\t$targetSampleName\n") + } + } + } + } + fun buildOutputWriterMap(sampleNames: List, outputDir: Path): Map { return sampleNames.associateWith { sampleName -> @@ -174,6 +338,12 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { //Get the current start position processRefBlockOverlap(startPos, endPos, ranges, outputWriters, refSeq, vc) } + else { + //Its an indel or complex polymorphism + //We will need to resize some variants when we do the sort though as we could potentially have an overlapping indel + val newVc = changeSampleName(vc, targetSampleName) + outputWriter.add(newVc) + } } } diff --git a/src/test/kotlin/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt similarity index 60% rename from src/test/kotlin/RecombineGvcfsTest.kt rename to src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt index 725bfd2..dd4c9c6 100644 --- a/src/test/kotlin/RecombineGvcfsTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -1,10 +1,11 @@ +package net.maizegenetics.commands + import htsjdk.variant.variantcontext.Allele import htsjdk.variant.variantcontext.GenotypeBuilder import htsjdk.variant.variantcontext.VariantContextBuilder import net.maizegenetics.net.maizegenetics.commands.RecombineGvcfs import kotlin.test.DefaultAsserter.assertEquals import kotlin.test.Test -import kotlin.test.assertEquals import kotlin.test.assertFalse class RecombineGvcfsTest { @@ -31,13 +32,17 @@ class RecombineGvcfsTest { val renamed = recombineGvcfs.changeSampleName(originalVariantContext, newSampleName) assertEquals("SampleName was not updated", newSampleName, renamed.genotypes.sampleNames.first()) - assertEquals("Alleles were not preserved", originalVariantContext.genotypes.first().alleles, renamed.genotypes.first().alleles) + assertEquals( + "Alleles were not preserved", + originalVariantContext.genotypes.first().alleles, + renamed.genotypes.first().alleles + ) //check that the rest of the variant matches and that Sample1 does not exist - assertEquals("Contigs do not match",originalVariantContext.contig, renamed.contig) - assertEquals("Start positions do not match",originalVariantContext.start, renamed.start) - assertEquals("Stop positions do not match",originalVariantContext.end, renamed.end) - assertEquals("IDs do not match",originalVariantContext.id, renamed.id) - assertEquals("Alleles do not match",originalVariantContext.alleles, renamed.alleles) - assertFalse( renamed.genotypes.sampleNames.contains("Sample1"), "Old sample name still exists") + assertEquals("Contigs do not match", originalVariantContext.contig, renamed.contig) + assertEquals("Start positions do not match", originalVariantContext.start, renamed.start) + assertEquals("Stop positions do not match", originalVariantContext.end, renamed.end) + assertEquals("IDs do not match", originalVariantContext.id, renamed.id) + assertEquals("Alleles do not match", originalVariantContext.alleles, renamed.alleles) + assertFalse(renamed.genotypes.sampleNames.contains("Sample1"), "Old sample name still exists") } } \ No newline at end of file From 7a0f49d0926b6a2c54b41af289e700fef73e4a33 Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Fri, 9 Jan 2026 14:29:35 -0500 Subject: [PATCH 04/14] Fixing imports --- .../commands/MutateAssemblies.kt | 13 +++------ .../maizegenetics/commands/RecombineGvcfs.kt | 7 +++-- .../utils/VariantContextUtils.kt | 2 +- .../commands/RecombineGvcfsTest.kt | 27 +++++++++++++++++++ 4 files changed, 34 insertions(+), 15 deletions(-) diff --git a/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt index a672b79..50a9790 100644 --- a/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt @@ -14,17 +14,10 @@ import htsjdk.variant.variantcontext.VariantContextBuilder import htsjdk.variant.variantcontext.writer.Options import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder import htsjdk.variant.vcf.VCFFileReader -import htsjdk.variant.vcf.VCFFormatHeaderLine -import htsjdk.variant.vcf.VCFHeader -import htsjdk.variant.vcf.VCFHeaderLine -import htsjdk.variant.vcf.VCFHeaderLineCount -import htsjdk.variant.vcf.VCFHeaderLineType -import htsjdk.variant.vcf.VCFInfoHeaderLine -import net.maizegenetics.net.maizegenetics.utils.Position -import net.maizegenetics.net.maizegenetics.utils.SimpleVariant -import net.maizegenetics.net.maizegenetics.utils.VariantContextUtils +import net.maizegenetics.utils.Position +import net.maizegenetics.utils.SimpleVariant +import net.maizegenetics.utils.VariantContextUtils import java.io.File -import java.util.HashSet import kotlin.io.path.createDirectories import kotlin.io.path.exists diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt index f7984d0..a6b1387 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -20,12 +20,11 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder import htsjdk.variant.vcf.VCFFileReader import htsjdk.variant.vcf.VCFReader -import net.maizegenetics.net.maizegenetics.utils.Position -import net.maizegenetics.net.maizegenetics.utils.SimpleVariant -import net.maizegenetics.net.maizegenetics.utils.VariantContextUtils +import net.maizegenetics.utils.Position +import net.maizegenetics.utils.SimpleVariant +import net.maizegenetics.utils.VariantContextUtils import java.io.File import java.nio.file.Path -import kotlin.io.path.bufferedReader data class RecombinationRange(val chrom: String, val start: Int, val end: Int, val targetSampleName: String) diff --git a/src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt b/src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt index 5179b33..4a9df2e 100644 --- a/src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt +++ b/src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt @@ -1,4 +1,4 @@ -package net.maizegenetics.net.maizegenetics.utils +package net.maizegenetics.utils import htsjdk.variant.vcf.VCFFormatHeaderLine import htsjdk.variant.vcf.VCFHeader diff --git a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt index dd4c9c6..40b5913 100644 --- a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -1,15 +1,42 @@ package net.maizegenetics.commands +import com.google.common.collect.Range +import com.google.common.collect.RangeMap +import com.google.common.collect.TreeRangeMap import htsjdk.variant.variantcontext.Allele import htsjdk.variant.variantcontext.GenotypeBuilder import htsjdk.variant.variantcontext.VariantContextBuilder import net.maizegenetics.net.maizegenetics.commands.RecombineGvcfs +import net.maizegenetics.utils.Position import kotlin.test.DefaultAsserter.assertEquals import kotlin.test.Test import kotlin.test.assertFalse class RecombineGvcfsTest { + @Test + fun testFlipRecombinationMap() { + //This needs to take a recombination map Map> and flip it to Map> + + val recombinationMap = mutableMapOf>() + //Build the recombination map here + val sample1RangeMap = TreeRangeMap.create() + sample1RangeMap.put(Range.closed(Position("chr1", 100), Position("chr1", 200)), "TargetSampleA") + sample1RangeMap.put(Range.closed(Position("chr1", 201), Position("chr1", 300)), "TargetSampleB") + recombinationMap["Sample1"] = sample1RangeMap + val sample2RangeMap = TreeRangeMap.create() + sample2RangeMap.put(Range.closed(Position("chr1", 100), Position("chr1", 200)), "TargetSampleB") + sample2RangeMap.put(Range.closed(Position("chr1", 201), Position("chr1", 300)), "TargetSampleA") + recombinationMap["Sample2"] = sample2RangeMap + + val recombineGvcfs = RecombineGvcfs() + val flippedMap = recombineGvcfs.flipRecombinationMap(recombinationMap) + //Now check that the flipped map is correct + + + } + + @Test fun testChangeSampleName() { From 925a0bc932f1e1242bc7cbfe38f8dbcc8d3ee33b Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Fri, 9 Jan 2026 15:21:23 -0500 Subject: [PATCH 05/14] Adding in some unit tests --- .../commands/MutateAssembliesTest.kt | 6 +- .../commands/RecombineGvcfsTest.kt | 112 +++++++++++++++++- 2 files changed, 109 insertions(+), 9 deletions(-) diff --git a/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt b/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt index 0160ef4..7af992e 100644 --- a/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt @@ -7,9 +7,9 @@ import htsjdk.variant.variantcontext.Allele import htsjdk.variant.variantcontext.VariantContextBuilder import htsjdk.variant.vcf.VCFFileReader import net.maizegenetics.net.maizegenetics.commands.MutateAssemblies -import net.maizegenetics.net.maizegenetics.utils.Position -import net.maizegenetics.net.maizegenetics.utils.SimpleVariant -import net.maizegenetics.net.maizegenetics.utils.VariantContextUtils +import net.maizegenetics.utils.Position +import net.maizegenetics.utils.SimpleVariant +import net.maizegenetics.utils.VariantContextUtils import java.io.File import kotlin.test.Test import kotlin.test.assertEquals diff --git a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt index 40b5913..1642b9d 100644 --- a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -8,16 +8,103 @@ import htsjdk.variant.variantcontext.GenotypeBuilder import htsjdk.variant.variantcontext.VariantContextBuilder import net.maizegenetics.net.maizegenetics.commands.RecombineGvcfs import net.maizegenetics.utils.Position +import java.io.File +import kotlin.io.path.Path import kotlin.test.DefaultAsserter.assertEquals import kotlin.test.Test import kotlin.test.assertFalse class RecombineGvcfsTest { + val homeDir = System.getProperty("user.home").replace('\\', '/') + + val outputDir = "$homeDir/temp/seq_sim/recombine_gvcf_test/" + @Test fun testFlipRecombinationMap() { //This needs to take a recombination map Map> and flip it to Map> + val recombinationMap = buildSimpleRecombinationMap() + + val recombineGvcfs = RecombineGvcfs() + val flippedMap = recombineGvcfs.flipRecombinationMap(recombinationMap) + //Now check that the flipped map is correct + //Check TargetSampleA + val targetSampleARangeMap = flippedMap["TargetSampleA"] + assertEquals( + "TargetSampleA should have 2 ranges", + 2, + targetSampleARangeMap?.asMapOfRanges()?.size + ) + val targetSampleARanges = targetSampleARangeMap?.asMapOfRanges() + //Check that the ranges are correct + val range1 = targetSampleARanges?.keys?.find { it.contains(Position("chr1", 150)) } + assertEquals( + "TargetSampleA should have range from Sample1", + Range.closed(Position("chr1", 100), Position("chr1", 200)), + range1 + ) + val range2 = targetSampleARanges?.keys?.find { it.contains(Position("chr1", 250)) } + assertEquals( + "TargetSampleA should have range from Sample2", + Range.closed(Position("chr1", 201), Position("chr1", 300)), + range2 + ) + //Check TargetSampleB + val targetSampleBRangeMap = flippedMap["TargetSampleB"] + assertEquals( + "TargetSampleB should have 2 ranges", + 2, + targetSampleBRangeMap?.asMapOfRanges()?.size + ) + val targetSampleBRanges = targetSampleBRangeMap?.asMapOfRanges() + //Check that the ranges are correct + val range3 = targetSampleBRanges?.keys?.find { it.contains(Position("chr1", 150)) } + assertEquals( + "TargetSampleB should have range from Sample2", + Range.closed(Position("chr1", 100), Position("chr1", 200)), + range3 + ) + val range4 = targetSampleBRanges?.keys?.find { it.contains(Position("chr1", 250)) } + assertEquals( + "TargetSampleB should have range from Sample1", + Range.closed(Position("chr1", 201), Position("chr1", 300)), + range4 + ) + + + } + + + @Test + fun testWriteResizedBedFiles() { + val recombinationMap = buildSimpleRecombinationMap() + + val recombineGvcfs = RecombineGvcfs() + + File(outputDir).mkdirs() + + recombineGvcfs.writeResizedBedFiles(recombinationMap, Path(outputDir)) + + //Check that the files were created and have the correct content + val targetSampleABedFile = File("$outputDir/Sample1_resized.bed") + assertEquals("TargetSampleA.bed file was not created", true, targetSampleABedFile.isFile) + val targetSampleAContent = targetSampleABedFile.readText().trim() + val expectedTargetSampleAContent = "chr1\t99\t200\tTargetSampleA\nchr1\t200\t300\tTargetSampleB" + assertEquals("TargetSampleA.bed content is incorrect", expectedTargetSampleAContent, targetSampleAContent) + val targetSampleBBedFile = File("$outputDir/Sample2_resized.bed") + assertEquals("TargetSampleB.bed file was not created", true, targetSampleBBedFile.isFile) + val targetSampleBContent = targetSampleBBedFile.readText().trim() + val expectedTargetSampleBContent = "chr1\t99\t200\tTargetSampleB\nchr1\t200\t300\tTargetSampleA" + assertEquals("TargetSampleB.bed content is incorrect", expectedTargetSampleBContent, targetSampleBContent) + + //Delete the output directory + File(outputDir).deleteRecursively() + + + } + + private fun buildSimpleRecombinationMap(): Map> { val recombinationMap = mutableMapOf>() //Build the recombination map here val sample1RangeMap = TreeRangeMap.create() @@ -28,12 +115,7 @@ class RecombineGvcfsTest { sample2RangeMap.put(Range.closed(Position("chr1", 100), Position("chr1", 200)), "TargetSampleB") sample2RangeMap.put(Range.closed(Position("chr1", 201), Position("chr1", 300)), "TargetSampleA") recombinationMap["Sample2"] = sample2RangeMap - - val recombineGvcfs = RecombineGvcfs() - val flippedMap = recombineGvcfs.flipRecombinationMap(recombinationMap) - //Now check that the flipped map is correct - - + return recombinationMap } @@ -72,4 +154,22 @@ class RecombineGvcfsTest { assertEquals("Alleles do not match", originalVariantContext.alleles, renamed.alleles) assertFalse(renamed.genotypes.sampleNames.contains("Sample1"), "Old sample name still exists") } + + @Test + fun testBuildRefBlock() { + //Build a sample refBlock + val recombineGvcfs = RecombineGvcfs() + val refBlock = recombineGvcfs.buildRefBlock("chr1", 100, 200, "A","SampleA") + assertEquals("Contig does not match","chr1", refBlock.contig) + assertEquals("Start position does not match",100, refBlock.start) + assertEquals("End position does not match", 200, refBlock.end) + assertEquals("RefBlock should have 2 alleles", 2, refBlock.alleles.size) + assertEquals("Ref allele does not match", Allele.REF_A, refBlock.alleles[0]) + assertEquals("Alt allele does not match",Allele.create("",false), refBlock.alleles[1]) + assertEquals("There should be one sample in the refBlock", 1, refBlock.genotypes.sampleNames.size) + assertEquals("Sample name does not match","SampleA", refBlock.genotypes.sampleNames.first()) + val genotype = refBlock.genotypes.first() + assertEquals("Genotype should have 1 alleles", 1, genotype.alleles.size) + assertEquals("Genotype ref allele does not match", Allele.REF_A, genotype.alleles[0]) + } } \ No newline at end of file From b4f71b148b0116a7f8b2f3e5b159d35ef0278587 Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Mon, 12 Jan 2026 15:35:20 -0500 Subject: [PATCH 06/14] Unit tests and Associated files --- data/RecombineGvcfs/bed/sampleA.bed | 3 + data/RecombineGvcfs/bed/sampleB.bed | 3 + data/RecombineGvcfs/bed/sampleC.bed | 3 + data/RecombineGvcfs/gvcf/sampleA.gvcf | 16 ++ data/RecombineGvcfs/gvcf/sampleB.gvcf | 22 ++ data/RecombineGvcfs/gvcf/sampleC.gvcf | 21 ++ data/RecombineGvcfs/ref.fa | 1 + .../maizegenetics/commands/RecombineGvcfs.kt | 141 +++++++------ .../commands/RecombineGvcfsTest.kt | 191 ++++++++++++++++++ 9 files changed, 336 insertions(+), 65 deletions(-) create mode 100644 data/RecombineGvcfs/bed/sampleA.bed create mode 100644 data/RecombineGvcfs/bed/sampleB.bed create mode 100644 data/RecombineGvcfs/bed/sampleC.bed create mode 100644 data/RecombineGvcfs/gvcf/sampleA.gvcf create mode 100644 data/RecombineGvcfs/gvcf/sampleB.gvcf create mode 100644 data/RecombineGvcfs/gvcf/sampleC.gvcf create mode 100644 data/RecombineGvcfs/ref.fa diff --git a/data/RecombineGvcfs/bed/sampleA.bed b/data/RecombineGvcfs/bed/sampleA.bed new file mode 100644 index 0000000..727c4ec --- /dev/null +++ b/data/RecombineGvcfs/bed/sampleA.bed @@ -0,0 +1,3 @@ +chr1 0 10 sampleX +chr1 10 20 sampleY +chr1 20 30 sampleZ \ No newline at end of file diff --git a/data/RecombineGvcfs/bed/sampleB.bed b/data/RecombineGvcfs/bed/sampleB.bed new file mode 100644 index 0000000..d18fd5a --- /dev/null +++ b/data/RecombineGvcfs/bed/sampleB.bed @@ -0,0 +1,3 @@ +chr1 0 10 sampleY +chr1 10 20 sampleZ +chr1 20 30 sampleX \ No newline at end of file diff --git a/data/RecombineGvcfs/bed/sampleC.bed b/data/RecombineGvcfs/bed/sampleC.bed new file mode 100644 index 0000000..219ca9e --- /dev/null +++ b/data/RecombineGvcfs/bed/sampleC.bed @@ -0,0 +1,3 @@ +chr1 0 10 sampleZ +chr1 10 20 sampleX +chr1 20 30 sampleY \ No newline at end of file diff --git a/data/RecombineGvcfs/gvcf/sampleA.gvcf b/data/RecombineGvcfs/gvcf/sampleA.gvcf new file mode 100644 index 0000000..14dfdac --- /dev/null +++ b/data/RecombineGvcfs/gvcf/sampleA.gvcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleA +chr1 1 . A . . END=30 GT 0 \ No newline at end of file diff --git a/data/RecombineGvcfs/gvcf/sampleB.gvcf b/data/RecombineGvcfs/gvcf/sampleB.gvcf new file mode 100644 index 0000000..2b371bc --- /dev/null +++ b/data/RecombineGvcfs/gvcf/sampleB.gvcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleA +chr1 1 . A . . END=4 GT 0 +chr1 5 . A T . . GT 1 +chr1 6 . A . . END=14 GT 0 +chr1 15 . A T . . GT 1 +chr1 16 . A . . GT 0 +chr1 25 . A T . . GT 1 +chr1 26 . A . . END=30 GT 0 \ No newline at end of file diff --git a/data/RecombineGvcfs/gvcf/sampleC.gvcf b/data/RecombineGvcfs/gvcf/sampleC.gvcf new file mode 100644 index 0000000..a86dabd --- /dev/null +++ b/data/RecombineGvcfs/gvcf/sampleC.gvcf @@ -0,0 +1,21 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleA +chr1 1 . A . . END=8 GT 0 +chr1 9 . AAA A . . GT 1 +chr1 12 . A . . END=16 GT 0 +chr1 17 . AAA A . . GT 1 +chr1 20 . A T . . GT 1 +chr1 221 . A . . END=30 GT 0 \ No newline at end of file diff --git a/data/RecombineGvcfs/ref.fa b/data/RecombineGvcfs/ref.fa new file mode 100644 index 0000000..08b6f77 --- /dev/null +++ b/data/RecombineGvcfs/ref.fa @@ -0,0 +1 @@ +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA \ No newline at end of file diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt index a6b1387..128bc99 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -64,10 +64,10 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { println("Recombining GVCFs from $inputGvcfDir using BED files from $inputBedDir into $outputDir") //Build BedFile Map - val (recombinationMap, sampleNames) = buildRecombinationMap(inputBedDir) + val (recombinationMapBeforeResize, sampleNames) = buildRecombinationMap(inputBedDir) println("Resizing recombination maps for large indels from the GVCF files") - resizeRecombinationMapsForIndels(recombinationMap, inputGvcfDir) + val recombinationMap = resizeRecombinationMapsForIndels(recombinationMapBeforeResize, inputGvcfDir) println("Writing out the new BED files to $outputBedDir") writeResizedBedFiles(recombinationMap, outputBedDir) @@ -86,7 +86,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { val recombinationMap = mutableMapOf>() val targetNames = mutableSetOf() inputBedDir.toFile().listFiles()?.forEach { bedFile -> - val bedFileSampleName = bedFile.name.substringBeforeLast("_") + val bedFileSampleName = bedFile.name.substringBeforeLast("_").replace(".bed","") bedFile.forEachLine { line -> val parts = line.split("\t") if (parts.size >= 4) { @@ -105,7 +105,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { } - fun resizeRecombinationMapsForIndels(recombinationMap: Map>, gvcfDir: Path) { + fun resizeRecombinationMapsForIndels(recombinationMap: Map>, gvcfDir: Path): Map> { println("Collecting indels that require resizing of recombination ranges") @@ -115,20 +115,24 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { if(indelsForResizing.isEmpty()) { println("No overlapping indels found that require resizing of recombination ranges") - return + return recombinationMap } //Need to flip the region map so we can follow the target sample names when we have an overlapping indel val flippedRecombinationMap = flipRecombinationMap(recombinationMap) //Now we can loop through the indels and resize the original ranges - resizeMaps(indelsForResizing, recombinationMap, flippedRecombinationMap) + resizeMaps(indelsForResizing, flippedRecombinationMap) + + //reflip the maps + return flipRecombinationMap(flippedRecombinationMap) } fun findAllOverlappingIndels(recombinationMap: Map>, gvcfDir: Path): List> { return gvcfDir.toFile().listFiles()?.flatMap { gvcfFile -> - val sampleName = gvcfFile.name.substringBeforeLast(".g.vcf") + val sampleName = gvcfFile.name.replace(".g.vcf","").replace(".gvcf","").replace(".gz","") + val ranges = recombinationMap[sampleName] ?: return@flatMap emptyList>() @@ -170,6 +174,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { } currentLine = reader.readLine() } + reader.close() return overlappingIndels } @@ -188,53 +193,73 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { return flippedMap } + + fun resizeMaps( indelsForResizing: List>, - recombinationMap: Map>, flippedRecombinationMap: Map> ) { - TODO("Implement resizing of recombination maps for overlapping indels") -// for((sourceSampleName, targetSampleName, indel) in indelsForResizing) { -// val sourceRangeMap = recombinationMap[sourceSampleName] ?: continue -// val targetRangeMap = flippedRecombinationMap[targetSampleName] ?: continue -// -// //Find the ranges that the indel overlaps in the source map -// val startRangeEntry = sourceRangeMap.getEntry(indel.start) -// val endRangeEntry = sourceRangeMap.getEntry(indel.end) -// -// if(startRangeEntry == null || endRangeEntry == null) continue -// -// val startRange = startRangeEntry.key -// val endRange = endRangeEntry.key -// -// //Now we need to resize the ranges between startRange and endRange -// val rangesToResize = sourceRangeMap.asMapOfRanges().keys.filter { range -> -// range.isConnected(Range.closed(indel.start, indel.end)) -// } -// -// for(range in rangesToResize) { -// sourceRangeMap.remove(range) -// } -// -// //Re-add the resized ranges -// var currentStartPos = startRange.lowerEndpoint().position -// for(range in rangesToResize) { -// val newEndPos = if(range == endRange) { -// endRange.upperEndpoint().position -// } else { -// range.upperEndpoint().position - 1 -// } -// -// val newRange = Range.closed( -// Position(range.lowerEndpoint().contig, currentStartPos), -// Position(range.lowerEndpoint().contig, newEndPos) -// ) -// sourceRangeMap.put(newRange, sourceRangeMap.get(range)!!) -// -// currentStartPos = newEndPos + 1 -// } -// } + //work completely with flippedRecombinationMap then we need to flip back at the end + + //Walk through the indelsForResizing list + for((sourceSampleName, targetSampleName, indel) in indelsForResizing) { + //Get the source and target range maps + + val targetRangeMap = flippedRecombinationMap[targetSampleName] ?: continue + + //this list will hold all of the ranges and their corresponding source sample names that need to be added back in after resizing + val newRanges = mutableListOf, String>>() + val toDeleteRanges = mutableListOf>() + + //Now that we have this we need to first resize the sourceRangeMap's entry for the indels position + val startRangeEntry = targetRangeMap.getEntry(indel.refStart) + + //Double check to make sure that we need to resize aka check that the indel's end is outside of the start range + if(startRangeEntry == null || startRangeEntry.key.contains(indel.refEnd)) continue + + //for the original entry we can just resize it. + //add existing range to delete list + toDeleteRanges.add(startRangeEntry.key) + //create new resized range + val resizedRange = Range.closed( + startRangeEntry.key.lowerEndpoint(), + Position( + startRangeEntry.key.upperEndpoint().contig, + indel.refEnd.position - 1)) + + newRanges.add(Pair(resizedRange, startRangeEntry.value)) + + //Then we need to follow the target Sample's range map and resize/remove any that are overlapping + var currentPos = Position(startRangeEntry.key.upperEndpoint().contig, indel.refEnd.position) + while(currentPos <= indel.refEnd) { + val currentRangeEntry = targetRangeMap.getEntry(currentPos) ?: break + //add existing range to delete list + toDeleteRanges.add(currentRangeEntry.key) + //Check to see if this is the last overlapping range + if (currentRangeEntry.key.contains(indel.refEnd)) { + //Resize this range + val resizedLastRange = Range.closed( + Position(currentRangeEntry.key.lowerEndpoint().contig, indel.refEnd.position), + currentRangeEntry.key.upperEndpoint() + ) + newRanges.add(Pair(resizedLastRange, currentRangeEntry.value)) + break + } else { + //This range is fully contained within the indel, so we skip adding it back in + } + } + + //Now we can remove the old ranges from the targetRangeMap + for(range in toDeleteRanges) { + targetRangeMap.remove(range) + } + //Now we can add back in the new resized ranges + for((range, sourceSample) in newRanges) { + targetRangeMap.put(range, sourceSample) + } + + } } fun writeResizedBedFiles(recombinationMap: Map>, outputBedDir: Path) { @@ -287,21 +312,6 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { } } -// fun processGvcfsAndWrite( -// recombinationMap: Map>, -// inputGvcfDir: Path, -// outputWriters: Map -// ) { -// inputGvcfDir.toFile().listFiles()?.forEach { gvcfFile -> -// val sampleName = gvcfFile.name.substringBeforeLast(".g.vcf") -// val ranges = recombinationMap[sampleName] ?: return@forEach -// -// VCFFileReader(gvcfFile, false).use { gvcfReader -> -// processSingleGVCFFile(gvcfReader, ranges, outputWriters) -// } -// } -// } - fun processSingleGVCFFile( gvcfReader: VCFReader, ranges: RangeMap, @@ -339,7 +349,8 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { } else { //Its an indel or complex polymorphism - //We will need to resize some variants when we do the sort though as we could potentially have an overlapping indel + //Assign it to the left most range only + //This is ok to do as we have already resized the recombination ranges to account for overlapping indels val newVc = changeSampleName(vc, targetSampleName) outputWriter.add(newVc) } diff --git a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt index 1642b9d..c4a5210 100644 --- a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -11,6 +11,7 @@ import net.maizegenetics.utils.Position import java.io.File import kotlin.io.path.Path import kotlin.test.DefaultAsserter.assertEquals +import kotlin.test.DefaultAsserter.assertTrue import kotlin.test.Test import kotlin.test.assertFalse @@ -20,6 +21,187 @@ class RecombineGvcfsTest { val outputDir = "$homeDir/temp/seq_sim/recombine_gvcf_test/" + @Test + fun buildRecombinationMap() { + val bedDir = "./data/RecombineGvcfs/bed/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + //Check that the map has the correct number of samples + assertEquals( + "Recombination map should have 3 samples", + 3, + recombinationMap.size + ) + + //Check to make sure that the recombination maps are correct + checkRecombinationMapContents(recombinationMap, "sampleA", listOf("sampleX", "sampleY", "sampleZ")) + checkRecombinationMapContents(recombinationMap, "sampleB", listOf("sampleY", "sampleZ", "sampleX")) + checkRecombinationMapContents(recombinationMap, "sampleC", listOf("sampleZ", "sampleX", "sampleY")) + + + //targetNameList should also have 3 samples + assertEquals( + "Target name list should have 3 samples", + 3, + targetNameList.size + ) + + //Should have sampleX, sampleY, sampleZ + assertTrue( + "Target name list does not contain sampleX", + targetNameList.contains("sampleX") + ) + assertTrue( + "Target name list does not contain sampleY", + targetNameList.contains("sampleY") + ) + assertTrue( + "Target name list does not contain sampleZ", + targetNameList.contains("sampleZ") + ) + } + + private fun checkRecombinationMapContents(recombinationMap: Map>, sampleName: String, expectedTargetSamples: List) { + val sampleMap = recombinationMap[sampleName] + require(sampleMap != null) { "Range map for $sampleName is null" } + assertEquals( + "${sampleName} should have 3 ranges", + 3, + sampleMap.asMapOfRanges().size + ) + //From 1 -10 should be sampleX, from 10 to 19 should be sampleY, from 20 to 29 should be sampleZ + val range1 = sampleMap.getEntry(Position("chr1", 5)) + assertEquals( + "${sampleName} first range not correct", + Range.closed(Position("chr1", 1), Position("chr1", 10)), + range1?.key + ) + assertEquals( + "${sampleName} first range target not correct", + expectedTargetSamples[0], + range1?.value + ) + val range2 = sampleMap.getEntry(Position("chr1", 15)) + assertEquals( + "${sampleName} second range not correct", + Range.closed(Position("chr1", 11), Position("chr1", 20)), + range2?.key + ) + assertEquals( + "${sampleName} second range target not correct", + expectedTargetSamples[1], + range2?.value + ) + val range3 = sampleMap.getEntry(Position("chr1", 25)) + assertEquals( + "${sampleName} third range not correct", + Range.closed(Position("chr1", 21), Position("chr1", 30)), + range3?.key + ) + assertEquals( + "${sampleName} third range target not correct", + expectedTargetSamples[2], + range3?.value + ) + } + + + @Test + fun testFindOverlappingIndelsInGvcf() { + val bedDir = "./data/RecombineGvcfs/bed/" + val gvcfFile = "./data/RecombineGvcfs/gvcf/" + + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + + //Test SampleB and Sample C. SampleB does not have an overlapping indel so the list should be empty + val sampleBIndels = recombineGvcfs.findOverlappingIndelsInGvcf("sampleB", File("$gvcfFile/sampleB.gvcf"),recombinationMap["sampleB"]!!) + + assertEquals( + "SampleB should have 0 overlapping indels", + 0, + sampleBIndels.size + ) + + //SampleC should have one overlapping indel at chr1:9-11 hitting position 10 in the recombination map + val sampleCIndels = recombineGvcfs.findOverlappingIndelsInGvcf("sampleC", File("$gvcfFile/sampleC.gvcf"),recombinationMap["sampleC"]!!) + assertEquals( + "SampleC should have 1 overlapping indel", + 1, + sampleCIndels.size + ) + val indel = sampleCIndels[0] + + assertEquals( + "Indel contig does not match", + "chr1", + indel.third.refStart.contig + ) + assertEquals( + "Indel start does not match", + 9, + indel.third.refStart.position + ) + assertEquals( + "Indel end does not match", + 11, + indel.third.refEnd.position + ) + //check source and target sample names + assertEquals( + "Indel source sample name does not match", + "sampleC", + indel.first + ) + assertEquals( + "Indel target sample name does not match", + "sampleZ", + indel.second + ) + } + + @Test + fun testFindAllOverlappingIndels() { + val bedDir = "./data/RecombineGvcfs/bed/" + val gvcfDir = "./data/RecombineGvcfs/gvcf/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + val allIndels = recombineGvcfs.findAllOverlappingIndels(recombinationMap, Path(gvcfDir)) + //There should be only one indel from sampleC + assertEquals( + "There should be 1 overlapping indel in total", + 1, + allIndels.size + ) + val indel = allIndels[0] + assertEquals( + "Indel source sample name does not match", + "sampleC", + indel.first + ) + assertEquals( + "Indel target sample name does not match", + "sampleZ", + indel.second + ) + assertEquals( + "Indel contig does not match", + "chr1", + indel.third.refStart.contig + ) + assertEquals( + "Indel start does not match", + 9, + indel.third.refStart.position + ) + assertEquals( + "Indel end does not match", + 11, + indel.third.refEnd.position + ) + + } + @Test fun testFlipRecombinationMap() { //This needs to take a recombination map Map> and flip it to Map> @@ -71,11 +253,20 @@ class RecombineGvcfsTest { range4 ) + //Flip it back and see if the original map is recovered + val reflippedMap = recombineGvcfs.flipRecombinationMap(flippedMap) + assertEquals( + "Reflipped map should be equal to original map", + recombinationMap, + reflippedMap + ) } + + @Test fun testWriteResizedBedFiles() { val recombinationMap = buildSimpleRecombinationMap() From 3118b482db40dc1581bf35f85423f89a9ae161f0 Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Wed, 14 Jan 2026 16:14:47 -0500 Subject: [PATCH 07/14] Adding in more unit tests and fixing the sample names in the gvcfs --- data/RecombineGvcfs/gvcf/sampleB.gvcf | 2 +- data/RecombineGvcfs/gvcf/sampleC.gvcf | 2 +- .../maizegenetics/commands/RecombineGvcfs.kt | 22 +-- .../commands/RecombineGvcfsTest.kt | 163 +++++++++++++++++- 4 files changed, 168 insertions(+), 21 deletions(-) diff --git a/data/RecombineGvcfs/gvcf/sampleB.gvcf b/data/RecombineGvcfs/gvcf/sampleB.gvcf index 2b371bc..cca6660 100644 --- a/data/RecombineGvcfs/gvcf/sampleB.gvcf +++ b/data/RecombineGvcfs/gvcf/sampleB.gvcf @@ -12,7 +12,7 @@ ##INFO= ##INFO= ##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleA +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleB chr1 1 . A . . END=4 GT 0 chr1 5 . A T . . GT 1 chr1 6 . A . . END=14 GT 0 diff --git a/data/RecombineGvcfs/gvcf/sampleC.gvcf b/data/RecombineGvcfs/gvcf/sampleC.gvcf index a86dabd..3c2ce7c 100644 --- a/data/RecombineGvcfs/gvcf/sampleC.gvcf +++ b/data/RecombineGvcfs/gvcf/sampleC.gvcf @@ -12,7 +12,7 @@ ##INFO= ##INFO= ##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleA +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleC chr1 1 . A . . END=8 GT 0 chr1 9 . AAA A . . GT 1 chr1 12 . A . . END=16 GT 0 diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt index 128bc99..8a1c785 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -26,8 +26,6 @@ import net.maizegenetics.utils.VariantContextUtils import java.io.File import java.nio.file.Path -data class RecombinationRange(val chrom: String, val start: Int, val end: Int, val targetSampleName: String) - class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { private val inputBedDir by option(help = "Input Bed dir") @@ -122,10 +120,10 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { val flippedRecombinationMap = flipRecombinationMap(recombinationMap) //Now we can loop through the indels and resize the original ranges - resizeMaps(indelsForResizing, flippedRecombinationMap) + val resizedMap = resizeMaps(indelsForResizing, flippedRecombinationMap) //reflip the maps - return flipRecombinationMap(flippedRecombinationMap) + return flipRecombinationMap(resizedMap) } @@ -198,15 +196,16 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { fun resizeMaps( indelsForResizing: List>, flippedRecombinationMap: Map> - ) { + ): Map> { //work completely with flippedRecombinationMap then we need to flip back at the end - + val resizedMap = flippedRecombinationMap.toMutableMap() //Walk through the indelsForResizing list for((sourceSampleName, targetSampleName, indel) in indelsForResizing) { //Get the source and target range maps - val targetRangeMap = flippedRecombinationMap[targetSampleName] ?: continue +// val targetRangeMap = flippedRecombinationMap[targetSampleName] ?: continue + val targetRangeMap = resizedMap[targetSampleName] ?: continue //this list will hold all of the ranges and their corresponding source sample names that need to be added back in after resizing val newRanges = mutableListOf, String>>() @@ -226,7 +225,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { startRangeEntry.key.lowerEndpoint(), Position( startRangeEntry.key.upperEndpoint().contig, - indel.refEnd.position - 1)) + indel.refEnd.position)) newRanges.add(Pair(resizedRange, startRangeEntry.value)) @@ -237,10 +236,10 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { //add existing range to delete list toDeleteRanges.add(currentRangeEntry.key) //Check to see if this is the last overlapping range - if (currentRangeEntry.key.contains(indel.refEnd)) { + if (currentRangeEntry.key.contains(indel.refEnd) && currentRangeEntry.key.upperEndpoint()!= indel.refEnd) { //Resize this range val resizedLastRange = Range.closed( - Position(currentRangeEntry.key.lowerEndpoint().contig, indel.refEnd.position), + Position(currentRangeEntry.key.lowerEndpoint().contig, indel.refEnd.position+1), //Need to shift the position up by 1 currentRangeEntry.key.upperEndpoint() ) newRanges.add(Pair(resizedLastRange, currentRangeEntry.value)) @@ -258,8 +257,9 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { for((range, sourceSample) in newRanges) { targetRangeMap.put(range, sourceSample) } - + resizedMap[targetSampleName] = targetRangeMap } + return resizedMap } fun writeResizedBedFiles(recombinationMap: Map>, outputBedDir: Path) { diff --git a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt index c4a5210..84caa6a 100644 --- a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -6,10 +6,15 @@ import com.google.common.collect.TreeRangeMap import htsjdk.variant.variantcontext.Allele import htsjdk.variant.variantcontext.GenotypeBuilder import htsjdk.variant.variantcontext.VariantContextBuilder +import htsjdk.variant.variantcontext.writer.VariantContextWriter +import htsjdk.variant.vcf.VCFFileReader import net.maizegenetics.net.maizegenetics.commands.RecombineGvcfs import net.maizegenetics.utils.Position +import net.maizegenetics.utils.SimpleVariant import java.io.File import kotlin.io.path.Path +import kotlin.test.AfterTest +import kotlin.test.BeforeTest import kotlin.test.DefaultAsserter.assertEquals import kotlin.test.DefaultAsserter.assertTrue import kotlin.test.Test @@ -21,8 +26,20 @@ class RecombineGvcfsTest { val outputDir = "$homeDir/temp/seq_sim/recombine_gvcf_test/" + @BeforeTest + fun setupTest() { + // Runs before each test in common code + File(outputDir).mkdirs() + } + + @AfterTest + fun teardownTest() { + // Runs after each test in common code + File(outputDir).deleteRecursively() + } + @Test - fun buildRecombinationMap() { + fun testBuildRecombinationMap() { val bedDir = "./data/RecombineGvcfs/bed/" val recombineGvcfs = RecombineGvcfs() val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) @@ -106,6 +123,18 @@ class RecombineGvcfsTest { } + @Test + fun testResizeRecombinationMapsForIndels() { + //Each of the units are tested in other functions. Here we just need to test that the overall resizing works as expected + val bedDir = "./data/RecombineGvcfs/bed/" + val gvcfFile = "./data/RecombineGvcfs/gvcf/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + val resizedMap = recombineGvcfs.resizeRecombinationMapsForIndels(recombinationMap, Path(gvcfFile)) + + TODO("Implement test for resizeRecombinationMapsForIndels" ) + } + @Test fun testFindOverlappingIndelsInGvcf() { val bedDir = "./data/RecombineGvcfs/bed/" @@ -264,6 +293,81 @@ class RecombineGvcfsTest { } + @Test + fun testResizeMaps() { + val bedDir = "./data/RecombineGvcfs/bed/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + + //Need to flip the map + val flippedMap = recombineGvcfs.flipRecombinationMap(recombinationMap) + val indelsForResizingEmpty = emptyList>() + + val nonResizedMap = recombineGvcfs.resizeMaps(indelsForResizingEmpty, flippedMap) + //Check that the map is the same as the flipped map + assertEquals( + "No-resize map should be equal to flipped map", + flippedMap, + nonResizedMap + ) + + //Make an overlapping indel SampleC is the donor for TargetSampleZ at chr1:9-11 + //chr1 9 . AAA A . . GT 1 + val indelToResize = Triple( + "sampleC", + "sampleZ", + SimpleVariant( + Position("chr1",9), + Position("chr1",11), + refAllele = "AAA", + altAllele = "A" + ) + ) + + val indelsForResizing = listOf(indelToResize) + val zIndel1ResizedMap = recombineGvcfs.resizeMaps(indelsForResizing, flippedMap) + //Now check that the map has been resized correctly + + val targetSampleZRangeMap = zIndel1ResizedMap["sampleZ"] + assertEquals( + "sampleZ should have 3 ranges", + 3, + targetSampleZRangeMap?.asMapOfRanges()?.size + ) + val firstRegion = targetSampleZRangeMap?.getEntry(Position("chr1",5)) + assertEquals( + "First region should be from 1-11", + Range.closed(Position("chr1",1), Position("chr1",11)), + firstRegion?.key + ) + assertEquals( + "First region should be from sampleC", + "sampleC", + firstRegion?.value + ) + val secondRegion = targetSampleZRangeMap?.getEntry(Position("chr1",15)) + assertEquals( + "Second region should be from 12-20", + Range.closed(Position("chr1",12), Position("chr1",20)), + secondRegion?.key + ) + assertEquals( + "Second region should be from sampleB", + "sampleB", + secondRegion?.value + ) + val thirdRegion = targetSampleZRangeMap?.getEntry(Position("chr1",25)) + assertEquals( + "Third region should be from 21-30", + Range.closed(Position("chr1",21), Position("chr1",30)), + thirdRegion?.key + ) + assertEquals( + "Third region should be from sampleA", + "sampleA", + thirdRegion?.value + ) + } @@ -273,8 +377,6 @@ class RecombineGvcfsTest { val recombineGvcfs = RecombineGvcfs() - File(outputDir).mkdirs() - recombineGvcfs.writeResizedBedFiles(recombinationMap, Path(outputDir)) //Check that the files were created and have the correct content @@ -288,11 +390,6 @@ class RecombineGvcfsTest { val targetSampleBContent = targetSampleBBedFile.readText().trim() val expectedTargetSampleBContent = "chr1\t99\t200\tTargetSampleB\nchr1\t200\t300\tTargetSampleA" assertEquals("TargetSampleB.bed content is incorrect", expectedTargetSampleBContent, targetSampleBContent) - - //Delete the output directory - File(outputDir).deleteRecursively() - - } private fun buildSimpleRecombinationMap(): Map> { @@ -310,6 +407,56 @@ class RecombineGvcfsTest { } + @Test + fun testBuildOutputWriterMap() { + val sampleNames = listOf("Sample1", "Sample2", "Sample3") + val recombineGvcfs = RecombineGvcfs() + val writerMap = recombineGvcfs.buildOutputWriterMap(sampleNames, Path(outputDir)) + //Check that the map has the correct number of writers + assertEquals( + "Writer map should have 3 writers", + 3, + writerMap.size + ) + //Check that the writers are correctly named + sampleNames.forEach { sampleName -> + checkSampleNameToOutputFile(sampleName, writerMap) + } + } + + private fun checkSampleNameToOutputFile( + sampleName: String, + writerMap: Map + ) { + assertTrue( + "Writer map should contain writer for $sampleName", + writerMap.containsKey(sampleName) + ) + //close out the writers + writerMap[sampleName]?.close() + //Open up the file and check that the sample name is right + val outputFile = File("$outputDir/${sampleName}_recombined.gvcf") + assertEquals( + "Output file for $sampleName was not created", + true, + outputFile.isFile + ) + + val vcfReader = VCFFileReader(outputFile,false) + val headerSampleNames = vcfReader.fileHeader.sampleNamesInOrder + assertEquals( + "Output VCF for $sampleName should have 1 sample", + 1, + headerSampleNames.size + ) + assertEquals( + "Output VCF for $sampleName has incorrect sample name", + sampleName, + headerSampleNames.first() + ) + vcfReader.close() + } + @Test fun testChangeSampleName() { From f1869774450424901f1fcb6f6b0326bd111287df Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Thu, 15 Jan 2026 16:13:32 -0500 Subject: [PATCH 08/14] Fixing resizing algorithm to correctly prevent overlaps --- .../maizegenetics/commands/RecombineGvcfs.kt | 56 ++++++++--- .../commands/RecombineGvcfsTest.kt | 97 ++++++++++++++++++- 2 files changed, 137 insertions(+), 16 deletions(-) diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt index 8a1c785..423648b 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -120,7 +120,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { val flippedRecombinationMap = flipRecombinationMap(recombinationMap) //Now we can loop through the indels and resize the original ranges - val resizedMap = resizeMaps(indelsForResizing, flippedRecombinationMap) + val resizedMap = resizeMaps(indelsForResizing, recombinationMap,flippedRecombinationMap) //reflip the maps return flipRecombinationMap(resizedMap) @@ -195,6 +195,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { fun resizeMaps( indelsForResizing: List>, + originalRecombinationMap: Map>, flippedRecombinationMap: Map> ): Map> { @@ -204,12 +205,12 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { for((sourceSampleName, targetSampleName, indel) in indelsForResizing) { //Get the source and target range maps -// val targetRangeMap = flippedRecombinationMap[targetSampleName] ?: continue val targetRangeMap = resizedMap[targetSampleName] ?: continue + val sourceRangeMap = originalRecombinationMap[sourceSampleName] ?: continue //this list will hold all of the ranges and their corresponding source sample names that need to be added back in after resizing - val newRanges = mutableListOf, String>>() - val toDeleteRanges = mutableListOf>() + val newRanges = mutableSetOf, String>>() //Triple of (targetSampleName, range, sourceSampleName) + val toDeleteRanges = mutableSetOf>>() //Pair of (targetSampleName, range) //Now that we have this we need to first resize the sourceRangeMap's entry for the indels position val startRangeEntry = targetRangeMap.getEntry(indel.refStart) @@ -219,7 +220,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { //for the original entry we can just resize it. //add existing range to delete list - toDeleteRanges.add(startRangeEntry.key) + toDeleteRanges.add(Pair(targetSampleName,startRangeEntry.key)) //create new resized range val resizedRange = Range.closed( startRangeEntry.key.lowerEndpoint(), @@ -227,14 +228,15 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { startRangeEntry.key.upperEndpoint().contig, indel.refEnd.position)) - newRanges.add(Pair(resizedRange, startRangeEntry.value)) + newRanges.add(Triple(targetSampleName,resizedRange, startRangeEntry.value)) + //This makes sure that we don't have any overlapping sites in the output gvcf //Then we need to follow the target Sample's range map and resize/remove any that are overlapping var currentPos = Position(startRangeEntry.key.upperEndpoint().contig, indel.refEnd.position) while(currentPos <= indel.refEnd) { val currentRangeEntry = targetRangeMap.getEntry(currentPos) ?: break //add existing range to delete list - toDeleteRanges.add(currentRangeEntry.key) + toDeleteRanges.add(Pair(targetSampleName, currentRangeEntry.key)) //Check to see if this is the last overlapping range if (currentRangeEntry.key.contains(indel.refEnd) && currentRangeEntry.key.upperEndpoint()!= indel.refEnd) { //Resize this range @@ -242,7 +244,32 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { Position(currentRangeEntry.key.lowerEndpoint().contig, indel.refEnd.position+1), //Need to shift the position up by 1 currentRangeEntry.key.upperEndpoint() ) - newRanges.add(Pair(resizedLastRange, currentRangeEntry.value)) + newRanges.add(Triple(targetSampleName,resizedLastRange, currentRangeEntry.value)) + break + } else { + //This range is fully contained within the indel, so we skip adding it back in + } + } + + //We also need to adjust the source sample's range map to account for the resized regions otherwise we will + // have overlapping regions in the BED file after its reflipped + //Can use a similar process as above but instead we need to walk through the original recombination map and get the next ranges target then find that range in the target map and add to the delete and resize the add + //resetting our current pos to the indel start + currentPos = Position(startRangeEntry.key.upperEndpoint().contig, indel.refEnd.position) + while(currentPos <= indel.refEnd) { + val currentSourceRangeEntry = sourceRangeMap.getEntry(currentPos) ?: break + //Find the corresponding range in the target map + val currentRangeEntry = resizedMap[currentSourceRangeEntry.value]?.getEntry(currentSourceRangeEntry.key.lowerEndpoint()) ?: break + //add existing range to delete list + toDeleteRanges.add(Pair(currentSourceRangeEntry.value,currentRangeEntry.key)) + //Check to see if this is the last overlapping range + if (currentRangeEntry.key.contains(indel.refEnd) && currentRangeEntry.key.upperEndpoint()!= indel.refEnd) { + //Resize this range + val resizedLastRange = Range.closed( + Position(currentRangeEntry.key.lowerEndpoint().contig, indel.refEnd.position+1), //Need to shift the position up by 1 + currentRangeEntry.key.upperEndpoint() + ) + newRanges.add(Triple(currentSourceRangeEntry.value,resizedLastRange, currentRangeEntry.value)) break } else { //This range is fully contained within the indel, so we skip adding it back in @@ -250,14 +277,17 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { } //Now we can remove the old ranges from the targetRangeMap - for(range in toDeleteRanges) { - targetRangeMap.remove(range) + for((target,range) in toDeleteRanges) { +// targetRangeMap.remove(range) + resizedMap[target]!!.remove(range) } //Now we can add back in the new resized ranges - for((range, sourceSample) in newRanges) { - targetRangeMap.put(range, sourceSample) + for((target, range, sourceSample) in newRanges) { +// targetRangeMap.put(range, sourceSample) + resizedMap[target]!!.put(range, sourceSample) +// targetRangeMap.put(range, sourceSample) } - resizedMap[targetSampleName] = targetRangeMap +// resizedMap[targetSampleName] = targetRangeMap } return resizedMap } diff --git a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt index 84caa6a..45caf9a 100644 --- a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -132,7 +132,98 @@ class RecombineGvcfsTest { val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) val resizedMap = recombineGvcfs.resizeRecombinationMapsForIndels(recombinationMap, Path(gvcfFile)) - TODO("Implement test for resizeRecombinationMapsForIndels" ) + //resized map should be the same except for the first range of sampleC which should be resized to 1-11 for target sampleZ + //And the second range of sampleB which should be resized to 12-20 for target sampleZ + //Check the parts that have not changed first + //SampleA should be unchanged + val sampleARangeMap = resizedMap["sampleA"] + assertEquals( + "SampleA should have 3 ranges", + 3, + sampleARangeMap?.asMapOfRanges()?.size + ) + assertEquals("SampleA matches original", recombinationMap["sampleA"], sampleARangeMap) + + //SamplesB and C should have one resized region each + val sampleBRangeMap = resizedMap["sampleB"] + assertEquals( + "SampleB should have 3 ranges", + 3, + sampleBRangeMap?.asMapOfRanges()?.size + ) + val sampleBFirstRange = sampleBRangeMap?.getEntry(Position("chr1",5)) + assertEquals( + "SampleB first range should be unchanged", + Range.closed(Position("chr1",1), Position("chr1",10)), + sampleBFirstRange?.key + ) + assertEquals( + "SampleB first range target should be unchanged", + "sampleY", + sampleBFirstRange?.value + ) + val sampleBSecondRange = sampleBRangeMap?.getEntry(Position("chr1",15)) + assertEquals( + "SampleB second range should be resized to 12-20", + Range.closed(Position("chr1",12), Position("chr1",20)), + sampleBSecondRange?.key + ) + assertEquals( + "SampleB second range target should be sampleZ", + "sampleZ", + sampleBSecondRange?.value + ) + val sampleBThirdRange = sampleBRangeMap?.getEntry(Position("chr1",25)) + assertEquals( + "SampleB third range should be unchanged", + Range.closed(Position("chr1",21), Position("chr1",30)), + sampleBThirdRange?.key + ) + assertEquals( + "SampleB third range target should be unchanged", + "sampleX", + sampleBThirdRange?.value + ) + + val sampleCRangeMap = resizedMap["sampleC"] + assertEquals( + "SampleC should have 3 ranges", + 3, + sampleCRangeMap?.asMapOfRanges()?.size + ) + val sampleCFirstRange = sampleCRangeMap?.getEntry(Position("chr1",5)) + assertEquals( + "SampleC first range should be resized to 1-11", + Range.closed(Position("chr1",1), Position("chr1",11)), + sampleCFirstRange?.key + ) + assertEquals( + "SampleC first range target should be sampleZ", + "sampleZ", + sampleCFirstRange?.value + ) + val sampleCSecondRange = sampleCRangeMap?.getEntry(Position("chr1",15)) + assertEquals( + "SampleC second range has changed", + Range.closed(Position("chr1",12), Position("chr1",20)), + sampleCSecondRange?.key + ) + assertEquals( + "SampleC second range target should be unchanged", + "sampleX", + sampleCSecondRange?.value + ) + val sampleCThirdRange = sampleCRangeMap?.getEntry(Position("chr1",25)) + assertEquals( + "SampleC third range should be unchanged", + Range.closed(Position("chr1",21), Position("chr1",30)), + sampleCThirdRange?.key + ) + assertEquals( + "SampleC third range target should be unchanged", + "sampleY", + sampleCThirdRange?.value + ) } @Test @@ -303,7 +394,7 @@ class RecombineGvcfsTest { val flippedMap = recombineGvcfs.flipRecombinationMap(recombinationMap) val indelsForResizingEmpty = emptyList>() - val nonResizedMap = recombineGvcfs.resizeMaps(indelsForResizingEmpty, flippedMap) + val nonResizedMap = recombineGvcfs.resizeMaps(indelsForResizingEmpty, recombinationMap,flippedMap) //Check that the map is the same as the flipped map assertEquals( "No-resize map should be equal to flipped map", @@ -325,7 +416,7 @@ class RecombineGvcfsTest { ) val indelsForResizing = listOf(indelToResize) - val zIndel1ResizedMap = recombineGvcfs.resizeMaps(indelsForResizing, flippedMap) + val zIndel1ResizedMap = recombineGvcfs.resizeMaps(indelsForResizing, recombinationMap,flippedMap) //Now check that the map has been resized correctly val targetSampleZRangeMap = zIndel1ResizedMap["sampleZ"] From 07a01ecfd76b28cc5270496e10cc4e6f750387dc Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Fri, 16 Jan 2026 09:40:09 -0500 Subject: [PATCH 09/14] Fixing End RefBlock issue --- .../maizegenetics/commands/RecombineGvcfs.kt | 1 + .../commands/RecombineGvcfsTest.kt | 61 +++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt index 423648b..cea8849 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -451,6 +451,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { .chr(chrom) .start(start.toLong()) .stop(end.toLong()) + .attribute("END", end) .alleles(listOf(refAllele, "")) .genotypes( listOf( diff --git a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt index 45caf9a..a74b22b 100644 --- a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -1,5 +1,6 @@ package net.maizegenetics.commands +import biokotlin.seq.NucSeq import com.google.common.collect.Range import com.google.common.collect.RangeMap import com.google.common.collect.TreeRangeMap @@ -548,6 +549,66 @@ class RecombineGvcfsTest { vcfReader.close() } + + @Test + fun testProcessRefBlockOverlap() { + val recombineGvcfs = RecombineGvcfs() + val outputWriters = recombineGvcfs.buildOutputWriterMap(listOf("sampleX", "sampleY", "sampleZ"), Path(outputDir)) + + //Build a rangeMap -> Target map + val rangeMap = TreeRangeMap.create() + rangeMap.put(Range.closed(Position("chr1", 1), Position("chr1", 10)), "sampleX") + rangeMap.put(Range.closed(Position("chr1",11), Position("chr1", 20)), "sampleY") + rangeMap.put(Range.closed(Position("chr1",21), Position("chr1", 30)), "sampleZ") + + //Build a refBlock from 5-25 + val refBlock = recombineGvcfs.buildRefBlock("chr1", 5, 25, "A", "sampleA") + + val refString = "A".repeat(30) //Length should be 30 + val refSeq = mapOf(Pair("chr1",NucSeq(refString))) + + recombineGvcfs.processRefBlockOverlap(Position("chr1",5),Position("chr1",25),rangeMap,outputWriters, refSeq , refBlock ) + + //Close out the writers + outputWriters.values.forEach { it.close() } + + //Now check that the output files have the correct refBlocks + //We will have 3 output files each with one variant in them. + //sampleX should have a refBlock from 5-10 + val sampleXFile = File("$outputDir/sampleX_recombined.gvcf") + assertEquals("sampleX output file was not created", true, sampleXFile.isFile) + val sampleXReader = VCFFileReader(sampleXFile,false) + val sampleXVariants = sampleXReader.iterator().toList() + assertEquals("sampleX should have 1 variant", 1, sampleXVariants.size) + val sampleXVariant = sampleXVariants[0] + assertEquals("sampleX variant contig does not match", "chr1", sampleXVariant.contig) + assertEquals("sampleX variant start does not match", 5, sampleXVariant.start) + assertEquals("sampleX variant end does not match", 10, sampleXVariant.end) + sampleXReader.close() + //sampleY should have a refBlock from 11-20 + val sampleYFile = File("$outputDir/sampleY_recombined.gvcf") + assertEquals("sampleY output file was not created", true, sampleYFile.isFile) + val sampleYReader = VCFFileReader(sampleYFile,false) + val sampleYVariants = sampleYReader.iterator().toList() + assertEquals("sampleY should have 1 variant", 1, sampleYVariants.size) + val sampleYVariant = sampleYVariants[0] + assertEquals("sampleY variant contig does not match", "chr1", sampleYVariant.contig) + assertEquals("sampleY variant start does not match", 11, sampleYVariant.start) + assertEquals("sampleY variant end does not match", 20, sampleYVariant.end) + sampleYReader.close() + //sampleZ should have a refBlock from 21-25 + val sampleZFile = File("$outputDir/sampleZ_recombined.gvcf") + assertEquals("sampleZ output file was not created", true, sampleZFile.isFile) + val sampleZReader = VCFFileReader(sampleZFile,false) + val sampleZVariants = sampleZReader.iterator().toList() + assertEquals("sampleZ should have 1 variant", 1, sampleZVariants.size) + val sampleZVariant = sampleZVariants[0] + assertEquals("sampleZ variant contig does not match", "chr1", sampleZVariant.contig) + assertEquals("sampleZ variant start does not match", 21, sampleZVariant.start) + assertEquals("sampleZ variant end does not match", 25, sampleZVariant.end) + sampleZReader.close() + } + @Test fun testChangeSampleName() { From 7f5b76d185355c2ecd887add603e40c8ca1046f4 Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Fri, 16 Jan 2026 11:44:24 -0500 Subject: [PATCH 10/14] Fixing test gvcfs --- data/RecombineGvcfs/gvcf/sampleB.gvcf | 6 +- data/RecombineGvcfs/gvcf/sampleC.gvcf | 8 +- .../maizegenetics/commands/RecombineGvcfs.kt | 2 +- .../commands/RecombineGvcfsTest.kt | 86 +++++++++++++++++++ 4 files changed, 94 insertions(+), 8 deletions(-) diff --git a/data/RecombineGvcfs/gvcf/sampleB.gvcf b/data/RecombineGvcfs/gvcf/sampleB.gvcf index cca6660..b409971 100644 --- a/data/RecombineGvcfs/gvcf/sampleB.gvcf +++ b/data/RecombineGvcfs/gvcf/sampleB.gvcf @@ -14,9 +14,9 @@ ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleB chr1 1 . A . . END=4 GT 0 -chr1 5 . A T . . GT 1 +chr1 5 . A T . . . GT 1 chr1 6 . A . . END=14 GT 0 -chr1 15 . A T . . GT 1 +chr1 15 . A T . . . GT 1 chr1 16 . A . . GT 0 -chr1 25 . A T . . GT 1 +chr1 25 . A T . . . GT 1 chr1 26 . A . . END=30 GT 0 \ No newline at end of file diff --git a/data/RecombineGvcfs/gvcf/sampleC.gvcf b/data/RecombineGvcfs/gvcf/sampleC.gvcf index 3c2ce7c..a276ee1 100644 --- a/data/RecombineGvcfs/gvcf/sampleC.gvcf +++ b/data/RecombineGvcfs/gvcf/sampleC.gvcf @@ -14,8 +14,8 @@ ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleC chr1 1 . A . . END=8 GT 0 -chr1 9 . AAA A . . GT 1 +chr1 9 . AAA A . . . GT 1 chr1 12 . A . . END=16 GT 0 -chr1 17 . AAA A . . GT 1 -chr1 20 . A T . . GT 1 -chr1 221 . A . . END=30 GT 0 \ No newline at end of file +chr1 17 . AAA A . . . GT 1 +chr1 20 . A T . . . GT 1 +chr1 21 . A . . END=30 GT 0 \ No newline at end of file diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt index cea8849..bf9c48e 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -371,7 +371,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { val newVc = changeSampleName(vc, targetSampleName) outputWriter.add(newVc) } - else if(vc.reference.length() == 1 && vc.alternateAlleles.first().displayString == "NON_REF"){ + else if(vc.reference.length() == 1 && vc.alternateAlleles.first().displayString == ""){ //This is a RefBlock //We need to 'walk through' the refBlock by splitting it up into multiple refBlocks and write out to the correct output writer //Get the current start position diff --git a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt index a74b22b..df0959d 100644 --- a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -1,6 +1,7 @@ package net.maizegenetics.commands import biokotlin.seq.NucSeq +import biokotlin.seq.NucSeqRecord import com.google.common.collect.Range import com.google.common.collect.RangeMap import com.google.common.collect.TreeRangeMap @@ -550,6 +551,91 @@ class RecombineGvcfsTest { } + @Test + fun testProcessSingleGVCFFile() { + //gvcfReader: VCFReader, + // ranges: RangeMap, + // outputWriters: Map, + // refSeq :Map + + //Lets test SampleC as it has an overlapping index, a standard indel and a SNP so we should hit all the edge cases we handle + val gvcfFile = "./data/RecombineGvcfs/gvcf/sampleC.gvcf" + val bedDir = "./data/RecombineGvcfs/bed/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + //Need to resize the recombination map first + val resizedRecombinationMap = recombineGvcfs.resizeRecombinationMapsForIndels(recombinationMap, Path("./data/RecombineGvcfs/gvcf/")) + val sampleCRanges = resizedRecombinationMap["sampleC"]!! + val outputWriters = recombineGvcfs.buildOutputWriterMap(targetNameList, Path(outputDir)) + val gvcfReader = VCFFileReader(File(gvcfFile), false) + val refString = "A".repeat(30) //Length should be 30 + val refSeq = mapOf(Pair("chr1",NucSeqRecord(NucSeq(refString), "chr1"))) + recombineGvcfs.processSingleGVCFFile( + gvcfReader, + sampleCRanges, + outputWriters, + refSeq + ) + //Close out the writers + outputWriters.values.forEach { it.close() } + //Now check that the output files have the correct variants + //We will have 3 output files + //targetSampleZ whould have 2 variants: refBlock from 1-8 and Indel at 9 - 11 + val targetSampleZFile = File("$outputDir/sampleZ_recombined.gvcf") + assertEquals("sampleZ output file was not created", true, targetSampleZFile.isFile) + val targetSampleZReader = VCFFileReader(targetSampleZFile,false) + val targetSampleZVariants = targetSampleZReader.iterator().toList() + assertEquals("sampleZ should have 2 variants", 2, targetSampleZVariants.size) + val firstVariantZ = targetSampleZVariants[0] + assertEquals("sampleZ first variant contig does not match", "chr1", firstVariantZ.contig) + assertEquals("sampleZ first variant start does not match", 1, firstVariantZ.start) + assertEquals("sampleZ first variant end does not match", 8, firstVariantZ.end) + val secondVariantZ = targetSampleZVariants[1] + assertEquals("sampleZ second variant contig does not match", "chr1", secondVariantZ.contig) + assertEquals("sampleZ second variant start does not match", 9, secondVariantZ.start) + assertEquals("sampleZ second variant end does not match", 11, secondVariantZ.end) + //check ref and alt alleles for the indel + assertEquals("sampleZ second variant ref allele does not match", "AAA", secondVariantZ.reference.baseString) + assertEquals("sampleZ second variant alt allele does not match", "A", secondVariantZ.alternateAlleles[0].baseString) + targetSampleZReader.close() + //targetSampleX should have 3 variant: Deletion at 17-19 and SNP at 20 and a refBlock from 12-16 + val targetSampleXFile = File("$outputDir/sampleX_recombined.gvcf") + assertEquals("sampleX output file was not created", true, targetSampleXFile.isFile) + val targetSampleXReader = VCFFileReader(targetSampleXFile,false) + val targetSampleXVariants = targetSampleXReader.iterator().toList() + assertEquals("sampleX should have 3 variants", 3, targetSampleXVariants.size) + val firstVariantX = targetSampleXVariants[0] + assertEquals("sampleX first variant contig does not match", "chr1", firstVariantX.contig) + assertEquals("sampleX first variant start does not match", 12, firstVariantX.start) + assertEquals("sampleX first variant end does not match", 16, firstVariantX.end) + val secondVariantX = targetSampleXVariants[1] + assertEquals("sampleX second variant contig does not match", "chr1", secondVariantX.contig) + assertEquals("sampleX second variant start does not match", 17, secondVariantX.start) + assertEquals("sampleX second variant end does not match", 19, secondVariantX.end) + //check ref and alt alleles for the deletion + assertEquals("sampleX second variant ref allele does not match", "AAA", secondVariantX.reference.baseString) + assertEquals("sampleX second variant alt allele does not match", "A", secondVariantX.alternateAlleles[0].baseString) + val thirdVariantX = targetSampleXVariants[2] + assertEquals("sampleX third variant contig does not match", "chr1", thirdVariantX.contig) + assertEquals("sampleX third variant start does not match", 20, thirdVariantX.start) + assertEquals("sampleX third variant end does not match", 20, thirdVariantX.end) + //check ref and alt alleles for the SNP + assertEquals("sampleX third variant ref allele does not match", "A", thirdVariantX.reference.baseString) + assertEquals("sampleX third variant alt allele does not match", "T", thirdVariantX.alternateAlleles[0].baseString) + targetSampleXReader.close() + //targetSampleY should have 1 variant: refBlock from 21-30 + val targetSampleYFile = File("$outputDir/sampleY_recombined.gvcf") + assertEquals("sampleY output file was not created", true, targetSampleYFile.isFile) + val targetSampleYReader = VCFFileReader(targetSampleYFile,false) + val targetSampleYVariants = targetSampleYReader.iterator().toList() + assertEquals("sampleY should have 1 variant", 1, targetSampleYVariants.size) + val firstVariantY = targetSampleYVariants[0] + assertEquals("sampleY variant contig does not match", "chr1", firstVariantY.contig) + assertEquals("sampleY variant start does not match", 21, firstVariantY.start) + assertEquals("sampleY variant end does not match", 30, firstVariantY.end) + targetSampleYReader.close() + } + @Test fun testProcessRefBlockOverlap() { val recombineGvcfs = RecombineGvcfs() From 7eeaefc8c1e16de6cb50cb8be47bdb4d6bc50e27 Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Fri, 16 Jan 2026 15:22:16 -0500 Subject: [PATCH 11/14] Fixing bugs related to gaps within range. --- data/RecombineGvcfs/gvcf/sampleB.gvcf | 2 +- .../maizegenetics/commands/RecombineGvcfs.kt | 31 ++-- .../commands/RecombineGvcfsTest.kt | 154 +++++++++++++++++- 3 files changed, 171 insertions(+), 16 deletions(-) diff --git a/data/RecombineGvcfs/gvcf/sampleB.gvcf b/data/RecombineGvcfs/gvcf/sampleB.gvcf index b409971..24a3ca5 100644 --- a/data/RecombineGvcfs/gvcf/sampleB.gvcf +++ b/data/RecombineGvcfs/gvcf/sampleB.gvcf @@ -17,6 +17,6 @@ chr1 1 . A . . END=4 GT 0 chr1 5 . A T . . . GT 1 chr1 6 . A . . END=14 GT 0 chr1 15 . A T . . . GT 1 -chr1 16 . A . . GT 0 +chr1 16 . A . . END=24 GT 0 chr1 25 . A T . . . GT 1 chr1 26 . A . . END=30 GT 0 \ No newline at end of file diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt index bf9c48e..0c09d2f 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -278,16 +278,12 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { //Now we can remove the old ranges from the targetRangeMap for((target,range) in toDeleteRanges) { -// targetRangeMap.remove(range) resizedMap[target]!!.remove(range) } //Now we can add back in the new resized ranges for((target, range, sourceSample) in newRanges) { -// targetRangeMap.put(range, sourceSample) resizedMap[target]!!.put(range, sourceSample) -// targetRangeMap.put(range, sourceSample) } -// resizedMap[targetSampleName] = targetRangeMap } return resizedMap } @@ -332,8 +328,15 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { outputWriters: Map, refSeq :Map ) { + val pattern = Regex("""^(.+?)\.g(?:\.?vcf|vcs)(?:\.gz)?$""") + inputGvcfDir.toFile().listFiles()?.forEach { gvcfFile -> - val sampleName = gvcfFile.name.substringBeforeLast(".g.vcf") + val match = pattern.matchEntire(gvcfFile.name) + if (match == null) { + println("Skipping file ${gvcfFile.name} as it does not match expected GVCF naming pattern.") + return@forEach + } + val sampleName = match.groupValues[1] val ranges = recombinationMap[sampleName] ?: return@forEach VCFFileReader(gvcfFile, false).use { gvcfReader -> @@ -395,12 +398,19 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { refSeq: Map, vc: VariantContext ){ + val subRanges = ranges.subRangeMap(Range.closed(startPos, endPos)) var currentStartPos = startPos - while (currentStartPos <= endPos) { - //Get the target sample name for the current start position - val rangesEntry = ranges.getEntry(currentStartPos) ?: break - val range = rangesEntry.key - val targetSampleName = rangesEntry.value + var currentRangeIdx = 0 + val rangeEntries = subRanges.asMapOfRanges().entries.toList() + while (currentStartPos <= endPos && currentRangeIdx < rangeEntries.size) { + val rangeEntry = rangeEntries[currentRangeIdx] + val range = rangeEntry.key + val targetSampleName = rangeEntry.value + if(currentStartPos , - // outputWriters: Map, - // refSeq :Map - //Lets test SampleC as it has an overlapping index, a standard indel and a SNP so we should hit all the edge cases we handle val gvcfFile = "./data/RecombineGvcfs/gvcf/sampleC.gvcf" +// val gvcfFile = "./data/RecombineGvcfs/gvcf/sampleB.gvcf" val bedDir = "./data/RecombineGvcfs/bed/" val recombineGvcfs = RecombineGvcfs() val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) //Need to resize the recombination map first val resizedRecombinationMap = recombineGvcfs.resizeRecombinationMapsForIndels(recombinationMap, Path("./data/RecombineGvcfs/gvcf/")) val sampleCRanges = resizedRecombinationMap["sampleC"]!! +// val sampleCRanges = resizedRecombinationMap["sampleB"]!! val outputWriters = recombineGvcfs.buildOutputWriterMap(targetNameList, Path(outputDir)) val gvcfReader = VCFFileReader(File(gvcfFile), false) val refString = "A".repeat(30) //Length should be 30 From e1ada2e64407970bf30cd6fd1db37bfca089d23b Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Fri, 16 Jan 2026 15:27:02 -0500 Subject: [PATCH 12/14] Finishing up unit tests --- data/RecombineGvcfs/ref.fa | 1 + .../commands/RecombineGvcfsTest.kt | 63 +++++++++++++------ 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/data/RecombineGvcfs/ref.fa b/data/RecombineGvcfs/ref.fa index 08b6f77..bbcf083 100644 --- a/data/RecombineGvcfs/ref.fa +++ b/data/RecombineGvcfs/ref.fa @@ -1 +1,2 @@ +>chr1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA \ No newline at end of file diff --git a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt index cf33e63..0c477a7 100644 --- a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -26,18 +26,45 @@ class RecombineGvcfsTest { val homeDir = System.getProperty("user.home").replace('\\', '/') - val outputDir = "$homeDir/temp/seq_sim/recombine_gvcf_test/" + val outputGvcfDir = "$homeDir/temp/seq_sim/recombine_gvcf_test/gvcf/" + val outputBedDir = "$homeDir/temp/seq_sim/recombine_gvcf_test/bed/" @BeforeTest fun setupTest() { // Runs before each test in common code - File(outputDir).mkdirs() + File(outputGvcfDir).mkdirs() + File(outputBedDir).mkdirs() } @AfterTest fun teardownTest() { // Runs after each test in common code - File(outputDir).deleteRecursively() + File(outputGvcfDir).deleteRecursively() + File(outputBedDir).deleteRecursively() + } + + @Test + fun testRecombineGvcfs() { + //recombineGvcfs(inputBedDir: Path, inputGvcfDir: Path, refFile: Path, outputDir: Path, outputBedDir: Path) + val bedDir = "./data/RecombineGvcfs/bed/" + val gvcfFile = "./data/RecombineGvcfs/gvcf/" + val refFile = "./data/RecombineGvcfs/ref.fa" + val recombineGvcfs = RecombineGvcfs() + recombineGvcfs.recombineGvcfs( + Path(bedDir), + Path(gvcfFile), + Path(refFile), + Path(outputGvcfDir), + Path(outputBedDir) + ) + //Check that the output files were created + val outputFiles = File(outputGvcfDir).listFiles().filter { it.name.endsWith(".gvcf") } + assertEquals( + "There should be 3 output gvcf files", + 3, + outputFiles.size + ) + checkAllResults(outputGvcfDir) } @Test @@ -470,15 +497,15 @@ class RecombineGvcfsTest { val recombineGvcfs = RecombineGvcfs() - recombineGvcfs.writeResizedBedFiles(recombinationMap, Path(outputDir)) + recombineGvcfs.writeResizedBedFiles(recombinationMap, Path(outputGvcfDir)) //Check that the files were created and have the correct content - val targetSampleABedFile = File("$outputDir/Sample1_resized.bed") + val targetSampleABedFile = File("$outputGvcfDir/Sample1_resized.bed") assertEquals("TargetSampleA.bed file was not created", true, targetSampleABedFile.isFile) val targetSampleAContent = targetSampleABedFile.readText().trim() val expectedTargetSampleAContent = "chr1\t99\t200\tTargetSampleA\nchr1\t200\t300\tTargetSampleB" assertEquals("TargetSampleA.bed content is incorrect", expectedTargetSampleAContent, targetSampleAContent) - val targetSampleBBedFile = File("$outputDir/Sample2_resized.bed") + val targetSampleBBedFile = File("$outputGvcfDir/Sample2_resized.bed") assertEquals("TargetSampleB.bed file was not created", true, targetSampleBBedFile.isFile) val targetSampleBContent = targetSampleBBedFile.readText().trim() val expectedTargetSampleBContent = "chr1\t99\t200\tTargetSampleB\nchr1\t200\t300\tTargetSampleA" @@ -504,7 +531,7 @@ class RecombineGvcfsTest { fun testBuildOutputWriterMap() { val sampleNames = listOf("Sample1", "Sample2", "Sample3") val recombineGvcfs = RecombineGvcfs() - val writerMap = recombineGvcfs.buildOutputWriterMap(sampleNames, Path(outputDir)) + val writerMap = recombineGvcfs.buildOutputWriterMap(sampleNames, Path(outputGvcfDir)) //Check that the map has the correct number of writers assertEquals( "Writer map should have 3 writers", @@ -528,7 +555,7 @@ class RecombineGvcfsTest { //close out the writers writerMap[sampleName]?.close() //Open up the file and check that the sample name is right - val outputFile = File("$outputDir/${sampleName}_recombined.gvcf") + val outputFile = File("$outputGvcfDir/${sampleName}_recombined.gvcf") assertEquals( "Output file for $sampleName was not created", true, @@ -558,7 +585,7 @@ class RecombineGvcfsTest { val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) //Need to resize the recombination map first val resizedRecombinationMap = recombineGvcfs.resizeRecombinationMapsForIndels(recombinationMap, Path(gvcfDir)) - val outputWriters = recombineGvcfs.buildOutputWriterMap(targetNameList, Path(outputDir)) + val outputWriters = recombineGvcfs.buildOutputWriterMap(targetNameList, Path(outputGvcfDir)) val refString = "A".repeat(30) //Length should be 30 val refSeq = mapOf(Pair("chr1",NucSeqRecord(NucSeq(refString), "chr1"))) recombineGvcfs.processGvcfsAndWrite( @@ -570,7 +597,7 @@ class RecombineGvcfsTest { //Close out the writers outputWriters.values.forEach { it.close() } //Now check that the output files were created - checkAllResults(outputDir) + checkAllResults(outputGvcfDir) } fun checkAllResults(outputDir: String) { @@ -710,7 +737,7 @@ class RecombineGvcfsTest { val resizedRecombinationMap = recombineGvcfs.resizeRecombinationMapsForIndels(recombinationMap, Path("./data/RecombineGvcfs/gvcf/")) val sampleCRanges = resizedRecombinationMap["sampleC"]!! // val sampleCRanges = resizedRecombinationMap["sampleB"]!! - val outputWriters = recombineGvcfs.buildOutputWriterMap(targetNameList, Path(outputDir)) + val outputWriters = recombineGvcfs.buildOutputWriterMap(targetNameList, Path(outputGvcfDir)) val gvcfReader = VCFFileReader(File(gvcfFile), false) val refString = "A".repeat(30) //Length should be 30 val refSeq = mapOf(Pair("chr1",NucSeqRecord(NucSeq(refString), "chr1"))) @@ -725,7 +752,7 @@ class RecombineGvcfsTest { //Now check that the output files have the correct variants //We will have 3 output files //targetSampleZ whould have 2 variants: refBlock from 1-8 and Indel at 9 - 11 - val targetSampleZFile = File("$outputDir/sampleZ_recombined.gvcf") + val targetSampleZFile = File("$outputGvcfDir/sampleZ_recombined.gvcf") assertEquals("sampleZ output file was not created", true, targetSampleZFile.isFile) val targetSampleZReader = VCFFileReader(targetSampleZFile,false) val targetSampleZVariants = targetSampleZReader.iterator().toList() @@ -743,7 +770,7 @@ class RecombineGvcfsTest { assertEquals("sampleZ second variant alt allele does not match", "A", secondVariantZ.alternateAlleles[0].baseString) targetSampleZReader.close() //targetSampleX should have 3 variant: Deletion at 17-19 and SNP at 20 and a refBlock from 12-16 - val targetSampleXFile = File("$outputDir/sampleX_recombined.gvcf") + val targetSampleXFile = File("$outputGvcfDir/sampleX_recombined.gvcf") assertEquals("sampleX output file was not created", true, targetSampleXFile.isFile) val targetSampleXReader = VCFFileReader(targetSampleXFile,false) val targetSampleXVariants = targetSampleXReader.iterator().toList() @@ -768,7 +795,7 @@ class RecombineGvcfsTest { assertEquals("sampleX third variant alt allele does not match", "T", thirdVariantX.alternateAlleles[0].baseString) targetSampleXReader.close() //targetSampleY should have 1 variant: refBlock from 21-30 - val targetSampleYFile = File("$outputDir/sampleY_recombined.gvcf") + val targetSampleYFile = File("$outputGvcfDir/sampleY_recombined.gvcf") assertEquals("sampleY output file was not created", true, targetSampleYFile.isFile) val targetSampleYReader = VCFFileReader(targetSampleYFile,false) val targetSampleYVariants = targetSampleYReader.iterator().toList() @@ -783,7 +810,7 @@ class RecombineGvcfsTest { @Test fun testProcessRefBlockOverlap() { val recombineGvcfs = RecombineGvcfs() - val outputWriters = recombineGvcfs.buildOutputWriterMap(listOf("sampleX", "sampleY", "sampleZ"), Path(outputDir)) + val outputWriters = recombineGvcfs.buildOutputWriterMap(listOf("sampleX", "sampleY", "sampleZ"), Path(outputGvcfDir)) //Build a rangeMap -> Target map val rangeMap = TreeRangeMap.create() @@ -805,7 +832,7 @@ class RecombineGvcfsTest { //Now check that the output files have the correct refBlocks //We will have 3 output files each with one variant in them. //sampleX should have a refBlock from 5-10 - val sampleXFile = File("$outputDir/sampleX_recombined.gvcf") + val sampleXFile = File("$outputGvcfDir/sampleX_recombined.gvcf") assertEquals("sampleX output file was not created", true, sampleXFile.isFile) val sampleXReader = VCFFileReader(sampleXFile,false) val sampleXVariants = sampleXReader.iterator().toList() @@ -816,7 +843,7 @@ class RecombineGvcfsTest { assertEquals("sampleX variant end does not match", 10, sampleXVariant.end) sampleXReader.close() //sampleY should have a refBlock from 11-20 - val sampleYFile = File("$outputDir/sampleY_recombined.gvcf") + val sampleYFile = File("$outputGvcfDir/sampleY_recombined.gvcf") assertEquals("sampleY output file was not created", true, sampleYFile.isFile) val sampleYReader = VCFFileReader(sampleYFile,false) val sampleYVariants = sampleYReader.iterator().toList() @@ -827,7 +854,7 @@ class RecombineGvcfsTest { assertEquals("sampleY variant end does not match", 20, sampleYVariant.end) sampleYReader.close() //sampleZ should have a refBlock from 21-25 - val sampleZFile = File("$outputDir/sampleZ_recombined.gvcf") + val sampleZFile = File("$outputGvcfDir/sampleZ_recombined.gvcf") assertEquals("sampleZ output file was not created", true, sampleZFile.isFile) val sampleZReader = VCFFileReader(sampleZFile,false) val sampleZVariants = sampleZReader.iterator().toList() From 7ef28b046d91019abbd1ef5829f3f150b49e8ff3 Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Tue, 3 Feb 2026 14:13:27 -0500 Subject: [PATCH 13/14] Updating version --- build.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle.kts b/build.gradle.kts index 5d32bf3..5b0ddbf 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "net.maizegenetics" -version = "0.2.7" +version = "0.2.8" repositories { mavenCentral() From a77117b9b1ba299254fc61a27cf3494df87be16a Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Fri, 6 Feb 2026 13:45:28 -0500 Subject: [PATCH 14/14] Updating with review comments. --- src/main/kotlin/Main.kt | 6 +++++- .../net/maizegenetics/commands/MutateAssemblies.kt | 2 +- .../net/maizegenetics/commands/RecombineGvcfs.kt | 12 ++++++++---- .../maizegenetics/commands/MutateAssembliesTest.kt | 1 - .../net/maizegenetics/commands/RecombineGvcfsTest.kt | 3 +-- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/main/kotlin/Main.kt b/src/main/kotlin/Main.kt index c3aa04a..70e27a6 100644 --- a/src/main/kotlin/Main.kt +++ b/src/main/kotlin/Main.kt @@ -20,6 +20,8 @@ import net.maizegenetics.commands.PickCrossovers import net.maizegenetics.commands.RopeBwtChrIndex import net.maizegenetics.commands.RopeBwtMem import net.maizegenetics.commands.SetupEnvironment +import net.maizegenetics.commands.MutateAssemblies +import net.maizegenetics.commands.RecombineGvcfs class SeqSim : CliktCommand() { override fun run() = Unit @@ -43,6 +45,8 @@ fun main(args: Array) = SeqSim() RopeBwtMem(), BuildSplineKnots(), ConvertRopebwt2Ps4g(), - ExtractChromIds() + ExtractChromIds(), + MutateAssemblies(), + RecombineGvcfs() ) .main(args) diff --git a/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt index 50a9790..a8da658 100644 --- a/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt @@ -1,4 +1,4 @@ -package net.maizegenetics.net.maizegenetics.commands +package net.maizegenetics.commands import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.option diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt index 0c09d2f..e93d85a 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -1,4 +1,4 @@ -package net.maizegenetics.net.maizegenetics.commands +package net.maizegenetics.commands import biokotlin.seq.NucSeq import biokotlin.seq.NucSeqRecord @@ -25,6 +25,7 @@ import net.maizegenetics.utils.SimpleVariant import net.maizegenetics.utils.VariantContextUtils import java.io.File import java.nio.file.Path +import kotlin.collections.iterator class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { @@ -49,6 +50,8 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { .required() + val pattern = Regex("""^(.+?)\.g(?:\.?vcf)(?:\.gz)?$""") + override fun run() { // Implementation goes here recombineGvcfs(inputBedDir, inputGvcfDir, refFile, outputDir, outputBedDir) @@ -128,7 +131,9 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { } fun findAllOverlappingIndels(recombinationMap: Map>, gvcfDir: Path): List> { - return gvcfDir.toFile().listFiles()?.flatMap { gvcfFile -> + return gvcfDir.toFile().listFiles()?.filter{ + pattern.matches(it.name) + }?.flatMap { gvcfFile -> val sampleName = gvcfFile.name.replace(".g.vcf","").replace(".gvcf","").replace(".gz","") val ranges = @@ -328,7 +333,6 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { outputWriters: Map, refSeq :Map ) { - val pattern = Regex("""^(.+?)\.g(?:\.?vcf|vcs)(?:\.gz)?$""") inputGvcfDir.toFile().listFiles()?.forEach { gvcfFile -> val match = pattern.matchEntire(gvcfFile.name) @@ -395,7 +399,7 @@ class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { endPos: Position, ranges: RangeMap, outputWriters: Map, - refSeq: Map, + refSeq: Map, vc: VariantContext ){ val subRanges = ranges.subRangeMap(Range.closed(startPos, endPos)) diff --git a/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt b/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt index 7af992e..9da0993 100644 --- a/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt @@ -6,7 +6,6 @@ import com.google.common.collect.TreeRangeMap import htsjdk.variant.variantcontext.Allele import htsjdk.variant.variantcontext.VariantContextBuilder import htsjdk.variant.vcf.VCFFileReader -import net.maizegenetics.net.maizegenetics.commands.MutateAssemblies import net.maizegenetics.utils.Position import net.maizegenetics.utils.SimpleVariant import net.maizegenetics.utils.VariantContextUtils diff --git a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt index 0c477a7..b2dc329 100644 --- a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -10,7 +10,6 @@ import htsjdk.variant.variantcontext.GenotypeBuilder import htsjdk.variant.variantcontext.VariantContextBuilder import htsjdk.variant.variantcontext.writer.VariantContextWriter import htsjdk.variant.vcf.VCFFileReader -import net.maizegenetics.net.maizegenetics.commands.RecombineGvcfs import net.maizegenetics.utils.Position import net.maizegenetics.utils.SimpleVariant import java.io.File @@ -822,7 +821,7 @@ class RecombineGvcfsTest { val refBlock = recombineGvcfs.buildRefBlock("chr1", 5, 25, "A", "sampleA") val refString = "A".repeat(30) //Length should be 30 - val refSeq = mapOf(Pair("chr1",NucSeq(refString))) + val refSeq = mapOf(Pair("chr1",NucSeqRecord(NucSeq(refString), "chr1"))) recombineGvcfs.processRefBlockOverlap(Position("chr1",5),Position("chr1",25),rangeMap,outputWriters, refSeq , refBlock )