diff --git a/build.gradle.kts b/build.gradle.kts index 5d32bf3..5b0ddbf 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "net.maizegenetics" -version = "0.2.7" +version = "0.2.8" repositories { mavenCentral() diff --git a/data/RecombineGvcfs/bed/sampleA.bed b/data/RecombineGvcfs/bed/sampleA.bed new file mode 100644 index 0000000..727c4ec --- /dev/null +++ b/data/RecombineGvcfs/bed/sampleA.bed @@ -0,0 +1,3 @@ +chr1 0 10 sampleX +chr1 10 20 sampleY +chr1 20 30 sampleZ \ No newline at end of file diff --git a/data/RecombineGvcfs/bed/sampleB.bed b/data/RecombineGvcfs/bed/sampleB.bed new file mode 100644 index 0000000..d18fd5a --- /dev/null +++ b/data/RecombineGvcfs/bed/sampleB.bed @@ -0,0 +1,3 @@ +chr1 0 10 sampleY +chr1 10 20 sampleZ +chr1 20 30 sampleX \ No newline at end of file diff --git a/data/RecombineGvcfs/bed/sampleC.bed b/data/RecombineGvcfs/bed/sampleC.bed new file mode 100644 index 0000000..219ca9e --- /dev/null +++ b/data/RecombineGvcfs/bed/sampleC.bed @@ -0,0 +1,3 @@ +chr1 0 10 sampleZ +chr1 10 20 sampleX +chr1 20 30 sampleY \ No newline at end of file diff --git a/data/RecombineGvcfs/gvcf/sampleA.gvcf b/data/RecombineGvcfs/gvcf/sampleA.gvcf new file mode 100644 index 0000000..14dfdac --- /dev/null +++ b/data/RecombineGvcfs/gvcf/sampleA.gvcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleA +chr1 1 . A . . END=30 GT 0 \ No newline at end of file diff --git a/data/RecombineGvcfs/gvcf/sampleB.gvcf b/data/RecombineGvcfs/gvcf/sampleB.gvcf new file mode 100644 index 0000000..24a3ca5 --- /dev/null +++ b/data/RecombineGvcfs/gvcf/sampleB.gvcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleB +chr1 1 . A . . END=4 GT 0 +chr1 5 . A T . . . GT 1 +chr1 6 . A . . END=14 GT 0 +chr1 15 . A T . . . GT 1 +chr1 16 . A . . END=24 GT 0 +chr1 25 . A T . . . GT 1 +chr1 26 . A . . END=30 GT 0 \ No newline at end of file diff --git a/data/RecombineGvcfs/gvcf/sampleC.gvcf b/data/RecombineGvcfs/gvcf/sampleC.gvcf new file mode 100644 index 0000000..a276ee1 --- /dev/null +++ b/data/RecombineGvcfs/gvcf/sampleC.gvcf @@ -0,0 +1,21 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleC +chr1 1 . A . . END=8 GT 0 +chr1 9 . AAA A . . . GT 1 +chr1 12 . A . . END=16 GT 0 +chr1 17 . AAA A . . . GT 1 +chr1 20 . A T . . . GT 1 +chr1 21 . A . . END=30 GT 0 \ No newline at end of file diff --git a/data/RecombineGvcfs/ref.fa b/data/RecombineGvcfs/ref.fa new file mode 100644 index 0000000..bbcf083 --- /dev/null +++ b/data/RecombineGvcfs/ref.fa @@ -0,0 +1,2 @@ +>chr1 +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA \ No newline at end of file diff --git a/src/main/kotlin/Main.kt b/src/main/kotlin/Main.kt index c3aa04a..70e27a6 100644 --- a/src/main/kotlin/Main.kt +++ b/src/main/kotlin/Main.kt @@ -20,6 +20,8 @@ import net.maizegenetics.commands.PickCrossovers import net.maizegenetics.commands.RopeBwtChrIndex import net.maizegenetics.commands.RopeBwtMem import net.maizegenetics.commands.SetupEnvironment +import net.maizegenetics.commands.MutateAssemblies +import net.maizegenetics.commands.RecombineGvcfs class SeqSim : CliktCommand() { override fun run() = Unit @@ -43,6 +45,8 @@ fun main(args: Array) = SeqSim() RopeBwtMem(), BuildSplineKnots(), ConvertRopebwt2Ps4g(), - ExtractChromIds() + ExtractChromIds(), + MutateAssemblies(), + RecombineGvcfs() ) .main(args) diff --git a/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt index 2230077..a8da658 100644 --- a/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt @@ -1,4 +1,4 @@ -package net.maizegenetics.net.maizegenetics.commands +package net.maizegenetics.commands import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.option @@ -14,44 +14,15 @@ import htsjdk.variant.variantcontext.VariantContextBuilder import htsjdk.variant.variantcontext.writer.Options import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder import htsjdk.variant.vcf.VCFFileReader -import htsjdk.variant.vcf.VCFFormatHeaderLine -import htsjdk.variant.vcf.VCFHeader -import htsjdk.variant.vcf.VCFHeaderLine -import htsjdk.variant.vcf.VCFHeaderLineCount -import htsjdk.variant.vcf.VCFHeaderLineType -import htsjdk.variant.vcf.VCFInfoHeaderLine +import net.maizegenetics.utils.Position +import net.maizegenetics.utils.SimpleVariant +import net.maizegenetics.utils.VariantContextUtils import java.io.File -import java.util.HashSet import kotlin.io.path.createDirectories import kotlin.io.path.exists -data class Position(val contig: String, val position: Int) : Comparable { - override fun compareTo(other: Position): Int { - if (this.contig == other.contig) { - return this.position.compareTo(other.position) - } - - val thisContig = this.contig.replace("chr", "", ignoreCase = true).trim() - val otherContig = other.contig.replace("chr", "", ignoreCase = true).trim() - - return try { - thisContig.toInt() - otherContig.toInt() - } catch (e: NumberFormatException) { - // If we can't convert contigs to an int, then compare the strings - contig.compareTo(other.contig) - } - - } - - override fun toString(): String { - return "$contig:$position" - } -} -data class SimpleVariant(val refStart: Position, val refEnd: Position, - val refAllele: String, val altAllele: String, - val isAddedMutation: Boolean = false) @OptIn(kotlin.io.path.ExperimentalPathApi::class) class MutateAssemblies : CliktCommand(name = "mutate-assemblies") { @@ -156,7 +127,7 @@ class MutateAssemblies : CliktCommand(name = "mutate-assemblies") { return } - if (isIndel(currentSimpleVariant) && isIndel(overlappingVariant)) { + if (VariantContextUtils.isIndel(currentSimpleVariant) && VariantContextUtils.isIndel(overlappingVariant)) { //skip as it will be tricky/slow to handle return //TODO figure out a way to handle this well } @@ -260,7 +231,7 @@ class MutateAssemblies : CliktCommand(name = "mutate-assemblies") { .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER) .build().use { writer -> - writer.writeHeader(createGenericHeader(listOf(sampleName), emptySet())) + writer.writeHeader(VariantContextUtils.createGenericHeader(listOf(sampleName), emptySet())) val sortedVariants = baseVariantMap.asMapOfRanges().toList().sortedBy { it.first.lowerEndpoint() } @@ -294,40 +265,6 @@ class MutateAssemblies : CliktCommand(name = "mutate-assemblies") { } } - fun createGenericHeader(taxaNames: List, altLines:Set): VCFHeader { - val headerLines = createGenericHeaderLineSet() as MutableSet - headerLines.addAll(altLines) - return VCFHeader(headerLines, taxaNames) - } - - fun createGenericHeaderLineSet(): Set { - val headerLines: MutableSet = HashSet() - headerLines.add(VCFFormatHeaderLine("AD", 3, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")) - headerLines.add( - VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)") - ) - headerLines.add(VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality")) - headerLines.add(VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype")) - headerLines.add( - VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification") - ) - headerLines.add(VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total Depth")) - headerLines.add(VCFInfoHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Number of Samples With Data")) - headerLines.add(VCFInfoHeaderLine("AF", 3, VCFHeaderLineType.Integer, "Allele Frequency")) - headerLines.add(VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval")) - // These last 4 are needed for assembly g/hvcfs, but not for reference. I am keeping them in as header - // lines but they will only be added to the data lines if they are present in the VariantContext. - headerLines.add(VCFInfoHeaderLine("ASM_Chr", 1, VCFHeaderLineType.String, "Assembly chromosome")) - headerLines.add(VCFInfoHeaderLine("ASM_Start", 1, VCFHeaderLineType.Integer, "Assembly start position")) - headerLines.add(VCFInfoHeaderLine("ASM_End", 1, VCFHeaderLineType.Integer, "Assembly end position")) - headerLines.add(VCFInfoHeaderLine("ASM_Strand", 1, VCFHeaderLineType.String, "Assembly strand")) - return headerLines - } - fun isIndel(variant: SimpleVariant?): Boolean { - if(variant == null) return false - return !((variant.refAllele.length == 1 && variant.altAllele.length == 1) || //SNP - (variant.refAllele.length == 1 && variant.altAllele == "")) //RefBlock - } } \ No newline at end of file diff --git a/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt new file mode 100644 index 0000000..e93d85a --- /dev/null +++ b/src/main/kotlin/net/maizegenetics/commands/RecombineGvcfs.kt @@ -0,0 +1,478 @@ +package net.maizegenetics.commands + +import biokotlin.seq.NucSeq +import biokotlin.seq.NucSeqRecord +import biokotlin.seqIO.NucSeqIO +import biokotlin.util.bufferedReader +import com.github.ajalt.clikt.core.CliktCommand +import com.github.ajalt.clikt.parameters.options.option +import com.github.ajalt.clikt.parameters.options.required +import com.github.ajalt.clikt.parameters.types.path +import com.google.common.collect.Range +import com.google.common.collect.RangeMap +import com.google.common.collect.TreeRangeMap +import htsjdk.variant.variantcontext.Allele +import htsjdk.variant.variantcontext.GenotypeBuilder +import htsjdk.variant.variantcontext.VariantContext +import htsjdk.variant.variantcontext.VariantContextBuilder +import htsjdk.variant.variantcontext.writer.Options +import htsjdk.variant.variantcontext.writer.VariantContextWriter +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder +import htsjdk.variant.vcf.VCFFileReader +import htsjdk.variant.vcf.VCFReader +import net.maizegenetics.utils.Position +import net.maizegenetics.utils.SimpleVariant +import net.maizegenetics.utils.VariantContextUtils +import java.io.File +import java.nio.file.Path +import kotlin.collections.iterator + +class RecombineGvcfs : CliktCommand(name = "recombine-gvcfs") { + + private val inputBedDir by option(help = "Input Bed dir") + .path(canBeFile = false, canBeDir = true) + .required() + + private val inputGvcfDir by option(help = "Input GVCF dir") + .path(canBeFile = false, canBeDir = true) + .required() + + private val outputDir by option(help = "Output dir") + .path(canBeFile = false, canBeDir = true) + .required() + + private val refFile by option(help = "Ref file") + .path(canBeFile = true, canBeDir = false) + .required() + + private val outputBedDir by option(help = "Output Bed dir") + .path(canBeFile = false, canBeDir = true) + .required() + + + val pattern = Regex("""^(.+?)\.g(?:\.?vcf)(?:\.gz)?$""") + + override fun run() { + // Implementation goes here + recombineGvcfs(inputBedDir, inputGvcfDir, refFile, outputDir, outputBedDir) + } + + fun recombineGvcfs(inputBedDir: Path, inputGvcfDir: Path, refFile: Path, outputDir: Path, outputBedDir: Path) { + println("Loading in the reference Genome from $refFile") + val refSeq = NucSeqIO(refFile.toFile().path).readAll() + + // Placeholder for the actual recombination logic + println("Recombining GVCFs from $inputGvcfDir using BED files from $inputBedDir into $outputDir") + + //Build BedFile Map + val (recombinationMapBeforeResize, sampleNames) = buildRecombinationMap(inputBedDir) + + println("Resizing recombination maps for large indels from the GVCF files") + val recombinationMap = resizeRecombinationMapsForIndels(recombinationMapBeforeResize, inputGvcfDir) + println("Writing out the new BED files to $outputBedDir") + writeResizedBedFiles(recombinationMap, outputBedDir) + + //Build Output writers for each sample name + val outputWriters = buildOutputWriterMap(sampleNames, outputDir) + //Process GVCFs and write out recombined files + processGvcfsAndWrite(recombinationMap, inputGvcfDir, outputWriters, refSeq) + //Close the GVCF writers + outputWriters.values.forEach { it.close() } + + //Sort the gvcfs + } + + fun buildRecombinationMap(inputBedDir: Path): Pair>, List> { + //loop through each file in the inputBedDir + val recombinationMap = mutableMapOf>() + val targetNames = mutableSetOf() + inputBedDir.toFile().listFiles()?.forEach { bedFile -> + val bedFileSampleName = bedFile.name.substringBeforeLast("_").replace(".bed","") + bedFile.forEachLine { line -> + val parts = line.split("\t") + if (parts.size >= 4) { + val chrom = parts[0] + val start = parts[1].toInt() + 1 //BED is 0 based, VCF is 1 based + val end = parts[2].toInt() + val targetSampleName = parts[3] + + val range = Range.closed(Position(chrom, start), Position(chrom, end)) + recombinationMap.computeIfAbsent(bedFileSampleName) { TreeRangeMap.create() }.put(range, targetSampleName) + targetNames.add(targetSampleName) + } + } + } + return Pair(recombinationMap, targetNames.toList()) + } + + + fun resizeRecombinationMapsForIndels(recombinationMap: Map>, gvcfDir: Path): Map> { + println("Collecting indels that require resizing of recombination ranges") + + + //loop through the gvcf files and process them to find indels that will require resizing + //Do this quickly by just parsing the file like normal and not use htsjdk + val indelsForResizing = findAllOverlappingIndels(recombinationMap, gvcfDir) + + if(indelsForResizing.isEmpty()) { + println("No overlapping indels found that require resizing of recombination ranges") + return recombinationMap + } + + //Need to flip the region map so we can follow the target sample names when we have an overlapping indel + val flippedRecombinationMap = flipRecombinationMap(recombinationMap) + + //Now we can loop through the indels and resize the original ranges + val resizedMap = resizeMaps(indelsForResizing, recombinationMap,flippedRecombinationMap) + + //reflip the maps + return flipRecombinationMap(resizedMap) + + } + + fun findAllOverlappingIndels(recombinationMap: Map>, gvcfDir: Path): List> { + return gvcfDir.toFile().listFiles()?.filter{ + pattern.matches(it.name) + }?.flatMap { gvcfFile -> + val sampleName = gvcfFile.name.replace(".g.vcf","").replace(".gvcf","").replace(".gz","") + + val ranges = + recombinationMap[sampleName] ?: return@flatMap emptyList>() + + findOverlappingIndelsInGvcf(sampleName, gvcfFile, ranges) + }?: emptyList() + } + + fun findOverlappingIndelsInGvcf(sampleName: String, gvcfFile: File, ranges: RangeMap): List> { + val reader = bufferedReader(gvcfFile.absolutePath) + + var currentLine = reader.readLine() + val overlappingIndels = mutableListOf>() + while(currentLine != null) { + if(currentLine.startsWith("#")) { + currentLine = reader.readLine() + continue + } + val parts = currentLine.split("\t") + val chrom = parts[0] + val pos = parts[1].toInt() + val ref = parts[3] + val alt = parts[4] + + val startPos = Position(chrom, pos) + val endPos = Position(chrom, pos + ref.length - 1) + + //check if its an indel first + if(ref.length != alt.length && alt != "") { + //It's an indel + //Now check to see if it overlaps more than one range. This can be done by checking the ranges coming out of the start and end positions + val startRange = ranges.getEntry(startPos) + val endRange = ranges.getEntry(endPos) + if(startRange != null && endRange != null && startRange != endRange) { + //It overlaps more than one range + val simpleVariant = SimpleVariant(startPos, endPos, ref, alt) + println("Found overlapping indel at $chrom:$pos $ref->$alt") + overlappingIndels.add(Triple(sampleName, startRange.value, simpleVariant)) + } + } + currentLine = reader.readLine() + } + reader.close() + return overlappingIndels + } + + fun flipRecombinationMap(recombinationMap: Map>): Map> { + val flippedMap = mutableMapOf>() + + for((sampleName, rangeMap) in recombinationMap) { + for(entry in rangeMap.asMapOfRanges().entries) { + val range = entry.key + val targetSampleName = entry.value + + flippedMap.computeIfAbsent(targetSampleName) { TreeRangeMap.create() }.put(range, sampleName) + } + } + + return flippedMap + } + + + + fun resizeMaps( + indelsForResizing: List>, + originalRecombinationMap: Map>, + flippedRecombinationMap: Map> + ): Map> { + + //work completely with flippedRecombinationMap then we need to flip back at the end + val resizedMap = flippedRecombinationMap.toMutableMap() + //Walk through the indelsForResizing list + for((sourceSampleName, targetSampleName, indel) in indelsForResizing) { + //Get the source and target range maps + + val targetRangeMap = resizedMap[targetSampleName] ?: continue + val sourceRangeMap = originalRecombinationMap[sourceSampleName] ?: continue + + //this list will hold all of the ranges and their corresponding source sample names that need to be added back in after resizing + val newRanges = mutableSetOf, String>>() //Triple of (targetSampleName, range, sourceSampleName) + val toDeleteRanges = mutableSetOf>>() //Pair of (targetSampleName, range) + + //Now that we have this we need to first resize the sourceRangeMap's entry for the indels position + val startRangeEntry = targetRangeMap.getEntry(indel.refStart) + + //Double check to make sure that we need to resize aka check that the indel's end is outside of the start range + if(startRangeEntry == null || startRangeEntry.key.contains(indel.refEnd)) continue + + //for the original entry we can just resize it. + //add existing range to delete list + toDeleteRanges.add(Pair(targetSampleName,startRangeEntry.key)) + //create new resized range + val resizedRange = Range.closed( + startRangeEntry.key.lowerEndpoint(), + Position( + startRangeEntry.key.upperEndpoint().contig, + indel.refEnd.position)) + + newRanges.add(Triple(targetSampleName,resizedRange, startRangeEntry.value)) + + //This makes sure that we don't have any overlapping sites in the output gvcf + //Then we need to follow the target Sample's range map and resize/remove any that are overlapping + var currentPos = Position(startRangeEntry.key.upperEndpoint().contig, indel.refEnd.position) + while(currentPos <= indel.refEnd) { + val currentRangeEntry = targetRangeMap.getEntry(currentPos) ?: break + //add existing range to delete list + toDeleteRanges.add(Pair(targetSampleName, currentRangeEntry.key)) + //Check to see if this is the last overlapping range + if (currentRangeEntry.key.contains(indel.refEnd) && currentRangeEntry.key.upperEndpoint()!= indel.refEnd) { + //Resize this range + val resizedLastRange = Range.closed( + Position(currentRangeEntry.key.lowerEndpoint().contig, indel.refEnd.position+1), //Need to shift the position up by 1 + currentRangeEntry.key.upperEndpoint() + ) + newRanges.add(Triple(targetSampleName,resizedLastRange, currentRangeEntry.value)) + break + } else { + //This range is fully contained within the indel, so we skip adding it back in + } + } + + //We also need to adjust the source sample's range map to account for the resized regions otherwise we will + // have overlapping regions in the BED file after its reflipped + //Can use a similar process as above but instead we need to walk through the original recombination map and get the next ranges target then find that range in the target map and add to the delete and resize the add + //resetting our current pos to the indel start + currentPos = Position(startRangeEntry.key.upperEndpoint().contig, indel.refEnd.position) + while(currentPos <= indel.refEnd) { + val currentSourceRangeEntry = sourceRangeMap.getEntry(currentPos) ?: break + //Find the corresponding range in the target map + val currentRangeEntry = resizedMap[currentSourceRangeEntry.value]?.getEntry(currentSourceRangeEntry.key.lowerEndpoint()) ?: break + //add existing range to delete list + toDeleteRanges.add(Pair(currentSourceRangeEntry.value,currentRangeEntry.key)) + //Check to see if this is the last overlapping range + if (currentRangeEntry.key.contains(indel.refEnd) && currentRangeEntry.key.upperEndpoint()!= indel.refEnd) { + //Resize this range + val resizedLastRange = Range.closed( + Position(currentRangeEntry.key.lowerEndpoint().contig, indel.refEnd.position+1), //Need to shift the position up by 1 + currentRangeEntry.key.upperEndpoint() + ) + newRanges.add(Triple(currentSourceRangeEntry.value,resizedLastRange, currentRangeEntry.value)) + break + } else { + //This range is fully contained within the indel, so we skip adding it back in + } + } + + //Now we can remove the old ranges from the targetRangeMap + for((target,range) in toDeleteRanges) { + resizedMap[target]!!.remove(range) + } + //Now we can add back in the new resized ranges + for((target, range, sourceSample) in newRanges) { + resizedMap[target]!!.put(range, sourceSample) + } + } + return resizedMap + } + + fun writeResizedBedFiles(recombinationMap: Map>, outputBedDir: Path) { + println("Writing resized BED files") + for((sampleName, rangeMap) in recombinationMap) { + val outputBedFile = File(outputBedDir.toFile(), "${sampleName}_resized.bed") + outputBedFile.bufferedWriter().use { writer -> + for(entry in rangeMap.asMapOfRanges().entries) { + val range = entry.key + val targetSampleName = entry.value + val chrom = range.lowerEndpoint().contig + val start = range.lowerEndpoint().position - 1 //Convert back to 0 based for BED + val end = range.upperEndpoint().position + + writer.write("$chrom\t$start\t$end\t$targetSampleName\n") + } + } + } + } + + + fun buildOutputWriterMap(sampleNames: List, outputDir: Path): Map { + return sampleNames.associateWith { sampleName -> + val outputFile = outputDir.resolve("${sampleName}_recombined.gvcf") + val writer = VariantContextWriterBuilder() + .unsetOption(Options.INDEX_ON_THE_FLY) + .setOutputFile(outputFile.toFile()) + .setOutputFileType(VariantContextWriterBuilder.OutputType.VCF) + .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER) + .build() + + writer.writeHeader(VariantContextUtils.createGenericHeader(listOf(sampleName), emptySet())) + writer + } + } + + fun processGvcfsAndWrite( + recombinationMap: Map>, + inputGvcfDir: Path, + outputWriters: Map, + refSeq :Map + ) { + + inputGvcfDir.toFile().listFiles()?.forEach { gvcfFile -> + val match = pattern.matchEntire(gvcfFile.name) + if (match == null) { + println("Skipping file ${gvcfFile.name} as it does not match expected GVCF naming pattern.") + return@forEach + } + val sampleName = match.groupValues[1] + val ranges = recombinationMap[sampleName] ?: return@forEach + + VCFFileReader(gvcfFile, false).use { gvcfReader -> + processSingleGVCFFile(gvcfReader, ranges, outputWriters, refSeq) + } + } + } + + fun processSingleGVCFFile( + gvcfReader: VCFReader, + ranges: RangeMap, + outputWriters: Map, + refSeq :Map + ) { + //Need to loop through each range and each gvcf record. + //We need to see if the variant falls within the range. If so, write it to the appropriate output writer. + //There are edge cases where the variant can span multiple ranges(Due to RefBlock) which is valid just need to resize the variant and write out. + //We do need to make sure that if an indel spans a range boundary that we correctly handle it by assigning it to the left most range but not the right + + //This might actually be very simple, If we convert the List into a Range Map, we only need to check the start position. + //Then if the variant is an RefBlock we need to resize it to start at the end of the range and move to the next range. Continue doing this + val iterator = gvcfReader.iterator() + while (iterator.hasNext()) { + val vc = iterator.next() + val startPos = Position(vc.contig, vc.start) + val endPos = Position(vc.contig, vc.end) + + var targetSampleName = ranges.get(startPos) ?: continue + + var outputWriter = outputWriters[targetSampleName] ?: continue + + if((vc.reference.length() == 1) && vc.reference.length() == vc.alternateAlleles[0].length()) { + // SNP polymorphism + //can write out directly as it is only 1 position and will not overlap but will need to change the genotype name + val newVc = changeSampleName(vc, targetSampleName) + outputWriter.add(newVc) + } + else if(vc.reference.length() == 1 && vc.alternateAlleles.first().displayString == ""){ + //This is a RefBlock + //We need to 'walk through' the refBlock by splitting it up into multiple refBlocks and write out to the correct output writer + //Get the current start position + processRefBlockOverlap(startPos, endPos, ranges, outputWriters, refSeq, vc) + } + else { + //Its an indel or complex polymorphism + //Assign it to the left most range only + //This is ok to do as we have already resized the recombination ranges to account for overlapping indels + val newVc = changeSampleName(vc, targetSampleName) + outputWriter.add(newVc) + } + } + } + + fun processRefBlockOverlap( + startPos: Position, + endPos: Position, + ranges: RangeMap, + outputWriters: Map, + refSeq: Map, + vc: VariantContext + ){ + val subRanges = ranges.subRangeMap(Range.closed(startPos, endPos)) + var currentStartPos = startPos + var currentRangeIdx = 0 + val rangeEntries = subRanges.asMapOfRanges().entries.toList() + while (currentStartPos <= endPos && currentRangeIdx < rangeEntries.size) { + val rangeEntry = rangeEntries[currentRangeIdx] + val range = rangeEntry.key + val targetSampleName = rangeEntry.value + if(currentStartPos + GenotypeBuilder(genotype) + .name(newSampleName) + .make() + } + builder.genotypes(genotypes) + return builder.make() + } + + fun buildRefBlock(chrom: String, start: Int, end: Int, refAllele: String, sampleName: String): VariantContext { + return VariantContextBuilder() + .chr(chrom) + .start(start.toLong()) + .stop(end.toLong()) + .attribute("END", end) + .alleles(listOf(refAllele, "")) + .genotypes( + listOf( + GenotypeBuilder(sampleName).alleles(listOf(Allele.create(refAllele,true))).make() + ) + ).make() + } + +} \ No newline at end of file diff --git a/src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt b/src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt new file mode 100644 index 0000000..4a9df2e --- /dev/null +++ b/src/main/kotlin/net/maizegenetics/utils/VariantContextUtils.kt @@ -0,0 +1,76 @@ +package net.maizegenetics.utils + +import htsjdk.variant.vcf.VCFFormatHeaderLine +import htsjdk.variant.vcf.VCFHeader +import htsjdk.variant.vcf.VCFHeaderLine +import htsjdk.variant.vcf.VCFHeaderLineCount +import htsjdk.variant.vcf.VCFHeaderLineType +import htsjdk.variant.vcf.VCFInfoHeaderLine +import java.util.HashSet + +data class Position(val contig: String, val position: Int) : Comparable { + override fun compareTo(other: Position): Int { + + if (this.contig == other.contig) { + return this.position.compareTo(other.position) + } + + val thisContig = this.contig.replace("chr", "", ignoreCase = true).trim() + val otherContig = other.contig.replace("chr", "", ignoreCase = true).trim() + + return try { + thisContig.toInt() - otherContig.toInt() + } catch (e: NumberFormatException) { + // If we can't convert contigs to an int, then compare the strings + contig.compareTo(other.contig) + } + + } + + override fun toString(): String { + return "$contig:$position" + } +} +data class SimpleVariant(val refStart: Position, val refEnd: Position, + val refAllele: String, val altAllele: String, + val isAddedMutation: Boolean = false) + +class VariantContextUtils { + companion object { + fun createGenericHeader(taxaNames: List, altLines:Set): VCFHeader { + val headerLines = createGenericHeaderLineSet() as MutableSet + headerLines.addAll(altLines) + return VCFHeader(headerLines, taxaNames) + } + + fun createGenericHeaderLineSet(): Set { + val headerLines: MutableSet = HashSet() + headerLines.add(VCFFormatHeaderLine("AD", 3, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")) + headerLines.add( + VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)") + ) + headerLines.add(VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality")) + headerLines.add(VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype")) + headerLines.add( + VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification") + ) + headerLines.add(VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total Depth")) + headerLines.add(VCFInfoHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Number of Samples With Data")) + headerLines.add(VCFInfoHeaderLine("AF", 3, VCFHeaderLineType.Integer, "Allele Frequency")) + headerLines.add(VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval")) + // These last 4 are needed for assembly g/hvcfs, but not for reference. I am keeping them in as header + // lines but they will only be added to the data lines if they are present in the VariantContext. + headerLines.add(VCFInfoHeaderLine("ASM_Chr", 1, VCFHeaderLineType.String, "Assembly chromosome")) + headerLines.add(VCFInfoHeaderLine("ASM_Start", 1, VCFHeaderLineType.Integer, "Assembly start position")) + headerLines.add(VCFInfoHeaderLine("ASM_End", 1, VCFHeaderLineType.Integer, "Assembly end position")) + headerLines.add(VCFInfoHeaderLine("ASM_Strand", 1, VCFHeaderLineType.String, "Assembly strand")) + return headerLines + } + + fun isIndel(variant: SimpleVariant?): Boolean { + if(variant == null) return false + return !((variant.refAllele.length == 1 && variant.altAllele.length == 1) || //SNP + (variant.refAllele.length == 1 && variant.altAllele == "")) //RefBlock + } + } +} \ No newline at end of file diff --git a/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt b/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt index 40862c4..9da0993 100644 --- a/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/MutateAssembliesTest.kt @@ -6,9 +6,9 @@ import com.google.common.collect.TreeRangeMap import htsjdk.variant.variantcontext.Allele import htsjdk.variant.variantcontext.VariantContextBuilder import htsjdk.variant.vcf.VCFFileReader -import net.maizegenetics.net.maizegenetics.commands.MutateAssemblies -import net.maizegenetics.net.maizegenetics.commands.Position -import net.maizegenetics.net.maizegenetics.commands.SimpleVariant +import net.maizegenetics.utils.Position +import net.maizegenetics.utils.SimpleVariant +import net.maizegenetics.utils.VariantContextUtils import java.io.File import kotlin.test.Test import kotlin.test.assertEquals @@ -306,16 +306,16 @@ class MutateAssembliesTest { val mutateAssemblies = MutateAssemblies() val refBlockVariant = SimpleVariant(Position("chr1", 100), Position("chr1", 150), "A", "") - assert(!mutateAssemblies.isIndel(refBlockVariant)) + assert(!VariantContextUtils.isIndel(refBlockVariant)) val snpVariant = SimpleVariant(Position("chr1", 150), Position("chr1", 150), "C", "G") - assert(!mutateAssemblies.isIndel(snpVariant)) + assert(!VariantContextUtils.isIndel(snpVariant)) val insertionVariant = SimpleVariant(Position("chr1", 200), Position("chr1", 200), "A", "AGG") - assert(mutateAssemblies.isIndel(insertionVariant)) + assert(VariantContextUtils.isIndel(insertionVariant)) val deletionVariant = SimpleVariant(Position("chr1", 201), Position("chr1", 205), "GGGGG", "G") - assert(mutateAssemblies.isIndel(deletionVariant)) + assert(VariantContextUtils.isIndel(deletionVariant)) } private fun createSimpleBaseVariantMap(): RangeMap { diff --git a/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt new file mode 100644 index 0000000..b2dc329 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/RecombineGvcfsTest.kt @@ -0,0 +1,921 @@ +package net.maizegenetics.commands + +import biokotlin.seq.NucSeq +import biokotlin.seq.NucSeqRecord +import com.google.common.collect.Range +import com.google.common.collect.RangeMap +import com.google.common.collect.TreeRangeMap +import htsjdk.variant.variantcontext.Allele +import htsjdk.variant.variantcontext.GenotypeBuilder +import htsjdk.variant.variantcontext.VariantContextBuilder +import htsjdk.variant.variantcontext.writer.VariantContextWriter +import htsjdk.variant.vcf.VCFFileReader +import net.maizegenetics.utils.Position +import net.maizegenetics.utils.SimpleVariant +import java.io.File +import kotlin.io.path.Path +import kotlin.test.AfterTest +import kotlin.test.BeforeTest +import kotlin.test.DefaultAsserter.assertEquals +import kotlin.test.DefaultAsserter.assertTrue +import kotlin.test.Test +import kotlin.test.assertFalse + +class RecombineGvcfsTest { + + val homeDir = System.getProperty("user.home").replace('\\', '/') + + val outputGvcfDir = "$homeDir/temp/seq_sim/recombine_gvcf_test/gvcf/" + val outputBedDir = "$homeDir/temp/seq_sim/recombine_gvcf_test/bed/" + + @BeforeTest + fun setupTest() { + // Runs before each test in common code + File(outputGvcfDir).mkdirs() + File(outputBedDir).mkdirs() + } + + @AfterTest + fun teardownTest() { + // Runs after each test in common code + File(outputGvcfDir).deleteRecursively() + File(outputBedDir).deleteRecursively() + } + + @Test + fun testRecombineGvcfs() { + //recombineGvcfs(inputBedDir: Path, inputGvcfDir: Path, refFile: Path, outputDir: Path, outputBedDir: Path) + val bedDir = "./data/RecombineGvcfs/bed/" + val gvcfFile = "./data/RecombineGvcfs/gvcf/" + val refFile = "./data/RecombineGvcfs/ref.fa" + val recombineGvcfs = RecombineGvcfs() + recombineGvcfs.recombineGvcfs( + Path(bedDir), + Path(gvcfFile), + Path(refFile), + Path(outputGvcfDir), + Path(outputBedDir) + ) + //Check that the output files were created + val outputFiles = File(outputGvcfDir).listFiles().filter { it.name.endsWith(".gvcf") } + assertEquals( + "There should be 3 output gvcf files", + 3, + outputFiles.size + ) + checkAllResults(outputGvcfDir) + } + + @Test + fun testBuildRecombinationMap() { + val bedDir = "./data/RecombineGvcfs/bed/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + //Check that the map has the correct number of samples + assertEquals( + "Recombination map should have 3 samples", + 3, + recombinationMap.size + ) + + //Check to make sure that the recombination maps are correct + checkRecombinationMapContents(recombinationMap, "sampleA", listOf("sampleX", "sampleY", "sampleZ")) + checkRecombinationMapContents(recombinationMap, "sampleB", listOf("sampleY", "sampleZ", "sampleX")) + checkRecombinationMapContents(recombinationMap, "sampleC", listOf("sampleZ", "sampleX", "sampleY")) + + + //targetNameList should also have 3 samples + assertEquals( + "Target name list should have 3 samples", + 3, + targetNameList.size + ) + + //Should have sampleX, sampleY, sampleZ + assertTrue( + "Target name list does not contain sampleX", + targetNameList.contains("sampleX") + ) + assertTrue( + "Target name list does not contain sampleY", + targetNameList.contains("sampleY") + ) + assertTrue( + "Target name list does not contain sampleZ", + targetNameList.contains("sampleZ") + ) + } + + private fun checkRecombinationMapContents(recombinationMap: Map>, sampleName: String, expectedTargetSamples: List) { + val sampleMap = recombinationMap[sampleName] + require(sampleMap != null) { "Range map for $sampleName is null" } + assertEquals( + "${sampleName} should have 3 ranges", + 3, + sampleMap.asMapOfRanges().size + ) + //From 1 -10 should be sampleX, from 10 to 19 should be sampleY, from 20 to 29 should be sampleZ + val range1 = sampleMap.getEntry(Position("chr1", 5)) + assertEquals( + "${sampleName} first range not correct", + Range.closed(Position("chr1", 1), Position("chr1", 10)), + range1?.key + ) + assertEquals( + "${sampleName} first range target not correct", + expectedTargetSamples[0], + range1?.value + ) + val range2 = sampleMap.getEntry(Position("chr1", 15)) + assertEquals( + "${sampleName} second range not correct", + Range.closed(Position("chr1", 11), Position("chr1", 20)), + range2?.key + ) + assertEquals( + "${sampleName} second range target not correct", + expectedTargetSamples[1], + range2?.value + ) + val range3 = sampleMap.getEntry(Position("chr1", 25)) + assertEquals( + "${sampleName} third range not correct", + Range.closed(Position("chr1", 21), Position("chr1", 30)), + range3?.key + ) + assertEquals( + "${sampleName} third range target not correct", + expectedTargetSamples[2], + range3?.value + ) + } + + + @Test + fun testResizeRecombinationMapsForIndels() { + //Each of the units are tested in other functions. Here we just need to test that the overall resizing works as expected + val bedDir = "./data/RecombineGvcfs/bed/" + val gvcfFile = "./data/RecombineGvcfs/gvcf/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + val resizedMap = recombineGvcfs.resizeRecombinationMapsForIndels(recombinationMap, Path(gvcfFile)) + + //resized map should be the same except for the first range of sampleC which should be resized to 1-11 for target sampleZ + //And the second range of sampleB which should be resized to 12-20 for target sampleZ + //Check the parts that have not changed first + //SampleA should be unchanged + val sampleARangeMap = resizedMap["sampleA"] + assertEquals( + "SampleA should have 3 ranges", + 3, + sampleARangeMap?.asMapOfRanges()?.size + ) + assertEquals("SampleA matches original", recombinationMap["sampleA"], sampleARangeMap) + + //SamplesB and C should have one resized region each + val sampleBRangeMap = resizedMap["sampleB"] + assertEquals( + "SampleB should have 3 ranges", + 3, + sampleBRangeMap?.asMapOfRanges()?.size + ) + val sampleBFirstRange = sampleBRangeMap?.getEntry(Position("chr1",5)) + assertEquals( + "SampleB first range should be unchanged", + Range.closed(Position("chr1",1), Position("chr1",10)), + sampleBFirstRange?.key + ) + assertEquals( + "SampleB first range target should be unchanged", + "sampleY", + sampleBFirstRange?.value + ) + val sampleBSecondRange = sampleBRangeMap?.getEntry(Position("chr1",15)) + assertEquals( + "SampleB second range should be resized to 12-20", + Range.closed(Position("chr1",12), Position("chr1",20)), + sampleBSecondRange?.key + ) + assertEquals( + "SampleB second range target should be sampleZ", + "sampleZ", + sampleBSecondRange?.value + ) + val sampleBThirdRange = sampleBRangeMap?.getEntry(Position("chr1",25)) + assertEquals( + "SampleB third range should be unchanged", + Range.closed(Position("chr1",21), Position("chr1",30)), + sampleBThirdRange?.key + ) + assertEquals( + "SampleB third range target should be unchanged", + "sampleX", + sampleBThirdRange?.value + ) + + val sampleCRangeMap = resizedMap["sampleC"] + assertEquals( + "SampleC should have 3 ranges", + 3, + sampleCRangeMap?.asMapOfRanges()?.size + ) + val sampleCFirstRange = sampleCRangeMap?.getEntry(Position("chr1",5)) + assertEquals( + "SampleC first range should be resized to 1-11", + Range.closed(Position("chr1",1), Position("chr1",11)), + sampleCFirstRange?.key + ) + assertEquals( + "SampleC first range target should be sampleZ", + "sampleZ", + sampleCFirstRange?.value + ) + val sampleCSecondRange = sampleCRangeMap?.getEntry(Position("chr1",15)) + assertEquals( + "SampleC second range has changed", + Range.closed(Position("chr1",12), Position("chr1",20)), + sampleCSecondRange?.key + ) + assertEquals( + "SampleC second range target should be unchanged", + "sampleX", + sampleCSecondRange?.value + ) + val sampleCThirdRange = sampleCRangeMap?.getEntry(Position("chr1",25)) + assertEquals( + "SampleC third range should be unchanged", + Range.closed(Position("chr1",21), Position("chr1",30)), + sampleCThirdRange?.key + ) + assertEquals( + "SampleC third range target should be unchanged", + "sampleY", + sampleCThirdRange?.value + ) + } + + @Test + fun testFindOverlappingIndelsInGvcf() { + val bedDir = "./data/RecombineGvcfs/bed/" + val gvcfFile = "./data/RecombineGvcfs/gvcf/" + + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + + //Test SampleB and Sample C. SampleB does not have an overlapping indel so the list should be empty + val sampleBIndels = recombineGvcfs.findOverlappingIndelsInGvcf("sampleB", File("$gvcfFile/sampleB.gvcf"),recombinationMap["sampleB"]!!) + + assertEquals( + "SampleB should have 0 overlapping indels", + 0, + sampleBIndels.size + ) + + //SampleC should have one overlapping indel at chr1:9-11 hitting position 10 in the recombination map + val sampleCIndels = recombineGvcfs.findOverlappingIndelsInGvcf("sampleC", File("$gvcfFile/sampleC.gvcf"),recombinationMap["sampleC"]!!) + assertEquals( + "SampleC should have 1 overlapping indel", + 1, + sampleCIndels.size + ) + val indel = sampleCIndels[0] + + assertEquals( + "Indel contig does not match", + "chr1", + indel.third.refStart.contig + ) + assertEquals( + "Indel start does not match", + 9, + indel.third.refStart.position + ) + assertEquals( + "Indel end does not match", + 11, + indel.third.refEnd.position + ) + //check source and target sample names + assertEquals( + "Indel source sample name does not match", + "sampleC", + indel.first + ) + assertEquals( + "Indel target sample name does not match", + "sampleZ", + indel.second + ) + } + + @Test + fun testFindAllOverlappingIndels() { + val bedDir = "./data/RecombineGvcfs/bed/" + val gvcfDir = "./data/RecombineGvcfs/gvcf/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + val allIndels = recombineGvcfs.findAllOverlappingIndels(recombinationMap, Path(gvcfDir)) + //There should be only one indel from sampleC + assertEquals( + "There should be 1 overlapping indel in total", + 1, + allIndels.size + ) + val indel = allIndels[0] + assertEquals( + "Indel source sample name does not match", + "sampleC", + indel.first + ) + assertEquals( + "Indel target sample name does not match", + "sampleZ", + indel.second + ) + assertEquals( + "Indel contig does not match", + "chr1", + indel.third.refStart.contig + ) + assertEquals( + "Indel start does not match", + 9, + indel.third.refStart.position + ) + assertEquals( + "Indel end does not match", + 11, + indel.third.refEnd.position + ) + + } + + @Test + fun testFlipRecombinationMap() { + //This needs to take a recombination map Map> and flip it to Map> + val recombinationMap = buildSimpleRecombinationMap() + + val recombineGvcfs = RecombineGvcfs() + val flippedMap = recombineGvcfs.flipRecombinationMap(recombinationMap) + //Now check that the flipped map is correct + //Check TargetSampleA + val targetSampleARangeMap = flippedMap["TargetSampleA"] + assertEquals( + "TargetSampleA should have 2 ranges", + 2, + targetSampleARangeMap?.asMapOfRanges()?.size + ) + val targetSampleARanges = targetSampleARangeMap?.asMapOfRanges() + //Check that the ranges are correct + val range1 = targetSampleARanges?.keys?.find { it.contains(Position("chr1", 150)) } + assertEquals( + "TargetSampleA should have range from Sample1", + Range.closed(Position("chr1", 100), Position("chr1", 200)), + range1 + ) + val range2 = targetSampleARanges?.keys?.find { it.contains(Position("chr1", 250)) } + assertEquals( + "TargetSampleA should have range from Sample2", + Range.closed(Position("chr1", 201), Position("chr1", 300)), + range2 + ) + //Check TargetSampleB + val targetSampleBRangeMap = flippedMap["TargetSampleB"] + assertEquals( + "TargetSampleB should have 2 ranges", + 2, + targetSampleBRangeMap?.asMapOfRanges()?.size + ) + val targetSampleBRanges = targetSampleBRangeMap?.asMapOfRanges() + //Check that the ranges are correct + val range3 = targetSampleBRanges?.keys?.find { it.contains(Position("chr1", 150)) } + assertEquals( + "TargetSampleB should have range from Sample2", + Range.closed(Position("chr1", 100), Position("chr1", 200)), + range3 + ) + val range4 = targetSampleBRanges?.keys?.find { it.contains(Position("chr1", 250)) } + assertEquals( + "TargetSampleB should have range from Sample1", + Range.closed(Position("chr1", 201), Position("chr1", 300)), + range4 + ) + + //Flip it back and see if the original map is recovered + val reflippedMap = recombineGvcfs.flipRecombinationMap(flippedMap) + assertEquals( + "Reflipped map should be equal to original map", + recombinationMap, + reflippedMap + ) + + } + + + @Test + fun testResizeMaps() { + val bedDir = "./data/RecombineGvcfs/bed/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + + //Need to flip the map + val flippedMap = recombineGvcfs.flipRecombinationMap(recombinationMap) + val indelsForResizingEmpty = emptyList>() + + val nonResizedMap = recombineGvcfs.resizeMaps(indelsForResizingEmpty, recombinationMap,flippedMap) + //Check that the map is the same as the flipped map + assertEquals( + "No-resize map should be equal to flipped map", + flippedMap, + nonResizedMap + ) + + //Make an overlapping indel SampleC is the donor for TargetSampleZ at chr1:9-11 + //chr1 9 . AAA A . . GT 1 + val indelToResize = Triple( + "sampleC", + "sampleZ", + SimpleVariant( + Position("chr1",9), + Position("chr1",11), + refAllele = "AAA", + altAllele = "A" + ) + ) + + val indelsForResizing = listOf(indelToResize) + val zIndel1ResizedMap = recombineGvcfs.resizeMaps(indelsForResizing, recombinationMap,flippedMap) + //Now check that the map has been resized correctly + + val targetSampleZRangeMap = zIndel1ResizedMap["sampleZ"] + assertEquals( + "sampleZ should have 3 ranges", + 3, + targetSampleZRangeMap?.asMapOfRanges()?.size + ) + val firstRegion = targetSampleZRangeMap?.getEntry(Position("chr1",5)) + assertEquals( + "First region should be from 1-11", + Range.closed(Position("chr1",1), Position("chr1",11)), + firstRegion?.key + ) + assertEquals( + "First region should be from sampleC", + "sampleC", + firstRegion?.value + ) + val secondRegion = targetSampleZRangeMap?.getEntry(Position("chr1",15)) + assertEquals( + "Second region should be from 12-20", + Range.closed(Position("chr1",12), Position("chr1",20)), + secondRegion?.key + ) + assertEquals( + "Second region should be from sampleB", + "sampleB", + secondRegion?.value + ) + val thirdRegion = targetSampleZRangeMap?.getEntry(Position("chr1",25)) + assertEquals( + "Third region should be from 21-30", + Range.closed(Position("chr1",21), Position("chr1",30)), + thirdRegion?.key + ) + assertEquals( + "Third region should be from sampleA", + "sampleA", + thirdRegion?.value + ) + } + + + + @Test + fun testWriteResizedBedFiles() { + val recombinationMap = buildSimpleRecombinationMap() + + val recombineGvcfs = RecombineGvcfs() + + recombineGvcfs.writeResizedBedFiles(recombinationMap, Path(outputGvcfDir)) + + //Check that the files were created and have the correct content + val targetSampleABedFile = File("$outputGvcfDir/Sample1_resized.bed") + assertEquals("TargetSampleA.bed file was not created", true, targetSampleABedFile.isFile) + val targetSampleAContent = targetSampleABedFile.readText().trim() + val expectedTargetSampleAContent = "chr1\t99\t200\tTargetSampleA\nchr1\t200\t300\tTargetSampleB" + assertEquals("TargetSampleA.bed content is incorrect", expectedTargetSampleAContent, targetSampleAContent) + val targetSampleBBedFile = File("$outputGvcfDir/Sample2_resized.bed") + assertEquals("TargetSampleB.bed file was not created", true, targetSampleBBedFile.isFile) + val targetSampleBContent = targetSampleBBedFile.readText().trim() + val expectedTargetSampleBContent = "chr1\t99\t200\tTargetSampleB\nchr1\t200\t300\tTargetSampleA" + assertEquals("TargetSampleB.bed content is incorrect", expectedTargetSampleBContent, targetSampleBContent) + } + + private fun buildSimpleRecombinationMap(): Map> { + val recombinationMap = mutableMapOf>() + //Build the recombination map here + val sample1RangeMap = TreeRangeMap.create() + sample1RangeMap.put(Range.closed(Position("chr1", 100), Position("chr1", 200)), "TargetSampleA") + sample1RangeMap.put(Range.closed(Position("chr1", 201), Position("chr1", 300)), "TargetSampleB") + recombinationMap["Sample1"] = sample1RangeMap + val sample2RangeMap = TreeRangeMap.create() + sample2RangeMap.put(Range.closed(Position("chr1", 100), Position("chr1", 200)), "TargetSampleB") + sample2RangeMap.put(Range.closed(Position("chr1", 201), Position("chr1", 300)), "TargetSampleA") + recombinationMap["Sample2"] = sample2RangeMap + return recombinationMap + } + + + @Test + fun testBuildOutputWriterMap() { + val sampleNames = listOf("Sample1", "Sample2", "Sample3") + val recombineGvcfs = RecombineGvcfs() + val writerMap = recombineGvcfs.buildOutputWriterMap(sampleNames, Path(outputGvcfDir)) + //Check that the map has the correct number of writers + assertEquals( + "Writer map should have 3 writers", + 3, + writerMap.size + ) + //Check that the writers are correctly named + sampleNames.forEach { sampleName -> + checkSampleNameToOutputFile(sampleName, writerMap) + } + } + + private fun checkSampleNameToOutputFile( + sampleName: String, + writerMap: Map + ) { + assertTrue( + "Writer map should contain writer for $sampleName", + writerMap.containsKey(sampleName) + ) + //close out the writers + writerMap[sampleName]?.close() + //Open up the file and check that the sample name is right + val outputFile = File("$outputGvcfDir/${sampleName}_recombined.gvcf") + assertEquals( + "Output file for $sampleName was not created", + true, + outputFile.isFile + ) + + val vcfReader = VCFFileReader(outputFile,false) + val headerSampleNames = vcfReader.fileHeader.sampleNamesInOrder + assertEquals( + "Output VCF for $sampleName should have 1 sample", + 1, + headerSampleNames.size + ) + assertEquals( + "Output VCF for $sampleName has incorrect sample name", + sampleName, + headerSampleNames.first() + ) + vcfReader.close() + } + + @Test + fun testProcessGvcfsAndWrite(){ + val gvcfDir = "./data/RecombineGvcfs/gvcf/" + val bedDir = "./data/RecombineGvcfs/bed/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + //Need to resize the recombination map first + val resizedRecombinationMap = recombineGvcfs.resizeRecombinationMapsForIndels(recombinationMap, Path(gvcfDir)) + val outputWriters = recombineGvcfs.buildOutputWriterMap(targetNameList, Path(outputGvcfDir)) + val refString = "A".repeat(30) //Length should be 30 + val refSeq = mapOf(Pair("chr1",NucSeqRecord(NucSeq(refString), "chr1"))) + recombineGvcfs.processGvcfsAndWrite( + resizedRecombinationMap, + Path(gvcfDir), + outputWriters, + refSeq + ) + //Close out the writers + outputWriters.values.forEach { it.close() } + //Now check that the output files were created + checkAllResults(outputGvcfDir) + } + + fun checkAllResults(outputDir: String) { + val expectedSamples = listOf("sampleX", "sampleY", "sampleZ") + //Check sampleX's output + //load in sampleX_recombined.gvcf and check that it has the expected variants but the variants are not sorted so we need to sort by start position + //It will have A refBlock from 1-10, A RefBlock from 12-16, a Deletion at 17-19 and a SNP at 20 then A refBlock from 21-25 a SNP at 15 and a refBlock from 26-30 + val sampleXFile = File("$outputDir/sampleX_recombined.gvcf") + assertEquals("sampleX output file was not created", true, sampleXFile.isFile) + val sampleXReader = VCFFileReader(sampleXFile,false) + val sampleXVariants = sampleXReader.iterator().toList().sortedBy { it.start } + assertEquals("sampleX should have 7 variants", 7, sampleXVariants.size) + //Check first variant + val firstVariantX = sampleXVariants[0] + assertEquals("sampleX first variant contig does not match", "chr1", firstVariantX.contig) + assertEquals("sampleX first variant start does not match", 1, firstVariantX.start) + assertEquals("sampleX first variant end does not match", 10, firstVariantX.end) + //Check second variant + val secondVariantX = sampleXVariants[1] + assertEquals("sampleX second variant contig does not match", "chr1", secondVariantX.contig) + assertEquals("sampleX second variant start does not match", 12, secondVariantX.start) + assertEquals("sampleX second variant end does not match", 16, secondVariantX.end) + //Check third variant + val thirdVariantX = sampleXVariants[2] + assertEquals("sampleX third variant contig does not match", "chr1", thirdVariantX.contig) + assertEquals("sampleX third variant start does not match", 17 , thirdVariantX.start) + assertEquals("sampleX third variant end does not match", 19 , thirdVariantX.end) + //Check fourth variant + val fourthVariantX = sampleXVariants[3] + assertEquals("sampleX fourth variant contig does not match", "chr1", fourthVariantX.contig) + assertEquals("sampleX fourth variant start does not match", 20 , fourthVariantX.start) + assertEquals("sampleX fourth variant end does not match", 20 , fourthVariantX.end) + //Check fifth variant + val fifthVariantX = sampleXVariants[4] + assertEquals("sampleX fifth variant contig does not match", "chr1", fifthVariantX.contig) + assertEquals("sampleX fifth variant start does not match", 21 , fifthVariantX.start) + assertEquals("sampleX fifth variant end does not match", 24 , fifthVariantX.end) + //Check sixth variant + val sixthVariantX = sampleXVariants[5] + assertEquals("sampleX sixth variant contig does not match", "chr1", sixthVariantX.contig) + assertEquals("sampleX sixth variant start does not match", 25 , sixthVariantX.start) + assertEquals("sampleX sixth variant end does not match", 25 , sixthVariantX.end) + //Check seventh variant + val seventhVariantX = sampleXVariants[6] + assertEquals("sampleX sixth variant contig does not match", "chr1", seventhVariantX.contig) + assertEquals("sampleX sixth variant start does not match", 26 , seventhVariantX.start) + assertEquals("sampleX sixth variant end does not match", 30 , seventhVariantX.end) + sampleXReader.close() + + + //Check sampleY's output + //There is a Refblock from 1-4, a SNP at 5, A ref block from 6-10, then ref block from 11-20 and ref block from 21-30 + val sampleYFile = File("$outputDir/sampleY_recombined.gvcf") + assertEquals("sampleY output file was not created", true, sampleYFile.isFile) + val sampleYReader = VCFFileReader(sampleYFile,false) + val sampleYVariants = sampleYReader.iterator().toList().sortedBy { it.start } + assertEquals("sampleY should have 5 variants", 5, sampleYVariants.size) + //Check first variant + val firstVariantY = sampleYVariants[0] + assertEquals("sampleY first variant contig does not match", "chr1", firstVariantY.contig) + assertEquals("sampleY first variant start does not match", 1, firstVariantY.start) + assertEquals("sampleY first variant end does not match", 4, firstVariantY.end) + //Check second variant + val secondVariantY = sampleYVariants[1] + assertEquals("sampleY second variant contig does not match", "chr1", secondVariantY.contig) + assertEquals("sampleY second variant start does not match", 5, secondVariantY.start) + assertEquals("sampleY second variant end does not match", 5, secondVariantY.end) + //Check third variant + val thirdVariantY = sampleYVariants[2] + assertEquals("sampleY third variant contig does not match", "chr1", thirdVariantY.contig) + assertEquals("sampleY third variant start does not match", 6 , thirdVariantY.start) + assertEquals("sampleY third variant end does not match", 10 , thirdVariantY.end) + //Check fourth variant + val fourthVariantY = sampleYVariants[3] + assertEquals("sampleY fourth variant contig does not match", "chr1", fourthVariantY.contig) + assertEquals("sampleY fourth variant start does not match", 11 , fourthVariantY.start) + assertEquals("sampleY fourth variant end does not match", 20 , fourthVariantY.end) + //Check fifth variant + val fifthVariantY = sampleYVariants[4] + assertEquals("sampleY fifth variant contig does not match", "chr1", fifthVariantY.contig) + assertEquals("sampleY fifth variant start does not match", 21 , fifthVariantY.start) + assertEquals("sampleY fifth variant end does not match", 30 , fifthVariantY.end) + sampleYReader.close() + + //Check sampleZ's output + //There is a ref block from 1-8, an indel at 9-11, ref block from 12-14, SNP at 15, ref block from 15-20, ref block from 21-30 + val sampleZFile = File("$outputDir/sampleZ_recombined.gvcf") + assertEquals("sampleZ output file was not created", true, sampleZFile.isFile) + val sampleZReader = VCFFileReader(sampleZFile,false) + val sampleZVariants = sampleZReader.iterator().toList().sortedBy { it.start } + assertEquals("sampleZ should have 6 variants", 6, sampleZVariants.size) + //Check first variant + val firstVariantZ = sampleZVariants[0] + assertEquals("sampleZ first variant contig does not match", "chr1", firstVariantZ.contig) + assertEquals("sampleZ first variant start does not match", 1, firstVariantZ.start) + assertEquals("sampleZ first variant end does not match", 8, firstVariantZ.end) + //Check second variant + val secondVariantZ = sampleZVariants[1] + assertEquals("sampleZ second variant contig does not match", "chr1", secondVariantZ.contig) + assertEquals("sampleZ second variant start does not match", 9, secondVariantZ.start) + assertEquals("sampleZ second variant end does not match", 11, secondVariantZ.end) + //Check ref and alt alleles for the indel + assertEquals("sampleZ second variant ref allele does not match", "AAA", secondVariantZ.reference.baseString) + assertEquals("sampleZ second variant alt allele does not match", "A", secondVariantZ.alternateAlleles[0].baseString) + //Check third variant + val thirdVariantZ = sampleZVariants[2] + assertEquals("sampleZ third variant contig does not match", "chr1", thirdVariantZ.contig) + assertEquals("sampleZ third variant start does not match", 12 , thirdVariantZ.start) + assertEquals("sampleZ third variant end does not match", 14 , thirdVariantZ.end) + //Check fourth variant + val fourthVariantZ = sampleZVariants[3] + assertEquals("sampleZ fourth variant contig does not match", "chr1", fourthVariantZ.contig) + assertEquals("sampleZ fourth variant start does not match", 15 , fourthVariantZ.start) + assertEquals("sampleZ fourth variant end does not match", 15 , fourthVariantZ.end) + //Check fifth variant + val fifthVariantZ = sampleZVariants[4] + assertEquals("sampleZ fifth variant contig does not match", "chr1", fifthVariantZ.contig) + assertEquals("sampleZ fifth variant start does not match", 16 , fifthVariantZ.start) + assertEquals("sampleZ fifth variant end does not match", 20 , fifthVariantZ.end) + //Check sixth variant + val sixthVariantZ = sampleZVariants[5] + assertEquals("sampleZ sixth variant contig does not match", "chr1", sixthVariantZ.contig) + assertEquals("sampleZ sixth variant start does not match", 21 , sixthVariantZ.start) + assertEquals("sampleZ sixth variant end does not match", 30 , sixthVariantZ.end) + sampleZReader.close() + } + + @Test + fun testProcessSingleGVCFFile() { + //Lets test SampleC as it has an overlapping index, a standard indel and a SNP so we should hit all the edge cases we handle + val gvcfFile = "./data/RecombineGvcfs/gvcf/sampleC.gvcf" +// val gvcfFile = "./data/RecombineGvcfs/gvcf/sampleB.gvcf" + val bedDir = "./data/RecombineGvcfs/bed/" + val recombineGvcfs = RecombineGvcfs() + val (recombinationMap, targetNameList) = recombineGvcfs.buildRecombinationMap(Path(bedDir)) + //Need to resize the recombination map first + val resizedRecombinationMap = recombineGvcfs.resizeRecombinationMapsForIndels(recombinationMap, Path("./data/RecombineGvcfs/gvcf/")) + val sampleCRanges = resizedRecombinationMap["sampleC"]!! +// val sampleCRanges = resizedRecombinationMap["sampleB"]!! + val outputWriters = recombineGvcfs.buildOutputWriterMap(targetNameList, Path(outputGvcfDir)) + val gvcfReader = VCFFileReader(File(gvcfFile), false) + val refString = "A".repeat(30) //Length should be 30 + val refSeq = mapOf(Pair("chr1",NucSeqRecord(NucSeq(refString), "chr1"))) + recombineGvcfs.processSingleGVCFFile( + gvcfReader, + sampleCRanges, + outputWriters, + refSeq + ) + //Close out the writers + outputWriters.values.forEach { it.close() } + //Now check that the output files have the correct variants + //We will have 3 output files + //targetSampleZ whould have 2 variants: refBlock from 1-8 and Indel at 9 - 11 + val targetSampleZFile = File("$outputGvcfDir/sampleZ_recombined.gvcf") + assertEquals("sampleZ output file was not created", true, targetSampleZFile.isFile) + val targetSampleZReader = VCFFileReader(targetSampleZFile,false) + val targetSampleZVariants = targetSampleZReader.iterator().toList() + assertEquals("sampleZ should have 2 variants", 2, targetSampleZVariants.size) + val firstVariantZ = targetSampleZVariants[0] + assertEquals("sampleZ first variant contig does not match", "chr1", firstVariantZ.contig) + assertEquals("sampleZ first variant start does not match", 1, firstVariantZ.start) + assertEquals("sampleZ first variant end does not match", 8, firstVariantZ.end) + val secondVariantZ = targetSampleZVariants[1] + assertEquals("sampleZ second variant contig does not match", "chr1", secondVariantZ.contig) + assertEquals("sampleZ second variant start does not match", 9, secondVariantZ.start) + assertEquals("sampleZ second variant end does not match", 11, secondVariantZ.end) + //check ref and alt alleles for the indel + assertEquals("sampleZ second variant ref allele does not match", "AAA", secondVariantZ.reference.baseString) + assertEquals("sampleZ second variant alt allele does not match", "A", secondVariantZ.alternateAlleles[0].baseString) + targetSampleZReader.close() + //targetSampleX should have 3 variant: Deletion at 17-19 and SNP at 20 and a refBlock from 12-16 + val targetSampleXFile = File("$outputGvcfDir/sampleX_recombined.gvcf") + assertEquals("sampleX output file was not created", true, targetSampleXFile.isFile) + val targetSampleXReader = VCFFileReader(targetSampleXFile,false) + val targetSampleXVariants = targetSampleXReader.iterator().toList() + assertEquals("sampleX should have 3 variants", 3, targetSampleXVariants.size) + val firstVariantX = targetSampleXVariants[0] + assertEquals("sampleX first variant contig does not match", "chr1", firstVariantX.contig) + assertEquals("sampleX first variant start does not match", 12, firstVariantX.start) + assertEquals("sampleX first variant end does not match", 16, firstVariantX.end) + val secondVariantX = targetSampleXVariants[1] + assertEquals("sampleX second variant contig does not match", "chr1", secondVariantX.contig) + assertEquals("sampleX second variant start does not match", 17, secondVariantX.start) + assertEquals("sampleX second variant end does not match", 19, secondVariantX.end) + //check ref and alt alleles for the deletion + assertEquals("sampleX second variant ref allele does not match", "AAA", secondVariantX.reference.baseString) + assertEquals("sampleX second variant alt allele does not match", "A", secondVariantX.alternateAlleles[0].baseString) + val thirdVariantX = targetSampleXVariants[2] + assertEquals("sampleX third variant contig does not match", "chr1", thirdVariantX.contig) + assertEquals("sampleX third variant start does not match", 20, thirdVariantX.start) + assertEquals("sampleX third variant end does not match", 20, thirdVariantX.end) + //check ref and alt alleles for the SNP + assertEquals("sampleX third variant ref allele does not match", "A", thirdVariantX.reference.baseString) + assertEquals("sampleX third variant alt allele does not match", "T", thirdVariantX.alternateAlleles[0].baseString) + targetSampleXReader.close() + //targetSampleY should have 1 variant: refBlock from 21-30 + val targetSampleYFile = File("$outputGvcfDir/sampleY_recombined.gvcf") + assertEquals("sampleY output file was not created", true, targetSampleYFile.isFile) + val targetSampleYReader = VCFFileReader(targetSampleYFile,false) + val targetSampleYVariants = targetSampleYReader.iterator().toList() + assertEquals("sampleY should have 1 variant", 1, targetSampleYVariants.size) + val firstVariantY = targetSampleYVariants[0] + assertEquals("sampleY variant contig does not match", "chr1", firstVariantY.contig) + assertEquals("sampleY variant start does not match", 21, firstVariantY.start) + assertEquals("sampleY variant end does not match", 30, firstVariantY.end) + targetSampleYReader.close() + } + + @Test + fun testProcessRefBlockOverlap() { + val recombineGvcfs = RecombineGvcfs() + val outputWriters = recombineGvcfs.buildOutputWriterMap(listOf("sampleX", "sampleY", "sampleZ"), Path(outputGvcfDir)) + + //Build a rangeMap -> Target map + val rangeMap = TreeRangeMap.create() + rangeMap.put(Range.closed(Position("chr1", 1), Position("chr1", 10)), "sampleX") + rangeMap.put(Range.closed(Position("chr1",11), Position("chr1", 20)), "sampleY") + rangeMap.put(Range.closed(Position("chr1",21), Position("chr1", 30)), "sampleZ") + + //Build a refBlock from 5-25 + val refBlock = recombineGvcfs.buildRefBlock("chr1", 5, 25, "A", "sampleA") + + val refString = "A".repeat(30) //Length should be 30 + val refSeq = mapOf(Pair("chr1",NucSeqRecord(NucSeq(refString), "chr1"))) + + recombineGvcfs.processRefBlockOverlap(Position("chr1",5),Position("chr1",25),rangeMap,outputWriters, refSeq , refBlock ) + + //Close out the writers + outputWriters.values.forEach { it.close() } + + //Now check that the output files have the correct refBlocks + //We will have 3 output files each with one variant in them. + //sampleX should have a refBlock from 5-10 + val sampleXFile = File("$outputGvcfDir/sampleX_recombined.gvcf") + assertEquals("sampleX output file was not created", true, sampleXFile.isFile) + val sampleXReader = VCFFileReader(sampleXFile,false) + val sampleXVariants = sampleXReader.iterator().toList() + assertEquals("sampleX should have 1 variant", 1, sampleXVariants.size) + val sampleXVariant = sampleXVariants[0] + assertEquals("sampleX variant contig does not match", "chr1", sampleXVariant.contig) + assertEquals("sampleX variant start does not match", 5, sampleXVariant.start) + assertEquals("sampleX variant end does not match", 10, sampleXVariant.end) + sampleXReader.close() + //sampleY should have a refBlock from 11-20 + val sampleYFile = File("$outputGvcfDir/sampleY_recombined.gvcf") + assertEquals("sampleY output file was not created", true, sampleYFile.isFile) + val sampleYReader = VCFFileReader(sampleYFile,false) + val sampleYVariants = sampleYReader.iterator().toList() + assertEquals("sampleY should have 1 variant", 1, sampleYVariants.size) + val sampleYVariant = sampleYVariants[0] + assertEquals("sampleY variant contig does not match", "chr1", sampleYVariant.contig) + assertEquals("sampleY variant start does not match", 11, sampleYVariant.start) + assertEquals("sampleY variant end does not match", 20, sampleYVariant.end) + sampleYReader.close() + //sampleZ should have a refBlock from 21-25 + val sampleZFile = File("$outputGvcfDir/sampleZ_recombined.gvcf") + assertEquals("sampleZ output file was not created", true, sampleZFile.isFile) + val sampleZReader = VCFFileReader(sampleZFile,false) + val sampleZVariants = sampleZReader.iterator().toList() + assertEquals("sampleZ should have 1 variant", 1, sampleZVariants.size) + val sampleZVariant = sampleZVariants[0] + assertEquals("sampleZ variant contig does not match", "chr1", sampleZVariant.contig) + assertEquals("sampleZ variant start does not match", 21, sampleZVariant.start) + assertEquals("sampleZ variant end does not match", 25, sampleZVariant.end) + sampleZReader.close() + } + + @Test + fun testChangeSampleName() { + + val recombineGvcfs = RecombineGvcfs() + + val originalVariantContext = VariantContextBuilder() + .chr("1") + .start(100) + .stop(100) + .id("rs123") + .alleles(listOf(Allele.REF_A, Allele.ALT_C)) + .genotypes( + listOf( + GenotypeBuilder("Sample1").alleles(listOf(Allele.REF_A, Allele.REF_A)).make() + ) + ) + .make() + + val newSampleName = "NewSample" + val renamed = recombineGvcfs.changeSampleName(originalVariantContext, newSampleName) + + assertEquals("SampleName was not updated", newSampleName, renamed.genotypes.sampleNames.first()) + assertEquals( + "Alleles were not preserved", + originalVariantContext.genotypes.first().alleles, + renamed.genotypes.first().alleles + ) + //check that the rest of the variant matches and that Sample1 does not exist + assertEquals("Contigs do not match", originalVariantContext.contig, renamed.contig) + assertEquals("Start positions do not match", originalVariantContext.start, renamed.start) + assertEquals("Stop positions do not match", originalVariantContext.end, renamed.end) + assertEquals("IDs do not match", originalVariantContext.id, renamed.id) + assertEquals("Alleles do not match", originalVariantContext.alleles, renamed.alleles) + assertFalse(renamed.genotypes.sampleNames.contains("Sample1"), "Old sample name still exists") + } + + @Test + fun testBuildRefBlock() { + //Build a sample refBlock + val recombineGvcfs = RecombineGvcfs() + val refBlock = recombineGvcfs.buildRefBlock("chr1", 100, 200, "A","SampleA") + assertEquals("Contig does not match","chr1", refBlock.contig) + assertEquals("Start position does not match",100, refBlock.start) + assertEquals("End position does not match", 200, refBlock.end) + assertEquals("RefBlock should have 2 alleles", 2, refBlock.alleles.size) + assertEquals("Ref allele does not match", Allele.REF_A, refBlock.alleles[0]) + assertEquals("Alt allele does not match",Allele.create("",false), refBlock.alleles[1]) + assertEquals("There should be one sample in the refBlock", 1, refBlock.genotypes.sampleNames.size) + assertEquals("Sample name does not match","SampleA", refBlock.genotypes.sampleNames.first()) + val genotype = refBlock.genotypes.first() + assertEquals("Genotype should have 1 alleles", 1, genotype.alleles.size) + assertEquals("Genotype ref allele does not match", Allele.REF_A, genotype.alleles[0]) + } +} \ No newline at end of file