Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ plugins {
}

group = "net.maizegenetics"
version = "0.2.7"
version = "0.2.8"

repositories {
mavenCentral()
Expand Down
3 changes: 3 additions & 0 deletions data/RecombineGvcfs/bed/sampleA.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
chr1 0 10 sampleX
chr1 10 20 sampleY
chr1 20 30 sampleZ
3 changes: 3 additions & 0 deletions data/RecombineGvcfs/bed/sampleB.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
chr1 0 10 sampleY
chr1 10 20 sampleZ
chr1 20 30 sampleX
3 changes: 3 additions & 0 deletions data/RecombineGvcfs/bed/sampleC.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
chr1 0 10 sampleZ
chr1 10 20 sampleX
chr1 20 30 sampleY
16 changes: 16 additions & 0 deletions data/RecombineGvcfs/gvcf/sampleA.gvcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=3,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=AF,Number=3,Type=Integer,Description="Allele Frequency">
##INFO=<ID=ASM_Chr,Number=1,Type=String,Description="Assembly chromosome">
##INFO=<ID=ASM_End,Number=1,Type=Integer,Description="Assembly end position">
##INFO=<ID=ASM_Start,Number=1,Type=Integer,Description="Assembly start position">
##INFO=<ID=ASM_Strand,Number=1,Type=String,Description="Assembly strand">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleA
chr1 1 . A <NON_REF> . . END=30 GT 0
22 changes: 22 additions & 0 deletions data/RecombineGvcfs/gvcf/sampleB.gvcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=3,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=AF,Number=3,Type=Integer,Description="Allele Frequency">
##INFO=<ID=ASM_Chr,Number=1,Type=String,Description="Assembly chromosome">
##INFO=<ID=ASM_End,Number=1,Type=Integer,Description="Assembly end position">
##INFO=<ID=ASM_Start,Number=1,Type=Integer,Description="Assembly start position">
##INFO=<ID=ASM_Strand,Number=1,Type=String,Description="Assembly strand">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleB
chr1 1 . A <NON_REF> . . END=4 GT 0
chr1 5 . A T . . . GT 1
chr1 6 . A <NON_REF> . . END=14 GT 0
chr1 15 . A T . . . GT 1
chr1 16 . A <NON_REF> . . END=24 GT 0
chr1 25 . A T . . . GT 1
chr1 26 . A <NON_REF> . . END=30 GT 0
21 changes: 21 additions & 0 deletions data/RecombineGvcfs/gvcf/sampleC.gvcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=3,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=AF,Number=3,Type=Integer,Description="Allele Frequency">
##INFO=<ID=ASM_Chr,Number=1,Type=String,Description="Assembly chromosome">
##INFO=<ID=ASM_End,Number=1,Type=Integer,Description="Assembly end position">
##INFO=<ID=ASM_Start,Number=1,Type=Integer,Description="Assembly start position">
##INFO=<ID=ASM_Strand,Number=1,Type=String,Description="Assembly strand">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sampleC
chr1 1 . A <NON_REF> . . END=8 GT 0
chr1 9 . AAA A . . . GT 1
chr1 12 . A <NON_REF> . . END=16 GT 0
chr1 17 . AAA A . . . GT 1
chr1 20 . A T . . . GT 1
chr1 21 . A <NON_REF> . . END=30 GT 0
2 changes: 2 additions & 0 deletions data/RecombineGvcfs/ref.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>chr1
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
6 changes: 5 additions & 1 deletion src/main/kotlin/Main.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import net.maizegenetics.commands.PickCrossovers
import net.maizegenetics.commands.RopeBwtChrIndex
import net.maizegenetics.commands.RopeBwtMem
import net.maizegenetics.commands.SetupEnvironment
import net.maizegenetics.commands.MutateAssemblies
import net.maizegenetics.commands.RecombineGvcfs

class SeqSim : CliktCommand() {
override fun run() = Unit
Expand All @@ -43,6 +45,8 @@ fun main(args: Array<String>) = SeqSim()
RopeBwtMem(),
BuildSplineKnots(),
ConvertRopebwt2Ps4g(),
ExtractChromIds()
ExtractChromIds(),
MutateAssemblies(),
RecombineGvcfs()
)
.main(args)
75 changes: 6 additions & 69 deletions src/main/kotlin/net/maizegenetics/commands/MutateAssemblies.kt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package net.maizegenetics.net.maizegenetics.commands
package net.maizegenetics.commands

import com.github.ajalt.clikt.core.CliktCommand
import com.github.ajalt.clikt.parameters.options.option
Expand All @@ -14,44 +14,15 @@ import htsjdk.variant.variantcontext.VariantContextBuilder
import htsjdk.variant.variantcontext.writer.Options
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
import htsjdk.variant.vcf.VCFFileReader
import htsjdk.variant.vcf.VCFFormatHeaderLine
import htsjdk.variant.vcf.VCFHeader
import htsjdk.variant.vcf.VCFHeaderLine
import htsjdk.variant.vcf.VCFHeaderLineCount
import htsjdk.variant.vcf.VCFHeaderLineType
import htsjdk.variant.vcf.VCFInfoHeaderLine
import net.maizegenetics.utils.Position
import net.maizegenetics.utils.SimpleVariant
import net.maizegenetics.utils.VariantContextUtils
import java.io.File
import java.util.HashSet
import kotlin.io.path.createDirectories
import kotlin.io.path.exists


data class Position(val contig: String, val position: Int) : Comparable<Position> {
override fun compareTo(other: Position): Int {

if (this.contig == other.contig) {
return this.position.compareTo(other.position)
}

val thisContig = this.contig.replace("chr", "", ignoreCase = true).trim()
val otherContig = other.contig.replace("chr", "", ignoreCase = true).trim()

return try {
thisContig.toInt() - otherContig.toInt()
} catch (e: NumberFormatException) {
// If we can't convert contigs to an int, then compare the strings
contig.compareTo(other.contig)
}

}

override fun toString(): String {
return "$contig:$position"
}
}
data class SimpleVariant(val refStart: Position, val refEnd: Position,
val refAllele: String, val altAllele: String,
val isAddedMutation: Boolean = false)

@OptIn(kotlin.io.path.ExperimentalPathApi::class)
class MutateAssemblies : CliktCommand(name = "mutate-assemblies") {
Expand Down Expand Up @@ -156,7 +127,7 @@ class MutateAssemblies : CliktCommand(name = "mutate-assemblies") {
return
}

if (isIndel(currentSimpleVariant) && isIndel(overlappingVariant)) {
if (VariantContextUtils.isIndel(currentSimpleVariant) && VariantContextUtils.isIndel(overlappingVariant)) {
//skip as it will be tricky/slow to handle
return //TODO figure out a way to handle this well
}
Expand Down Expand Up @@ -260,7 +231,7 @@ class MutateAssemblies : CliktCommand(name = "mutate-assemblies") {
.setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
.build().use { writer ->

writer.writeHeader(createGenericHeader(listOf(sampleName), emptySet()))
writer.writeHeader(VariantContextUtils.createGenericHeader(listOf(sampleName), emptySet()))

val sortedVariants = baseVariantMap.asMapOfRanges().toList().sortedBy { it.first.lowerEndpoint() }

Expand Down Expand Up @@ -294,40 +265,6 @@ class MutateAssemblies : CliktCommand(name = "mutate-assemblies") {
}
}

fun createGenericHeader(taxaNames: List<String>, altLines:Set<VCFHeaderLine>): VCFHeader {
val headerLines = createGenericHeaderLineSet() as MutableSet<VCFHeaderLine>
headerLines.addAll(altLines)
return VCFHeader(headerLines, taxaNames)
}

fun createGenericHeaderLineSet(): Set<VCFHeaderLine> {
val headerLines: MutableSet<VCFHeaderLine> = HashSet()
headerLines.add(VCFFormatHeaderLine("AD", 3, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed"))
headerLines.add(
VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")
)
headerLines.add(VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality"))
headerLines.add(VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"))
headerLines.add(
VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")
)
headerLines.add(VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total Depth"))
headerLines.add(VCFInfoHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Number of Samples With Data"))
headerLines.add(VCFInfoHeaderLine("AF", 3, VCFHeaderLineType.Integer, "Allele Frequency"))
headerLines.add(VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval"))
// These last 4 are needed for assembly g/hvcfs, but not for reference. I am keeping them in as header
// lines but they will only be added to the data lines if they are present in the VariantContext.
headerLines.add(VCFInfoHeaderLine("ASM_Chr", 1, VCFHeaderLineType.String, "Assembly chromosome"))
headerLines.add(VCFInfoHeaderLine("ASM_Start", 1, VCFHeaderLineType.Integer, "Assembly start position"))
headerLines.add(VCFInfoHeaderLine("ASM_End", 1, VCFHeaderLineType.Integer, "Assembly end position"))
headerLines.add(VCFInfoHeaderLine("ASM_Strand", 1, VCFHeaderLineType.String, "Assembly strand"))
return headerLines
}

fun isIndel(variant: SimpleVariant?): Boolean {
if(variant == null) return false
return !((variant.refAllele.length == 1 && variant.altAllele.length == 1) || //SNP
(variant.refAllele.length == 1 && variant.altAllele == "<NON_REF>")) //RefBlock
}

}
Loading