diff --git a/modules/local/freebayes.nf b/modules/local/freebayes.nf new file mode 100644 index 0000000..1cb23a0 --- /dev/null +++ b/modules/local/freebayes.nf @@ -0,0 +1,33 @@ +process freebayes { + label 'process_high' + conda "bioconda::freebayes=1.3.6 bioconda::bcftools=1.21" + tag "$library" + + input: + tuple val(library), path(bam), path(bai) + path(fasta) + path(fai) + val(target_bed) + + output: + tuple val(library), path("*.vcf.gz"), path("*.vcf.gz.tbi"), emit: vcf + tuple val(library), path("*.flt.vcf.gz"), path("*.flt.vcf.gz.tbi"), emit: filtered_vcf + + script: + + """ + freebayes-parallel <(fasta_generate_regions.py ${fai} 100000) $task.cpus \\ + -f $fasta \\ + $bam \\ + --min-base-quality 1 \\ + > ${library}.vcf + + cat ${library}.vcf | bcftools view -i 'QUAL > $params.freebayes_qual_filter' > ${library}.flt.vcf + bgzip ${library}.flt.vcf + tabix ${library}.flt.vcf.gz + + bgzip ${library}.vcf + tabix ${library}.vcf.gz + + """ +} \ No newline at end of file diff --git a/modules/local/happy_concordance.nf b/modules/local/happy_concordance.nf index 454ac8e..1a8ab60 100644 --- a/modules/local/happy_concordance.nf +++ b/modules/local/happy_concordance.nf @@ -1,7 +1,6 @@ process happyConcordance { label 'process_high' conda "bioconda::hap.py=0.3.15" - publishDir "$params.outdir/happy/", mode: 'copy' tag { library } input: diff --git a/modules/local/parse_happy_vcf.nf b/modules/local/parse_happy_vcf.nf index a6247fb..0519d56 100644 --- a/modules/local/parse_happy_vcf.nf +++ b/modules/local/parse_happy_vcf.nf @@ -1,6 +1,5 @@ process parseHappyVcf { label 'process_single' - publishDir "$params.outdir/happy/", mode: 'copy' tag { library } input: @@ -38,27 +37,4 @@ process parseHappyVcf { done """ -} -// echo "Count Ref Alt Type" > !{library}.happy.snp_mutations.txt - -// if [[ -n "!{happy_bed}" ]]; then -// regions_opt='grep "CONF" |' -// else -// regions_opt='' -// fi - -// zcat *.vcf.gz | grep "SNP" | $regions_opt grep -v NoPass | grep TP |\ -// awk -v FS='\t' -v OFS=' ' '{print \$4, \$5, "Correct"}' \ -// | sort | uniq -c |sed 's/^[ \\t]*//' >> !{library}.happy.snp_mutations.txt - -// zcat *.vcf.gz | grep "SNP" | $regions_opt grep -v NoPass | grep FN | grep -v NOCALL |\ -// awk -v FS='\t' -v OFS=' ' '{print \$4, \$5, "Incorrect"}' \ -// | sort | uniq -c |sed 's/^[ \\t]*//' >> !{library}.happy.snp_mutations.txt - -// zcat *.vcf.gz | grep "SNP" | $regions_opt grep -v NoPass | grep FN | grep NOCALL |\ -// awk -v FS='\t' -v OFS=' ' '{print \$4, \$5, "Missed"}' \ -// | sort | uniq -c |sed 's/^[ \\t]*//' >> !{library}.happy.snp_mutations.txt - -// zcat *.vcf.gz | grep "SNP" | $regions_opt grep NoPass | grep TP |\ -// awk -v FS='\t' -v OFS=' ' '{print \$4, \$5, "Missed"}' \ -// | sort | uniq -c |sed 's/^[ \\t]*//' >> !{library}.happy.snp_mutations.txt \ No newline at end of file +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index de24e67..3b8e092 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,6 +14,8 @@ params { outdir = 'results' tracedir = "${params.outdir}/pipeline_info" run_strelka = true + run_freebayes = true + freebayes_qual_filter = 15 // default quality filter for freebayes output. run_happy = true target_bed = null happy_truth_vcf = null diff --git a/tests/main.nf.test b/tests/main.nf.test index b900181..5dd13a4 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -21,9 +21,12 @@ nextflow_pipeline { def calcmd_bam = path("${launchDir}/test_output/calcMD/200ng-V2-1.md.calcmd.bam").md5 def masked_bam = path("${launchDir}/test_output/revelio/200ng-V2-1.md.masked.bam").md5 def strelka_vcf = path("${launchDir}/test_output/strelka/200ng-V2-1.md.strelka_vcf.gz").vcf.summary - def happy_vcf = path("${launchDir}/test_output/happy/200ng-V2-1.md_vs_truth.vcf.happy.vcf.gz").vcf.summary - def happy_summary = path("${launchDir}/test_output/happy/200ng-V2-1.md_vs_truth.vcf.happy.summary.csv").text.tokenize('\n') - def snp_breakdown = path("${launchDir}/test_output/happy/200ng-V2-1.md.happy.snp_mutations.txt").text.tokenize('\n') + def freebayes_vcf = path("${launchDir}/test_output/freebayes/200ng-V2-1.md.flt.vcf.gz").vcf.summary + def happy_vcf_strelka = path("${launchDir}/test_output/happyConcordanceStrelka/200ng-V2-1.md_vs_truth.vcf.happy.vcf.gz").vcf.summary + def happy_summary_strelka = path("${launchDir}/test_output/happyConcordanceFreebayes/200ng-V2-1.md_vs_truth.vcf.happy.summary.csv").text.tokenize('\n') + def happy_vcf_freebayes = path("${launchDir}/test_output/happyConcordanceFreebayes/200ng-V2-1.md_vs_truth.vcf.happy.vcf.gz").vcf.summary + def happy_summary_freebayes = path("${launchDir}/test_output/happyConcordanceFreebayes/200ng-V2-1.md_vs_truth.vcf.happy.summary.csv").text.tokenize('\n') + def snp_breakdown = path("${launchDir}/test_output/parseHappyVcf/200ng-V2-1.md.happy.snp_mutations.txt").text.tokenize('\n') assertAll( { assert workflow.success }, @@ -32,8 +35,11 @@ nextflow_pipeline { ["calcmd_bam", calcmd_bam], ["masked_bam", masked_bam], ["strelka_vcf", strelka_vcf], - ["happy_vcf", happy_vcf], - ["happy_summary", happy_summary], + ["freebayes_vcf", freebayes_vcf], + ["happy_vcf_strelka", happy_vcf_strelka], + ["happy_summary_strelka", happy_summary_strelka], + ["happy_vcf_freebayes", happy_vcf_freebayes], + ["happy_summary_freebayes", happy_summary_freebayes], ["snp_breakdown", snp_breakdown] ).match() } diff --git a/tests/main.nf.test.snap b/tests/main.nf.test.snap index 097923d..d5981a6 100644 --- a/tests/main.nf.test.snap +++ b/tests/main.nf.test.snap @@ -3,8 +3,8 @@ "content": [ { "tasksFailed": 0, - "tasksCount": 11, - "tasksSucceeded": 11 + "tasksCount": 15, + "tasksSucceeded": 15 }, [ "calcmd_bam", @@ -19,15 +19,35 @@ "VcfFile [chromosomes=[chr21], sampleCount=1, variantCount=23, phased=false, phasedAutodetect=false]" ], [ - "happy_vcf", + "freebayes_vcf", + "VcfFile [chromosomes=[chr21], sampleCount=1, variantCount=11, phased=false, phasedAutodetect=false]" + ], + [ + "happy_vcf_strelka", "VcfFile [chromosomes=[chr21], sampleCount=2, variantCount=23, phased=false, phasedAutodetect=false]" ], [ - "happy_summary", + "happy_summary_strelka", + [ + "Type,Filter,TRUTH.TOTAL,TRUTH.TP,TRUTH.FN,QUERY.TOTAL,QUERY.FP,QUERY.UNK,FP.gt,FP.al,METRIC.Recall,METRIC.Precision,METRIC.Frac_NA,METRIC.F1_Score,TRUTH.TOTAL.TiTv_ratio,QUERY.TOTAL.TiTv_ratio,TRUTH.TOTAL.het_hom_ratio,QUERY.TOTAL.het_hom_ratio", + "INDEL,ALL,0,0,0,2,2,0,0,1,0.0,0.0,0.0,,,,,", + "INDEL,PASS,0,0,0,2,2,0,0,1,0.0,0.0,0.0,,,,,", + "SNP,ALL,1,1,0,14,13,0,0,1,1.0,0.07142899999999999,0.0,0.13333299999999998,0.0,0.75,0.0,1.3333333333333333", + "SNP,PASS,1,1,0,14,13,0,0,1,1.0,0.07142899999999999,0.0,0.13333299999999998,0.0,0.75,0.0,1.3333333333333333" + ] + ], + [ + "happy_vcf_freebayes", + "VcfFile [chromosomes=[chr21], sampleCount=2, variantCount=16, phased=false, phasedAutodetect=false]" + ], + [ + "happy_summary_freebayes", [ "Type,Filter,TRUTH.TOTAL,TRUTH.TP,TRUTH.FN,QUERY.TOTAL,QUERY.FP,QUERY.UNK,FP.gt,FP.al,METRIC.Recall,METRIC.Precision,METRIC.Frac_NA,METRIC.F1_Score,TRUTH.TOTAL.TiTv_ratio,QUERY.TOTAL.TiTv_ratio,TRUTH.TOTAL.het_hom_ratio,QUERY.TOTAL.het_hom_ratio", - "SNP,ALL,1,1,0,23,22,0,0,0,1.0,0.043477999999999996,0.0,0.083333,0.0,0.5333333333333333,0.0,22.0", - "SNP,PASS,1,1,0,4,3,0,0,0,1.0,0.25,0.0,0.4,0.0,0.0,0.0,3.0" + "INDEL,ALL,0,0,0,2,2,0,0,1,0.0,0.0,0.0,,,,,", + "INDEL,PASS,0,0,0,2,2,0,0,1,0.0,0.0,0.0,,,,,", + "SNP,ALL,1,1,0,14,13,0,0,1,1.0,0.07142899999999999,0.0,0.13333299999999998,0.0,0.75,0.0,1.3333333333333333", + "SNP,PASS,1,1,0,14,13,0,0,1,1.0,0.07142899999999999,0.0,0.13333299999999998,0.0,0.75,0.0,1.3333333333333333" ] ], [ @@ -42,6 +62,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.2" }, - "timestamp": "2025-04-15T11:26:45.336212952" + "timestamp": "2025-04-17T16:25:23.194977669" } } \ No newline at end of file diff --git a/workflows/emseq_variant_calling.nf b/workflows/emseq_variant_calling.nf index 4b35516..bfa7620 100644 --- a/workflows/emseq_variant_calling.nf +++ b/workflows/emseq_variant_calling.nf @@ -34,7 +34,9 @@ include { downloadRevelio } from '../modules/local/download_revelio.nf' include { calcMD } from '../modules/local/calc_md.nf' include { revelio } from '../modules/local/revelio.nf' include { strelka } from '../modules/local/strelka.nf' -include { happyConcordance } from '../modules/local/happy_concordance.nf' +include { freebayes } from '../modules/local/freebayes.nf' +include { happyConcordance as happyConcordanceStrelka; + happyConcordance as happyConcordanceFreebayes } from '../modules/local/happy_concordance.nf' include { parseHappyVcf } from '../modules/local/parse_happy_vcf.nf' workflow emseq_variant_calling { @@ -72,13 +74,13 @@ workflow emseq_variant_calling { fai, target_bed // call regions (optional) ) - + // // Module: Run hap.py to compare variants to a "truth" vcf // if (params.run_happy) { - happyConcordance( + happyConcordanceStrelka( strelka.out.vcf, happy_bed, // comparison regions (optional) fasta, @@ -91,13 +93,39 @@ workflow emseq_variant_calling { // Module: Parse hap.py vcf output to break down SNPs by ref/alt bases. // parseHappyVcf( - happyConcordance.out.vcf, + happyConcordanceStrelka.out.vcf, happy_bed // analyze breakdown in called regions if provided ) } } + // + // Module: Run freebayes to call germline variants + // + if (params.run_freebayes) { + freebayes ( + revelio.out.masked, + fasta, + fai, + target_bed // call regions (optional) + ) + + // + // Module: Run hap.py to compare variants to a "truth" vcf + // + if (params.run_happy) { + + happyConcordanceFreebayes( + freebayes.out.filtered_vcf, + happy_bed, // comparison regions (optional) + fasta, + fai, + happy_truth_vcf, + happy_truth_tbi + ) + } + } } workflow {