nf-core · TCLamnidis · Mar 7, 2025 · Mar 28, 2025 · Apr 4, 2025 · Apr 4, 2025
diff --git a/assets/schema_fasta.json b/assets/schema_fasta.json
@@ -132,6 +132,27 @@
                 "pattern": "^\\S+\\.vcf$",
                 "exists": true,
                 "errorMessage": "SNP annotation files for GATK must not contain any spaces and have file extension '.vcf'."
+            },
+            "consensus_multivcfanalyzer_reference_gff_annotations": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.gff(\\.gz)?$",
+                "exists": true,
+                "errorMessage": "GFF annotation files for MultiVCFAnalyzer must not contain any spaces and have file extensions '.gff' or '.gff.gz'."
+            },
+            "consensus_multivcfanalyzer_reference_gff_exclude": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.bed(\\.gz)?$",
+                "exists": true,
+                "errorMessage": "BED files for exclusion in MultiVCFAnalyzer must not contain any spaces and have file extensions '.bed' or '.bed.gz'."
+            },
+            "consensus_multivcfanalyzer_reference_snpeff_results": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.txt(\\.gz)?$",
+                "exists": true,
+                "errorMessage": "SNPeff results files for MultiVCFAnalyzer must not contain any spaces and have file extensions '.txt' or '.txt.gz'."
             }
         },
         "required": ["reference_name", "fasta"],

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -73,6 +73,18 @@
                 "type": "string",
                 "meta": ["bam_reference_id"],
                 "errorMessage": "A BAM reference ID (corresponding to what is supplied to `--fasta`) must always be provided when providing a BAM file."
+            },
+            "vcf": {
+                "type": "string",
+                "format": "file-path",
+                "pattern": "^\\S+\\.vcf.gz$",
+                "exists": true,
+                "errorMessage": "VCFs files cannot contain any spaces and must have extension '.vcf.gz' and be gzip compressed."
+            },
+            "vcf_reference_id": {
+                "type": "string",
+                "meta": ["vcf_reference_id"],
+                "errorMessage": "A VCF reference ID (corresponding to what is supplied to `--fasta`) must always be provided when providing a VCF file."
             }
         },
         "required": [
@@ -90,18 +102,24 @@
             },
             {
                 "required": ["bam"]
+            },
+            {
+                "required": ["vcf"]
             }
         ],
         "dependentRequired": {
             "r2": ["r1"],
             "bam": ["bam_reference_id"],
-            "bam_reference_id": ["bam"]
+            "bam_reference_id": ["bam"],
+            "vcf": ["vcf_reference_id"],
+            "vcf_reference_id": ["vcf"]
         }
     },
     "allOf": [
         { "uniqueEntries": ["lane", "library_id"] },
         { "uniqueEntries": "r1" },
         { "uniqueEntries": "r2" },
-        { "uniqueEntries": "bam" }
+        { "uniqueEntries": "bam" },
+        { "uniqueEntries": "vcf" }
     ]
 }
diff --git a/conf/modules.config b/conf/modules.config
@@ -1757,4 +1757,42 @@ process {
             ]
         ]
     }
+
+    withName: UG_BGZIP {
+        tag = { "${meta.reference}|${meta.sample_id}" }
+        ext.args = "-d"
+    }
+
+    withName: REF_MVA_GUNZIP {
+        tag = { "${meta.reference}" }
+        ext.args = "-d"
+    }
+
+    withName: MULTIVCFANALYZER {
+        tag = { "${meta.reference}" }
+        ext.prefix = { "multivcfanalyzer_${meta.reference}" }
+        publishDir = [
+            [
+                // data
+                path: { "${params.outdir}/consensus_sequence/multivcfanalyzer/data/" },
+                mode: params.publish_dir_mode,
+                enabled: true,
+                pattern: '*.fasta.gz'
+            ],
+            [
+                // data
+                path: { "${params.outdir}/consensus_sequence/multivcfanalyzer/data/" },
+                mode: params.publish_dir_mode,
+                enabled: true,
+                pattern: '*{snpTable,structure}*.tsv'
+            ],
+            [
+                // stats
+                path: { "${params.outdir}/consensus_sequence/multivcfanalyzer/stats/" },
+                mode: params.publish_dir_mode,
+                enabled: true,
+                pattern: '*{.txt,snpStatistics.tsv}'
+            ]
+        ]
+    }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -63,4 +63,9 @@ params {
     metagenomics_profiling_tool     = 'metaphlan'
     metagenomics_profiling_database = params.pipelines_testdata_base_path + 'eager/databases/metaphlan/metaphlan4_database.tar.gz'
     metagenomics_run_postprocessing = true
+
+    // Consensus sequence generation
+    run_consensus_sequence                              = true
+    consensus_tool                                      = 'multivcfanalyzer'
+    consensus_multivcfanalyzer_write_allele_frequencies = true
 }
diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md
@@ -1123,3 +1123,21 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --
 ## Expect: BAM input shows up in FastQC -> mapping results.
 nextflow run main.nf -profile test,docker --outdir ./results -w work/ --convert_inputbam --skip_deduplication -resume -ansi-log false -dump-channels
 ```
+
+# MultiVCFAnalyzer
+
+Based on GATK_UG test, but with added consensus sequence.
+
+```bash
+## Gatk UG on raw reads
+## Expect: MVCFA runs. Results include three fastq.gz files in the data section, 3 snpTable TSV and 2 strcutureGenotypes TSV files in data section, two tsv/txt files in the stats section of the results.
+nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keeprealignbam -ansi-log false -dump-channels --run_consensus_sequence --consensus_tool 'multivcfanalyzer'
+```
+
+Based on test_microbial, but forcing GATK_UG genotyping. Multi Reference.
+
+```bash
+## Gatk UG on raw reads. Use BWA because circularmapper runs into issues with BWA, due to difference in MT chrom length. Custom input to test multiple samples/lanes.
+## Expect: MVCFA runs. Two sets of outputs, one per reference. MVCFA runs into potential out of memory issues with the multi contig reference hs37d5_chr22-MT, so I had to run this on an HPC, with increased resources.
+nextflow run main.nf -profile test_microbial,eva_grace --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keeprealignbam -ansi-log false -dump-channels --run_consensus_sequence --consensus_tool 'multivcfanalyzer' --mapping_tool 'bwaaln' --input "https://github.com/nf-core/test-datasets/raw/refs/heads/eager/testdata/Mammoth/samplesheet_v3.tsv" --deduplication_tool 'markduplicates'
+```
diff --git a/docs/output.md b/docs/output.md
@@ -752,3 +752,28 @@ When using pileupCaller for genotyping, single-stranded and double-stranded libr
 </details>
 
 [ANGSD](http://www.popgen.dk/angsd/index.php/ANGSD) is a software for analyzing next generation sequencing data. It can estimate genotype likelihoods and allele frequencies from next-generation sequencing data. The output provided is a bgzipped genotype likelihood file, containing likelihoods across all samples per reference. Users can specify the model used for genotype likelihood estimation, as well as the output format. For more information on the available options, see the [ANGSD](https://www.popgen.dk/angsd/index.php/Genotype_Likelihoods).
+
+### Consensus calling
+
+#### MultiVCFAnalyzer
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `consensus_sequence/multivcfanalyzer/`
+  - `data/`
+    - `*fullAlignment.fasta.gz`: FASTA file of all positions contained in the VCF files i.e. including ref calls
+    - `*snpAlignment.fasta.gz`: FASTA file of only SNP positions including only the calls of the samples.
+    - `*snpAlignmentIncludingRefGenome.fasta.gz`: FASTA file of just SNP positions including reference genome calls.
+  - `stats/`
+    - `*info.txt`: File with information about the run
+    - `*snpStatistics.tsv`: File containing basic statistics about the SNP calls of each sample.
+    - `*snpTableForSnpEff.tsv`: Input file for SnpEff.
+    - `*snpTable.tsv`: SNP table of combined positions taken from each VCF file, in TSV format.
+    - `*snpTableWithUncertaintyCalls.tsv`: SNP table of combined positions taken from each VCF file, in TSV format, but with lower case characters indicating uncertain calls
+    - `*structureGenotypes_noMissingData-Columns.tsv`: Alternate input file for STRUCTURE.
+    - `*structureGenotypes.tsv`: Input file for STRUCTURE.
+
+</details>
+
+[MultiVCFAnalyzer](https://github.com/alexherbig/MultiVCFAnalyzer) is a SNP filtering and SNP alignment generation tool, designed around (but not limited to) low coverage ancient DNA data. MultiVCFanalyzer reads multiple VCF files as produced by GATK UnifiedGenotyper, performs filtering based on a number of criteria, and provides the combined genotype calls in a number of formats that are suitable for follow-up analyses such as phylogenetic reconstruction, SNP effect analyses, population genetic analyses etc. Furthermore, the results are provided in the form of various tables for manual inspection and presentation/publication purposes.
diff --git a/main.nf b/main.nf
@@ -45,6 +45,7 @@ workflow NFCORE_EAGER {
     take:
     samplesheet_fastqs // channel: samplesheet read in from --input
     samplesheet_bams
+    samplesheet_vcfs
 
     main:
 
@@ -53,7 +54,8 @@ workflow NFCORE_EAGER {
     //
     EAGER (
         samplesheet_fastqs,
-        samplesheet_bams
+        samplesheet_bams,
+        samplesheet_vcfs
     )
     emit:
     multiqc_report = EAGER.out.multiqc_report // channel: /path/to/multiqc_report.html
@@ -88,6 +90,7 @@ workflow {
     NFCORE_EAGER (
         PIPELINE_INITIALISATION.out.samplesheet_fastqs,
         PIPELINE_INITIALISATION.out.samplesheet_bams,
+        PIPELINE_INITIALISATION.out.samplesheet_vcfs
     )
     //
     // SUBWORKFLOW: Run completion tasks

diff --git a/modules.json b/modules.json
@@ -235,6 +235,11 @@
                         "git_sha": "af27af1be706e6a2bb8fe454175b0cdf77f47b49",
                         "installed_by": ["modules"]
                     },
+                    "multivcfanalyzer": {
+                        "branch": "master",
+                        "git_sha": "de26d9dc50ba49a6ffaac64d473f784ab97a9d61",
+                        "installed_by": ["modules"]
+                    },
                     "picard/createsequencedictionary": {
                         "branch": "master",
                         "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447",
@@ -340,6 +345,11 @@
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
+                    "tabix/bgzip": {
+                        "branch": "master",
+                        "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d",
+                        "installed_by": ["modules"]
+                    },
                     "taxpasta/merge": {
                         "branch": "master",
                         "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",

diff --git a/modules/nf-core/multivcfanalyzer/environment.yml b/modules/nf-core/multivcfanalyzer/environment.yml
diff --git a/modules/nf-core/multivcfanalyzer/main.nf b/modules/nf-core/multivcfanalyzer/main.nf