diff --git a/modules/local/UMICollapse/main.nf b/modules/local/UMICollapse/main.nf new file mode 100644 index 0000000..037c9c3 --- /dev/null +++ b/modules/local/UMICollapse/main.nf @@ -0,0 +1,37 @@ +process UMICOLLAPSE { + tag "$meta.id" + label "process_high" + + container 'elly1502/umicollapse:latest' + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + java -jar /UMICollapse/umicollapse.jar \\ + bam \\ + -i $bam \\ + -o ${prefix}.bam \\ + $args + + mv .command.log ${prefix}_UMICollapse.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umicollapse: NA + END_VERSIONS + """ +} + diff --git a/modules/local/umitools_dedup/meta.yml b/modules/local/UMICollapse/meta.yml similarity index 53% rename from modules/local/umitools_dedup/meta.yml rename to modules/local/UMICollapse/meta.yml index 3d3c642..fbd3918 100644 --- a/modules/local/umitools_dedup/meta.yml +++ b/modules/local/UMICollapse/meta.yml @@ -1,14 +1,13 @@ -name: umitools_dedup +name: umicollapse description: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. keywords: - - umitools + - umicollapse - deduplication tools: - - umi_tools: + - umicollapse: description: > - UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) - and single cell RNA-Seq cell barcodes - documentation: https://umi-tools.readthedocs.io/en/latest/ + UMICollapse contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs). + documentation: https://github.com/Daniel-Liu-c0deb0t/UMICollapse license: ["MIT"] input: - meta: @@ -36,28 +35,11 @@ output: type: file description: BAM file with deduplicated UMIs. pattern: "*.{bam}" - - tsv_edit_distance: - type: file - description: Reports the (binned) average edit distance between the UMIs at each position. - pattern: "*edit_distance.tsv" - - tsv_per_umi: - type: file - description: UMI-level summary statistics. - pattern: "*per_umi.tsv" - - tsv_umi_per_position: - type: file - description: Tabulates the counts for unique combinations of UMI and position. - pattern: "*per_position.tsv" - - log: - type: file - description: Log file - pattern: "*.log" - versions: type: file description: File containing software versions pattern: "versions.yml" authors: - - "@drpatelh" - - "@grst" - - "@klkeys" + - "@Daniel-Liu-c0deb0t" + - "@CharlotteAnne" diff --git a/modules/local/clipqc/templates/clipqc.py b/modules/local/clipqc/templates/clipqc.py index c1c2caa..c759d45 100755 --- a/modules/local/clipqc/templates/clipqc.py +++ b/modules/local/clipqc/templates/clipqc.py @@ -89,18 +89,19 @@ with open(dedup_log, 'r') as logfile: - exp = re.sub('.log', '', os.path.basename(dedup_log)) + exp = re.sub('_UMICollapse.log', '', os.path.basename(dedup_log)) lines = logfile.readlines() - input_reads = [i for i in lines if 'INFO Reads: Input Reads:' in i] + input_reads = [i for i in lines if 'Number of input reads' in i] input_reads = int(re.findall(r'\\d+', input_reads[0])[-1]) - output_reads = [i for i in lines if 'Number of reads out:' in i] + output_reads = [i for i in lines if 'Number of reads after deduplicating' in i] output_reads = int(re.findall(r'\\d+', output_reads[0])[-1]) - mean_umis = [i for i in lines if 'Mean number of unique UMIs per position:' in i] - mean_umis = float(re.findall(r'\\d+', mean_umis[0])[-1]) + mean_umis = [i for i in lines if 'Average number of UMIs per alignment position' in i] + mean_umis = float(re.findall(r'\\d+\\.*\\d*', mean_umis[0])[-1]) + mean_umis = np.round(mean_umis, 2) dedup['exp'].append(exp) dedup['input_reads'].append(input_reads) diff --git a/modules/local/du/main.nf b/modules/local/du/main.nf deleted file mode 100644 index 5b0b825..0000000 --- a/modules/local/du/main.nf +++ /dev/null @@ -1,20 +0,0 @@ -process DU { - label "min_cores" - label "min_mem" - label "regular_queue" - - tag "$meta.id" - - container "biocontainers/biocontainers:v1.2.0_cv1" - - input: - tuple val(meta), path(input_file) - - output: - tuple val(meta), stdout, emit: size - - script: - """ - echo -n "\$(du -kL $input_file | awk '{print(\$1)}')" - """ -} diff --git a/modules/local/du/meta.yml b/modules/local/du/meta.yml deleted file mode 100644 index a4b95dc..0000000 --- a/modules/local/du/meta.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: du -description: Runs du to determine the size of the input file in KB -tools: - - du: - description: Estimates file space usage -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input_file: - type: file - description: The file to determine the size of. -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - size: - type: string - description: The size of the file in KB -authors: - - "@marc-jones" \ No newline at end of file diff --git a/modules/local/get_umi_length/main.nf b/modules/local/get_umi_length/main.nf deleted file mode 100644 index 7159d1d..0000000 --- a/modules/local/get_umi_length/main.nf +++ /dev/null @@ -1,33 +0,0 @@ -process GET_UMI_LENGTH { - tag "$meta.id" - label 'process_low' - - container "quay.io/biocontainers/pysam:0.19.0--py39h5030a8b_0" - - input: - tuple val(meta), path(bam), path(bai) - - output: - tuple val(meta), stdout, emit: length - - script: - def umi_separator = task.ext.args ?: ":" - """ - #!/usr/bin/env python3 - - import pysam - import sys - - file_path = "$bam" - umi_separator = "$umi_separator" - - bam_file = pysam.AlignmentFile(file_path, "rb") - - max_umi_len = 0 - for read in bam_file.fetch(): - if umi_separator in read.query_name: - max_umi_len = max(max_umi_len, len(read.query_name.split(umi_separator)[-1])) - - sys.stdout.write(str(max_umi_len)) - """ -} diff --git a/modules/local/get_umi_length/meta.yml b/modules/local/get_umi_length/meta.yml deleted file mode 100644 index 5efdd47..0000000 --- a/modules/local/get_umi_length/meta.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: get_umi_length -description: Determines the UMI length used for aligned reads -tools: - - pysam: - description: | - Pysam is a python module for reading and manipulating files in the - SAM/BAM format. The SAM/BAM format is a way to store efficiently large - numbers of alignments, such as those routinely created by - next-generation sequencing methods. Pysam is a lightweight wrapper of - the samtools C-API. Pysam also includes an interface for tabix. - homepage: https://github.com/pysam-developers/pysam - documentation: https://pysam.readthedocs.io - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - - bai: - type: file - description: BAM/CRAM/SAM index file - pattern: "*.{bai,crai,sai}" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - size: - type: length - description: The size of the UMI -authors: - - "@marc-jones" \ No newline at end of file diff --git a/modules/local/umitools_dedup/main.nf b/modules/local/umitools_dedup/main.nf deleted file mode 100644 index 19694f9..0000000 --- a/modules/local/umitools_dedup/main.nf +++ /dev/null @@ -1,41 +0,0 @@ -process UMITOOLS_DEDUP { - tag "$meta.id" - label "process_high" - - conda (params.enable_conda ? "bioconda::umi_tools=1.1.2" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.2--py38h4a8c8d9_0' : - 'quay.io/biocontainers/umi_tools:1.1.2--py38h4a8c8d9_0' }" - - input: - tuple val(meta), path(bam), path(bai) - - output: - tuple val(meta), path("*.bam") , emit: bam - tuple val(meta), path("*.log") , emit: log - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def paired = meta.single_end ? "" : "--paired" - def low_mem = meta.low_memory ? "--method unique" : "" - """ - umi_tools \\ - dedup \\ - -I $bam \\ - -S ${prefix}.bam \\ - --log=${prefix}.log \\ - $paired \\ - $low_mem \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') - END_VERSIONS - """ -} diff --git a/subworkflows/clipqualitycheck.nf b/subworkflows/clipqualitycheck.nf index b1eb271..de6e5ae 100644 --- a/subworkflows/clipqualitycheck.nf +++ b/subworkflows/clipqualitycheck.nf @@ -15,7 +15,7 @@ workflow CLIP_QUALITY_CHECK { trimgalore_log bowtie_align_log star_align_log_final - umitools_dedup_log + umicollapse_log crosslinks icount_peaks paraclu_peaks @@ -26,7 +26,7 @@ workflow CLIP_QUALITY_CHECK { CLIPQC ( bowtie_align_log.map(strip_meta).collect(), star_align_log_final.map(strip_meta).collect(), - umitools_dedup_log.map(strip_meta).collect(), + umicollapse_log.map(strip_meta).collect(), crosslinks.map(strip_meta).collect(), icount_peaks.map(strip_meta).collect(), paraclu_peaks.map(strip_meta).collect(), diff --git a/subworkflows/primaryclipanalysis.config b/subworkflows/primaryclipanalysis.config index 3328ae1..2fab21a 100644 --- a/subworkflows/primaryclipanalysis.config +++ b/subworkflows/primaryclipanalysis.config @@ -1,6 +1,6 @@ includeConfig '../conf/base.config' -def umi_separator = params.containsKey("umi_separator") ? params.umi_separator : "rbc" +def umi_separator = params.containsKey("umi_separator") ? params.umi_separator : "rbc:" process { withName: "TRIMGALORE" { @@ -15,15 +15,11 @@ process { ext.args = "--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --outFilterMultimapNmax 1 --outFilterMultimapScoreRange 1 --outSAMattributes All --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --alignIntronMin 20 --alignIntronMax 1000000 --outFilterScoreMin 10 --alignEndsType Extend5pOfRead1 --twopassMode Basic" } - withName: "UMITOOLS_DEDUP" { - ext.args = "--umi-separator='${umi_separator}:'" + withName: "UMICOLLAPSE" { + ext.args = "--umi-sep '${umi_separator}' --two-pass" } - withName: "TOME_UMITOOLS_DEDUP" { - ext.args = "--umi-separator='${umi_separator}:'" - } - - withName: "GET_UMI_LENGTH" { - ext.args = "${umi_separator}:" + withName: "TOME_UMICOLLAPSE" { + ext.args = "--umi-sep '${umi_separator}' --two-pass" } } \ No newline at end of file diff --git a/subworkflows/primaryclipanalysis.json b/subworkflows/primaryclipanalysis.json index 8e12b9e..123956e 100644 --- a/subworkflows/primaryclipanalysis.json +++ b/subworkflows/primaryclipanalysis.json @@ -31,7 +31,7 @@ "format": "text", "required": false, "default": "rbc", - "description": "Separator used in UMI-tools Dedup." + "description": "Separator used in UMICollapse." } } } diff --git a/subworkflows/primaryclipanalysis.nf b/subworkflows/primaryclipanalysis.nf index fa9da2a..986a59b 100644 --- a/subworkflows/primaryclipanalysis.nf +++ b/subworkflows/primaryclipanalysis.nf @@ -5,21 +5,17 @@ nextflow.enable.dsl=2 include { TRIMGALORE } from '../modules/nf-core/modules/trimgalore/main' include { BOWTIE_ALIGN } from '../modules/nf-core/modules/bowtie/align/main' include { STAR_ALIGN } from '../modules/nf-core/modules/star/align/main' -include { DU } from '../modules/local/du/main' -include { GET_UMI_LENGTH } from '../modules/local/get_umi_length/main' -include { UMITOOLS_DEDUP } from '../modules/local/umitools_dedup/main' +include { UMICOLLAPSE } from '../modules/local/UMICollapse/main' include { SAMTOOLS_INDEX as STAR_SAMTOOLS_INDEX} from '../modules/nf-core/modules/samtools/index/main' -include { SAMTOOLS_INDEX as UMITOOLS_SAMTOOLS_INDEX} from '../modules/nf-core/modules/samtools/index/main' +include { SAMTOOLS_INDEX as UMICOLLAPSE_SAMTOOLS_INDEX} from '../modules/nf-core/modules/samtools/index/main' include { GET_CROSSLINKS } from '../modules/local/get_crosslinks/main' include { CROSSLINKS_COVERAGE } from '../modules/luslab/nf-core-modules/crosslinks/coverage/main' include { CROSSLINKS_NORMCOVERAGE } from '../modules/luslab/nf-core-modules/crosslinks/normcoverage/main' include { FILTER_TRANSCRIPTS } from '../modules/local/filter_transcriptome_bam/main' -include { DU as TOME_DU } from '../modules/local/du/main' -include { GET_UMI_LENGTH as TOME_GET_UMI_LENGTH } from '../modules/local/get_umi_length/main' -include { UMITOOLS_DEDUP as TOME_UMITOOLS_DEDUP } from '../modules/local/umitools_dedup/main' +include { UMICOLLAPSE as TOME_UMICOLLAPSE } from '../modules/local/UMICollapse/main' include { SAMTOOLS_INDEX as TOME_STAR_SAMTOOLS_INDEX } from '../modules/nf-core/modules/samtools/index/main' -include { SAMTOOLS_INDEX as TOME_UMITOOLS_SAMTOOLS_INDEX } from '../modules/nf-core/modules/samtools/index/main' +include { SAMTOOLS_INDEX as TOME_UMICOLLAPSE_SAMTOOLS_INDEX } from '../modules/nf-core/modules/samtools/index/main' include { GET_CROSSLINKS as TOME_GET_CROSSLINKS } from '../modules/local/get_crosslinks/main' include { CROSSLINKS_COVERAGE as TOME_CROSSLINKS_COVERAGE } from '../modules/luslab/nf-core-modules/crosslinks/coverage/main' include { CROSSLINKS_NORMCOVERAGE as TOME_CROSSLINKS_NORMCOVERAGE } from '../modules/luslab/nf-core-modules/crosslinks/normcoverage/main' @@ -34,15 +30,6 @@ include { PARACLU_PARACLU } from '../modules/luslab/nf-core-modules/paraclu/para include { PARACLU_CUT } from '../modules/luslab/nf-core-modules/paraclu/cut/main' include { PEKA } from '../modules/luslab/nf-core-modules/peka/main' -// Closure to annotate UMITools Input -annotate_umitools_input = { it -> - def meta = it[0].clone() - if (it[3].toInteger() >= params.max_kilobytes & - it[4].toInteger() >= params.max_umi_length) { - meta["low_memory"] = true - } - return [meta, it[1], it[2]] -} workflow { // If running straight from command line, will need to construct the @@ -146,25 +133,12 @@ workflow PRIMARY_CLIP_ANALYSIS { TOME_STAR_SAMTOOLS_INDEX ( FILTER_TRANSCRIPTS.out.filtered_bam ) FILTER_TRANSCRIPTS.out.filtered_bam.join(TOME_STAR_SAMTOOLS_INDEX.out.bai) .set{ tome_ch_umi_input } - - // Determine if UMITools needs to be run in "low_memory" mode - TOME_DU ( tome_ch_umi_input.map{it -> it[0, 1]} ) - TOME_GET_UMI_LENGTH ( tome_ch_umi_input ) - tome_ch_umi_input - .join( TOME_DU.out.size ) - .join( TOME_GET_UMI_LENGTH.out.length ) - .map( annotate_umitools_input ) - .set{ tome_ch_umi_input_annotated } - - TOME_UMITOOLS_DEDUP ( tome_ch_umi_input_annotated ) - TOME_UMITOOLS_DEDUP.out.bam - .map{ it -> [it[0].findAll{key, val -> key != "low_memory"}, it[1]] } - .set{ ch_tome_umitools_bam } - TOME_UMITOOLS_SAMTOOLS_INDEX ( ch_tome_umitools_bam ) + TOME_UMICOLLAPSE ( tome_ch_umi_input) + TOME_UMICOLLAPSE_SAMTOOLS_INDEX ( TOME_UMICOLLAPSE.out.bam ) reads.map{triplet -> [ triplet[0], file(triplet[2] + "/FIND_LONGEST_TRANSCRIPT/*.fa.fai") ]}.set{ ch_longest_transcript_index } - tome_ch_xl_input = ch_tome_umitools_bam.join(TOME_UMITOOLS_SAMTOOLS_INDEX.out.bai) + tome_ch_xl_input = TOME_UMICOLLAPSE.out.bam.join(TOME_UMICOLLAPSE_SAMTOOLS_INDEX.out.bai) tome_ch_xl_input.join( ch_longest_transcript_index ).set{ tome_with_index } tome_with_index.multiMap { tuple -> bam: [tuple[0], tuple[1], tuple[2]] @@ -172,43 +146,21 @@ workflow PRIMARY_CLIP_ANALYSIS { }.set { ch_tome_input } TOME_GET_CROSSLINKS ( ch_tome_input.bam, ch_tome_input.transcript ) TOME_CROSSLINKS_COVERAGE ( TOME_GET_CROSSLINKS.out.crosslinkBed ) - TOME_CROSSLINKS_NORMCOVERAGE ( TOME_GET_CROSSLINKS.out.crosslinkBed ) - + TOME_CROSSLINKS_NORMCOVERAGE ( TOME_GET_CROSSLINKS.out.crosslinkBed ) // Get crosslinks STAR_SAMTOOLS_INDEX ( STAR_ALIGN.out.bam_sorted ) ch_umi_input = STAR_ALIGN.out.bam_sorted.combine(STAR_SAMTOOLS_INDEX.out.bai, by: 0) - // Determine if UMITools needs to be run in "low_memory" mode - DU ( ch_umi_input.map{it -> it[0, 1]} ) - GET_UMI_LENGTH ( ch_umi_input ) - ch_umi_input - .join( DU.out.size ) - .join( GET_UMI_LENGTH.out.length ) - .map( annotate_umitools_input ) - .set{ ch_umi_input_annotated } - - UMITOOLS_DEDUP ( ch_umi_input_annotated ) - - // Strip out the low_memory key from the meta value so that the later joins - // actually work - UMITOOLS_DEDUP.out.bam - .map{ it -> [it[0].findAll{key, val -> key != "low_memory"}, it[1]] } - .set{ ch_umitools_bam } - // Keep a channel for converting between meta with the low_memory key and - // without, in case in the future you want to keep track of which files were - // run as low mem - UMITOOLS_DEDUP.out.bam - .map{ it -> [it[0].findAll{key, val -> key != "low_memory"}, it[0]] } - .set{ ch_meta_conversion } - - UMITOOLS_SAMTOOLS_INDEX ( ch_umitools_bam ) + UMICOLLAPSE ( ch_umi_input ) + + UMICOLLAPSE_SAMTOOLS_INDEX ( UMICOLLAPSE.out.bam ) reads.map{triplet -> [ triplet[0], file(triplet[2] + "/SAMTOOLS_FAIDX/*.fa.fai") ]}.set{ ch_genome_fai } - ch_xl_input = ch_umitools_bam.join(UMITOOLS_SAMTOOLS_INDEX.out.bai) + ch_xl_input = UMICOLLAPSE.out.bam.join(UMICOLLAPSE_SAMTOOLS_INDEX.out.bai) ch_xl_input.join( ch_genome_fai ).set{ ch_with_index } @@ -295,9 +247,9 @@ workflow PRIMARY_CLIP_ANALYSIS { trimgalore_log = TRIMGALORE.out.log bowtie_align_log = BOWTIE_ALIGN.out.log star_align_log_final = STAR_ALIGN.out.log_final - umitools_dedup_log = UMITOOLS_DEDUP.out.log + umicollapse_log = UMICOLLAPSE.out.log crosslinks = GET_CROSSLINKS.out.crosslinkBed icount_peaks = ICOUNT_PEAKS.out.peaks paraclu_peaks = PARACLU_CONVERT.out.peaks clippy_peaks = CLIPPY.out.peaks -} +} \ No newline at end of file diff --git a/tests/test_primary_clip_analysis_pipeline.py b/tests/test_primary_clip_analysis_pipeline.py index 5ba17c7..8c0e6a1 100644 --- a/tests/test_primary_clip_analysis_pipeline.py +++ b/tests/test_primary_clip_analysis_pipeline.py @@ -18,20 +18,7 @@ def test_can_run_pipeline_with_genome_that_has_gzip(self): "Hs_genome": "assets/human_genome", }, profile=["iMaps", "local", "test"]) self.assertEqual(execution.status, "OK", msg=execution.stdout) - self.assertEqual(len(execution.process_executions), 29) - - # Default UMI Separator is rbc - for proc in execution.process_executions: - if "UMITOOLS_DEDUP" in proc.process: - p1, p2 = proc.hash.split("/") - subdirs = os.listdir(os.path.join("work", p1)) - subdir = [d for d in subdirs if d.startswith(p2)][0] - meta_id = proc.name[proc.name.find("(") + 1 : proc.name.find(")")] - with open(os.path.join("work", p1, subdir, "{}.log".format(meta_id))) as f: - self.assertIn( - "--umi-separator=rbc:", f.read(), - "Default umi-separator was not 'rbc'" - ) + self.assertEqual(len(execution.process_executions), 25) def test_can_run_pipeline_with_genome_that_has_no_gzip(self): @@ -49,7 +36,7 @@ def test_can_run_pipeline_with_genome_that_has_no_gzip(self): "Hs_genome": "assets/human_genome_no_gzip", }, profile=["iMaps", "local", "test"]) self.assertEqual(execution.status, "OK", msg=execution.stdout) - self.assertEqual(len(execution.process_executions), 29) + self.assertEqual(len(execution.process_executions), 25) finally: if os.path.exists("assets/human_genome_no_gzip"): shutil.rmtree("assets/human_genome_no_gzip") diff --git a/workflows/demuxandanalyse.nf b/workflows/demuxandanalyse.nf index 247aed1..cb0b983 100644 --- a/workflows/demuxandanalyse.nf +++ b/workflows/demuxandanalyse.nf @@ -40,7 +40,7 @@ workflow { PRIMARY_CLIP_ANALYSIS.out.trimgalore_log, PRIMARY_CLIP_ANALYSIS.out.bowtie_align_log, PRIMARY_CLIP_ANALYSIS.out.star_align_log_final, - PRIMARY_CLIP_ANALYSIS.out.umitools_dedup_log, + PRIMARY_CLIP_ANALYSIS.out.umicollapse_log, PRIMARY_CLIP_ANALYSIS.out.crosslinks, PRIMARY_CLIP_ANALYSIS.out.icount_peaks, PRIMARY_CLIP_ANALYSIS.out.paraclu_peaks,