From 0ec5df588b0646056519825aad4bd09563266cd9 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Wed, 13 Nov 2024 11:32:56 -0500 Subject: [PATCH 01/10] mp v4 working --- aws-params.yaml | 4 ++-- metaphlan_only.nf | 18 ++++++++++++++++++ processes/metaphlan.nf | 11 +++-------- 3 files changed, 23 insertions(+), 10 deletions(-) create mode 100644 metaphlan_only.nf diff --git a/aws-params.yaml b/aws-params.yaml index ba0ae3e..377f1ad 100644 --- a/aws-params.yaml +++ b/aws-params.yaml @@ -1,8 +1,8 @@ readsdir: "s3://vkc-nextflow/rawfastq/" outdir: "s3://vkc-nextflow/output/" human_genome: "s3://biobakery-databases/kneaddata_databases/" -metaphlan_db: "s3://biobakery-databases/metaphlan_databases/" +metaphlan_db: "s3://biobakery-databases/metaphlan_v4_databases/" humann_bowtie_db: "s3://biobakery-databases/humann_databases/chocophlan" humann_protein_db: "s3://biobakery-databases/humann_databases/uniref" humann_utility_db: "s3://biobakery-databases/humann_databases/utility_mapping" -filepattern: "*_L00{1,2,3,4}_R{1,2}_001.fastq.gz" \ No newline at end of file +filepattern: "*_L00{1,2,3,4}_R{1,2}_001.fastq.gz" diff --git a/metaphlan_only.nf b/metaphlan_only.nf new file mode 100644 index 0000000..e4f02fc --- /dev/null +++ b/metaphlan_only.nf @@ -0,0 +1,18 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +include { metaphlan; metaphlan_bzip } from './processes/metaphlan.nf' + +workflow { + + read_pairs_ch = Channel + .fromFilePairs( + [ "$params.readsdir/$params.filepattern", + "$params.readsdir/*_kneaddata.fastq.gz" ], + size:-1) + + metaphlan_db = params.metaphlan_db + + metaphlan_out = metaphlan(read_pairs_ch, metaphlan_db) + metaphlan_bzip = metaphlan_bzip(metaphlan_out[0], metaphlan_out[4]) +} diff --git a/processes/metaphlan.nf b/processes/metaphlan.nf index e081e44..ad35952 100644 --- a/processes/metaphlan.nf +++ b/processes/metaphlan.nf @@ -4,7 +4,6 @@ process metaphlan { input: tuple val(sample), path(kneads) - path unmatched path metaphlan_db output: @@ -15,13 +14,9 @@ process metaphlan { path "${sample}.sam" script: - def forward = kneads[0] - def reverse = kneads[1] - def unf = unmatched[0] - def unr = unmatched[1] - + """ - cat $forward $reverse $unf $unr > ${sample}_grouped.fastq.gz + cat $kneads > ${sample}_grouped.fastq.gz metaphlan ${sample}_grouped.fastq.gz ${sample}_profile.tsv \ --bowtie2out ${sample}_bowtie2.tsv \ @@ -49,4 +44,4 @@ process metaphlan { """ bzip2 -v $sam """ -} \ No newline at end of file +} From c277c986897853faf5139fa47ab304b0eb8ea22b Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Thu, 2 Jan 2025 16:12:41 -0500 Subject: [PATCH 02/10] modify main and metaphlan --- main.nf | 6 ------ processes/metaphlan.nf | 5 +++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index e99896d..aa11be5 100755 --- a/main.nf +++ b/main.nf @@ -14,14 +14,8 @@ workflow { human_genome = params.human_genome metaphlan_db = params.metaphlan_db - humann_bowtie_db = params.humann_bowtie_db - humann_protein_db = params.humann_protein_db - humann_utility_db = params.humann_utility_db knead_out = kneaddata(read_pairs_ch, human_genome) metaphlan_out = metaphlan(knead_out[0], knead_out[1], metaphlan_db) metaphlan_bzip = metaphlan_bzip(metaphlan_out[0], metaphlan_out[4]) - humann_out = humann(metaphlan_out[0], metaphlan_out[1], metaphlan_out[2], humann_bowtie_db, humann_protein_db) - regroup_out = humann_regroup(humann_out[0], humann_out[1], humann_utility_db) - humann_rename(regroup_out, humann_utility_db) } diff --git a/processes/metaphlan.nf b/processes/metaphlan.nf index ad35952..ecc1a40 100644 --- a/processes/metaphlan.nf +++ b/processes/metaphlan.nf @@ -3,8 +3,9 @@ process metaphlan { publishDir "$params.outdir/metaphlan", pattern: "{*.tsv}" input: - tuple val(sample), path(kneads) - path metaphlan_db + tuple val(sample), path(kneads) + path unmatched + path metaphlan_db output: val sample , emit: sample From 6051b4948b04948183f0da69d51a92e8f7500d72 Mon Sep 17 00:00:00 2001 From: Guilherme Fabur Bottino Date: Wed, 22 Jan 2025 11:41:43 -0500 Subject: [PATCH 03/10] added `prefetch_and_split` process --- processes/prefetch_and_split.nf | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 processes/prefetch_and_split.nf diff --git a/processes/prefetch_and_split.nf b/processes/prefetch_and_split.nf new file mode 100644 index 0000000..6a4bd06 --- /dev/null +++ b/processes/prefetch_and_split.nf @@ -0,0 +1,24 @@ +process prefetch_and_split { + tag "$sra_id" + + input: + val sra_id + + output: + tuple val(sra_id), path("${sra_id}_*.fastq.gz") + + publishDir "${params.readsdir}", mode: 'copy' + + script: + """ + echo "Prefetching $sra_id to S3" + prefetch $sra_id -v -v + + echo "Splitting $sra_id into FASTQ files directly on S3" + fasterq-dump $sra_id --split-files --threads ${task.cpus} -v -v + + echo "Compressing FASTQ files" + gzip ${sra_id}_1.fastq + gzip ${sra_id}_2.fastq + """ +} \ No newline at end of file From fd0b3f0455c4e33f44c105869c12dd7610b7d4ba Mon Sep 17 00:00:00 2001 From: Guilherme Fabur Bottino Date: Wed, 22 Jan 2025 11:42:21 -0500 Subject: [PATCH 04/10] removed unmatched from MP input --- processes/metaphlan.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/processes/metaphlan.nf b/processes/metaphlan.nf index ecc1a40..47ba0b7 100644 --- a/processes/metaphlan.nf +++ b/processes/metaphlan.nf @@ -4,7 +4,6 @@ process metaphlan { input: tuple val(sample), path(kneads) - path unmatched path metaphlan_db output: From e12687ad3753d8759b9663923907c5d37d9115c1 Mon Sep 17 00:00:00 2001 From: Guilherme Fabur Bottino Date: Wed, 22 Jan 2025 11:42:38 -0500 Subject: [PATCH 05/10] added workflow to use the SRA process --- tubular.nf | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tubular.nf diff --git a/tubular.nf b/tubular.nf new file mode 100644 index 0000000..631af1a --- /dev/null +++ b/tubular.nf @@ -0,0 +1,26 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +include { prefetch_and_split } from './processes/prefetch_and_split.nf' +include { kneaddata } from './processes/kneaddata.nf' +include { metaphlan; metaphlan_bzip } from './processes/metaphlan.nf' +include { humann; humann_regroup; humann_rename } from './processes/humann.nf' + +workflow { + + // Step 1: Read the SRA accession list and create a channel + sra_ch = Channel.fromPath(params.sra_list) + .splitText() + .map { it.trim() } + + // Step 2: Prefetch and split FASTQ files + fastq_pairs_ch = prefetch_and_split(sra_ch) + + human_genome = params.human_genome + metaphlan_db = params.metaphlan_db + + knead_out = kneaddata(fastq_pairs_ch, human_genome) + metaphlan_out = metaphlan(knead_out[0], metaphlan_db) + metaphlan_bzip = metaphlan_bzip(metaphlan_out[0], metaphlan_out[4]) +} From 7816c29dac94562e1c21790ebbd7ebe85f5b8ce1 Mon Sep 17 00:00:00 2001 From: Guilherme Fabur Bottino Date: Wed, 22 Jan 2025 11:43:09 -0500 Subject: [PATCH 06/10] changed config for MP to increase memory size and update image to MP4 --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 0097e86..e3fa473 100644 --- a/nextflow.config +++ b/nextflow.config @@ -59,10 +59,10 @@ profiles { withName: metaphlan { maxForks = 4 - memory = '8.G' + memory = '32.G' time = '12.h' cpus = 8 - container = 'public.ecr.aws/j5i5h1i5/metaphlan-nodb:mamba-v3.1' + container = 'public.ecr.aws/j5i5h1i5/metaphlan-nodb:mamba-v4.1' queue = 'Nextflow-metaphlan' } From 6a6447288bc72ac7a1849dc003edc230dbb6af57 Mon Sep 17 00:00:00 2001 From: Guilherme Fabur Bottino Date: Wed, 22 Jan 2025 11:43:28 -0500 Subject: [PATCH 07/10] added configuration for prefetch_and_split on AWS --- nextflow.config | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nextflow.config b/nextflow.config index e3fa473..906670b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -47,6 +47,15 @@ profiles { executor = 'awsbatch' + withName: prefetch_and_split { + maxForks = 1 + memory = '8.G' + time = '2.h' + cpus = 2 + container = 'public.ecr.aws/j5i5h1i5/bzip2sra:mamba-v1.0' + queue = 'Nextflow-IOPS' + } + withName: kneaddata { maxForks = 4 memory = '16.G' From 349998bbb4195c047a2bd59c3d9f2b013a782d6a Mon Sep 17 00:00:00 2001 From: Guilherme Fabur Bottino Date: Wed, 22 Jan 2025 11:43:47 -0500 Subject: [PATCH 08/10] cleanup --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 906670b..25c1d71 100644 --- a/nextflow.config +++ b/nextflow.config @@ -46,7 +46,7 @@ profiles { process { executor = 'awsbatch' - + withName: prefetch_and_split { maxForks = 1 memory = '8.G' From 359469370e22ba96707c5baa29668fbf1d168635 Mon Sep 17 00:00:00 2001 From: Guilherme Fabur Bottino Date: Wed, 22 Jan 2025 11:44:21 -0500 Subject: [PATCH 09/10] changed number of arguments on metaphlan call to comply with removal of unmatched input --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index aa11be5..32fdf79 100755 --- a/main.nf +++ b/main.nf @@ -16,6 +16,6 @@ workflow { metaphlan_db = params.metaphlan_db knead_out = kneaddata(read_pairs_ch, human_genome) - metaphlan_out = metaphlan(knead_out[0], knead_out[1], metaphlan_db) + metaphlan_out = metaphlan(knead_out[0], metaphlan_db) metaphlan_bzip = metaphlan_bzip(metaphlan_out[0], metaphlan_out[4]) } From d3e94c02e871417c2498f1031a57540ce8d4f23d Mon Sep 17 00:00:00 2001 From: Guilherme Fabur Bottino Date: Wed, 22 Jan 2025 11:46:07 -0500 Subject: [PATCH 10/10] added sra_list to the parameter file, changed filepattern and readsdir --- aws-params.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/aws-params.yaml b/aws-params.yaml index 377f1ad..5c2aeea 100644 --- a/aws-params.yaml +++ b/aws-params.yaml @@ -1,8 +1,9 @@ -readsdir: "s3://vkc-nextflow/rawfastq/" +readsdir: "s3://vkc-nextflow/scratch/" outdir: "s3://vkc-nextflow/output/" human_genome: "s3://biobakery-databases/kneaddata_databases/" metaphlan_db: "s3://biobakery-databases/metaphlan_v4_databases/" humann_bowtie_db: "s3://biobakery-databases/humann_databases/chocophlan" humann_protein_db: "s3://biobakery-databases/humann_databases/uniref" humann_utility_db: "s3://biobakery-databases/humann_databases/utility_mapping" -filepattern: "*_L00{1,2,3,4}_R{1,2}_001.fastq.gz" +filepattern: "*_{1,2}.fastq.gz" +sra_list: "s3://vkc-nextflow/sra_accessions.txt" \ No newline at end of file