From fb844b8392a71ac3cde6d3d2fcd2f028370d28d4 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Mon, 9 Jun 2025 13:38:52 -0400
Subject: [PATCH 01/35] add tufts config

---
 nextflow.config | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/nextflow.config b/nextflow.config
index 0097e86..d4bdfa6 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -7,6 +7,36 @@ profiles {
         process.executor = 'local'
     }
 
+    tufts_hpc {
+        process {
+
+            executor = 'slurm'
+            queue = 'batch,preempt'
+    
+            withName: kneaddata {
+                memory = '8.G'
+                time   = '8.h'
+                cpus  =  8
+                
+            }
+
+            withName: metaphlan {
+                memory = '16.G'
+                time   = '4h'
+                cpus  =  8
+                
+            }
+
+            withName: humann {
+                memory = '16G'
+                time   = '12h'
+                cpus  =  16
+            }
+
+        }
+
+    }
+
     engaging {
         process {
 

From 26076fd2e54379d1d0d89a59d357cf975bed6269 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Mon, 9 Jun 2025 13:39:13 -0400
Subject: [PATCH 02/35] paths for stuff on tufts hpc

---
 tuftshpc-params.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 tuftshpc-params.yaml

diff --git a/tuftshpc-params.yaml b/tuftshpc-params.yaml
new file mode 100644
index 0000000..129554b
--- /dev/null
+++ b/tuftshpc-params.yaml
@@ -0,0 +1,8 @@
+readsdir: "/cluster/tufts/bonhamlab/shared/sequencing/fastq"
+outdir: "/cluster/tufts/bonhamlab/shared/sequencing/processed/"
+human_genome: "/cluster/tufts/bonhamlab/shared/databases/biobakery/kneaddata/"
+metaphlan_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/metaphlan/"
+humann_bowtie_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/chocophlan"
+humann_protein_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/uniref"
+humann_utility_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/utility_mapping"
+filepattern: "*{_1,_2,}.fastq"

From 479e8e55031e1be1403ab4e70e573bf5b0f8e040 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Mon, 9 Jun 2025 13:53:41 -0400
Subject: [PATCH 03/35] start working on bam conversion

---
 main.nf                |  6 +++---
 processes/bam2fastq.nf | 26 ++++++++++++++++++++++++++
 tuftshpc-params.yaml   |  2 +-
 3 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 processes/bam2fastq.nf

diff --git a/main.nf b/main.nf
index e99896d..5da9633 100755
--- a/main.nf
+++ b/main.nf
@@ -9,8 +9,8 @@ include { humann; humann_regroup; humann_rename } from './processes/humann.nf'
 
 workflow {
     
-    read_pairs_ch = Channel
-        .fromFilePairs("$params.readsdir/$params.filepattern", size: 2)
+    read_ch = Channel
+        .fromPath("$params.readsdir/$params.filepattern")
 
     human_genome      = params.human_genome
     metaphlan_db      = params.metaphlan_db
@@ -18,7 +18,7 @@ workflow {
     humann_protein_db = params.humann_protein_db
     humann_utility_db = params.humann_utility_db
     
-    knead_out     = kneaddata(read_pairs_ch, human_genome)
+    knead_out     = kneaddata(read_ch, human_genome)
     metaphlan_out = metaphlan(knead_out[0], knead_out[1], metaphlan_db)
     metaphlan_bzip = metaphlan_bzip(metaphlan_out[0], metaphlan_out[4])
     humann_out    = humann(metaphlan_out[0], metaphlan_out[1], metaphlan_out[2], humann_bowtie_db, humann_protein_db)
diff --git a/processes/bam2fastq.nf b/processes/bam2fastq.nf
new file mode 100644
index 0000000..61dd391
--- /dev/null
+++ b/processes/bam2fastq.nf
@@ -0,0 +1,26 @@
+process bam2fastq {
+    tag "bam2fastq $sample"
+    time { workflow.profile == 'standard' ? null : time * task.attempt }
+    memory { workflow.profile == 'standard' ? null : memory * task.attempt }
+
+    errorStrategy 'retry'
+    maxRetries 3
+
+    input:
+    tuple val(sample), path(reads)
+
+    output:
+
+    shell:
+    
+    """
+    echo $sample
+
+    kneaddata --input ${reads[0]} --input ${reads[1]} \
+              --reference-db $human_genome --output ./ \
+              --processes ${task.cpus} --output-prefix ${sample}_kneaddata \
+              --trimmomatic /opt/conda/share/trimmomatic
+
+    gzip *.fastq
+    """  
+}
diff --git a/tuftshpc-params.yaml b/tuftshpc-params.yaml
index 129554b..dbfba0c 100644
--- a/tuftshpc-params.yaml
+++ b/tuftshpc-params.yaml
@@ -5,4 +5,4 @@ metaphlan_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/metaphlan/"
 humann_bowtie_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/chocophlan"
 humann_protein_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/uniref"
 humann_utility_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/utility_mapping"
-filepattern: "*{_1,_2,}.fastq"
+filepattern: "*.bam"

From 646b40a0712eb21680619ec3972b97258e954457 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <dev@bonham.ch>
Date: Mon, 9 Jun 2025 16:00:50 -0400
Subject: [PATCH 04/35] Use emit better, rejigger some stuff

---
 main.nf                | 12 +++++++-----
 processes/bam2fastq.nf |  8 ++------
 processes/humann.nf    |  7 ++++---
 processes/kneaddata.nf |  6 +++---
 processes/metaphlan.nf | 31 +++++++++++++------------------
 5 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/main.nf b/main.nf
index 5da9633..1791b7e 100755
--- a/main.nf
+++ b/main.nf
@@ -2,6 +2,7 @@
 
 nextflow.enable.dsl=2
 
+include { bam2fastq } from './processes/bam2fastq.nf'
 include { kneaddata } from './processes/kneaddata.nf'
 include { metaphlan; metaphlan_bzip } from './processes/metaphlan.nf'
 include { humann; humann_regroup; humann_rename } from './processes/humann.nf'
@@ -18,10 +19,11 @@ workflow {
     humann_protein_db = params.humann_protein_db
     humann_utility_db = params.humann_utility_db
     
-    knead_out     = kneaddata(read_ch, human_genome)
-    metaphlan_out = metaphlan(knead_out[0], knead_out[1], metaphlan_db)
-    metaphlan_bzip = metaphlan_bzip(metaphlan_out[0], metaphlan_out[4])
-    humann_out    = humann(metaphlan_out[0], metaphlan_out[1], metaphlan_out[2], humann_bowtie_db, humann_protein_db)
-    regroup_out   = humann_regroup(humann_out[0], humann_out[1], humann_utility_db)
+    bam_out       = bam2fastq(read_ch)
+    knead_out     = kneaddata(bam_out, human_genome)
+    metaphlan_out = metaphlan(knead_out.fastq, metaphlan_db)
+    metaphlan_bzip = metaphlan_bzip(metaphlan_out.sample, metaphlan_out.sam)
+    humann_out    = humann(metaphlan_out.sample, knead_out.fastq, metaphlan_out.profile, humann_bowtie_db, humann_protein_db)
+    regroup_out   = humann_regroup(humann_out.sample, humann_out.genefamilies, humann_utility_db)
     humann_rename(regroup_out, humann_utility_db)
 }
diff --git a/processes/bam2fastq.nf b/processes/bam2fastq.nf
index 61dd391..482cff6 100644
--- a/processes/bam2fastq.nf
+++ b/processes/bam2fastq.nf
@@ -10,17 +10,13 @@ process bam2fastq {
     tuple val(sample), path(reads)
 
     output:
+    tuple val(sample), path("{sample}.fastq")
 
     shell:
     
     """
     echo $sample
 
-    kneaddata --input ${reads[0]} --input ${reads[1]} \
-              --reference-db $human_genome --output ./ \
-              --processes ${task.cpus} --output-prefix ${sample}_kneaddata \
-              --trimmomatic /opt/conda/share/trimmomatic
-
-    gzip *.fastq
+    samtools fastq -@ {task.cpus} > {sample}.fastq
     """  
 }
diff --git a/processes/humann.nf b/processes/humann.nf
index a56073e..b1c84f2 100644
--- a/processes/humann.nf
+++ b/processes/humann.nf
@@ -46,9 +46,10 @@ process humann_regroup {
 
     output:
     val  sample , emit: sample
-    path "${sample}_ecs.tsv"
-    path "${sample}_kos.tsv"
-    path "${sample}_pfams.tsv"
+    path "${sample}_ecs.tsv", emit: ecs
+    path "${sample}_kos.tsv", emit: kos
+    path "${sample}_pfams.tsv", emit: pfams
+
 
     script:
 
diff --git a/processes/kneaddata.nf b/processes/kneaddata.nf
index 6ec2c90..fa9aad9 100644
--- a/processes/kneaddata.nf
+++ b/processes/kneaddata.nf
@@ -12,8 +12,8 @@ process kneaddata {
     path human_genome
 
     output:
-    tuple val(sample), path("${sample}_kneaddata_paired_{1,2}.fastq.gz")
-    path "${sample}_kneaddata_unmatched_{1,2}.fastq.gz"
+    val(sample), emit: sample
+    path("${sample}_kneaddata.fastq.gz"), emit: fastq
     path "${sample}_kneaddata*.fastq.gz" , optional:true , emit: others
     path "${sample}_kneaddata.log"                       , emit: log
 
@@ -22,7 +22,7 @@ process kneaddata {
     """
     echo $sample
 
-    kneaddata --input ${reads[0]} --input ${reads[1]} \
+    kneaddata --input $reads \
               --reference-db $human_genome --output ./ \
               --processes ${task.cpus} --output-prefix ${sample}_kneaddata \
               --trimmomatic /opt/conda/share/trimmomatic
diff --git a/processes/metaphlan.nf b/processes/metaphlan.nf
index e081e44..f92ded2 100644
--- a/processes/metaphlan.nf
+++ b/processes/metaphlan.nf
@@ -1,6 +1,6 @@
 process metaphlan {
     tag "metaphlan on $sample"
-    publishDir "$params.outdir/metaphlan", pattern: "{*.tsv}"
+    publishDir "$params.outdir/metaphlan", pattern: "*.tsv"
 
     input:
     tuple val(sample), path(kneads)
@@ -10,21 +10,14 @@ process metaphlan {
     output:
     val  sample                  , emit: sample
     path "${sample}_profile.tsv" , emit: profile
-    path "${sample}_grouped.fastq.gz"
-    path "${sample}_bowtie2.tsv"
-    path "${sample}.sam"
+    path "${sample}_bowtie2.tsv" , emit: bowtie2
+    path "${sample}.sam"         , emit: sam
 
-    script:
-    def forward = kneads[0]
-    def reverse = kneads[1]
-    def unf = unmatched[0]
-    def unr = unmatched[1]
 
+    script:
     """
-    cat $forward $reverse $unf $unr > ${sample}_grouped.fastq.gz
-    
-    metaphlan ${sample}_grouped.fastq.gz ${sample}_profile.tsv \
-        --bowtie2out ${sample}_bowtie2.tsv \
+    metaphlan $kneads ${sample}_profile.tsv \
+        --mapout ${sample}_bowtie2.tsv \
         --samout ${sample}.sam \
         --input_type fastq \
         --nproc ${task.cpus} \
@@ -32,8 +25,8 @@ process metaphlan {
     """
 }
  
- process metaphlan_bzip {
-    tag "metaphlan_bzip on $sample"
+ process metaphlan_bam {
+    tag "metaphlan_bam on $sample"
     publishDir "$params.outdir/metaphlan"
     stageInMode "copy"
 
@@ -42,11 +35,13 @@ process metaphlan {
     path sam
 
     output:
-    val  sample                  , emit: sample
-    path "${sample}.sam.bz2"
+    val  sample          , emit: sample
+    path "${sample}.bam" , emit: bam
+
+    when:
 
     script:
     """
-    bzip2 -v $sam
+    samtools -bS $sam -o ${sample}.bam
     """
 }
\ No newline at end of file

From 79bd96fbb9cfd36aaf357b75c220ec0c848e5eb9 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <dev@bonham.ch>
Date: Wed, 11 Jun 2025 15:59:37 -0400
Subject: [PATCH 05/35] Bunch of changes for v4s

---
 main.nf                | 15 +++++----------
 processes/humann.nf    | 20 +++++++++-----------
 processes/kneaddata.nf |  2 +-
 processes/metaphlan.nf | 12 +++++++-----
 4 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/main.nf b/main.nf
index 1791b7e..a3082c1 100755
--- a/main.nf
+++ b/main.nf
@@ -13,17 +13,12 @@ workflow {
     read_ch = Channel
         .fromPath("$params.readsdir/$params.filepattern")
 
-    human_genome      = params.human_genome
-    metaphlan_db      = params.metaphlan_db
-    humann_bowtie_db  = params.humann_bowtie_db
-    humann_protein_db = params.humann_protein_db
-    humann_utility_db = params.humann_utility_db
     
     bam_out       = bam2fastq(read_ch)
-    knead_out     = kneaddata(bam_out, human_genome)
-    metaphlan_out = metaphlan(knead_out.fastq, metaphlan_db)
+    knead_out     = kneaddata(bam_out)
+    metaphlan_out = metaphlan(knead_out.fastq, metaphlan_db, metaphlan_index)
     metaphlan_bzip = metaphlan_bzip(metaphlan_out.sample, metaphlan_out.sam)
-    humann_out    = humann(metaphlan_out.sample, knead_out.fastq, metaphlan_out.profile, humann_bowtie_db, humann_protein_db)
-    regroup_out   = humann_regroup(humann_out.sample, humann_out.genefamilies, humann_utility_db)
-    humann_rename(regroup_out, humann_utility_db)
+    humann_out    = humann(metaphlan_out.sample, knead_out.fastq, metaphlan_out.profile)
+    regroup_out   = humann_regroup(humann_out.sample, humann_out.genefamilies)
+    humann_rename(regroup_out)
 }
diff --git a/processes/humann.nf b/processes/humann.nf
index b1c84f2..8904ec1 100644
--- a/processes/humann.nf
+++ b/processes/humann.nf
@@ -14,8 +14,6 @@ process humann {
     val  sample
     path profile
     path catkneads
-    path humann_bowtie_db
-    path humann_protein_db
 
     output:
     val  sample                       , emit: sample
@@ -26,12 +24,14 @@ process humann {
     script:
 
     """
-    humann_config --update database_folders nucleotide `realpath $humann_bowtie_db`
-    humann_config --update database_folders protein `realpath $humann_protein_db`
-
     humann --input $catkneads --taxonomic-profile $profile --output ./ \
-        --threads ${task.cpus} --remove-temp-output --search-mode uniref90 \
-        --output-basename $sample
+        --threads ${task.cpus} --remove-temp-output \ # add --search-mode uniref90 for 3.7
+        --output-basename $sample \
+        --protein-database ${params.humann_protein_db} \
+        --nucleotide-database ${params.humann_nucleotide_db} \
+        --utility-mapping ${params.humann_utility_db} \
+        --metaphlan-options="--index mpa_vOct22_CHOCOPhlAnSGB_202403 --bowtie2db ${params.metaphlan_db} -t rel_ab_with_read_stats"
+
     """
 }
 
@@ -42,7 +42,6 @@ process humann_regroup {
     input:
     val  sample
     path genefamilies
-    path humann_utility_db
 
     output:
     val  sample , emit: sample
@@ -54,7 +53,7 @@ process humann_regroup {
     script:
 
     """
-    humann_config --update database_folders utility_mapping `realpath $humann_utility_db`
+    humann_config --update database_folders utility_mapping `realpath ${params.humann_utility_db}`
     humann_regroup_table --input $genefamilies --output ${sample}_ecs.tsv --groups uniref90_level4ec
     humann_regroup_table --input $genefamilies --output ${sample}_kos.tsv --groups uniref90_ko
     humann_regroup_table --input $genefamilies --output ${sample}_pfams.tsv --groups uniref90_pfam
@@ -70,7 +69,6 @@ process humann_rename {
     path ecs
     path kos
     path pfams
-    path humann_utility_db
 
     output:
     val  sample , emit: sample
@@ -81,7 +79,7 @@ process humann_rename {
     script:
 
     """
-    humann_config --update database_folders utility_mapping `realpath $humann_utility_db`
+    humann_config --update database_folders utility_mapping `realpath ${params.humann_utility_db}`
     humann_rename_table --input $ecs   --output ${sample}_ecs_rename.tsv   --names ec
     humann_rename_table --input $kos   --output ${sample}_kos_rename.tsv   --names kegg-orthology
     humann_rename_table --input $pfams --output ${sample}_pfams_rename.tsv --names pfam
diff --git a/processes/kneaddata.nf b/processes/kneaddata.nf
index fa9aad9..f27f5f6 100644
--- a/processes/kneaddata.nf
+++ b/processes/kneaddata.nf
@@ -23,7 +23,7 @@ process kneaddata {
     echo $sample
 
     kneaddata --input $reads \
-              --reference-db $human_genome --output ./ \
+              --reference-db ${params.human_genome} --output ./ \
               --processes ${task.cpus} --output-prefix ${sample}_kneaddata \
               --trimmomatic /opt/conda/share/trimmomatic
 
diff --git a/processes/metaphlan.nf b/processes/metaphlan.nf
index f92ded2..c4cd503 100644
--- a/processes/metaphlan.nf
+++ b/processes/metaphlan.nf
@@ -3,9 +3,8 @@ process metaphlan {
     publishDir "$params.outdir/metaphlan", pattern: "*.tsv"
 
     input:
-    tuple val(sample), path(kneads)
-    path unmatched
-    path metaphlan_db
+    val(sample)
+    path(kneads)
 
     output:
     val  sample                  , emit: sample
@@ -21,7 +20,9 @@ process metaphlan {
         --samout ${sample}.sam \
         --input_type fastq \
         --nproc ${task.cpus} \
-        --bowtie2db $metaphlan_db
+        --db_dir ${params.metaphlan_db} \
+        --index ${params.metaphlan_index} \
+        -t rel_ab_w_read_stats
     """
 }
  
@@ -42,6 +43,7 @@ process metaphlan {
 
     script:
     """
-    samtools -bS $sam -o ${sample}.bam
+    samtools -b $sam -o ${sample}.bam
+    rm $sam
     """
 }
\ No newline at end of file

From 37292eff14b3c1f92611e1bebc5d57c6bb281352 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <dev@bonham.ch>
Date: Wed, 11 Jun 2025 16:02:19 -0400
Subject: [PATCH 06/35] fix params

---
 tuftshpc-params.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tuftshpc-params.yaml b/tuftshpc-params.yaml
index dbfba0c..b0204af 100644
--- a/tuftshpc-params.yaml
+++ b/tuftshpc-params.yaml
@@ -1,7 +1,7 @@
-readsdir: "/cluster/tufts/bonhamlab/shared/sequencing/fastq"
-outdir: "/cluster/tufts/bonhamlab/shared/sequencing/processed/"
-human_genome: "/cluster/tufts/bonhamlab/shared/databases/biobakery/kneaddata/"
-metaphlan_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/metaphlan/"
+readsdir: "/cluster/tufts/bonhamlab/shared/sequencing/bam"
+outdir: "/cluster/tufts/bonhamlab/shared/sequencing/processed"
+human_genome: "/cluster/tufts/bonhamlab/shared/databases/biobakery/kneaddata"
+metaphlan_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/metaphlan"
 humann_bowtie_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/chocophlan"
 humann_protein_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/uniref"
 humann_utility_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/utility_mapping"

From 62de8ecf34c4118146381edb687385a092c9b8fe Mon Sep 17 00:00:00 2001
From: Kevin Bonham <dev@bonham.ch>
Date: Wed, 11 Jun 2025 16:50:45 -0400
Subject: [PATCH 07/35] Few more humann tweak

---
 processes/humann.nf | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/processes/humann.nf b/processes/humann.nf
index 8904ec1..8f164f7 100644
--- a/processes/humann.nf
+++ b/processes/humann.nf
@@ -17,9 +17,10 @@ process humann {
 
     output:
     val  sample                       , emit: sample
-    path "${sample}_genefamilies.tsv" , emit: genefamilies
-    path "${sample}_pathabundance.tsv"
-    path "${sample}_pathcoverage.tsv"
+    path "${sample}_2_genefamilies.tsv" , emit: genefamilies
+    path "${sample}_0.log"
+    path "${sample}_3_reactions.tsv"
+    path "${sample}_4_pathabundance.tsv"
 
     script:
 
@@ -53,10 +54,9 @@ process humann_regroup {
     script:
 
     """
-    humann_config --update database_folders utility_mapping `realpath ${params.humann_utility_db}`
-    humann_regroup_table --input $genefamilies --output ${sample}_ecs.tsv --groups uniref90_level4ec
-    humann_regroup_table --input $genefamilies --output ${sample}_kos.tsv --groups uniref90_ko
-    humann_regroup_table --input $genefamilies --output ${sample}_pfams.tsv --groups uniref90_pfam
+    humann_regroup_table --input $genefamilies --output ${sample}_ecs.tsv --custom ${params.humann_utility_db}/map_level4ec_uniclust90.txt.gz
+    humann_regroup_table --input $genefamilies --output ${sample}_kos.tsv --custom ${params.humann_utility_db}/map_ko_uniclust90.txt.gz
+    humann_regroup_table --input $genefamilies --output ${sample}_pfams.tsv --custom ${params.humann_utility_db}/map_pfam_uniref90.txt.gz
     """
 }   
 
@@ -79,9 +79,8 @@ process humann_rename {
     script:
 
     """
-    humann_config --update database_folders utility_mapping `realpath ${params.humann_utility_db}`
-    humann_rename_table --input $ecs   --output ${sample}_ecs_rename.tsv   --names ec
-    humann_rename_table --input $kos   --output ${sample}_kos_rename.tsv   --names kegg-orthology
-    humann_rename_table --input $pfams --output ${sample}_pfams_rename.tsv --names pfam
+    humann_rename_table --input $ecs   --output ${sample}_ecs_rename.tsv   --custom ${params.humann_utility_db}/map_level4ec_name.txt.gz
+    humann_rename_table --input $kos   --output ${sample}_kos_rename.tsv   --custom ${params.humann_utility_db}/map_ko_name.txt.gz
+    humann_rename_table --input $pfams --output ${sample}_pfams_rename.tsv --custom ${params.humann_utility_db}/map_pfam_name.txt.gz
     """
 }
\ No newline at end of file

From 326c33d19e78ebcf395b914ba7ebce9f6298fc1b Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Wed, 11 Jun 2025 16:58:43 -0400
Subject: [PATCH 08/35] fix mp stuff

---
 main.nf         | 4 ++--
 nextflow.config | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index a3082c1..d49aef6 100755
--- a/main.nf
+++ b/main.nf
@@ -4,7 +4,7 @@ nextflow.enable.dsl=2
 
 include { bam2fastq } from './processes/bam2fastq.nf'
 include { kneaddata } from './processes/kneaddata.nf'
-include { metaphlan; metaphlan_bzip } from './processes/metaphlan.nf'
+include { metaphlan; metaphlan_bam } from './processes/metaphlan.nf'
 include { humann; humann_regroup; humann_rename } from './processes/humann.nf'
 
 
@@ -17,7 +17,7 @@ workflow {
     bam_out       = bam2fastq(read_ch)
     knead_out     = kneaddata(bam_out)
     metaphlan_out = metaphlan(knead_out.fastq, metaphlan_db, metaphlan_index)
-    metaphlan_bzip = metaphlan_bzip(metaphlan_out.sample, metaphlan_out.sam)
+    metaphlan_bam = metaphlan_bam(metaphlan_out.sample, metaphlan_out.sam)
     humann_out    = humann(metaphlan_out.sample, knead_out.fastq, metaphlan_out.profile)
     regroup_out   = humann_regroup(humann_out.sample, humann_out.genefamilies)
     humann_rename(regroup_out)
diff --git a/nextflow.config b/nextflow.config
index d4bdfa6..5edf9bd 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -21,15 +21,15 @@ profiles {
             }
 
             withName: metaphlan {
-                memory = '16.G'
-                time   = '4h'
+                memory = '32.G'
+                time   = '4.h'
                 cpus  =  8
                 
             }
 
             withName: humann {
-                memory = '16G'
-                time   = '12h'
+                memory = '16.G'
+                time   = '8.h'
                 cpus  =  16
             }
 

From 69f78c1d5a0380ad2575cb4353cd170c47f1f63e Mon Sep 17 00:00:00 2001
From: Kevin Bonham <dev@bonham.ch>
Date: Wed, 11 Jun 2025 17:04:32 -0400
Subject: [PATCH 09/35] fix kneaddata inputs

---
 processes/kneaddata.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/processes/kneaddata.nf b/processes/kneaddata.nf
index f27f5f6..78da988 100644
--- a/processes/kneaddata.nf
+++ b/processes/kneaddata.nf
@@ -8,8 +8,8 @@ process kneaddata {
     maxRetries 3
 
     input:
-    tuple val(sample), path(reads)
-    path human_genome
+    val(sample)
+    path(reads)
 
     output:
     val(sample), emit: sample

From bc78841cc92b908c927ae2de7c4d362421b618a9 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Wed, 11 Jun 2025 17:08:12 -0400
Subject: [PATCH 10/35] typos

---
 main.nf                | 2 +-
 processes/kneaddata.nf | 5 ++---
 processes/metaphlan.nf | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index d49aef6..48b97b6 100755
--- a/main.nf
+++ b/main.nf
@@ -16,7 +16,7 @@ workflow {
     
     bam_out       = bam2fastq(read_ch)
     knead_out     = kneaddata(bam_out)
-    metaphlan_out = metaphlan(knead_out.fastq, metaphlan_db, metaphlan_index)
+    metaphlan_out = metaphlan(knead_out.sample, knead_out.fastq)
     metaphlan_bam = metaphlan_bam(metaphlan_out.sample, metaphlan_out.sam)
     humann_out    = humann(metaphlan_out.sample, knead_out.fastq, metaphlan_out.profile)
     regroup_out   = humann_regroup(humann_out.sample, humann_out.genefamilies)
diff --git a/processes/kneaddata.nf b/processes/kneaddata.nf
index 78da988..c579de6 100644
--- a/processes/kneaddata.nf
+++ b/processes/kneaddata.nf
@@ -8,8 +8,7 @@ process kneaddata {
     maxRetries 3
 
     input:
-    val(sample)
-    path(reads)
+    tuple val(sample), path(reads)
 
     output:
     val(sample), emit: sample
@@ -29,4 +28,4 @@ process kneaddata {
 
     gzip *.fastq
     """  
-}
\ No newline at end of file
+}
diff --git a/processes/metaphlan.nf b/processes/metaphlan.nf
index c4cd503..be9da29 100644
--- a/processes/metaphlan.nf
+++ b/processes/metaphlan.nf
@@ -46,4 +46,4 @@ process metaphlan {
     samtools -b $sam -o ${sample}.bam
     rm $sam
     """
-}
\ No newline at end of file
+}

From a3878854fe7cfcb804b8bb36a90ad7d1bb738eea Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Wed, 11 Jun 2025 17:21:53 -0400
Subject: [PATCH 11/35] wrap params in braces

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 48b97b6..4043dad 100755
--- a/main.nf
+++ b/main.nf
@@ -11,7 +11,7 @@ include { humann; humann_regroup; humann_rename } from './processes/humann.nf'
 workflow {
     
     read_ch = Channel
-        .fromPath("$params.readsdir/$params.filepattern")
+        .fromPath("${params.readsdir}/${params.filepattern}")
 
     
     bam_out       = bam2fastq(read_ch)

From ac550b5a94b18ae1f55347a55f8050e180a12f6f Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Wed, 11 Jun 2025 17:35:23 -0400
Subject: [PATCH 12/35] trying stuff

---
 main.nf | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 4043dad..c91ca92 100755
--- a/main.nf
+++ b/main.nf
@@ -11,8 +11,10 @@ include { humann; humann_regroup; humann_rename } from './processes/humann.nf'
 workflow {
     
     read_ch = Channel
-        .fromPath("${params.readsdir}/${params.filepattern}")
-
+        .fromFilePairs("${params.readsdir}/${params.filepattern}", size: 1)
+	
+ 
+    read_ch.view()
     
     bam_out       = bam2fastq(read_ch)
     knead_out     = kneaddata(bam_out)

From 3eabbd454e403294180a49590a2bbeaff059d9bf Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Wed, 11 Jun 2025 17:42:07 -0400
Subject: [PATCH 13/35] something works

---
 main.nf | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index c91ca92..72b3a98 100755
--- a/main.nf
+++ b/main.nf
@@ -11,10 +11,12 @@ include { humann; humann_regroup; humann_rename } from './processes/humann.nf'
 workflow {
     
     read_ch = Channel
-        .fromFilePairs("${params.readsdir}/${params.filepattern}", size: 1)
-	
- 
-    read_ch.view()
+        .fromPath("${params.readsdir}/${params.filepattern}")
+        .map { file -> 
+            def sample = file.baseName  // ERR3405856.bam -> ERR3405856
+            return tuple(sample, file)
+        }
+    
     
     bam_out       = bam2fastq(read_ch)
     knead_out     = kneaddata(bam_out)

From 21727da1a3004d6ea330a0a8078bac6a7e870f4c Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Wed, 11 Jun 2025 21:43:34 -0400
Subject: [PATCH 14/35] claude-suggested

---
 nextflow.config        | 5 +++++
 processes/bam2fastq.nf | 8 ++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 5edf9bd..f07a33a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,6 +13,11 @@ profiles {
             executor = 'slurm'
             queue = 'batch,preempt'
     
+	    // Default settings for all processes
+            memory = '4.G'
+            time = '2.h'
+            cpus = 2
+
             withName: kneaddata {
                 memory = '8.G'
                 time   = '8.h'
diff --git a/processes/bam2fastq.nf b/processes/bam2fastq.nf
index 482cff6..8e87aa5 100644
--- a/processes/bam2fastq.nf
+++ b/processes/bam2fastq.nf
@@ -1,7 +1,7 @@
 process bam2fastq {
     tag "bam2fastq $sample"
-    time { workflow.profile == 'standard' ? null : time * task.attempt }
-    memory { workflow.profile == 'standard' ? null : memory * task.attempt }
+    time { workflow.profile == 'standard' ? null : task.time * task.attempt }
+    memory { workflow.profile == 'standard' ? null : task.memory * task.attempt }
 
     errorStrategy 'retry'
     maxRetries 3
@@ -10,13 +10,13 @@ process bam2fastq {
     tuple val(sample), path(reads)
 
     output:
-    tuple val(sample), path("{sample}.fastq")
+    tuple val(sample), path("${sample}.fastq")
 
     shell:
     
     """
     echo $sample
 
-    samtools fastq -@ {task.cpus} > {sample}.fastq
+    samtools fastq -@ ${task.cpus} ${reads} > ${sample}.fastq
     """  
 }

From 5fe9fd16da4ff98fe8d6a4fbbadf5c7f816ef137 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <dev@bonham.ch>
Date: Wed, 11 Jun 2025 21:47:50 -0400
Subject: [PATCH 15/35] return explicit stuff for bam

---
 nextflow.config | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index f07a33a..c779fa5 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,11 +13,18 @@ profiles {
             executor = 'slurm'
             queue = 'batch,preempt'
     
-	    // Default settings for all processes
+	        // Default settings for all processes
             memory = '4.G'
             time = '2.h'
             cpus = 2
 
+            withName: bam2fastq {
+                memory = '8.G'
+                time   = '8.h'
+                cpus  =  8
+                
+            }
+
             withName: kneaddata {
                 memory = '8.G'
                 time   = '8.h'

From b8c39a23c0d0e1e41d516be8baad865bbfbbda1c Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Thu, 12 Jun 2025 11:48:33 -0400
Subject: [PATCH 16/35] more tweaks from failures

---
 nextflow.config        |  6 ++++++
 processes/humann.nf    |  4 ++--
 processes/kneaddata.nf | 10 +++++-----
 processes/metaphlan.nf |  6 +++---
 tuftshpc-params.yaml   |  3 ++-
 5 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index c779fa5..4387f24 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -47,6 +47,12 @@ profiles {
 
         }
 
+	apptainer {
+	    enabled = true
+            autoMounts = true
+	    runOptions = '--no-home --bind /cluster'
+	}
+
     }
 
     engaging {
diff --git a/processes/humann.nf b/processes/humann.nf
index 8f164f7..eccf4f8 100644
--- a/processes/humann.nf
+++ b/processes/humann.nf
@@ -12,8 +12,8 @@ process humann {
 
     input:
     val  sample
-    path profile
     path catkneads
+    path profile
 
     output:
     val  sample                       , emit: sample
@@ -83,4 +83,4 @@ process humann_rename {
     humann_rename_table --input $kos   --output ${sample}_kos_rename.tsv   --custom ${params.humann_utility_db}/map_ko_name.txt.gz
     humann_rename_table --input $pfams --output ${sample}_pfams_rename.tsv --custom ${params.humann_utility_db}/map_pfam_name.txt.gz
     """
-}
\ No newline at end of file
+}
diff --git a/processes/kneaddata.nf b/processes/kneaddata.nf
index c579de6..0c27509 100644
--- a/processes/kneaddata.nf
+++ b/processes/kneaddata.nf
@@ -4,8 +4,8 @@ process kneaddata {
     time { workflow.profile == 'standard' ? null : time * task.attempt }
     memory { workflow.profile == 'standard' ? null : memory * task.attempt }
 
-    errorStrategy 'retry'
-    maxRetries 3
+    //errorStrategy 'retry'
+    //maxRetries 3
 
     input:
     tuple val(sample), path(reads)
@@ -21,11 +21,11 @@ process kneaddata {
     """
     echo $sample
 
-    kneaddata --input $reads \
+    kneaddata --unpaired $reads \
               --reference-db ${params.human_genome} --output ./ \
               --processes ${task.cpus} --output-prefix ${sample}_kneaddata \
-              --trimmomatic /opt/conda/share/trimmomatic
+              --trimmomatic /cluster/tufts/bonhamlab/shared/conda-envs/metaphlan_v4.2/.CondaPkg/.pixi/envs/default/share/trimmomatic
 
-    gzip *.fastq
+    gzip ${sample}_kneaddata*.fastq
     """  
 }
diff --git a/processes/metaphlan.nf b/processes/metaphlan.nf
index be9da29..4bec369 100644
--- a/processes/metaphlan.nf
+++ b/processes/metaphlan.nf
@@ -15,7 +15,7 @@ process metaphlan {
 
     script:
     """
-    metaphlan $kneads ${sample}_profile.tsv \
+    metaphlan $kneads -o ${sample}_profile.tsv \
         --mapout ${sample}_bowtie2.tsv \
         --samout ${sample}.sam \
         --input_type fastq \
@@ -37,13 +37,13 @@ process metaphlan {
 
     output:
     val  sample          , emit: sample
-    path "${sample}.bam" , emit: bam
+    path "${sample}_markers.bam" , emit: bam
 
     when:
 
     script:
     """
-    samtools -b $sam -o ${sample}.bam
+    samtools view -bS $sam -o ${sample}_markers.bam
     rm $sam
     """
 }
diff --git a/tuftshpc-params.yaml b/tuftshpc-params.yaml
index b0204af..a4b19c4 100644
--- a/tuftshpc-params.yaml
+++ b/tuftshpc-params.yaml
@@ -2,7 +2,8 @@ readsdir: "/cluster/tufts/bonhamlab/shared/sequencing/bam"
 outdir: "/cluster/tufts/bonhamlab/shared/sequencing/processed"
 human_genome: "/cluster/tufts/bonhamlab/shared/databases/biobakery/kneaddata"
 metaphlan_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/metaphlan"
+metaphlan_index: "mpa_vOct22_CHOCOPhlAnSGB_202403"
 humann_bowtie_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/chocophlan"
 humann_protein_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/uniref"
 humann_utility_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/utility_mapping"
-filepattern: "*.bam"
+filepattern: "SRR*.bam"

From 2506c985c9e2e3a4feee5c6b32489613b87f8263 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Thu, 12 Jun 2025 12:57:24 -0400
Subject: [PATCH 17/35] remove bam for now

---
 main.nf                |  2 +-
 processes/humann.nf    |  6 ++----
 processes/metaphlan.nf | 10 ++++++++--
 tuftshpc-params.yaml   |  2 +-
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/main.nf b/main.nf
index 72b3a98..d89a0a6 100755
--- a/main.nf
+++ b/main.nf
@@ -21,7 +21,7 @@ workflow {
     bam_out       = bam2fastq(read_ch)
     knead_out     = kneaddata(bam_out)
     metaphlan_out = metaphlan(knead_out.sample, knead_out.fastq)
-    metaphlan_bam = metaphlan_bam(metaphlan_out.sample, metaphlan_out.sam)
+    // metaphlan_bam = metaphlan_bam(metaphlan_out.sample, metaphlan_out.sam) // not working because of headers
     humann_out    = humann(metaphlan_out.sample, knead_out.fastq, metaphlan_out.profile)
     regroup_out   = humann_regroup(humann_out.sample, humann_out.genefamilies)
     humann_rename(regroup_out)
diff --git a/processes/humann.nf b/processes/humann.nf
index eccf4f8..4f648e6 100644
--- a/processes/humann.nf
+++ b/processes/humann.nf
@@ -26,13 +26,11 @@ process humann {
 
     """
     humann --input $catkneads --taxonomic-profile $profile --output ./ \
-        --threads ${task.cpus} --remove-temp-output \ # add --search-mode uniref90 for 3.7
-        --output-basename $sample \
+        --threads ${task.cpus} --remove-temp-output \ 
         --protein-database ${params.humann_protein_db} \
         --nucleotide-database ${params.humann_nucleotide_db} \
         --utility-mapping ${params.humann_utility_db} \
-        --metaphlan-options="--index mpa_vOct22_CHOCOPhlAnSGB_202403 --bowtie2db ${params.metaphlan_db} -t rel_ab_with_read_stats"
-
+        --output-basename $sample 
     """
 }
 
diff --git a/processes/metaphlan.nf b/processes/metaphlan.nf
index 4bec369..8e618f9 100644
--- a/processes/metaphlan.nf
+++ b/processes/metaphlan.nf
@@ -1,6 +1,7 @@
 process metaphlan {
     tag "metaphlan on $sample"
-    publishDir "$params.outdir/metaphlan", pattern: "*.tsv"
+    // publishDir "$params.outdir/metaphlan", pattern: "*.tsv" // once fix for sam compression is found
+    publishDir "$params.outdir/metaphlan" // keeps sam file
 
     input:
     val(sample)
@@ -43,7 +44,12 @@ process metaphlan {
 
     script:
     """
-    samtools view -bS $sam -o ${sample}_markers.bam
+    # Duplicate headers in output - skipping header validation
+    samtools view -bS --no-PG ${sam} -o ${sample}_markers.bam
+ 
+    # Alternative approach: strip and rebuild header
+    # samtools view -S ${sam} | samtools view -b -o ${sample}_markers.bam
+    
     rm $sam
     """
 }
diff --git a/tuftshpc-params.yaml b/tuftshpc-params.yaml
index a4b19c4..46efaae 100644
--- a/tuftshpc-params.yaml
+++ b/tuftshpc-params.yaml
@@ -3,7 +3,7 @@ outdir: "/cluster/tufts/bonhamlab/shared/sequencing/processed"
 human_genome: "/cluster/tufts/bonhamlab/shared/databases/biobakery/kneaddata"
 metaphlan_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/metaphlan"
 metaphlan_index: "mpa_vOct22_CHOCOPhlAnSGB_202403"
-humann_bowtie_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/chocophlan"
+humann_nucleotide_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/chocophlan"
 humann_protein_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/uniref"
 humann_utility_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/utility_mapping"
 filepattern: "SRR*.bam"

From 3cf41ccb34a45316fb73963afcda3debd935dea3 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Thu, 12 Jun 2025 13:02:53 -0400
Subject: [PATCH 18/35] more fixing humann

---
 processes/humann.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/processes/humann.nf b/processes/humann.nf
index 4f648e6..8bd37b2 100644
--- a/processes/humann.nf
+++ b/processes/humann.nf
@@ -26,10 +26,10 @@ process humann {
 
     """
     humann --input $catkneads --taxonomic-profile $profile --output ./ \
-        --threads ${task.cpus} --remove-temp-output \ 
+        --threads ${task.cpus} --remove-temp-output \
         --protein-database ${params.humann_protein_db} \
         --nucleotide-database ${params.humann_nucleotide_db} \
-        --utility-mapping ${params.humann_utility_db} \
+        --utility-database ${params.humann_utility_db} \
         --output-basename $sample 
     """
 }

From 72dda1a511bbb7756ad7b125df8ae5ddd699a13f Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Thu, 12 Jun 2025 14:46:28 -0400
Subject: [PATCH 19/35] it works

---
 processes/humann.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processes/humann.nf b/processes/humann.nf
index 8bd37b2..95589eb 100644
--- a/processes/humann.nf
+++ b/processes/humann.nf
@@ -53,7 +53,7 @@ process humann_regroup {
 
     """
     humann_regroup_table --input $genefamilies --output ${sample}_ecs.tsv --custom ${params.humann_utility_db}/map_level4ec_uniclust90.txt.gz
-    humann_regroup_table --input $genefamilies --output ${sample}_kos.tsv --custom ${params.humann_utility_db}/map_ko_uniclust90.txt.gz
+    humann_regroup_table --input $genefamilies --output ${sample}_kos.tsv --custom ${params.humann_utility_db}/map_ko_uniref90.txt.gz
     humann_regroup_table --input $genefamilies --output ${sample}_pfams.tsv --custom ${params.humann_utility_db}/map_pfam_uniref90.txt.gz
     """
 }   

From 57d70e3af456068c86c710c761af23350e82678f Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Thu, 12 Jun 2025 15:18:16 -0400
Subject: [PATCH 20/35] update params to run all

---
 tuftshpc-params.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuftshpc-params.yaml b/tuftshpc-params.yaml
index 46efaae..2cee90b 100644
--- a/tuftshpc-params.yaml
+++ b/tuftshpc-params.yaml
@@ -6,4 +6,4 @@ metaphlan_index: "mpa_vOct22_CHOCOPhlAnSGB_202403"
 humann_nucleotide_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/chocophlan"
 humann_protein_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/uniref"
 humann_utility_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/utility_mapping"
-filepattern: "SRR*.bam"
+filepattern: "*.bam"

From 64c692b3b11e86e7b932498ae5c0ede0b2f77da1 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kbonha01@tufts.edu>
Date: Mon, 16 Jun 2025 17:22:13 -0400
Subject: [PATCH 21/35] trying to solve random issues

---
 nextflow.config        | 6 +++---
 processes/bam2fastq.nf | 2 --
 processes/humann.nf    | 8 ++++----
 tuftshpc-params.yaml   | 4 ++--
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 4387f24..0492127 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -20,8 +20,8 @@ profiles {
 
             withName: bam2fastq {
                 memory = '8.G'
-                time   = '8.h'
-                cpus  =  8
+                time   = '1.h'
+                cpus  =  4
                 
             }
 
@@ -40,7 +40,7 @@ profiles {
             }
 
             withName: humann {
-                memory = '16.G'
+                memory = '32.G'
                 time   = '8.h'
                 cpus  =  16
             }
diff --git a/processes/bam2fastq.nf b/processes/bam2fastq.nf
index 8e87aa5..0f076be 100644
--- a/processes/bam2fastq.nf
+++ b/processes/bam2fastq.nf
@@ -1,7 +1,5 @@
 process bam2fastq {
     tag "bam2fastq $sample"
-    time { workflow.profile == 'standard' ? null : task.time * task.attempt }
-    memory { workflow.profile == 'standard' ? null : task.memory * task.attempt }
 
     errorStrategy 'retry'
     maxRetries 3
diff --git a/processes/humann.nf b/processes/humann.nf
index 95589eb..e2094a5 100644
--- a/processes/humann.nf
+++ b/processes/humann.nf
@@ -3,11 +3,11 @@
 process humann {
     tag "humann on $sample"
     publishDir "$params.outdir/humann/main"
-    memory { workflow.profile == 'standard' ? null : memory * task.attempt }
-    cpus { workflow.profile == 'standard' ? null : cpus * task.attempt }
+    // memory { workflow.profile == 'standard' ? null : memory * task.attempt }
+    // cpus { workflow.profile == 'standard' ? null : cpus * task.attempt }
 
-    errorStrategy { task.exitStatus in 134..140 ? 'retry' : 'terminate' }
-    maxRetries 3
+    // errorStrategy { task.exitStatus in 134..140 ? 'retry' : 'terminate' }
+    // maxRetries 3
 
 
     input:
diff --git a/tuftshpc-params.yaml b/tuftshpc-params.yaml
index 2cee90b..9821466 100644
--- a/tuftshpc-params.yaml
+++ b/tuftshpc-params.yaml
@@ -1,9 +1,9 @@
 readsdir: "/cluster/tufts/bonhamlab/shared/sequencing/bam"
-outdir: "/cluster/tufts/bonhamlab/shared/sequencing/processed"
+outdir: "/cluster/tufts/bonhamlab/shared/sequencing/nf-staging"
 human_genome: "/cluster/tufts/bonhamlab/shared/databases/biobakery/kneaddata"
 metaphlan_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/metaphlan"
 metaphlan_index: "mpa_vOct22_CHOCOPhlAnSGB_202403"
 humann_nucleotide_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/chocophlan"
 humann_protein_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/uniref"
 humann_utility_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/utility_mapping"
-filepattern: "*.bam"
+filepattern: "SRR*.bam"

From 93e1409f55cfc3396bb2676e342accc6491d95b7 Mon Sep 17 00:00:00 2001
From: Danielle Pinto <danielle.peterson101@gmail.com>
Date: Thu, 19 Jun 2025 20:56:51 -0400
Subject: [PATCH 22/35] scope out master-params file and update metaphlan.nf
 params based on diff metaphlan versions

---
 processes/metaphlan.nf | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/processes/metaphlan.nf b/processes/metaphlan.nf
index 8e618f9..151b122 100644
--- a/processes/metaphlan.nf
+++ b/processes/metaphlan.nf
@@ -15,13 +15,23 @@ process metaphlan {
 
 
     script:
+    // metphlan4 changed metaphlan db variable from bowtie2db to db_dir
+    // also changed from bowtie2out to mapout
+    if (params.metaphaln_ver == 'metaphlan4') {
+    db_arg = 'db_dir'
+    out_arg = 'mapout'}
+    else (params.metaphaln_ver == 'metaphlan3.1.0'){
+    db_arg = 'bowtie2db'
+    out_arg = 'bowtie2out'
+    }
+    
     """
     metaphlan $kneads -o ${sample}_profile.tsv \
-        --mapout ${sample}_bowtie2.tsv \
+        --${out_arg} ${sample}_bowtie2.tsv \
         --samout ${sample}.sam \
         --input_type fastq \
         --nproc ${task.cpus} \
-        --db_dir ${params.metaphlan_db} \
+        --${dbarg} ${params.metaphlan_db} \
         --index ${params.metaphlan_index} \
         -t rel_ab_w_read_stats
     """

From dd332d426039c5b1ec6a28041f45a3318e159546 Mon Sep 17 00:00:00 2001
From: Danielle Pinto <danielle.peterson101@gmail.com>
Date: Sat, 21 Jun 2025 22:03:02 -0400
Subject: [PATCH 23/35] add documentation to the README

---
 README.md | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 79 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 36db0f9..d031f47 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,87 @@
 # Nextflow pipeline for running the bioBakery
 
-by Kevin Bonham, PhD
+by Kevin Bonham, PhD 
 
-- `KneadData`
+bioBakery
+
+- `KneadData`: a data quality-control pipeline that removes host genomic data within our metagenomic samples. Particularly, this pipeline uses a database containing a reference human genome so that all human DNA is removed from the samples. Link to more information here: (https://huttenhower.sph.harvard.edu/kneaddata/).
 - `MetaPhlAn`
 - `HUMAnN`
 
 ## Setup
+Instructions for setting up a local environment to run the pipeline can be found on Danielle's notebook [here](LINK TO BE ADDED). 
+
+Computing environments on the Tufts HPC and AWS should already be set-up with apptainer environments.
+
+## Running the pipeline
+This nextflow pipeline can be run on three different types of machines: 
+1) Locally
+2) Tufts high performance cluster (HPC)
+3) Amazon website services cloud (AWS)
+
+Based on the profiles described in `nextflow.config`, we can run the pipeline with the following Nextflow commands:
+
+[NEED TO DOUBLE CHECK THIS PART]
+### Running locally
+`nextflow main.nf --local` 
+
+### Running on the HPC
+TO DO: Still need to figure out the exact nextflow syntax
+
+Jobs on the Tufts HPC can be run in two different ways:
+- **Batch**: the job will be sent to the queue and it will be completed based on how many resources you have requested, current cluster load, and fairshare (have you recently used the cluster) 
+    - `nextflow main.nf --tufts_hpc --batch` 
+
+- **Preempt**: this allows you to run your job preemptively using free nodes from another lab that paid for these compute resources. However, if they are already running a job, your job will be killed and you'll have to resubmit it.
+
+    - `nextflow main.nf --tufts_hpc --preempt` 
+
+
+### Running on AWS
+`nextflow main.nf --amazon` 
+
+> Kevin may want to add additional comments here about different ways to run the pipeline
+
+## Databases
+Several databases must be installed to run the pipeline. 
+
+### Kneaddata
+- A database containing a reference human genome so that unwanted human DNA can be removed from our metagenomic samples.
+
+### MetaPhlAn
+- `mpa_vOct22_CHOCOPhlAnSGB_202403` is the most recent MetaPhlAn database that is compatible with the versions of HUMAnN we are using
+- Note: there is a more up-to-date version (released in January 2025) that we will probably eventually want to shift to once HUMAnN is able to support it.
+
+### HUMAnN
+
+
+## Information on software versions
+This pipeline supports the following versions of MetaPhlAn and HUMAnN:
+ 
+ ### MetaPhlAn
+- MetaPhlAn 3.1.0
+- MetaPhlAn 4
+
+### HUMAnN
+- HUMAnN3 3.7
+- HUMAnN3 4 alpha
+
+## Testing the pipeline
+There are some raw fastq files in `test/` which can be processed through the pipeline
+
+## Using the `master-params.yaml` file
+The `master-params.yaml` file defines all input parameters that you may want to use to run the Nextflow pipeline. The file should not be used directly to run the pipeline. Rather, the user should select the params they need from the file based on how they would like to use the pipeline (software versions of MetaPhlAn or HUMAnN, computing environment, databases, etc. ), and paste these into a separate yaml file. This second yaml file can be used to run the Nextflow pipeline. 
 
-**TODO**
\ No newline at end of file
+### Overview of parameters in `master-params.yaml`
+- `paired_end`: True or False, given the type of input data
+- `metaphlan_ver`: MetaPhlAn software version (either `metaphlan3.1.0` or `metaphlan4`)
+- `humann_ver`: HUMAnN3 software version (either `humann3.7` or `humann4_alpha`)
+- `readsdir`: path to directory that contains raw data (bam files)
+- `outdir`: path to directory where processed results will be saved
+- `human_genome`: path to directory that contains human reference database used during Kneaddata 
+- `metaphlan_db`: path to directory that contains metaphlan databases
+- `metaphlan_index`: 
+- `humann_nucleotide_db`: 
+- `humann_protein_db`: 
+- `humann_utility_db`: 
+- `filepattern`: regex describing samples should be named (relative to the input raw data)
\ No newline at end of file

From b1da87c5157e1fdf173b50159399737659b67305 Mon Sep 17 00:00:00 2001
From: Danielle Pinto <danielle.peterson101@gmail.com>
Date: Sat, 21 Jun 2025 22:06:01 -0400
Subject: [PATCH 24/35] add param for paired-end reads

---
 master-params.yaml | 52 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 master-params.yaml

diff --git a/master-params.yaml b/master-params.yaml
new file mode 100644
index 0000000..fb97efb
--- /dev/null
+++ b/master-params.yaml
@@ -0,0 +1,52 @@
+### Data type
+# paired end data
+paired_end: "True"
+
+### Metaphlan version
+# metaphlan3.1.0 params
+metaphlan_ver : "metaphlan3.1.0"
+# metaphlan4 params
+metaphlan_ver : "metaphlan4"
+
+# humann3.7 params
+humann_ver : "humann3.7"
+# humann4alpha params
+humann_ver : "humann4_alpha"
+
+
+
+### Computing environment
+# local params (will need to fill out yourself based on the location of files on your personal computer)
+
+# readsdir: 
+# outdir: 
+# human_genome: 
+# metaphlan_db: 
+# metaphlan_index: 
+# humann_nucleotide_db: 
+# humann_protein_db: 
+# humann_utility_db: 
+
+# Tufts HPC params
+readsdir: "/cluster/tufts/bonhamlab/shared/sequencing/bam"
+outdir: "/cluster/tufts/bonhamlab/shared/sequencing/processed"
+human_genome: "/cluster/tufts/bonhamlab/shared/databases/biobakery/kneaddata"
+metaphlan_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/metaphlan"
+metaphlan_index: "mpa_vOct22_CHOCOPhlAnSGB_202403"
+humann_nucleotide_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/chocophlan"
+humann_protein_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/uniref"
+humann_utility_db: "/cluster/tufts/bonhamlab/shared/databases/biobakery/humann/utility_mapping"
+
+
+# AWS params
+readsdir: "s3://vkc-nextflow/rawfastq/"
+outdir: "s3://vkc-nextflow/output/"
+human_genome: "s3://biobakery-databases/kneaddata_databases/"
+metaphlan_db: "s3://biobakery-databases/metaphlan_databases/"
+humann_bowtie_db: "s3://biobakery-databases/humann_databases/chocophlan"
+humann_protein_db: "s3://biobakery-databases/humann_databases/uniref"
+humann_utility_db: "s3://biobakery-databases/humann_databases/utility_mapping"
+
+
+# Global params (same regardless of computer environment)
+filepattern: "*.bam" # need to adjust if bam or fastq
\ No newline at end of file

From 91064dfc673ac66c99952893b5df2c27fe7faf19 Mon Sep 17 00:00:00 2001
From: Danielle Pinto <danielle.peterson101@gmail.com>
Date: Sun, 22 Jun 2025 22:07:42 -0400
Subject: [PATCH 25/35] add more documentation to README

---
 README.md | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index d031f47..757f83a 100644
--- a/README.md
+++ b/README.md
@@ -2,14 +2,14 @@
 
 by Kevin Bonham, PhD 
 
-bioBakery
+[bioBakery](https://github.com/biobakery): software, documentation, and tutorials for microbial community profiling (created and mantained by the Huttenhower lab)
 
-- `KneadData`: a data quality-control pipeline that removes host genomic data within our metagenomic samples. Particularly, this pipeline uses a database containing a reference human genome so that all human DNA is removed from the samples. Link to more information here: (https://huttenhower.sph.harvard.edu/kneaddata/).
-- `MetaPhlAn`
-- `HUMAnN`
+- [`KneadData`](https://github.com/biobakery/kneaddata): a data quality-control pipeline that removes host genomic data within our metagenomic samples. Particularly, this pipeline uses a database containing a reference human genome so that all human DNA is removed from the samples. Link to more information here: (https://huttenhower.sph.harvard.edu/kneaddata/).
+- [`MetaPhlAn`](https://github.com/biobakery/MetaPhlAn): a computational tool for species-level microbial profiling (bacteria, archaea, eukaryotes, and viruses) from metagenomic shotgun sequencing data. Link to more information here:(https://huttenhower.sph.harvard.edu/metaphlan)
+- [`HUMAnN`](https://github.com/biobakery/humann): a pipeline for efficiently and accurately profiling the presence/absence and abundance of microbial pathways in a community from metagenomic or metatranscriptomic sequencing data (typically millions of short DNA/RNA reads). This process, referred to as functional profiling, aims to describe the metabolic potential of a microbial community and its members. Link to more information here:(https://huttenhower.sph.harvard.edu/humann)
 
 ## Setup
-Instructions for setting up a local environment to run the pipeline can be found on Danielle's notebook [here](LINK TO BE ADDED). 
+Instructions for setting up a local environment to run the pipeline can be found on Danielle's notebook [here](https://github.com/BonhamLab/daniellepinto/blob/main/PeriodicMeetings/2025-06-17.md#danielles-personal-notes). 
 
 Computing environments on the Tufts HPC and AWS should already be set-up with apptainer environments.
 
@@ -47,12 +47,18 @@ Several databases must be installed to run the pipeline.
 
 ### Kneaddata
 - A database containing a reference human genome so that unwanted human DNA can be removed from our metagenomic samples.
+    - The `Homo_sapiens_hg39_T2T_Bowtie2_v0.1` bowtie2 database can be downloaded from [here](https://huttenhower.sph.harvard.edu/kneadData_databases/Homo_sapiens_hg39_T2T_Bowtie2_v0.1.tar.gz).
+        - This version of the database can be used for all analyses and there shouldn't be a big need to upgrade the database (unless we have an updated human genome!)
+- Other reference databases can be added as well if other types of data want to be removed (eg. human transcriptome, mouse genome, etc.)
 
 ### MetaPhlAn
-- `mpa_vOct22_CHOCOPhlAnSGB_202403` is the most recent MetaPhlAn database that is compatible with the versions of HUMAnN we are using
+- `mpa_vOct22_CHOCOPhlAnSGB_202403` is the most recent MetaPhlAn database that is compatible with the versions of HUMAnN we are using.
+    - It can be downloaded from [here](http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/).
 - Note: there is a more up-to-date version (released in January 2025) that we will probably eventually want to shift to once HUMAnN is able to support it.
 
 ### HUMAnN
+- Looks like there is only version available
+    - Database can be downloaded [here](http://cmprod1.cibio.unitn.it/databases/HUMAnN/).
 
 
 ## Information on software versions
@@ -80,7 +86,7 @@ The `master-params.yaml` file defines all input parameters that you may want to
 - `outdir`: path to directory where processed results will be saved
 - `human_genome`: path to directory that contains human reference database used during Kneaddata 
 - `metaphlan_db`: path to directory that contains metaphlan databases
-- `metaphlan_index`: 
+- `metaphlan_index`: database version (database must exist within `metaphlan_db`)
 - `humann_nucleotide_db`: 
 - `humann_protein_db`: 
 - `humann_utility_db`: 

From 5dd757ee4476119f1481910d3fff70ba984da86d Mon Sep 17 00:00:00 2001
From: Danielle Pinto <danielle.peterson101@gmail.com>
Date: Mon, 23 Jun 2025 21:42:47 -0400
Subject: [PATCH 26/35] final first draft of README

---
 README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 757f83a..20caba2 100644
--- a/README.md
+++ b/README.md
@@ -57,8 +57,7 @@ Several databases must be installed to run the pipeline.
 - Note: there is a more up-to-date version (released in January 2025) that we will probably eventually want to shift to once HUMAnN is able to support it.
 
 ### HUMAnN
-- Looks like there is only version available
-    - Database can be downloaded [here](http://cmprod1.cibio.unitn.it/databases/HUMAnN/).
+- Database can be downloaded [here](http://cmprod1.cibio.unitn.it/databases/HUMAnN/).
 
 
 ## Information on software versions
@@ -87,7 +86,7 @@ The `master-params.yaml` file defines all input parameters that you may want to
 - `human_genome`: path to directory that contains human reference database used during Kneaddata 
 - `metaphlan_db`: path to directory that contains metaphlan databases
 - `metaphlan_index`: database version (database must exist within `metaphlan_db`)
-- `humann_nucleotide_db`: 
-- `humann_protein_db`: 
+- `humann_nucleotide_db`: path to directory containing chocophlan database
+- `humann_protein_db`: path to directory containing UniRef database
 - `humann_utility_db`: 
 - `filepattern`: regex describing samples should be named (relative to the input raw data)
\ No newline at end of file

From 0758a1c40f20f661baeb90a64310cc271576f05a Mon Sep 17 00:00:00 2001
From: Danielle Pinto <danielle.peterson101@gmail.com>
Date: Tue, 24 Jun 2025 20:33:00 -0400
Subject: [PATCH 27/35] rename master-params to template-params

---
 README.md                                  | 6 +++---
 master-params.yaml => template-params.yaml | 0
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename master-params.yaml => template-params.yaml (100%)

diff --git a/README.md b/README.md
index 20caba2..9983548 100644
--- a/README.md
+++ b/README.md
@@ -74,10 +74,10 @@ This pipeline supports the following versions of MetaPhlAn and HUMAnN:
 ## Testing the pipeline
 There are some raw fastq files in `test/` which can be processed through the pipeline
 
-## Using the `master-params.yaml` file
-The `master-params.yaml` file defines all input parameters that you may want to use to run the Nextflow pipeline. The file should not be used directly to run the pipeline. Rather, the user should select the params they need from the file based on how they would like to use the pipeline (software versions of MetaPhlAn or HUMAnN, computing environment, databases, etc. ), and paste these into a separate yaml file. This second yaml file can be used to run the Nextflow pipeline. 
+## Using the `template-params.yaml` file
+The `template-params.yaml` file defines all input parameters that you may want to use to run the Nextflow pipeline. The file should not be used directly to run the pipeline. Rather, the user should select the params they need from the file based on how they would like to use the pipeline (software versions of MetaPhlAn or HUMAnN, computing environment, databases, etc. ), and paste these into a separate yaml file. This second yaml file can be used to run the Nextflow pipeline. 
 
-### Overview of parameters in `master-params.yaml`
+### Overview of parameters in `template-params.yaml`
 - `paired_end`: True or False, given the type of input data
 - `metaphlan_ver`: MetaPhlAn software version (either `metaphlan3.1.0` or `metaphlan4`)
 - `humann_ver`: HUMAnN3 software version (either `humann3.7` or `humann4_alpha`)
diff --git a/master-params.yaml b/template-params.yaml
similarity index 100%
rename from master-params.yaml
rename to template-params.yaml

From fe48fb880898188a8a9b5a205c015df66e5bf390 Mon Sep 17 00:00:00 2001
From: Danielle Pinto <danielle.peterson101@gmail.com>
Date: Tue, 24 Jun 2025 21:32:15 -0400
Subject: [PATCH 28/35] update nextflow commands in README

---
 README.md            | 22 ++++++++++++----------
 template-params.yaml |  3 +++
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 9983548..5fc9f7d 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ by Kevin Bonham, PhD
 - [`MetaPhlAn`](https://github.com/biobakery/MetaPhlAn): a computational tool for species-level microbial profiling (bacteria, archaea, eukaryotes, and viruses) from metagenomic shotgun sequencing data. Link to more information here:(https://huttenhower.sph.harvard.edu/metaphlan)
 - [`HUMAnN`](https://github.com/biobakery/humann): a pipeline for efficiently and accurately profiling the presence/absence and abundance of microbial pathways in a community from metagenomic or metatranscriptomic sequencing data (typically millions of short DNA/RNA reads). This process, referred to as functional profiling, aims to describe the metabolic potential of a microbial community and its members. Link to more information here:(https://huttenhower.sph.harvard.edu/humann)
 
-## Setup
+## Environment setup
 Instructions for setting up a local environment to run the pipeline can be found on Danielle's notebook [here](https://github.com/BonhamLab/daniellepinto/blob/main/PeriodicMeetings/2025-06-17.md#danielles-personal-notes). 
 
 Computing environments on the Tufts HPC and AWS should already be set-up with apptainer environments.
@@ -23,27 +23,28 @@ Based on the profiles described in `nextflow.config`, we can run the pipeline wi
 
 [NEED TO DOUBLE CHECK THIS PART]
 ### Running locally
-`nextflow main.nf --local` 
+`nextflow run main.nf -profile local -params-file params.yaml` 
 
 ### Running on the HPC
-TO DO: Still need to figure out the exact nextflow syntax
-
 Jobs on the Tufts HPC can be run in two different ways:
 - **Batch**: the job will be sent to the queue and it will be completed based on how many resources you have requested, current cluster load, and fairshare (have you recently used the cluster) 
-    - `nextflow main.nf --tufts_hpc --batch` 
 
 - **Preempt**: this allows you to run your job preemptively using free nodes from another lab that paid for these compute resources. However, if they are already running a job, your job will be killed and you'll have to resubmit it.
 
-    - `nextflow main.nf --tufts_hpc --preempt` 
+With how the HPC environment is currently defined in `nextflow.config`, jobs will first be submitted to the batch queue. If there are not any available resources, it will be processed preemptively. 
+
 
+- `nextflow run main.nf -profile tufts_hpc -params-file params.yaml` 
 
 ### Running on AWS
-`nextflow main.nf --amazon` 
+`nextflow main.nf -profile amazon -params-file params.yaml` 
 
 > Kevin may want to add additional comments here about different ways to run the pipeline
 
+> Note: We can also process samples on the MIT `engaging` cluster, but that should probably not be used without permission
+
 ## Databases
-Several databases must be installed to run the pipeline. 
+Several databases must be installed to run this pipeline. 
 
 ### Kneaddata
 - A database containing a reference human genome so that unwanted human DNA can be removed from our metagenomic samples.
@@ -75,13 +76,14 @@ This pipeline supports the following versions of MetaPhlAn and HUMAnN:
 There are some raw fastq files in `test/` which can be processed through the pipeline
 
 ## Using the `template-params.yaml` file
-The `template-params.yaml` file defines all input parameters that you may want to use to run the Nextflow pipeline. The file should not be used directly to run the pipeline. Rather, the user should select the params they need from the file based on how they would like to use the pipeline (software versions of MetaPhlAn or HUMAnN, computing environment, databases, etc. ), and paste these into a separate yaml file. This second yaml file can be used to run the Nextflow pipeline. 
+The `template-params.yaml` file defines all input parameters that you may want to use to run the Nextflow pipeline. The file should **not** be used directly to run the pipeline. Rather, the user should select the params they need from the file based on how they would like to use the pipeline (software versions of MetaPhlAn or HUMAnN, computing environment, databases, input data etc. ), and paste these into a separate yaml file. This second yaml file can be used to run the Nextflow pipeline. 
 
 ### Overview of parameters in `template-params.yaml`
+- `data_type`: type of input data (either `fastq` or `bam`)
 - `paired_end`: True or False, given the type of input data
 - `metaphlan_ver`: MetaPhlAn software version (either `metaphlan3.1.0` or `metaphlan4`)
 - `humann_ver`: HUMAnN3 software version (either `humann3.7` or `humann4_alpha`)
-- `readsdir`: path to directory that contains raw data (bam files)
+- `readsdir`: path to directory that contains raw data 
 - `outdir`: path to directory where processed results will be saved
 - `human_genome`: path to directory that contains human reference database used during Kneaddata 
 - `metaphlan_db`: path to directory that contains metaphlan databases
diff --git a/template-params.yaml b/template-params.yaml
index fb97efb..5eedd90 100644
--- a/template-params.yaml
+++ b/template-params.yaml
@@ -1,7 +1,10 @@
 ### Data type
+data_type: "fastq"
+data_type: "bam"
 # paired end data
 paired_end: "True"
 
+
 ### Metaphlan version
 # metaphlan3.1.0 params
 metaphlan_ver : "metaphlan3.1.0"

From 7242da6cb70eef43cc022458b79f7538989729c0 Mon Sep 17 00:00:00 2001
From: Danielle Pinto <danielle.peterson101@gmail.com>
Date: Wed, 25 Jun 2025 16:06:23 -0400
Subject: [PATCH 29/35] update according to Kevin's github feedback

---
 README.md            | 15 ++++++++-------
 template-params.yaml | 19 ++++++++++---------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 5fc9f7d..df33e0d 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ This nextflow pipeline can be run on three different types of machines:
 
 Based on the profiles described in `nextflow.config`, we can run the pipeline with the following Nextflow commands:
 
-[NEED TO DOUBLE CHECK THIS PART]
+
 ### Running locally
 `nextflow run main.nf -profile local -params-file params.yaml` 
 
@@ -54,7 +54,7 @@ Several databases must be installed to run this pipeline.
 
 ### MetaPhlAn
 - `mpa_vOct22_CHOCOPhlAnSGB_202403` is the most recent MetaPhlAn database that is compatible with the versions of HUMAnN we are using.
-    - It can be downloaded from [here](http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/).
+    - It can be found/downloaded manually from [here](http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/). The easiest way to download is by running `metaphlan --install #any_other_args`
 - Note: there is a more up-to-date version (released in January 2025) that we will probably eventually want to shift to once HUMAnN is able to support it.
 
 ### HUMAnN
@@ -79,10 +79,12 @@ There are some raw fastq files in `test/` which can be processed through the pip
 The `template-params.yaml` file defines all input parameters that you may want to use to run the Nextflow pipeline. The file should **not** be used directly to run the pipeline. Rather, the user should select the params they need from the file based on how they would like to use the pipeline (software versions of MetaPhlAn or HUMAnN, computing environment, databases, input data etc. ), and paste these into a separate yaml file. This second yaml file can be used to run the Nextflow pipeline. 
 
 ### Overview of parameters in `template-params.yaml`
-- `data_type`: type of input data (either `fastq` or `bam`)
+- `input_data_type`: type of input data (either `fastq` or `bam`)
 - `paired_end`: True or False, given the type of input data
-- `metaphlan_ver`: MetaPhlAn software version (either `metaphlan3.1.0` or `metaphlan4`)
-- `humann_ver`: HUMAnN3 software version (either `humann3.7` or `humann4_alpha`)
+- `filepattern`: regex describing sample naming convention (relative to the input data type)
+
+- `metaphlan_version`: MetaPhlAn software version (either `metaphlan_v3` or `metaphlan_v4`)
+- `humann_version`: HUMAnN3 software version (either `humann_v37` or `humann_v4a`)
 - `readsdir`: path to directory that contains raw data 
 - `outdir`: path to directory where processed results will be saved
 - `human_genome`: path to directory that contains human reference database used during Kneaddata 
@@ -90,5 +92,4 @@ The `template-params.yaml` file defines all input parameters that you may want t
 - `metaphlan_index`: database version (database must exist within `metaphlan_db`)
 - `humann_nucleotide_db`: path to directory containing chocophlan database
 - `humann_protein_db`: path to directory containing UniRef database
-- `humann_utility_db`: 
-- `filepattern`: regex describing samples should be named (relative to the input raw data)
\ No newline at end of file
+- `humann_utility_db`: path to directory containing databases that have conversions between different protein annotations (eg UniRef90 to KO or EC), and names for all of the different annotations that have them
diff --git a/template-params.yaml b/template-params.yaml
index 5eedd90..8c0696e 100644
--- a/template-params.yaml
+++ b/template-params.yaml
@@ -1,20 +1,24 @@
 ### Data type
-data_type: "fastq"
-data_type: "bam"
+input_data_type: "bam"
+input_data_type: "fastq"
 # paired end data
 paired_end: "True"
 
+filepattern: "*.bam" # need to adjust if bam or fastq
+# filepattern: "*.fastq" 
+# filepattern: "*.fastq.gz" 
+
 
 ### Metaphlan version
 # metaphlan3.1.0 params
-metaphlan_ver : "metaphlan3.1.0"
+metaphlan_version : "metaphlan_v3"
 # metaphlan4 params
-metaphlan_ver : "metaphlan4"
+metaphlan_version : "metaphlan_v4"
 
 # humann3.7 params
-humann_ver : "humann3.7"
+humann_version : "humann_v37"
 # humann4alpha params
-humann_ver : "humann4_alpha"
+humann_version : "humann_v4a"
 
 
 
@@ -50,6 +54,3 @@ humann_bowtie_db: "s3://biobakery-databases/humann_databases/chocophlan"
 humann_protein_db: "s3://biobakery-databases/humann_databases/uniref"
 humann_utility_db: "s3://biobakery-databases/humann_databases/utility_mapping"
 
-
-# Global params (same regardless of computer environment)
-filepattern: "*.bam" # need to adjust if bam or fastq
\ No newline at end of file

From 9e6a84b7ee96afabdbc90044fd92a42089057af1 Mon Sep 17 00:00:00 2001
From: Danielle Pinto <108756057+danielle-pinto@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:06:55 -0400
Subject: [PATCH 30/35] add line breaks

Co-authored-by: Kevin Bonham <kevbonham@gmail.com>
---
 README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index df33e0d..0236379 100644
--- a/README.md
+++ b/README.md
@@ -26,8 +26,13 @@ Based on the profiles described in `nextflow.config`, we can run the pipeline wi
 `nextflow run main.nf -profile local -params-file params.yaml` 
 
 ### Running on the HPC
+
 Jobs on the Tufts HPC can be run in two different ways:
-- **Batch**: the job will be sent to the queue and it will be completed based on how many resources you have requested, current cluster load, and fairshare (have you recently used the cluster) 
+
+- **Batch**: the job will be sent to the queue
+  and it will be completed based on how many resources you have requested,
+  current cluster load,
+  and fairshare (have you recently used the cluster) 
 
 - **Preempt**: this allows you to run your job preemptively using free nodes from another lab that paid for these compute resources. However, if they are already running a job, your job will be killed and you'll have to resubmit it.
 

From 6e978425afaae39915c0fa283d973f2891484a0d Mon Sep 17 00:00:00 2001
From: Danielle Pinto <108756057+danielle-pinto@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:07:28 -0400
Subject: [PATCH 31/35] update README with Kevin's suggestions about containers

Co-authored-by: Kevin Bonham <kevbonham@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0236379..dc1cba6 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ by Kevin Bonham, PhD
 ## Environment setup
 Instructions for setting up a local environment to run the pipeline can be found on Danielle's notebook [here](https://github.com/BonhamLab/daniellepinto/blob/main/PeriodicMeetings/2025-06-17.md#danielles-personal-notes). 
 
-Computing environments on the Tufts HPC and AWS should already be set-up with apptainer environments.
+Computing environments on the Tufts HPC and AWS should already be set-up with container-based (docker, apptainer) or conda environments.
 
 ## Running the pipeline
 This nextflow pipeline can be run on three different types of machines: 

From d2d32a691f65ceaf63deca4a8442a63fb03a41d3 Mon Sep 17 00:00:00 2001
From: Danielle Pinto <108756057+danielle-pinto@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:08:14 -0400
Subject: [PATCH 32/35] add line breaks to Kneaddata description

Co-authored-by: Kevin Bonham <kevbonham@gmail.com>
---
 README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index dc1cba6..c7d9cb1 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,12 @@ by Kevin Bonham, PhD
 
 [bioBakery](https://github.com/biobakery): software, documentation, and tutorials for microbial community profiling (created and mantained by the Huttenhower lab)
 
-- [`KneadData`](https://github.com/biobakery/kneaddata): a data quality-control pipeline that removes host genomic data within our metagenomic samples. Particularly, this pipeline uses a database containing a reference human genome so that all human DNA is removed from the samples. Link to more information here: (https://huttenhower.sph.harvard.edu/kneaddata/).
+- [`KneadData`](https://github.com/biobakery/kneaddata): 
+  a data quality-control pipeline that trims low quality reads
+  and removes host genomic data within our metagenomic samples.
+  Particularly, this pipeline uses a database containing a reference human genome
+  so that all human DNA is removed from the samples.
+  Link to more information here: (https://huttenhower.sph.harvard.edu/kneaddata/).
 - [`MetaPhlAn`](https://github.com/biobakery/MetaPhlAn): a computational tool for species-level microbial profiling (bacteria, archaea, eukaryotes, and viruses) from metagenomic shotgun sequencing data. Link to more information here:(https://huttenhower.sph.harvard.edu/metaphlan)
 - [`HUMAnN`](https://github.com/biobakery/humann): a pipeline for efficiently and accurately profiling the presence/absence and abundance of microbial pathways in a community from metagenomic or metatranscriptomic sequencing data (typically millions of short DNA/RNA reads). This process, referred to as functional profiling, aims to describe the metabolic potential of a microbial community and its members. Link to more information here:(https://huttenhower.sph.harvard.edu/humann)
 

From 2f1bb4246593c5f8ecfd5382232faf1ccd252dbb Mon Sep 17 00:00:00 2001
From: Danielle Pinto <108756057+danielle-pinto@users.noreply.github.com>
Date: Wed, 25 Jun 2025 17:50:18 -0400
Subject: [PATCH 33/35] Update README.md with formatting suggestions

Co-authored-by: Kevin Bonham <kevbonham@gmail.com>
---
 README.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c7d9cb1..d5a239c 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,17 @@ by Kevin Bonham, PhD
   Particularly, this pipeline uses a database containing a reference human genome
   so that all human DNA is removed from the samples.
   Link to more information here: (https://huttenhower.sph.harvard.edu/kneaddata/).
-- [`MetaPhlAn`](https://github.com/biobakery/MetaPhlAn): a computational tool for species-level microbial profiling (bacteria, archaea, eukaryotes, and viruses) from metagenomic shotgun sequencing data. Link to more information here:(https://huttenhower.sph.harvard.edu/metaphlan)
-- [`HUMAnN`](https://github.com/biobakery/humann): a pipeline for efficiently and accurately profiling the presence/absence and abundance of microbial pathways in a community from metagenomic or metatranscriptomic sequencing data (typically millions of short DNA/RNA reads). This process, referred to as functional profiling, aims to describe the metabolic potential of a microbial community and its members. Link to more information here:(https://huttenhower.sph.harvard.edu/humann)
+- [`MetaPhlAn`](https://github.com/biobakery/MetaPhlAn): 
+  a computational tool for species-level microbial profiling (bacteria, archaea, eukaryotes, and viruses)
+  from metagenomic shotgun sequencing data.
+  Link to more information here:(https://huttenhower.sph.harvard.edu/metaphlan)
+- [`HUMAnN`](https://github.com/biobakery/humann): 
+  a pipeline for efficiently and accurately profiling the presence/absence and abundance of microbial pathways
+  in a community from metagenomic or metatranscriptomic sequencing data
+  (typically millions of short DNA/RNA reads).
+  This process, referred to as functional profiling,
+  aims to describe the metabolic potential of a microbial community and its members.
+  Link to more information here:(https://huttenhower.sph.harvard.edu/humann)
 
 ## Environment setup
 Instructions for setting up a local environment to run the pipeline can be found on Danielle's notebook [here](https://github.com/BonhamLab/daniellepinto/blob/main/PeriodicMeetings/2025-06-17.md#danielles-personal-notes). 

From 4fd5d1e46d01cff8a6418780286dec0b2eb61839 Mon Sep 17 00:00:00 2001
From: Danielle Pinto <108756057+danielle-pinto@users.noreply.github.com>
Date: Wed, 25 Jun 2025 17:52:18 -0400
Subject: [PATCH 34/35] Update README.md

Co-authored-by: Kevin Bonham <kevbonham@gmail.com>
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d5a239c..1093aa7 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,8 @@ Jobs on the Tufts HPC can be run in two different ways:
   current cluster load,
   and fairshare (have you recently used the cluster) 
 
-- **Preempt**: this allows you to run your job preemptively using free nodes from another lab that paid for these compute resources. However, if they are already running a job, your job will be killed and you'll have to resubmit it.
+- **Preempt**: this allows you to run your job using free nodes from another lab that paid for these compute resources.
+  However, if they attempt to queue a job, your job will be preempted and killed, so you'll have to resubmit it.
 
 With how the HPC environment is currently defined in `nextflow.config`, jobs will first be submitted to the batch queue. If there are not any available resources, it will be processed preemptively. 
 

From 1eaf4f62630aeadca9e417c428f17009d5240bd1 Mon Sep 17 00:00:00 2001
From: Danielle Pinto <108756057+danielle-pinto@users.noreply.github.com>
Date: Wed, 25 Jun 2025 17:52:37 -0400
Subject: [PATCH 35/35] Update README.md

Co-authored-by: Kevin Bonham <kevbonham@gmail.com>
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1093aa7..a885657 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,8 @@ Jobs on the Tufts HPC can be run in two different ways:
 - **Preempt**: this allows you to run your job using free nodes from another lab that paid for these compute resources.
   However, if they attempt to queue a job, your job will be preempted and killed, so you'll have to resubmit it.
 
-With how the HPC environment is currently defined in `nextflow.config`, jobs will first be submitted to the batch queue. If there are not any available resources, it will be processed preemptively. 
+With how the HPC environment is currently defined in `nextflow.config`,
+jobs will first be submitted to the `batch` or `preempt` queue, whichever is available first.
 
 
 - `nextflow run main.nf -profile tufts_hpc -params-file params.yaml`