diff --git a/.gitignore b/.gitignore index a3d6820..7e53725 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ *.fastq.gz .nextflow* **/work/* +input/human_genome +nextflow/kneaddata +nextflow/tutorial-inprogress.nf +input/humann_databases/full_chocophlan.v201901_v31.tar.gz diff --git a/components.typ b/components.typ index 00b8d78..8d03bd0 100644 --- a/components.typ +++ b/components.typ @@ -19,7 +19,9 @@ #v(2em) - Kevin Bonham, PhD + Kevin Bonham \ + Sagun Maharjan \ + Emily Green 2025-07-11 ] @@ -30,14 +32,15 @@ 1. Tasks (processes, rules, etc): single step in a workflow (typically one command) - inputs - outputs - - command + - command#pause 2. Workflow: Order of steps (instructions to build computational graph) + - Also a way to find / specify inputs#pause 3. Parameters: variables that may change from run to run - may be global (eg output directory) - or task-specific (eg memory to allocate) -== Workflows builds computational graph +== Workflows build computational graph #let bent-edge(from, to, ..args) = { let midpoint = (from, 50%, to) @@ -50,10 +53,12 @@ edge(..vertices, "-|>", ..args) } -#set text(16pt) +#text(16pt)[ #diagram( - node-stroke: luma(80%), + node-stroke: blue, edge-corner-radius: none, + edge-stroke: purple.darken(50%), + label-sep:0.2em, spacing: (10pt, 47pt), // Nodes @@ -67,7 +72,7 @@ node((0,2), [Task 3 output 1], name: ), node((1,2), [Task 3 output 2], name: ), - node((2.5,2), [Task3 output], name: ), + node((2.5,2), [Task 4 output], name: ), node((3.5,3), [Report], name: ), @@ -76,17 +81,225 @@ node((8,2), [outputs...], name: ), + edge((-0.3,3),(0.1,3), "-|>"), + node((0.4,3), [Task], stroke:none), + node((0,3.5), [], width:1em,height:1em), + node((0.4,3.5), [File ], stroke: none), + // Edges bent-edge(, , [_Task 1_]), bent-edge(, , [_Task 2_]), bent-edge(, , [_Task 3_]), bent-edge(, ), edge(, , "-|>", [_Task 4_]), - edge(, , "-|>"), edge(, , "-|>"), - edge(, , "-|>"), edge(, , "--|>", [Tasks...]), edge(, , "--|>", [Tasks...]), edge(, , "-|>"), edge(, , "-|>"), ) +] + +== In anadama, tasks are added to workflows + +#slide(composer: (40%,60%))[ +#set text(16pt) +```python +workflow.add_task( + "cat [depends[0]] > [targets[0]]", + depends = input_files, + targets = output_files +) +``` + +][ +#set text(16pt) +#diagram( + node-stroke: blue, + edge-corner-radius: none, + edge-stroke: purple.darken(50%), + label-sep:0.2em, + spacing: (10pt, 47pt), + + // Nodes + node((1.5,0), [*input_file*], stroke:3pt, name: ), + node((0.5,1), [*output_file*], stroke:3pt, name: ), + node((2.5,1), [Task 2 output], name: ), + node((0,2), [Task 3 output 1], name: ), + node((1,2), [Task 3 output 2], name: ), + + node((2.5,2), [Task 4 output], name: ), + bent-edge(, , [*_task1_*]), + bent-edge(, , [_Task 2_]), + bent-edge(, , [_Task 3_]), + bent-edge(, ), + edge(, , "-|>", [_Task 4_]), + + node(stroke:teal+2pt, enclose:(,)) +) +] +== In nextflow, tasks are `process`es + +#slide(composer: (40%,60%))[ +#set text(16pt) + +```groovy +process task1 { + input: + path input_file + + output: + path output_file + + shell: + """ + cat $input_file > $output_file + """ +} +``` + +][ +#set text(16pt) +#diagram( + node-stroke: blue, + edge-corner-radius: none, + edge-stroke: purple.darken(50%), + label-sep:0.2em, + spacing: (10pt, 47pt), + + // Nodes + node((1.5,0), [*input_file*], stroke:3pt, name: ), + node((0.5,1), [*output_file*], stroke:3pt, name: ), + node((2.5,1), [Task 2 output], name: ), + node((0,2), [Task 3 output 1], name: ), + node((1,2), [Task 3 output 2], name: ), + + node((2.5,2), [Task 4 output], name: ), + bent-edge(, , [*_task1_*]), + bent-edge(, , [_Task 2_]), + bent-edge(, , [_Task 3_]), + bent-edge(, ), + edge(, , "-|>", [_Task 4_]), + + node(stroke:teal+2pt, enclose:(,)) +) +] + +#slide(composer: (40%,60%))[ + +#set text(16pt) +```groovy +process task3 { + input: + path in_from_1 + + output: + path out1 + path out2 + + shell: + """ + cat $input_file > $out1 + echo "I'm done!" > $out2 + """ +} +``` + +][ +#set text(16pt) +#diagram( + node-stroke: blue, + edge-corner-radius: none, + edge-stroke: purple.darken(50%), + label-sep:0.2em, + spacing: (10pt, 47pt), + + // Nodes + node((1.5,0), [Input file 1], name: ), + node((0.5,1), [*in_from_1*], name: , stroke:3pt), + node((3.5,1), [Task 2 output], name: ), + node((0,2), [*out1*], name: , stroke:3pt), + node((1.5,2), [*out2*], name: , stroke:3pt), + + node((3.5,2), [Task 4 output], name: ), + bent-edge(, , [_Task 1_]), + bent-edge(, , [_Task 2_]), + bent-edge(, , [*_task3_*], label-pos:0.6), + bent-edge(, ), + edge(, , "-|>", [_Task 4_]), + node(stroke:teal+2pt, enclose:(,,)) +) +] + +== Processes are called like functions + +#slide(composer: (43%,50%))[ +#set text(15pt) +```groovy +workflow { + input_ch = Channel + .fromPath("inputs/*.txt") + + t1_out = task1(input_ch) + t2_out = task2(input_ch) + + t3_out = task3(t1_out) + t4_out = task4(t2_out) + + report = report_task(t3_out + .collect().map { + t3-> t3[1] + }) + +} +``` + +][ +#set text(13pt) +#diagram( + node-stroke: blue, + edge-corner-radius: none, + edge-stroke: purple.darken(50%), + label-sep:0.2em, + spacing: (10pt, 47pt), + + // Nodes + node((1.5,0), [input_ch[0]], name: ), + node((0.5,1), [t1_out], name: ), + node((2.5,1), [t2_out], name: ), + node((6, 0), [input_ch[n]], name:), + + node((5.5, 1), [...], stroke:none), + + node((0,2), [t3_out[0]], name: ), + node((1,2), [t3_out[1]], name: ), + + node((2.5,2), [t4_out], name: ), + + node((2.5,3), [Report], name: ), + + node((5,0), [input_ch[1]], name: ), + node((5,2), [outputs...], name: ), + + node((6,2), [outputs...], name: ), + + edge((-0.3,3),(0.1,3), "-|>"), + node((0.4,3), [Task], stroke:none), + node((0,3.5), [], width:1em,height:1em), + node((0.4,3.5), [File ], stroke: none), + + // Edges + bent-edge(, , [_Task 1_]), + bent-edge(, , [_Task 2_]), + bent-edge(, , [_Task 3_]), + bent-edge(, ), + edge(, , "-|>", [_Task 4_]), + edge(, , "-|>"), + edge(, , "--|>", []), + edge(, , "--|>", []), + edge(, , "-|>"), + edge(, , "-|>"), +) +] + + diff --git a/nextflow/.tutorial-answers.nf b/nextflow/.tutorial-answers.nf new file mode 100644 index 0000000..31a9648 --- /dev/null +++ b/nextflow/.tutorial-answers.nf @@ -0,0 +1,77 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +workflow { + reads_ch = Channel.fromPath("../input/*.fastq.gz") + + kneaddata_out = kneaddata(reads_ch) + metaphlan_out = metaphlan(kneaddata_out[0], kneaddata_out[1]) + humann_out = humann(metaphlan_out[0], metaphlan_out[1], metaphlan_out[2]) +} + +process kneaddata { + publishDir "kneaddata/" + input: + path file + + output: + val sample + path "${sample}_kneaddata.fastq.gz" + path "${sample}_kneaddata*.fastq.gz" + path "${sample}_kneaddata.log" + + + script: + sample = file.name.replaceAll(".fastq.gz", "") + + """ + kneaddata --unpaired $file --output ./ --output-prefix ${sample}_kneaddata \ + --reference-db ${params.kneaddata_db} + + gzip *.fastq + """ +} + +process metaphlan { + publishDir "metaphlan/", mode: copy + + input: + val sample + path knead_out + + output: + val sample + path knead_out + path "${sample}_profile.tsv" + + shell: + + """ + metaphlan $knead_out -o ${sample}_profile.tsv \ + --input_type fastq + """ +} + +process humann { + publishDir "humann/", mode: copy + + input: + val sample + path knead_out + path profile + + output: + path "${sample}_genefamilies.tsv" + path "${sample}_pathabundance.tsv" + path "${sample}_pathcoverage.tsv" + + shell: + + """ + humann --input $knead_out -o ./ --taxonomic-profile $profile \ + --remove-temp-output --search-mode uniref90 \ + --output-basename $sample + """ +} + diff --git a/nextflow/main.nf b/nextflow/main.nf deleted file mode 100644 index 7358474..0000000 --- a/nextflow/main.nf +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl=2 - -include { metaphlan } from './process.nf' - -workflow { - reads = Channel - .fromPath("${params.readsdir}/*.fastq.gz") - .map { - file -> - def sample = file.baseName.split("_")[0] // HD32R1_subsample.fastq.gz -> HD32R1 - return tuple(sample, file) - } - - kneaddata(reads) -} - -process kneaddata { - input: - tuple val(sample), path(reads) - - output: - path "${sample}_kneaddata.fastq.gz" - path "${sample}_kneaddata*.fastq.gz" - path "${sample}_kneaddata.log" - - - shell: - - """ - kneaddata --unpaired $reads --output ./ --output-prefix ${sample}_kneaddata - """ -} diff --git a/nextflow/nextflow-tutorial.md b/nextflow/nextflow-tutorial.md new file mode 100644 index 0000000..abafda6 --- /dev/null +++ b/nextflow/nextflow-tutorial.md @@ -0,0 +1,354 @@ +# Nextflow Biobakery Tutorial + +In this version of the workflow tutorial, +we're going to be building a workflow +that takes single-end read files and runs 3 steps + +1. `kneaddata` for quality control +2. `metaphlan` for taxonomic profiling +3. `humann` for functional profiling + +This is what it would look like for a single file, +call it `XXXX_subsample.fastq.gz`. + +First, we run `kneaddata`: + +```sh +kneaddata --unpaired XXXX_subsample.fastq.gz \ + --output ./ \Starting files + + --output-prefix XXXX_kneaddata +``` + +This will create a few different files, + +- `XXXX_kneaddata.fastq.gz`: this is the main output we care about for downstream applications +- `XXXX_kneaddata*.fastq.gz`: there may be a few different are outputs from quality control (eg trimmed/low quality reads) +- `XXXX_kneaddata.log`: log files that contains info on read depth etc (useful for reporting later) + +We then want to take the first of these outputs, and run metaphlan + +```sh +metaphlan XXXX_kneaddata.fastq.gz --output XXXX_profile.tsv \ + --input_type fastq +``` + +This will create the file `XXXX_profile.tsv`. + +Finally, we want to take the first output from `kneaddata` +and the output from `metaphlan` +and run `humann`: + +```sh +humann --input XXXX_kneaddata.fastq.gz \ + --taxonomic-profile XXXX_profile.tsv \ + --output ./ \ + --output-basename XXXX +``` + +## The structure of a nextflow workflow + +Nextflow scripts are written in groovy - +a dialect of Java. + +```groovy +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +// This is a comment +workflow { + + // Channel factory to get starting values / files / etc + input_ch = Channel.fromPath("../input/*.fastq.gz") + + // processes to call + p1_out = process1(input_ch) + p2_out = process2(p1_out) +} + +// process definitions +process p1 { + input + val some_var + + output + val other_var + + script: + """ + echo $some_var > $other var + """ +} + +process p2 { + // etc... +} +``` + +The basic structure is: + +1. preamble - shebang to define the language, + and a declaration of the nextflow language version +2. a workflow definition - at it's most basic, + this just calls a series of functions +3. process definitions - these can go before or after the workflow + +A few optional components: + +- Other files with eg process definitions can be brought in using `include`, + eg `include { my_process } from "./other_processes.nf"` +- If you'd like to define additional variables / parameters, + you can do so in a `yaml` file. + For example, you can have a `my_params.yaml` file: + + ```yaml + input_directory: /path/on/hpc/ + file_pattern: "*_R{1,2}.fastq.gz" + ``` + + and then refer to them as `"${params.input_directory}"` and + `"${params.file_pattern}"` in your nextflow workflow. + - point to the params file with `-params my_params.yaml` when calling the script + - or you can also pass `params` variables when calling the nextflow script + using command-line flags, eg `--input_directory "/path/on/hpc" --file_pattern "*_R{1,2}.fastq.gz` +- You can include additional parameters on a per-process basis + or globally, either by including them within a process definition, or using a config file. + + +## Channel Factories + +The first thing when defining a script is to determine the inputs. +`nextflow` uses "channels", which are just iterators over multiple values. +When you call a process on a channel, the output of that process is another channel, +which you can pass to another process. + +Helpfully, `nextflow` has a number of ["channel factories"][channel-factory] +that can make channels using common patterns, like pairs of files +or sequencess of numbers. +Because we're using single-end files for this tutorial, +we're using the `fromPath` channel, +which finds all files matching a given glob pattern, +in this case, `*.fastq.gz`. + +[channel-factory]: https://www.nextflow.io/docs/latest/reference/channel.html#channel-factory + +Take a look in the `tutorial.nf` file - in the `workflow` definition, +you'll see: + +```groovy +reads_ch = Channel.fromPath("*.fastq.gz") +``` + +## Processes + +The bulk of the logic for a workflow occurs in a `process` definition. +Processes need `input:`, `output:` and `script:` directives[^script]. + +[^script]: You can also use `exec:` which will run `groovy` code +rather than shell code. + +### Script + +Let's start at the end - take a look at the `kneaddata` process: + +```groovy +process kneaddata { + // ... + + script: + sample = file.name.replaceAll("fastq.gz", "") + + """ + kneaddata --unpaired $file --output ./ --output-prefix ${sample}_kneaddata \ + --reference-db ${params.kneaddata_db} + + gzip *.fastq + """ +} +``` + +Here, we define a new variable `sample` that is the input file name +without the `fastq.gz` extension. +For example, the first input `HD32R1_subsample.fastq.gz` +will have `sample = HD32R1_subsample`. + +Below that, we use triple-quotes to enter a multi-line string - +this is what will get passed to the shell. +Variables can be interpolated with `$`, so eg `$file` will get replaced +with the contents of the variable `file`. +The `--output-prefix` flag is given `${sample}_kneaddata` with surrounding curly braces +since `$sample_kneaddata` would look for the variable `sample_kneaddata`, +rather than appending `_kneaddata` to the variable `sample`. +You can also use curly braces if your variable is an accessor or a function, +eg `${params.some_var}` will get the `some_var` field from the `params` variable, +while `$params.some_var` (without curly braces) in a string will try to append the string `.some_var` +to the variable `params`. + +Otherwise, this looks just like a shell command. +Here, we're running kneaddata and then gzipping the outputs. + +### Input and output + +Now, let's look at the beginning of the process: + +```groovy +process kneaddata { + input: + path file + + output: + val sample + path "${sample}_kneaddata.fastq.gz" + path "${sample}_kneaddata*.fastq.gz" + path "${sample}_kneaddata.log" + + // ... +} +``` + +Here, we're defining the inputs and outputs to the process. +In this case, we have a single input - the `file` path +that comes from the `fromFile` channel factory. +And we'll generate several outputs. +Recall that the `sample` variable (`val` can be a string or number etc) +is defined below in the `script` section. +We will return that value so that downstream processes can use it, +as well as 3 paths + +1. `${sample}_kneaddata.fastq.gz` is the primary output file +2. `${sample}_kneaddata*.fastq.gz` will get all of the other files + matching this glob pattern in an array +3. `${sample}_kneaddata.log` the log file + +## Running the workflow + +In it's current state, if you make this directory +(`hutlab_reproWF/nextflow`) +the working directory, you should be able to run: + +```sh +$ nextflow tutorial.nf --kneaddata_db ../input/human_genome/ +``` + +This will find the 6 fastq files and run `kneaddata` on them. +Logs for the run are found in `.nextflow.log`, +and all of the products of the run +will be found in a "work directory" - by default `work/`. + +This is probably not want you want though - +chances are you want them in a specific location. +For that, we can add a `publishDir` to the process definition. + +Eg: + +```groovy +process kneaddata { + publishDir "kneaddata/" + + // ... +} +``` + +After adding this, run the workflow again, +including the `-resume` flag, eg: + +```groovy +$ nextflow tutorial.nf --kneaddata_db ../input/human_genome/ -resume +``` + +Your previous run should be cached, but now the outputs will be linked +to the `kneaddata/` directory. +Notice that these are symlinks - run `ls -l kneaddata/` +and you will see that they still point into the work directory. + +If you want to change this behavior, you can change the `mode` parameter of `publishDir`. +For example, `publishDir "kneaddata/", mode: 'link'` will use hard links instead of symlinks. +`mode: 'copy'` will make copies of files instead. +You can also use `mode: 'move'`, but I would avoid this in general - +if you need the files in later steps, this will break the workflow. + +## Adding a new process + +We are now ready to add in the next step - `metaphlan`. +Recall that the shell command we want to run is + +```sh +metaphlan XXXX_kneaddata.fastq.gz --output XXXX_profile.tsv \ + --input_type fastq +``` + +First, uncomment the call to the `metaphlan` process in the workflow +(remove the leading `//`). +Notice that there are two arguments being passed: `kneaddata_out[0]` and `kneaddata_out[1]`. +These refer to the first and second outputs of the `kneaddata` process, +the `sample` value and the `knead_out` path. + +Take a look at the process definition already completed - + +```groovy +process metaphlan { + input: + val sample + path knead_out + + output: + val sample + path knead_out + path "${sample}_profile.tsv" + + script: + """ + # Your code here... + + """ + +} +``` + +Here, `input:` has these two arguments, +*and so does `output:`*. +This is because we'll need to pass these variables to `humann` as well - +unfortunately, if you try to re-use the `kneaddata` outputs in a subsequent step, +the order of the iterators get out of whack +and you end up mixing outputs from different samples together. + +Enter the correct command in the `script` section - +remember to replace `XXXX` with the correct string interpolation syntax. + +Once you're ready to try it, run the exact same command as above (including `-resume`!). +You should see something like: + +```sh +nextflow tutorial.nf --kneaddata_db ../input/human_genome/ -resume + + N E X T F L O W ~ version 24.04.3 + +Launching `tutorial-inprogress.nf` [sick_shockley] DSL2 - revision: e0e9bc543a + +executor > local (6) +[4b/a32e45] kneaddata (4) [100%] 6 of 6, cached: 6 ✔ +[76/547136] metaphlan (2) [ 0%] 0 of 6 +``` + +Did you remember to add a `publishDir`? +If not, don't sweat it! +Add it now, and run it again with `-resume`, +your previous results should not need to re-run. + +## Make the humann process and call it + +You now know all of the pieces! +Can you create a process to run `humann`? +This is the shell command: + +```sh +humann --input XXXX_kneaddata.fastq.gz \ + --taxonomic-profile XXXX_profile.tsv \ + --output ./ \ + --output-basename XXXX +``` + +The input and taxonomic profiles should be outputs from previous processes. +How will you get the output basename? + diff --git a/nextflow/process.nf b/nextflow/process.nf deleted file mode 100644 index c082f04..0000000 --- a/nextflow/process.nf +++ /dev/null @@ -1,16 +0,0 @@ -process metaphlan { - input: - val sample - path knead_out - - output: - path "${sample}_profile.tsv" - - shell: - - """ - metaphlan $knead_out -o ${sample}_profile.tsv \ - --input_type fastq - -t rel_ab_w_read_stats - """ -} diff --git a/nextflow/tutorial.nf b/nextflow/tutorial.nf new file mode 100644 index 0000000..39ee69a --- /dev/null +++ b/nextflow/tutorial.nf @@ -0,0 +1,68 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +workflow { + reads_ch = Channel.fromPath("../input/*.fastq.gz") + + kneaddata_out = kneaddata(reads_ch) + // metaphlan_out = metaphlan(kneaddata_out[0], kneaddata_out[1]) + + // What to add to run the humann process? +} + +process kneaddata { + input: + path file + + output: + val sample + path "${sample}_kneaddata.fastq.gz" + path "${sample}_kneaddata*.fastq.gz" + path "${sample}_kneaddata.log" + + + script: + sample = file.name.replaceAll(".fastq.gz", "") + + """ + kneaddata --unpaired $file --output ./ --output-prefix ${sample}_kneaddata \ + --reference-db ${params.kneaddata_db} + + gzip *.fastq + """ +} + +process metaphlan { + input: + val sample + path knead_out + + output: + val sample + path knead_out + path "${sample}_profile.tsv" + + script: + """ + # Your code here... + + """ + +} + +process humann { + input: + // what inputs do you knead? + + output: + // what outputs do you need? + + script: + + """ + # your code here: + + """ +} +