Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions WDL/wmgx.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
version 1.0

workflow metagenomics_batch_pipeline {
input {
Array[File] raw_reads_fastqs
String kneaddata_db
String output_dir
Int threads = 4
}

scatter (sample_fastq in raw_reads_fastqs) {
# derive a sample name by stripping “.fastq”
String sample_name = sub(basename(sample_fastq), "\\.fastq$", "")

# Define the output directory paths for each tool explicitly within the scatter
String kneaddata_sample_out_dir = output_dir + "/" + sample_name + "/kneaddata"
String metaphlan_sample_out_dir = output_dir + "/" + sample_name + "/metaphlan"
String humann_sample_out_dir = output_dir + "/" + sample_name + "/humann"

call kneaddata_task {
input:
raw_reads = sample_fastq,
db = kneaddata_db,
threads = threads,
out_dir = kneaddata_sample_out_dir
}

call metaphlan_task {
input:
knead_out_fastq = kneaddata_task.cleaned_fastq,
threads = threads,
out_dir = metaphlan_sample_out_dir
}

call humann_task {
input:
knead_out_fastq = kneaddata_task.cleaned_fastq,
threads = threads,
out_dir = humann_sample_out_dir
}
}

output {
# The workflow already knows the *paths* to these directories from the scatter block
# so we can directly refer to the 'String' variables defined there.
# We collect them as arrays since they are in a scatter.
Array[String] kneaddata_dirs = kneaddata_sample_out_dir
Array[File] metaphlan_profiles = metaphlan_task.profile_txt
Array[String] humann_dirs = humann_sample_out_dir
}
}

task kneaddata_task {
input {
File raw_reads
String db
Int threads
String out_dir # This is an input path
}

command <<<
set -euo pipefail
mkdir -p ~{out_dir}
kneaddata \
--unpaired ~{raw_reads} \
--reference-db ~{db} \
--output ~{out_dir} \
--threads ~{threads} \
--bypass-trf
>>>

output {
# Only output newly generated files/paths, not the input 'out_dir' itself
String sample_base_name = sub(basename(raw_reads), "\\.fastq$", "")
File cleaned_fastq = "~{out_dir}/~{sample_base_name}_kneaddata.fastq"
}

runtime {
cpu: threads
}
}

task metaphlan_task {
input {
File knead_out_fastq
Int threads
String out_dir # This is an input path
}

command <<<
set -euo pipefail
mkdir -p ~{out_dir}
metaphlan \
~{knead_out_fastq} \
--input_type fastq \
--nproc ~{threads} \
--bowtie2out ~{out_dir}/metaphlan.bowtie2.bz2 \
-o ~{out_dir}/profile.txt
>>>

output {
# Only output newly generated files/paths, not the input 'out_dir' itself
File profile_txt = "~{out_dir}/profile.txt"
}

runtime {
cpu: threads
}
}

task humann_task {
input {
File knead_out_fastq
Int threads
String out_dir # This is an input path
}

command <<<
set -euo pipefail
mkdir -p ~{out_dir}
humann \
--input ~{knead_out_fastq} \
--output ~{out_dir} \
--threads ~{threads}
>>>

output {
# HUMAnN typically outputs multiple files into the directory.
# If you need to expose specific files, you'd list them here.
# If the workflow needs to know the *path* to the output directory,
# it already has 'humann_sample_out_dir' from the scatter.
# If you want to explicitly denote that the *entire directory* is an output artifact,
# you might need to rely on Cromwell's output localization or zip the directory.
# For now, if no specific files are needed, and the *path* is tracked, you can leave this empty,
# or if you need to pass the directory itself as a *result* of the task, you'd do:
# Directory output_directory = out_dir (but this goes back to the Directory type issue)
# The current best practice in WDL 1.0 for outputting a directory is usually to
# output specific files within it, or rely on the system handling the output paths.
# If you truly need the *output directory itself* as an output variable for downstream
# WDL tasks that operate on a directory, you might need to use a trick like:
# File humann_output_dir_marker = "~{out_dir}/humann_completed.txt"
# and then from the path of that marker file, infer the directory.
# But given your current workflow, the `humann_sample_out_dir` string is sufficient
# to refer to the location.
}

runtime {
cpu: threads
}
}
9 changes: 9 additions & 0 deletions WDL/wmgx_inputs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"metagenomics_batch_pipeline.raw_reads_fastqs": [
"/home/hutlab_public/Tutorials/hutlab_reproWF/anadama2/input/HD32R1_subsample.fastq",
"/home/hutlab_public/Tutorials/hutlab_reproWF/anadama2/input/HD42R4_subsample.fastq"
],
"metagenomics_batch_pipeline.kneaddata_db": "/home/hutlab_public/Tutorials/hutlab_reproWF/input/human_genome",
"metagenomics_batch_pipeline.output_dir": "analysis_output",
"metagenomics_batch_pipeline.threads": 2
}
63 changes: 63 additions & 0 deletions anadama2/anadama2_tutorials.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# AnADAMA2 workflow tutorial:



### (Section 1.) AnADAMA2 and general example workflow (Python executable + R executable + Data table + PDF generation task)
This tutorial shows how to use AnADAMA2 to build a fully tracked, reproducible workflow. It uses trim.py to trim raw TSV files, plot.py to generate figures, and analysis.R to perform statistical analysis in R, while also backing up intermediate text outputs, and generating a PDF report from a Jinja template. AnADAMA2 handles directory creation, tracks both your scripts and input files, and only reruns tasks when inputs or code have changed to maintain efficiency and reproducibility.

```
cd ~/Tutorials/hutlab_reproWF/anadama2
python3 run.py --input ./input/ --output output --lines 10 --metadata ./input/metadata.tsv
```

### (Section 2.) AnADAMA2 running metaphlan for multiple samples
This tutorial shows how to set up an AnADAMA2 pipeline to run MetaPhlAn on all your samples with a single command. You start by importing AnADAMA2 and creating a `Workflow` instance, then add an optional argument for file extensions and parse the command-line inputs. Next you collect all input files matching that extension and automatically generate corresponding output names tagged with “metaphlan_taxonomy.” By defining a single task group that runs MetaPhlAn on each input and directs its output to the matching target file, you can then launch everything at once with `workflow.go()`, letting AnADAMA2 handle dependency tracking, parallel execution, and only re-running steps when inputs or code change.
```
cd ~/Tutorials/hutlab_reproWF/anadama2
python3 run_metaphlan_workflow.py --input ./input/ --output output_metaphlan_workflow
```
### (Section 3.) AnADAMA2 running metaphlan for multiple samples in a cluster

- **Step1:** Change `add_task_group` to `add_task_group_gridable` in the above workflow code.
- **Step2:** Remove the `bowtie2 intermediate files` generated from last step using `rm -rf ~/Tutorials/hutlab_reproWF/anadama2/input/*bowtie2out.txt`
- NOTE: Since `grid` is not available in the VM, the task will run locally.
```
cd ~/Tutorials/hutlab_reproWF/anadama2
python3 run_metaphlan_workflow_grid.py --input ./input/ --output output_metaphlan_workflow_grid --grid-jobs 2
```

### (Section 4.) AnADAMA2 - Running kneadata + metaphlan + humann
This script provides an end-to-end AnADAMA2 custom biobakery workflow that automatically discovers all your FASTQ files, cleans them with KneadData using your specified human reference database, generates taxonomic profiles with MetaPhlAn, and produces functional annotations with HUMAnN, all in one go.
AnADAMA2 will track your input files and scripts, create per-sample output directories, and only rerun steps when the underlying data or code have changed, ensuring an efficient, reproducible metagenomics pipeline.
```
cd ~/Tutorials/hutlab_reproWF/anadama2
python3 run_custom_biobakery_workflow.py --input ./input --output output_custom_biobakery_workflows
```


# WDL workflow tutorial:

### Introduction

The Workflow Description Language (WDL) is a concise, human-readable DSL designed to specify bioinformatics and data-science pipelines in terms of **tasks** (individual command-line steps) and **workflows** (how those tasks connect). By pairing a WDL script with the Cromwell execution engine, you get a fully reproducible, portable pipeline: Cromwell reads your `*.wdl` file plus a JSON of runtime inputs, spins up each task in order (or in parallel when you use `scatter`), tracks inputs and outputs, and reruns only what’s changed.
In this example, the `metagenomics_batch_pipeline.wdl` script defines a metagenomics workflow that:
1. **Scatters** over all your input FASTQ files
2. **Cleans** reads with KneadData (`kneaddata_task`)
3. **Profiles** taxonomy via MetaPhlAn3 (`metaphlan_task`)
4. **Profiles** function via HUMAnN3 (`humann_task`)

Each sample’s results live in its own subdirectory under your chosen `output_dir`, and Cromwell will automatically create those directories, allocate the right number of CPUs (`threads`), and collect the outputs you declare.

### Requirements

- **Java** (to run Cromwell’s JAR)
- **Cromwell** (`cromwell.jar`)
- All command-line tools on your `PATH`: `kneaddata`, `metaphlan`, `humann`


### Running WDL Locally with Cromwell
```
cd ~/Tutorials/hutlab_reproWF/WDL
cromwell run wmgx.wdl --inputs wmgx_inputs.json
```
This single command tells Cromwell to load your WDL, ingest the inputs from `wmgx_inputs.json`, and execute the entire metagenomics batch pipeline,parallelizing across samples, managing dependencies, and delivering a reproducible set of outputs.
30 changes: 30 additions & 0 deletions anadama2/doc/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# README

### Dependencies:
- R version 3.6.3
- Python version >=3.7


##### Step 1: All the defaults values are configured in `etc/config.ini`. Please add/update the existing tasks for the private analysis workflow.

##### Usage:

```python run.py --help```

```
usage: run.py [-h] [--version] [--lines LINES] -o OUTPUT [-i INPUT]
[--config CONFIG] [--local-jobs JOBS] [--grid-jobs GRID_JOBS]
[--grid GRID] [--grid-partition GRID_PARTITION]
[--grid-benchmark {on,off}] [--grid-options GRID_OPTIONS]
[--grid-environment GRID_ENVIRONMENT]
[--grid-scratch GRID_SCRATCH] [--dry-run] [--skip-nothing]
[--quit-early] [--until-task UNTIL_TASK]
[--exclude-task EXCLUDE_TASK] [--target TARGET]
[--exclude-target EXCLUDE_TARGET]
[--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
```
## Step 2: Running the workflow
The input and the output arguments are required for the workflow.
```
python run.py -i input/data.tsv -o output
```
83 changes: 83 additions & 0 deletions anadama2/doc/template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#' **Feel free to edit/replace the template code below here and start working on the analysis.**
#'
#+ echo=False
# import pandas and matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import time
import os
from anadama2 import PweaveDocument
document = PweaveDocument()
vars = document.get_vars()

#' Date: <%= time.strftime("%d, %b %Y") %>
#'
#' # Demo Analysis Study Title
#'
#' # Introduction
#' <% print(vars["introduction_text"]) %>Please follow the following code pattern while doing an analysis.
#' This an example of a document that can be published using Pweave. Text is written using markdown (`#'`)
#' and python code between <%=%> are executed and results are included in the resulting pdf document.
#' You can define various options for code chunks to control code execution and formatting [see
#' Pweave docs](https://anadama2.readthedocs.io/en/latest/document.html).



#' # Examples
#' ### read_table example
#'
#' ```
#' Description:
#' Read the table from a text file with the first line
#' the column names and the first column the row names.
#'
#' Parameters:
#' file (str) – The file to read
#' invert (bool) – Invert the table rows/columns after reading
#' delimiter (str) – The delimiter present in the file
#' only_data_columns (bool) – Remove the header and row names
#' format_data (function) – A function to use to format the data
#' ```
#' ##### Example Output:
#'
#' ### Displaying images from visualization modules-example
#' The above boxplots is an example visualization output from plots.py
#' displayed as the markdown image in the Pweave pdf report.
#'
#' The above bar plots is an example visualization output from plots.py
#' displayed as the markdown image in the Pweave pdf report.

#'
#' # MISC Markdown Examples
#'
#' # h1 Heading
#' ## h2 Heading
#' ### h3 Heading
#' #### h4 Heading
#' ##### h5 Heading
#' ###### h6 Heading

#' ## Tables

#' | Option | Description |
#' | ------ | ----------- |
#' | data | path to data files to supply the data that will be passed into templates. |
#' | engine | engine to be used for processing templates. Handlebars is the default. |
#' | ext | extension to be used for dest files. |

#' Right aligned columns

#' | Option | Description |
#' | ------:| -----------:|
#' | data | path to data files to supply the data that will be passed into templates. |
#' | engine | engine to be used for processing templates. Handlebars is the default. |
#' | ext | extension to be used for dest files. |


#' ## Emphasis

#' **This is bold text**
#' __This is bold text__
#' *This is italic text*
#' _This is italic text_
#' ~~Strikethrough~~
4 changes: 4 additions & 0 deletions anadama2/etc/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[report]
title = Demo Analysis Report
project = Demo Analysis
introduction_text = This is a demo report
Loading