From 127a32a7083aade240b0d168cb1b3d40651c1735 Mon Sep 17 00:00:00 2001 From: Kevin-Brockers Date: Mon, 27 Apr 2026 21:04:06 +0200 Subject: [PATCH 1/5] Clean up whitespaces --- subworkflows/local/prepare_genome/main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index c23a7316..9768301f 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -50,7 +50,7 @@ workflow PREPARE_GENOME { // ch_fasta = channel.empty() if (fasta.endsWith('.gz')) { - ch_fasta = GUNZIP_FASTA([[:], fasta]).gunzip.map { it[1] } + ch_fasta = GUNZIP_FASTA([[:], fasta]).gunzip.map { it[1] } } else { ch_fasta = channel.value(file(fasta, checkIfExists: true)) @@ -62,18 +62,18 @@ workflow PREPARE_GENOME { ch_gtf = channel.empty() if (gtf) { if (gtf.endsWith('.gz')) { - ch_gtf = GUNZIP_GTF([[:], gtf]).gunzip.map { it[1] } + ch_gtf = GUNZIP_GTF([[:], gtf]).gunzip.map { it[1] } } else { ch_gtf = channel.value(file(gtf, checkIfExists: true)) } } else if (gff) { if (gff.endsWith('.gz')) { - ch_gff = GUNZIP_GFF([[:], file(gff, checkIfExists: true)]).gunzip.map { it[1] } + ch_gff = GUNZIP_GFF([[:], file(gff, checkIfExists: true)]).gunzip.map { it[1] } } else { ch_gff = channel.value(file(gff, checkIfExists: true)).map { [ [:], it ] } } - ch_gtf = GFFREAD(ch_gff, []).gtf.map { it[1] } + ch_gtf = GFFREAD(ch_gff, []).gtf.map { it[1] } } // From fa49f4621729de7ab6a597b1309f77f570596e7d Mon Sep 17 00:00:00 2001 From: Kevin-Brockers Date: Sat, 2 May 2026 22:34:53 +0200 Subject: [PATCH 2/5] restrict pipeline to use one ip replicate and control replicate, update usage on sample sheet design --- CHANGELOG.md | 2 ++ bin/check_samplesheet.py | 9 +++++++++ docs/usage.md | 42 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 034988a2..aea5f971 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [[PR #493](https://github.com/nf-core/chipseq/pull/493)] - Follow up to #487. - [[#492](https://github.com/nf-core/chipseq/issues/492), [#417](https://github.com/nf-core/chipseq/issues/417)] - Refactor local modules to nf-core standard. - [[#416](https://github.com/nf-core/chipseq/issues/416)] - Moved the KHMER_UNIQUEKMERS logic to prepare_genome +- [[#510](https://github.com/nf-core/chipseq/issues/510)] - Restrict the usage to one IP replicate against one control see: [#440](https://github.com/nf-core/chipseq/issues/440) + replicate. ### Parameters diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index d34f42ed..359ee4a0 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -212,9 +212,12 @@ def check_samplesheet(file_in, file_out): sample, ) + set_control_replicates = set() for idx, val in enumerate(sample_mapping_dict[sample][replicate]): control = "_REP".join(val[-1].split("_REP")[:-1]) control_replicate = val[-1].split("_REP")[-1] + set_control_replicates.update(control_replicate) + if control and ( control not in sample_mapping_dict.keys() or int(control_replicate) not in sample_mapping_dict[control].keys() @@ -225,6 +228,12 @@ def check_samplesheet(file_in, file_out): val[-1], ) + # Check that a given sample-replicate have only one control replicate + if len(set_control_replicates) > 1: + print_error( + f"Sample: {sample}, replicate {replicate} has more than one control replicate! Revise the experimental design, see: 'Note on IP and control replicates'" + ) + ## Write to file for idx in range(len(sample_mapping_dict[sample][replicate])): fastq_files = sample_mapping_dict[sample][replicate][idx] diff --git a/docs/usage.md b/docs/usage.md index d8eaf22b..8f48e2b8 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -47,6 +47,48 @@ WT_INPUT,BLA203A30_S21_L002_R1_001.fastq.gz,,2,,, WT_INPUT,BLA203A31_S21_L003_R1_001.fastq.gz,,3,,, ``` +### Note on IP and control replicates + +The pipeline is designed to handle one IP and matching control replicate, see section above. However there can be +situations where one might want to make multiple comparisons of the IP sample against several different controls. In +those cases it is advisable to encode these comparisons either in the sample column or as another replicate. + +- Encoding in sample names: + +```csv title="samplesheet.csv" +sample,fastq_1,fastq_2,replicate,antibody,control,control_replicate +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,1 +WT_BCATENIN_IP_CONTROL_2,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP_CONTROL_3,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,3 +WT_INPUT,BLA203A6_S32_L006_R1_001.fastq.gz,,1,,, +WT_INPUT,BLA203A30_S21_L001_R1_001.fastq.gz,,2,,, +WT_INPUT,BLA203A31_S21_L003_R1_001.fastq.gz,,3,,, +``` + +- Encoding as new biological replicates: + +```csv title="samplesheet.csv" +sample,fastq_1,fastq_2,replicate,antibody,control,control_replicate +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,1 +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,2,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,3,BCATENIN,WT_INPUT,3 +WT_INPUT,BLA203A6_S32_L006_R1_001.fastq.gz,,1,,, +WT_INPUT,BLA203A30_S21_L001_R1_001.fastq.gz,,2,,, +WT_INPUT,BLA203A31_S21_L003_R1_001.fastq.gz,,3,,, +``` + +- The following design, one IP replicate against more than one control replicate, is not allowed: + +```csv title="samplesheet.csv" +sample,fastq_1,fastq_2,replicate,antibody,control,control_replicate +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,1 +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,2 +WT_BCATENIN_IP,BLA203A1_S27_L006_R1_001.fastq.gz,,1,BCATENIN,WT_INPUT,3 +WT_INPUT,BLA203A6_S32_L006_R1_001.fastq.gz,,1,,, +WT_INPUT,BLA203A30_S21_L001_R1_001.fastq.gz,,2,,, +WT_INPUT,BLA203A31_S21_L003_R1_001.fastq.gz,,3,,, +``` + ### Full design The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 7 columns to match those defined in the table below. From 2728f21b9daf9b2a3ea72bdf208ffb80cf142b41 Mon Sep 17 00:00:00 2001 From: Kevin-Brockers Date: Sat, 2 May 2026 23:19:04 +0200 Subject: [PATCH 3/5] Sample sheet check will raise error when one sample-replicate has more than one antibody specified, added further explanation in usage --- CHANGELOG.md | 6 ++++-- bin/check_samplesheet.py | 15 +++++++++++++-- docs/usage.md | 18 +++++++++--------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aea5f971..c8b1cd36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,8 +21,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [[PR #493](https://github.com/nf-core/chipseq/pull/493)] - Follow up to #487. - [[#492](https://github.com/nf-core/chipseq/issues/492), [#417](https://github.com/nf-core/chipseq/issues/417)] - Refactor local modules to nf-core standard. - [[#416](https://github.com/nf-core/chipseq/issues/416)] - Moved the KHMER_UNIQUEKMERS logic to prepare_genome -- [[#510](https://github.com/nf-core/chipseq/issues/510)] - Restrict the usage to one IP replicate against one control see: [#440](https://github.com/nf-core/chipseq/issues/440) - replicate. +- [[#440](https://github.com/nf-core/chipseq/issues/440), [#510](https://github.com/nf-core/chipseq/issues/510)] - Fix + naming collisions when sample and replicate combination is identical for multiple antibodies see. +- [[#467](https://github.com/nf-core/chipseq/issues/467), [#510](https://github.com/nf-core/chipseq/issues/510)] - + Restrict the usage to one IP against one control replicate. ### Parameters diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 359ee4a0..463c7927 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -212,11 +212,13 @@ def check_samplesheet(file_in, file_out): sample, ) + set_antibodies = set() set_control_replicates = set() + for idx, val in enumerate(sample_mapping_dict[sample][replicate]): control = "_REP".join(val[-1].split("_REP")[:-1]) control_replicate = val[-1].split("_REP")[-1] - set_control_replicates.update(control_replicate) + set_control_replicates.add(control_replicate) if control and ( control not in sample_mapping_dict.keys() @@ -228,10 +230,19 @@ def check_samplesheet(file_in, file_out): val[-1], ) + for x in sample_mapping_dict[sample][replicate]: + set_antibodies.add(x[4]) + + # Check that a given sample replicate only uses one antibody + if len(set_antibodies) > 1: + print_error( + f"Sample: {sample}, replicate {replicate} has more than one antibody specified!" + ) + # Check that a given sample-replicate have only one control replicate if len(set_control_replicates) > 1: print_error( - f"Sample: {sample}, replicate {replicate} has more than one control replicate! Revise the experimental design, see: 'Note on IP and control replicates'" + f"Sample: {sample}, replicate {replicate} has more than one control replicate specified! Revise the experimental design, see: 'Note on IP and control replicates'" ) ## Write to file diff --git a/docs/usage.md b/docs/usage.md index 8f48e2b8..41a8285c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -119,15 +119,15 @@ NAIVE_INPUT,BLA203A48_S39_L001_R1_001.fastq.gz,,2,,, NAIVE_INPUT,BLA203A49_S1_L006_R1_001.fastq.gz,,3,,, ``` -| Column | Description | -| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `replicate` | Integer representing replicate number. This will be identical for re-sequenced libraries. Must start from `1..`. | -| `antibody` | Antibody name. This is required to segregate downstream analysis for different antibodies. Required when `control` is specified. | -| `control` | Sample name for control sample. | -| `control_replicate` | Integer representing replicate number for control sample. | +| Column | Description | +| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). It should be unique and contain the antibody name. E.g: `{Treatment or cell type}_{antibody}_IP` | +| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `replicate` | Integer representing replicate number. This will be identical for re-sequenced libraries. Must start from `1..`. | +| `antibody` | Antibody name. This is required to segregate downstream analysis for different antibodies. Required when `control` is specified. | +| `control` | Sample name for control sample. | +| `control_replicate` | Integer representing replicate number for control sample. | Example design files have been provided with the pipeline for [paired-end](../assets/samplesheet_pe.csv) and [single-end](../assets/samplesheet_se.csv) data. From f83b1d6efbe050e96c4e3bd3b5f75dc1afb09b6c Mon Sep 17 00:00:00 2001 From: Kevin-Brockers Date: Sat, 2 May 2026 23:50:25 +0200 Subject: [PATCH 4/5] Added Kevin Brockers as a contributor --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d022f5fc..40eb1447 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,13 @@ These scripts were originally written by Chuan Wang ([@chuan-wang](https://githu The pipeline workflow diagram was designed by Sarah Guinchard ([@G-Sarah](https://github.com/G-Sarah)). -Many thanks to others who have helped out and contributed along the way too, including (but not limited to): [@apeltzer](https://github.com/apeltzer), [@bc2zb](https://github.com/bc2zb), [@bjlang](https://github.com/bjlang), [@crickbabs](https://github.com/crickbabs), [@drejom](https://github.com/drejom), [@houghtos](https://github.com/houghtos), [@KevinMenden](https://github.com/KevinMenden), [@mashehu](https://github.com/mashehu), [@pditommaso](https://github.com/pditommaso), [@Rotholandus](https://github.com/Rotholandus), [@sofiahaglund](https://github.com/sofiahaglund), [@tiagochst](https://github.com/tiagochst) and [@winni2k](https://github.com/winni2k). +Many thanks to others who have helped out and contributed along the way too, including (but not limited to): +[@apeltzer](https://github.com/apeltzer), [@bc2zb](https://github.com/bc2zb), [@bjlang](https://github.com/bjlang), +[@crickbabs](https://github.com/crickbabs), [@drejom](https://github.com/drejom), +[@houghtos](https://github.com/houghtos), [@KevinMenden](https://github.com/KevinMenden), +[@mashehu](https://github.com/mashehu), [@pditommaso](https://github.com/pditommaso), +[@Rotholandus](https://github.com/Rotholandus), [@sofiahaglund](https://github.com/sofiahaglund), +[@tiagochst](https://github.com/tiagochst), [@winni2k](https://github.com/winni2k) and [@Kevin-Brockers](https://github.com/Kevin-Brockers). ## Contributions and Support From 0436058a36ed5e33da70877700a233a63b0a7141 Mon Sep 17 00:00:00 2001 From: Kevin-Brockers Date: Wed, 6 May 2026 23:42:50 +0200 Subject: [PATCH 5/5] Further explanations in usage --- docs/usage.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 41a8285c..0ebc6cf9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -47,11 +47,13 @@ WT_INPUT,BLA203A30_S21_L002_R1_001.fastq.gz,,2,,, WT_INPUT,BLA203A31_S21_L003_R1_001.fastq.gz,,3,,, ``` -### Note on IP and control replicates +### Note on IP and control replicates - Comparisons of one IP sample against multiple controls The pipeline is designed to handle one IP and matching control replicate, see section above. However there can be situations where one might want to make multiple comparisons of the IP sample against several different controls. In -those cases it is advisable to encode these comparisons either in the sample column or as another replicate. +those cases it is advisable to encode these comparisons either in the sample column or as another replicate. Since it is +rather unusual in ChIP-Seq experiments, this feature is considered experimental. Please open a github issue in case you +need further assistance. - Encoding in sample names: @@ -119,15 +121,15 @@ NAIVE_INPUT,BLA203A48_S39_L001_R1_001.fastq.gz,,2,,, NAIVE_INPUT,BLA203A49_S1_L006_R1_001.fastq.gz,,3,,, ``` -| Column | Description | -| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). It should be unique and contain the antibody name. E.g: `{Treatment or cell type}_{antibody}_IP` | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `replicate` | Integer representing replicate number. This will be identical for re-sequenced libraries. Must start from `1..`. | -| `antibody` | Antibody name. This is required to segregate downstream analysis for different antibodies. Required when `control` is specified. | -| `control` | Sample name for control sample. | -| `control_replicate` | Integer representing replicate number for control sample. | +| Column | Description | +| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). It should be unique per sample and contain sufficient informations, such as the antibody name. E.g: `{Treatment or cell type}_{antibody}_IP` -> `{WT/NAIVE}_{BCATENIN}_IP` | +| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `replicate` | Integer representing replicate number. This will be identical for re-sequenced libraries. Must start from `1..`. | +| `antibody` | Antibody name. This is required to segregate downstream analysis for different antibodies. Required when `control` is specified. | +| `control` | Sample name for control sample. | +| `control_replicate` | Integer representing replicate number for control sample. | Example design files have been provided with the pipeline for [paired-end](../assets/samplesheet_pe.csv) and [single-end](../assets/samplesheet_se.csv) data.