downsample-nf/downsample.nf at master · erikwaskiewicz/downsample-nf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
/* ======================================================================== *
 *   Nextflow pipeline for downsampling BAMs                                *
 *                                                                          *
 * ======================================================================== */

help_message = "TODO"


/* ------------------------------------------------------------------------ *
 *   Config
 * ------------------------------------------------------------------------ */

// show help if --help flag is used
params.help = false
if ( params.help ) { log.info help_message; exit 0 }

// initialise variables
params.outdir      = false
params.input_bam   = false
params.num_repeats = false
params.num_reads   = false
params.range_reads = false

// check for required input data
if ( !params.input_bam )   { log.error "ERROR: input_bam is required"; exit 1 }
if ( !params.num_repeats ) { log.error "ERROR: num_repeats is required"; exit 1 }

// make sure exactly one of num_reads and range_reads have been inputted
if ( !params.range_reads && !params.num_reads ) {
    log.error "ERROR: one of num_reads or range_reads is required"; exit 1

} else if ( params.range_reads && params.num_reads ) {
    log.error "ERROR: only one of num_reads or range_reads is required, not both"; exit 1

// parse input
} else if ( params.range_reads && !params.num_reads ) {
    range_min  = params.range_reads.split(',')[0].toInteger()
    range_max  = params.range_reads.split(',')[1].toInteger()
    range_step = params.range_reads.split(',')[2].toInteger()
    target_reads = (range_min..range_max).step(range_step)

} else if ( !params.range_reads && params.num_reads ) {
    target_reads = params.num_reads.split(',')
}

// TODO - check optional input data
//if ( !params.outdir )   {
//   log.warn "output directory not set, defaulting to current working directory";
//    params.outdir = baseDir
//}

// load file objects
input_bam = file("$params.input_bam", checkIfExists: true)

// create summary for log file
def summary = [:]
summary["Input BAM"]   = params.input_bam
summary["Reads"]       = target_reads.join(',')
summary["Num repeats"] = params.num_repeats
summary["Output dir"]  = params.outdir

log.info """
==============================================================================
Inputs:
${ summary.collect { k, v -> "  ${k.padRight(12)}: $v"}.join("\n") }
==============================================================================
"""

/* ------------------------------------------------------------------------ *
 *   Pipeline
 * ------------------------------------------------------------------------ */

// define initial channels
// channel for each target depth
Channel
    .from( target_reads )
    .map{ "${it}reads" }
    .set{ depths_ch }


// channel for each repeat
Channel
    .from( 1..params.num_repeats.toInteger() )
    .map{ "rep$it" }
    .set{ reps_ch }


/* ------------------------------------------------------------------------ *
 *   Check input BAM
 * ------------------------------------------------------------------------ */

// TODO - limit to bed region


// count total reads in input
process get_input_read_count {
    conda "$baseDir/env/downsample.yml"

    input:
    file(bam) from input_bam

    output:
    env(READ_COUNT) into input_bam_read_count_ch

    script:
    """
    READ_COUNT=\$(samtools view -c $bam)
    """
}


// are there enough reads to meet num_reads?
// TODO? - check inputs are integers
process input_bam_qc {
    conda "$baseDir/env/downsample.yml"

    input:
    val(input_count) from input_bam_read_count_ch

    output:
    val(input_count) into input_check_ch

    script:
    """
    #!/usr/bin/env python
    for requested_count in $target_reads:
        if int(requested_count) >= $input_count:
            raise ValueError(f'Requested number of reads ({requested_count}) is greater than the input ({$input_count})')
    """
}


// TODO - run depthofcoverage too?
// TODO - total reads-duplicates - need to mark duplicates first

/* ------------------------------------------------------------------------ *
 *   Downsample BAM
 * ------------------------------------------------------------------------ */

// calculate percent to downsample to
process calc_percent_downsample {
    conda "$baseDir/env/downsample.yml"
    tag "$depth"

    input:
    val(depth) from depths_ch
    val(input_count) from input_check_ch

    output:
    tuple val(depth),
          file("${depth}_percent.txt") into downsample_percent_ch

    script:
    """
    #!/usr/bin/env python
    target = int('$depth'.strip('reads')) / $input_count
    with open('${depth}_percent.txt', 'w') as f:
        f.write(str(target))
    """
}


// downsample BAM
process downsample {
    conda "$baseDir/env/downsample.yml"
    publishDir "${params.outdir}/downsample/$depth", mode: "copy"
    tag "${depth}_${rep}"

    input:
    tuple val(depth),
          file(percent_file) from downsample_percent_ch
    each(rep) from reps_ch

    output:
    tuple val(depth),
          val(rep),
          file("${depth}_${rep}.bam"),
          file("${depth}_${rep}.downsample_metrics") into downsampled_bams_ch
    file("${depth}_${rep}.bai")

    script:
    """
    percent=\$(cat $percent_file)
    picard DownsampleSam \
        I=$params.input_bam \
        O=${depth}_${rep}.bam \
        M=${depth}_${rep}.downsample_metrics \
        CREATE_INDEX=true \
        RANDOM_SEED=null \
        P=\$percent
    """
}


// calculate total reads with duplicates removed
process mark_dups {
    conda "$baseDir/env/downsample.yml"
    publishDir "${params.outdir}/downsample/$depth", mode: "copy"
    tag "${depth}_${rep}"

    input:
    tuple val(depth),
          val(rep),
          file(bam),
          file(bam_metrics) from downsampled_bams_ch

    output:
    file("${depth}_${rep}.rmdup.bam")
    file("${depth}_${rep}.rmdup.bai")
    tuple val(depth),
          val(rep),
          file("${depth}_${rep}.rmdup_metrics"),
          file(bam_metrics) into rmdup_metrics_ch

    script:
    """
    picard MarkDuplicates \
        I=$bam \
        O=${depth}_${rep}.rmdup.bam \
        M=${depth}_${rep}.rmdup_metrics \
        CREATE_INDEX=true
    """
}


// TODO - run depthofcoverage?


// combine metrics files into one per sample
process combine_metrics {
    tag "${depth}_${rep}"
    publishDir "${params.outdir}/downsample/$depth", mode: "copy"

    input:
    tuple val(depth),
          val(rep),
          file(rmdup),
          file(ds) from rmdup_metrics_ch

    output:
    file("${depth}_${rep}_metrics.csv") into combined_metrics_ch

    script:
    """
    # gather metics from downsampling metrics file
    ds_total_reads=\$(head -n8 $ds | tail -n1 | cut -f1)
    ds_pf_reads=\$(head -n8 $ds | tail -n1 | cut -f2)
    ds_read_length=\$(head -n8 $ds | tail -n1 | cut -f3)
    ds_total_bases=\$(head -n8 $ds | tail -n1 | cut -f4)
    ds_pf_bases=\$(head -n8 $ds | tail -n1 | cut -f5)

    # gather metrics from rmdup metics file
    rmdup_unpaired_reads_examined=\$(head -n8 $rmdup | tail -n1 | cut -f2)
    rmdup_read_pairs_examined=\$(head -n8 $rmdup | tail -n1 | cut -f3)
    rmdup_secondary_or_supplementary_reads=\$(head -n8 $rmdup | tail -n1 | cut -f4)
    rmdup_unmapped_reads=\$(head -n8 $rmdup | tail -n1 | cut -f5)
    rmdup_unpaired_read_duplicates=\$(head -n8 $rmdup | tail -n1 | cut -f6)
    rmdup_read_pair_duplicates=\$(head -n8 $rmdup | tail -n1 | cut -f7)
    rmdup_read_pair_optical_duplicates=\$(head -n8 $rmdup | tail -n1 | cut -f8)
    rmdup_percent_duplication=\$(head -n8 $rmdup | tail -n1 | cut -f9)
    rmdup_estimated_library_size=\$(head -n8 $rmdup | tail -n1 | cut -f10)

    # print metrics to file
    echo -e "${depth}_${rep},\\
    \$(echo ${depth} | tr -cd '0-9'),\\
    \$(echo ${rep} | tr -cd '0-9'),\\
    \$ds_total_reads,\\
    \$ds_pf_reads,\\
    \$ds_read_length,\\
    \$ds_total_bases,\\
    \$ds_pf_bases,\\
    \$rmdup_unpaired_reads_examined,\\
    \$rmdup_read_pairs_examined,\\
    \$rmdup_secondary_or_supplementary_reads,\\
    \$rmdup_unmapped_reads,\\
    \$rmdup_unpaired_read_duplicates,\\
    \$rmdup_read_pair_duplicates,\\
    \$rmdup_read_pair_optical_duplicates,\\
    \$rmdup_percent_duplication,\\
    \$rmdup_estimated_library_size" > ${depth}_${rep}_metrics.csv
    """
}


// TODO - add input BAM
// combine metrics files from all samples into one
process combine_samples {
    publishDir "${params.outdir}/downsample/", mode: "copy"

    input:
    file(metrics_file) from combined_metrics_ch.collect()

    output:
    file("combined_metrics.csv")

    script:
    """
    # make header
    echo -e "sample,\\
    target_reads,\\
    rep,\\
    ds_total_reads,\\
    ds_pf_reads,\\
    ds_read_length,\\
    ds_total_bases,\\
    ds_pf_bases,\\
    rmdup_unpaired_reads_examined,\\
    rmdup_read_pairs_examined,\\
    rmdup_secondary_or_supplementary_reads,\\
    rmdup_unmapped_reads,\\
    rmdup_unpaired_read_duplicates,\\
    rmdup_read_pair_duplicates,\\
    rmdup_read_pair_optical_duplicates,\\
    rmdup_percent_duplication,\\
    rmdup_estimated_library_size" > combined_metrics.csv

    # add sample
    cat $metrics_file >> combined_metrics.csv
    """
}