dev_RNAseq/Config at main · emilyyaklich/dev_RNAseq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#!/bin/bash

#   Name this project
PROJECT="lettuce_dev"

#   What email should we use for job notifications?
EMAIL="ely67071@uga.edu"

#	Is data paired-end? ("True" or "False")
PE=True

#   Are you submitting the job with qsub from PBS (Portable Batch System) or with sbatch from Slurm?
#       Choose from: "PBS" or "Slurm"
QUEUE=Slurm

############################################
##########  Reference Genome Info ##########
############################################

####The following variables are needed for: Genome indexing, and Reference Prep

#	What file format is your genome annotation file? (GTF or GFF3)
#	This variable will be used in the 'Genome Index' and 'Reference Prep' steps
ANNOTATION_FORMAT="GTF"

#   Where is the Genome Annotation file (GFF3 or GTF)?
#   Include the full filepath
#	This variable will be used in the 'Genome Index' and 'Reference Prep' steps
GEN_ANN="/scratch/ely67071/lettuce_dev_data/genomes/Hildegardhap1v1_2.gtf"

#   Where is the Genome FASTA file?
#	Include the full filepath
#	This variable will be used in the 'Genome Index' and 'Reference Prep' steps
GEN_FASTA="/scratch/ely67071/lettuce_dev_data/genomes/Hildegardhap1v1.fasta"

############################################
##########   Quality_Assessment   ##########
############################################

#   QUEUE=PBS: What are our QSub settings for Quality_Assessment?
#       Below are the recommended settings
QA_QSUB="mem=1gb,nodes=1:ppn=4,walltime=12:00:00"

#   QUEUE=Slurm: What are our sbatch settings?
#       Below are the recommended settings

PARTITION="batch"
QA_SBATCH="--nodes=1 --ntasks=4 --mem=5gb --time=06:00:00 --mail-type=BEGIN,END,FAIL --mail-user=${EMAIL} --partition=${PARTITION}"

#   What is the file name suffix for files to be assessed?
SUFFIX="fq.gz"

#	Where is the directory containing your data to be assessed?
#	This can be a directory to multiple directories each containing a forward and reverse read which is the format of raw data from the GGBC
#       Include the full file path, e.g. path/to/directory (with no trailing forward slash)
QA_INPUTDIR="/scratch/ely67071/lettuce_dev_data/raw_rna_seq"

#   Where do you want the FastQC results?
#       Include the full file path, e.g. path/to/directory (with no trailing forward slash)
QA_OUTPUTDIR="/scratch/ely67071/lettuce_dev_data/raw_rna_seq/quality"

#	A place for FASTQC to store temporary files
QA_TEMP="/scratch/ely67071/temp"

############################################
##########    Adapter_Trimming    ##########
############################################

#   QUEUE=PBS: What are our QSub settings for Adapter_Trimming?
#   Below are the recommended settings
AT_QSUB="mem=10gb,nodes=1:ppn=4,walltime=12:00:00"

#   QUEUE=Slurm: What are our sbatch settings?
#       Below are the recommended settings
AT_PARTITION="batch"
AT_SBATCH="--nodes=1 --ntasks=4 --mem=13gb --time=12:00:00 --mail-type=BEGIN,END,FAIL --mail-user=${EMAIL} --partition=${AT_PARTITION}"

#   What files to trim?
#	This handler will accept either a list of forward samples OR
#	a directory to raw data- this can be a directory to multiple directories,
#	each containing a forward and reverse read (such as the format of raw data from the GGBC)

#	Either a text file containing a list of samples to trim (if PE, only need to list the forward reads) including the full file path,
#	OR a directory of input files
AT_INPUT="/scratch/ely67071/sunflower_dev_data/raw_rna_seq"

#   Where do you want the Trimmed Samples to go?
#       Include the full file path, e.g. path/to/directory (with no trailing forward slash)
AT_OUTPUTDIR="/scratch/ely67071/sunflower_dev_data/trimmed_reads"

#	What is our adapter file? Include the full file path.
ADAPTERFILE="/scratch/ely67071/sunflower_dev_data/illumina_adapters.txt"

#       File name/path of where JOB IDs should be recorded
AT_JOB_LOG="/scratch/ely67071/sunflower_dev_data/AT_job_log.txt"

#   What shared suffix do the forward samples have?
#       Example: _1_sequence.txt.gz
FORWARD_NAMING=R1_001.fastq.gz

#   What shared suffix do the reverse samples have?
#       Example: _2_sequence.txt.gz
#	(Only relevant for paired-end data)
REVERSE_NAMING=R2_001.fastq.gz

#	The maximum mismatch count allowed for the "seed" (small section of adapter),
#	which causes the entire alignment between the read and adapter to be scored.
SEEDMISMATCH=2

#	The minimum alignment score threshold for clipping adapter sequence
#	A palindroma approach is used to check for adapter 'read-through'
#	This strategy is only used in PE data, but a value must still be supplied if SE data
PALINDROMECLIP=30

#	The minimum alignment score threshold for clipping adapter sequence
SIMPLECLIP=10

#	The minimum length of adapter sequence to be removed
#	Only relevant for Paired-end data
MINADAPTERLEN=1

#	Whether to keep the reverse read if adapter read-through has been detected by palindrome mode
#	the default behavior is to entirely drop the reverse read
#	Only relevant for Paired-end data
KEEPREADS=true

#	Low quality bases are removed from the beginning of the sequence.
#	What is the minimum quality value required to keep a base at the beginning?
LEADCUT=3

#	Low quality bases are removed from the end of the sequence.
#	What is the minimum quality value required to keep a base at the end?
TRAILCUT=3

#	Reads below a specified minimum length are removed (dropped after other processing steps)
#	What is the minimum length required of reads to be kept?
MINLENGTH=20

############################################
##########    Genome_Index        ##########
############################################

#####IMPORTANT:
# The variables:
# "ANNOTATION_FORMAT", "GEN_ANN" and "GEN_FASTA"
# MUST also be supplied (above, under 'reference genome info')

#   What are our QSub settings for Genome_Index?
#	note that STAR needs at least 30gb of memory to run
#   Below are the recommended settings
GI_QSUB="mem=50gb,nodes=1:ppn=4,walltime=450:00:00"


#   QUEUE=Slurm: What are our sbatch settings?
#       Below are the recommended settings
PARTITION="batch"
GI_SBATCH="--nodes=1 --ntasks=4 --mem=80gb --time=96:00:00 --mail-type=BEGIN,END,FAIL --mail-user=${EMAIL} --partition=${PARTITION}"

#	Define the number of threads to be used.
#	This must be set to the number of available cores on the server node
NTHREAD=4

#   Where do you want the files for your genome index?
#	This direcotry has to be created before program will run and needs writing permissions
#	This directory path will also be used in the next read mapping step
#	Include the full filepath
GEN_DIR="/scratch/ely67071/lettuce_dev_data/genomes/genome_indices_gtf"

#   Specify the length of genomic sequence to be used in constructing the splice junctions database.
#   This length should be equal to ReadLength-1, where ReadLength is the length of reads
#	(ex. for 2x100bp paired-end reads, the ideal value is 99)
SPLICE_JUN=149

#	What tag name should be used as the exons' gene-parents (in GTF files this defaults to "gene_id")
#	You only need to fill this out if: 1.) you are used a .gff3 file for annotations AND
#										2.) you want GeneCounts output from read mapping
#	(The tag name used for transcript-parents for a .gff3 file is 'Parent')
#	This variable will be ignored if annotation file is GTF format
GENE_PARENT="Parent"

############################################
########    Collect_Junctions        #######
############################################

### (Optional)
### Run a first-pass mapping step with STAR to detect novel splice junctions
### These novel splice junctions can be used to regenerate the genome index for better mapping sensitivity

###### IMPORTANT: This step shares variables with Read-Mapping (below)

#   Where do you want your output files to go? Include the full file path
CJ_OUTPUTDIR="/scratch/ely67071/lettuce_dev_data/STAR_output_gtf/junction_data"
#       File name/path of where JOB IDs should be recorded
CJ_JOB_LOG="/scratch/ely67071/lettuce_dev_data/CJ_job_log.txt"

############################################
########    Filter_Junctions        ########
############################################

### Filter identified junctions for accuracy
### Outputs a filtered list that can be added to Read Mapping to improve alignment

#	What are our QSub settings for Filter_Junctions?
#	Below are the recommended settings
FJ_QSUB="mem=22gb,nodes=1:ppn=4,walltime=4:00:00"


#   QUEUE=Slurm: What are our sbatch settings?
#       Below are the recommended settings
PARTITION="batch"
FJ_SBATCH="--nodes=1 --ntasks=4 --mem=22gb --time=04:00:00 --mail-type=BEGIN,END,FAIL --mail-user=${EMAIL} --partition=${PARTITION}"


#	Where are the "SJ.out.tab" files? Include the full filepath
#	This will also be where the concatenated filtered list is put
JUNCTIONDIR="/scratch/ely67071/lettuce_dev_data/STAR_output_gtf/junction_data"

#	Name of final list (will end in _SJ.filtered.tab)
SJ_LISTNAME="lettuce_dev_"

## FILTERS:

#  Remove junctions identified in scaffold sequences?
### If yes, what string denotes non-chromosomal sequences, eg "Chr00"? (Otherwise, put NA)
SCAFFOLD_STRING="NA"

### Filter out non-canonical junctions? (yes or no)
REMOVE_NC_JUNC="yes"

### Minimum number of uniquely mapping reads needed to support junction (within or across samples)
UNIQUE_NUM=2


############################################
######    Read_Mapping  and QUANT    #######
############################################

#####IMPORTANT:
# Make sure that the "GEN_DIR" variable is supplied above (under Genome Index)
# This directory will also be used for read mapping

#	What are our QSub settings for Read_Mapping?
#	Below are the recommended settings
#	Note that STAR needs at least 30gb of memory to run
RM_QSUB="mem=50gb,nodes=1:ppn=16,walltime=450:00:00"

#   QUEUE=Slurm: What are our sbatch settings?
#       Below are the recommended settings
RM_PARTITION="batch"
RM_SBATCH="--nodes=1 --ntasks=16 --mem=50gb --time=45:00:00 --partition=${PARTITION}"


#   Where are the trimmed files? This handler will accept EITHER a text file list of forward samples to map (include the full file path for each sample) OR
#	a directory of input files (include the full file path). This may be the same as your output directory defined in adapter trimming (AT_OUTPUTDIR)
RM_INPUT="/scratch/ely67071/lettuce_dev_data/raw_rna_seq"
#       File name/path of where JOB IDs should be recorded
RM_JOB_LOG="/scratch/ely67071/lettuce_dev_data/RM_job_log_2nd_pass.txt"

#   What shared suffix do the forward samples have?
#       Example: _R1_paired.fq.gz
FORWARD="_1.fq.gz"

#   What shared suffix do the reverse samples have? (leave blank if single-end)
#       Example: _R2_paired.fq.gz
REVERSE="_2.fq.gz"

#   Where do you want your output files to go? Include the full file path
RM_OUTPUTDIR="/scratch/ely67071/lettuce_dev_data/STAR_output_gtf"

#	Define the number of threads to be used.
#	This must be set to the number of available cores on the server node
RM_NTHREAD=16

#	What is the maximum number of mismatches allowed?
#	Alignment will output only if it has no more mismatches than this value
MAX_MIS=10

#	What is the maximum number of loci the read is allowed to map to?
#	All alignments will output only if the read maps to no more than this value
#	If greater than this value, read counted as "mapped to too many loci"
MAX_N=10

#	Do you want unmapped or partially mapped reads to output in separate fasta/fastq files?
#	If not, put "None", if yes, "Fastx"
UNMAP_F=Fastx

#	What is the minimum score needed for alignment?
#	Normalized by read length- (sum of mates' lengths for paired-end reads)
MINSCORE_READL=0.66

#	What is the minimum number of matched bases needed for alignment?
#	Normalized by read length- (sum of mates' lengths for paired-end reads)
MINMATCH_READL=0.66

#	What is the maximum length of read to use for seed search start points?
#	Reducing this parameter will increase the overall sensitivity of mapping.
SEEDSEARCH=50

#####	For adding read group (@RG) tags to mapped sequence:
#	This information is useful for differentiating reads coming from different runs/lanes even after merging

#	Name of Flowcell (or other name that differentiates a particular run, e.g. "Run1")
FLOWCELL_NAME="Run4"

# sequencing platform used
PLATFORM="ILLUMINA"

### Variables not shared with Collect_Junctions handler:

#	STAR outputs alignment in genome coordinates as well as transcript coordinates. The latter is used for
#	RSEM (step 7). If you plan on using the genomic coordinate alignment files for SNP calling, you will need
#	sorted BAM files. If you want STAR to output genome alignments as sorted BAM files, put "yes" here. Note that this
#	increases the time it takes for read mapping to run. If "no", the genome alignment file will be an unsorted SAM.
GENOMIC_COORDINATE_BAMSORTED="yes"

#	The type of quantification requested
#	Either "TranscriptomeSAM" -outputs SAM/BAM alignments to transcriptome in a separate file
#	Or "GeneCounts" - read counts per gene
#	Can also put "-" for none
QUANT="GeneCounts"

#	2nd-pass read mapping will use junctions discovered in the first pass (Collect_Junctions)
#	This variable can be one of two options:
#		1.) The filtered junction list output from Collect_Junctions handler, OR
#		2.) If you have more than one filtered junction list (if Collect_Junctions was run with multiple sample groups),
#			this variable can be a text file listing the full file paths for each junction file you want included
#	If not using additional junctions, leave blank
FILTERED_JUNC_LIST="/scratch/ely67071/lettuce_dev_data/STAR_output_gtf/junction_data/lettuce_dev__SJ.filtered.tab"

############################################
##########      Dependencies      ##########
############################################