scripts/genbank_parse.py at master · bmichanderson/scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
#!/usr/bin/env python

##########################
# Author: B. Anderson
# Date: 18 May 2020
# Modified: Mar 2021
# Description: Parse a genbank file to extract information
##########################


import sys			# allows access to command line arguments
import argparse
from Bio import SeqIO		# SeqIO is part of Biopython for parsing files


# instantiate the parser
parser = argparse.ArgumentParser(description = 'A script to extract specified regions from a genbank/multigenbank file or list features present')


# add arguments to parse
parser.add_argument('gb_file', type=str, help='A genbank/multigenbank file to parse')
parser.add_argument('-f', type=str, dest='regions_file', help='File with regions for extracting (lower case, one per line)')
parser.add_argument('-t', type=str, dest='type', help='The type of extract output: nucl [default] or prot (for use with the -f option)')
parser.add_argument('-l', type=str, dest='list', help='Specify a specific type of feature to list: CDS, rRNA, tRNA or gene')
parser.add_argument('-g', type=str, dest='gene', help='Specify the name of a gene to extract the sequence(s)')
parser.add_argument('-c', type=str, dest='coords', help='Specify the coordinates of a sequence to extract: start..end, with start > end for compliment')
parser.add_argument('-i', action='store_true', help='Flag to extract all intergenic regions and introns')
parser.add_argument('-m', type=int, dest='min', help='Specify minimum length of intergenic region to keep (default = 20)')


# parse the command line
if len(sys.argv[1:]) == 0:		# if there are no arguments
	parser.print_help(sys.stderr)
	sys.exit(1)

args = parser.parse_args()

gb_file = args.gb_file
regions_file = args.regions_file
type_extract = args.type
list_type = args.list
gene_name = args.gene
coords = args.coords
intergenic = args.i
min_inter = args.min

if not min_inter:
	min_inter = 20


# if -l is present, run the listing and exit

if list_type:
	if list_type.lower() == 'cds':
		feature_type = 'CDS'
	elif list_type.lower() == 'rrna':
		feature_type = 'rRNA'
	elif list_type.lower() == 'trna':
		feature_type = 'tRNA'
	elif list_type.lower() == 'gene':
		feature_type = 'gene'
	else:
		sys.exit('Specify a feature type to list as CDS, rRNA, tRNA or gene')

	print_list = []
	gbks = SeqIO.parse(gb_file, 'genbank')
	for gbk in gbks:
		if any((feature_type == 'CDS', feature_type == 'rRNA')):
			for feature in gbk.features:
				if feature.type == feature_type:
					if 'gene' in feature.qualifiers:
						if 'pseudo' in feature.qualifiers:
							print_list.append(''.join(feature.qualifiers['gene'][0].lower().split()) + '-pseudo')
						elif 'pseudogene' in feature.qualifiers:
							print_list.append(''.join(feature.qualifiers['gene'][0].lower().split()) + '-pseudo')
						else:
							print_list.append(''.join(feature.qualifiers['gene'][0].lower().split()))	# for removing spaces
					elif 'product' in feature.qualifiers:
						print_list.append(''.join(feature.qualifiers['product'][0].lower().split()))
		elif feature_type == 'tRNA':
			for feature in gbk.features:
				if feature.type == feature_type:
					if 'gene' in feature.qualifiers:
						name = ''.join(feature.qualifiers['gene'][0].lower().split())
					elif 'product' in feature.qualifiers:
						name = ''.join(feature.qualifiers['product'][0].lower().split())
					else:
						continue

					if '(' in name:
							mod_name = name.replace('(', '-')
							mod_name = mod_name.replace(')', '')
					else:
							mod_name = name

					if 'pseudo' in feature.qualifiers:
						print_list.append(mod_name + '-pseudo')
						continue

					if 'anticodon' in feature.qualifiers:
						anti_list = list(feature.qualifiers['anticodon'][0].rstrip(')').split(','))
						anti_codon = anti_list[-1].split(':')[-1]

						bases = list(('a', 'c', 'g', 't', 'u'))
						if not all((mod_name[-2:][0] in bases, mod_name[-2:][1] in bases)):		# if the end of the name isn't already a codon
							if mod_name[-3:] == '-cp':
								mod_name = mod_name[:-3] + '-' + anti_codon + '-cp'
							else:
								mod_name = mod_name + '-' + anti_codon
					elif 'note' in feature.qualifiers:
						note = str(feature.qualifiers['note'])
						if 'anticodon' in note:
							start = note.find('anticodon:')
							anti_codon = note[start + 10: start + 13].lower()

							bases = list(('a', 'c', 'g', 't', 'u'))
							if not all((mod_name[-2:][0] in bases, mod_name[-2:][1] in bases)):		# if the end of the name isn't already a codon
								if mod_name[-3:] == '-cp':
									mod_name = mod_name[:-3] + '-' + anti_codon + '-cp'
								else:
									mod_name = mod_name + '-' + anti_codon

					if 'note' in feature.qualifiers:
						if all(('plast' in str(feature.qualifiers['note']), '-cp' not in mod_name)):
							mod_name = mod_name + '-cp'

					print_list.append(mod_name)
		elif feature_type == 'gene':
			for feature in gbk.features:
				if feature.type == feature_type:
					if 'gene' in feature.qualifiers:
						if 'pseudo' in feature.qualifiers:
							print_list.append(''.join(feature.qualifiers['gene'][0].lower().split()) + '-pseudo')
						elif 'pseudogene' in feature.qualifiers:
							print_list.append(''.join(feature.qualifiers['gene'][0].lower().split()) + '-pseudo')
						else:
							print_list.append(''.join(feature.qualifiers['gene'][0].lower().split()))	# for removing spaces

	for gene in sorted(print_list):
		print(gene)
	sys.exit()


# If -g or -c are present, extract the region and exit

if any([gene_name, coords]):
	gbks = []
	genbanks = SeqIO.parse(gb_file, 'genbank')
	for gb in genbanks:
		gbks.append(gb)
	with open(gbks[0].annotations['organism'].split()[0] + '_extract.fasta', 'w') as out_file:
		if gene_name:
			for gbk in gbks:
				for feature in gbk.features:
					if all([feature.type == 'gene', 'gene' in feature.qualifiers]):
						name = ''.join(feature.qualifiers['gene'][0].lower().split())
						if name == gene_name.lower():
							out_file.write(">%s from %s\n%s\n" % (name, gbk.name + ' ' + gbk.annotations['organism'],
												feature.location.extract(gbk).seq))

		elif coords:
			if len(gbks) > 1:		# if this is a multigenbank file, coords shouldn't work
				sys.exit('Cannot specify coordinates to extract from a multigenbank file')
			elif '..' in coords:
				start = int(coords.split('..')[0])
				end = int(coords.split('..')[1])
				if start < end:
					out_file.write(">%s from %s\n%s\n" % ('sequence_' + coords, gbks[0].name + ' ' + gbks[0].annotations['organism'],
										gbks[0].seq[start-1:end]))
				elif start > end:	# opposite strand format
					out_file.write(">%s from %s\n%s\n" % ('sequence_' + coords, gbks[0].name + ' ' + gbks[0].annotations['organism'],
										gbks[0].seq[end-1:start].reverse_complement()))
				else:
					sys.exit('Ensure the start and end coordinates are not the same')
			else:
				sys.exit('Coordinates specified incorrectly. Need to be in the form start..end')

	sys.exit()


# If a file is provided to extract named regions, proceed

if regions_file:
	genes = []
	with open(regions_file, 'r') as gene_list:
		for gene in gene_list:
			genes.append(gene.rstrip())
	genes = list(set(genes))		# remove duplicates

	if type_extract:
		if type_extract.lower() == 'prot':
			type_extract = 'prot'
		else:
			type_extract = 'nucl'
	else:
		type_extract = 'nucl'

	gbks = []
	genbanks = SeqIO.parse(gb_file, 'genbank')
	for gb in genbanks:
		gbks.append(gb)
	with open(gbks[0].annotations['organism'].split()[0] + '_' + str(type_extract) + '_extract.fasta', 'w') as out_file:
		copy_num = {}
		ref_needed = 'False'		# a flag for when a multi-record genbank has features referencing different sequences
		multi_parts = []
		for gbk in gbks:
			for feature in gbk.features:

				# CDS
				if all((feature.type == 'CDS', 'gene' in feature.qualifiers)):

					if 'pseudo' in feature.qualifiers:
						name = ''.join(feature.qualifiers['gene'][0].lower().split()) + '-pseudo'
					else:
						name = ''.join(feature.qualifiers['gene'][0].lower().split())

					if name in genes:
						if name in copy_num:		# if this region has multiple copies
							copy_num[name] = copy_num[name] + 1
							print('Duplicate feature: ' + name + ' detected')
						else:
							copy_num[name] = 1

						if copy_num[name] > 1:
							name = name + '_' + str(copy_num[name])

						if type_extract == 'prot':
							if 'translation' in feature.qualifiers:
								out_file.write(">%s from %s\n%s\n" % (name, gbk.name + ' ' + gbk.annotations['organism'],
													feature.qualifiers['translation'][0]))
						elif type_extract == 'nucl':
							ref_present = 'False'
							for part in feature.location.parts:
								if part.ref:		# if there is a reference to another sequence
									ref_present = 'True'
									ref_needed = 'True'		# this flag will now trigger another pass over the list

							if ref_present == 'True':
								parts = []
								for part in feature.location.parts:		# note that the order of parts is important esp. if trans
									if part.ref:
										parts.append(part)
									else:		# the part is found in this genbank, so ref = None; need to add
										part.ref = gbk.id
										parts.append(part)
								multi_parts.append((name, parts))
							else:
								out_file.write(">%s from %s\n%s\n" % (name, gbk.name + ' ' + gbk.annotations['organism'],
													feature.location.extract(gbk).seq))
						else:
							sys.exit('Type of extraction (nucl or prot) specified incorrectly')


				# rRNA
				elif (feature.type == 'rRNA'):

					if 'gene' in feature.qualifiers:
						name = ''.join(feature.qualifiers['gene'][0].lower().split())

						if name in genes:
							out_file.write(">%s from %s\n%s\n" % (name, gbk.name + ' ' + gbk.annotations['organism'],
												feature.location.extract(gbk).seq))

					elif 'product' in feature.qualifiers:
						name = ''.join(feature.qualifiers['product'][0].lower().split())

						if name in genes:
							out_file.write(">%s from %s\n%s\n" % (name, gbk.name + ' ' + gbk.annotations['organism'],
												feature.location.extract(gbk).seq))


				# tRNA
				elif feature.type == 'tRNA':

					if 'gene' in feature.qualifiers:
						name = ''.join(feature.qualifiers['gene'][0].lower().split())
					elif 'product' in feature.qualifiers:
						name = ''.join(feature.qualifiers['product'][0].lower().split())
					else:
						continue

					if '(' in name:
							mod_name = name.replace('(', '-')
							mod_name = mod_name.replace(')', '')
					else:
							mod_name = name

					if 'pseudo' in feature.qualifiers:
						mod_name = mod_name + '-pseudo'
						if mod_name in genes:
							out_file.write(">%s from %s\n%s\n" % (mod_name, gbk.name + ' ' + gbk.annotations['organism'],
												feature.location.extract(gbk).seq))

					if 'anticodon' in feature.qualifiers:
						anti_list = list(feature.qualifiers['anticodon'][0].rstrip(')').split(','))
						anti_codon = anti_list[-1].split(':')[-1]

						bases = list(('a', 'c', 'g', 't', 'u'))
						if not all((mod_name[-2:][0] in bases, mod_name[-2:][1] in bases)):		# if the end of the name isn't already a codon
							if mod_name[-3:] == '-cp':
								mod_name = mod_name[:-3] + '-' + anti_codon + '-cp'
							else:
								mod_name = mod_name + '-' + anti_codon
					elif 'note' in feature.qualifiers:
						note = str(feature.qualifiers['note'])
						if 'anticodon' in note:
							start = note.find('anticodon:')
							anti_codon = note[start + 10: start + 13].lower()

							bases = list(('a', 'c', 'g', 't', 'u'))
							if not all((mod_name[-2:][0] in bases, mod_name[-2:][1] in bases)):		# if the end of the name isn't already a codon
								if mod_name[-3:] == '-cp':
									mod_name = mod_name[:-3] + '-' + anti_codon + '-cp'
								else:
									mod_name = mod_name + '-' + anti_codon

					if 'note' in feature.qualifiers:
							if all(('plast' in str(feature.qualifiers['note']), '-cp' not in mod_name)):
								mod_name = mod_name + '-cp'

					if mod_name in genes:
						out_file.write(">%s from %s\n%s\n" % (mod_name, gbk.name + ' ' + gbk.annotations['organism'],
											feature.location.extract(gbk).seq))

		if all((ref_needed == 'True', type_extract == 'nucl')):			# if a multi-record genbank had a CDS feature referencing another sequence
			for entry in multi_parts:
				seq = ''
				name = entry[0]
				parts = entry[1]
				for part in parts:
					for gbk in gbks:
						if gbk.id == part.ref:
							part.ref = ''
							if seq:
								seq = seq + part.extract(gbk.seq)
							else:
								seq = part.extract(gbk.seq)		# for the first part, to initiate a sequence object

				out_file.write(">%s from %s\n%s\n" % (name, 'multi-record ' + gbks[0].annotations['organism'], seq))
elif not intergenic:
	parser.print_help(sys.stderr)
	sys.exit(1)


# if the intergenic flag is set, proceed to extract all intergenic regions and introns

if intergenic:
	gbks = []
	genbanks = SeqIO.parse(gb_file, 'genbank')
	for gb in genbanks:
		gbks.append(gb)
	with open(gbks[0].annotations['organism'].split()[0] + '_intergenic_extract.fasta', 'w') as out_file:
		for gbk in gbks:
			gene_locs = []
			for feature in gbk.features:
			# introns
				if feature.type == 'intron':
					out_file.write(">%s from %s\n%s\n" % (feature.qualifiers['gene'][0] + '_intron_' +
										feature.qualifiers['number'][0],
										gbk.name + ' ' + gbk.annotations['organism'],
										feature.location.extract(gbk).seq))
					# note that this will output the entire intron, even if there is a CDS inside, e.g. matK
			# intergenic
				elif feature.type == 'gene':
					if 'trans_splicing' in feature.qualifiers:
						index = 1
						for piece in feature.location.parts:
							gene_locs.append([feature.qualifiers['gene'][0] + '_' + str(index), int(piece.start),
									 int(piece.end)])
							index = index + 1
					else:
						gene_locs.append([feature.qualifiers['gene'][0], int(feature.location.start),
								 int(feature.location.end)])
				else:
					continue

			# remove any duplicates
			check_list = []
			for gene in gene_locs:
				if gene in check_list:
					continue
				else:
					check_list.append(gene)

			# sort by start location and output the intergenic sequences
			gene_locs = sorted(check_list, key = lambda k: k[1])
			previous_gene = gene_locs[0][0]
			previous_end = gene_locs[0][2]
			for gene in gene_locs[1:]:
				this_gene = gene[0]
				this_start = gene[1]
				if (this_start - previous_end > min_inter):
					out_file.write(">%s from %s\n%s\n" % (previous_gene + '--' + this_gene,
										gbk.name + ' ' + gbk.annotations['organism'],
										gbk.seq[previous_end:this_start]))
				if gene[2] > previous_end:
					previous_gene = gene[0]
					previous_end = gene[2]