-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathannotations_tbl.py
More file actions
executable file
·149 lines (139 loc) · 6.13 KB
/
annotations_tbl.py
File metadata and controls
executable file
·149 lines (139 loc) · 6.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
##########################
# Author: B. Anderson
# Date: 3 Apr 2020
# Modified: Oct 2020, Apr 2021 (for pseudo and notes)
# Description: generate a Sequin format table of annotations for use with a .fsa file and tbl2asn then asn2gb to ultimately create a GenBank flatfile
##########################
import sys # allows access to command line arguments
def help():
print('A script to generate a Sequin format table of annotations from a file with names of genes and locations, one per line.')
print('')
print('Format of the tab-delimited locations file (after headers):')
print('gene_name note(; delimited) start1 end1 start2 end2 start3 end3 ...')
print('')
print('Introns should be annotated similarly to coding sequences, with keyword \'intron\' in the note')
print('')
print('Also include a tab-delimited key file for naming conventions and gene products of four columns with headers, in the format:')
print('gene_name formal_name type product')
print('')
print('Gene type should be one of CDS, rRNA or tRNA')
print('')
print('Usage: ' + str(sys.argv[0]) + ' locations_file key_file')
print('')
# Ensure there are command args
if len(sys.argv[1:]) > 1:
print('Arguments are locations_file: ' + str(sys.argv[1]) + ', and key_file: ' + str(sys.argv[2]))
else:
sys.exit(help())
# Read in the key file and create a dictionary for looking up name, type and product
key_dict = {}
with open(sys.argv[2], 'r') as key_file:
for line_no, line in enumerate(key_file, start=1):
if line_no == 1: # first line is the header row
continue
entry = line.rstrip().split('\t')
key_dict[entry[0].lower()] = '\t'.join(entry[1:4])
# Read in the locations file and output the Sequin format table
with open(sys.argv[1], 'r') as loc_file, open('features.tbl', 'w') as out_file:
out_file.write('>Features SeqID\n') # write header
locus_index = 1
for line_no, line in enumerate(loc_file, start=1):
if line_no == 1: # first line is the header row
continue
entries = [entry.rstrip() for entry in line.rstrip().split('\t')] # to get rid of whitespace
start1 = entries[2]
start2 = entries[3]
gene = entries[0].lower()
gene_name = key_dict[gene].split('\t')[0]
gene_type = key_dict[gene].split('\t')[1]
gene_prod = key_dict[gene].split('\t')[2]
is_pseudo = False
pieces = []
note_entry = ''
note_list = []
if len(entries) > 4: # i.e. if the gene/feature is in more than one piece
loc_iter = iter(entries[4:])
for loc in loc_iter:
pieces.append((loc, next(loc_iter)))
if len(str(entries[1])) > 0: # parse notes if the second entry has text
notes = [note.lstrip().rstrip().lower() for note in str(entries[1]).split(';')]
if 'pseudo' in notes:
is_pseudo = True
note_entry = '\t\t\tpseudo\n'
note_list.append(gene_prod)
for note in notes:
if 'pseudo' in note:
continue
elif 'trans-splicing' in note:
if is_pseudo:
continue
else:
note_entry = note_entry + '\t\t\ttrans_splicing\n'
elif 'codon_start' in note:
if is_pseudo:
continue
else:
note_entry = note_entry + '\t\t\tcodon_start\t' + note.split(' ')[1] + '\n'
else:
note_list.append(note)
if len(note_list) > 0:
note_entry = note_entry + '\t\t\tnote\t' + ('; ').join(note_list) + '\n'
if not any((gene_type == 'CDS', gene_type == 'rRNA', gene_type == 'tRNA')):
sys.exit('Incorrectly specified gene type')
# if gene_type == 'CDS': # prepend a protein product identifier when annotating a CDS
# if not is_pseudo:
# note_entry = '\t\t\tprotein_id\tgnl|DBNAME|LOCUSTAG_' + format(locus_index, '03d') + '\n' + note_entry
if len(pieces) > 0:
if 'trans_splicing' in note_entry:
out_file.write(str(start1) + '\t' + str(start2) + '\tgene\n' +
'\n'.join('\t'.join(x) for x in pieces) + '\n' +
'\t\t\tgene\t' + gene_name + '\n' +
'\t\t\tlocus_tag\tLOCUSTAG_' + format(locus_index, '03d') + '\n' +
'\t\t\ttrans_splicing\n' +
str(start1) + '\t' + str(start2) + '\t' + gene_type + '\n' +
'\n'.join('\t'.join(x) for x in pieces) + '\n' +
'\t\t\tgene\t' + gene_name + '\n' +
'\t\t\tlocus_tag\tLOCUSTAG_' + format(locus_index, '03d') + '\n' +
'\t\t\tproduct\t' + gene_prod + '\n' +
note_entry +
str(start1) + '\t' + str(start2) + '\texon\n' +
'\t\t\tnumber\t1\n' +
'\n'.join(('\t'.join(x) + '\texon\n\t\t\tnumber\t' + str(index)) for index, x in enumerate(pieces, start=2)) + '\n')
elif 'intron' in note_entry:
out_file.write(str(start1) + '\t' + str(start2) + '\tintron\n' +
'\t\t\tnumber\t1\n' +
'\n'.join(('\t'.join(x) + '\tintron\n\t\t\tnumber\t' + str(index)) for index, x in enumerate(pieces, start=2)) + '\n')
else:
out_file.write(str(start1) + '\t' + str(pieces[-1][1]) + '\tgene\n' +
'\t\t\tgene\t' + gene_name + '\n' +
'\t\t\tlocus_tag\tLOCUSTAG_' + format(locus_index, '03d') + '\n' +
str(start1) + '\t' + str(start2) + '\t' + gene_type + '\n' +
'\n'.join('\t'.join(x) for x in pieces) + '\n' +
'\t\t\tgene\t' + gene_name + '\n' +
'\t\t\tlocus_tag\tLOCUSTAG_' + format(locus_index, '03d') + '\n' +
'\t\t\tproduct\t' + gene_prod + '\n' +
note_entry +
str(start1) + '\t' + str(start2) + '\texon\n' +
'\t\t\tnumber\t1\n' +
'\n'.join(('\t'.join(x) + '\texon\n\t\t\tnumber\t' + str(index)) for index, x in enumerate(pieces, start=2)) + '\n')
else:
if 'intron' in note_entry:
out_file.write(str(start1) + '\t' + str(start2) + '\tintron\n' +
'\t\t\tnumber\t1\n')
elif is_pseudo:
out_file.write(str(start1) + '\t' + str(start2) + '\tgene\n' +
'\t\t\tgene\t' + gene_name + '\n' +
'\t\t\tlocus_tag\tLOCUSTAG_' + format(locus_index, '03d') + '\n' +
note_entry)
else:
out_file.write(str(start1) + '\t' + str(start2) + '\tgene\n' +
'\t\t\tgene\t' + gene_name + '\n' +
'\t\t\tlocus_tag\tLOCUSTAG_' + format(locus_index, '03d') + '\n' +
str(start1) + '\t' + str(start2) + '\t' + gene_type + '\n' +
'\t\t\tgene\t' + gene_name + '\n' +
'\t\t\tlocus_tag\tLOCUSTAG_' + format(locus_index, '03d') + '\n' +
'\t\t\tproduct\t' + gene_prod + '\n' +
note_entry)
if 'intron' not in note_entry:
locus_index = locus_index + 1