-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgetexons.py
More file actions
executable file
·162 lines (133 loc) · 4.71 KB
/
getexons.py
File metadata and controls
executable file
·162 lines (133 loc) · 4.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python
"""
Created: 2016-03-05
@author: irisdianauy
This tool extractS CDS position info from NCBI features table/s.
The output is an exon file for use in trainGlimmerHMM.
Future edits:
[ 1] Provide support for a multi-Genbank file.
[ 2] Provide support for hybrid input: feat table and multi-Genbank
[ 3] Provide option to specify inclusion of fuzzy starts/ends.
"""
from __future__ import print_function
import os.path
import argparse
import re
#"""
# Define regular expressions
# Regex that matches a whole feature block in an NCBI features table
reFeatBlock = re.compile(">Feature[\s][\d\|\w\.]+\n(?:(?:[\t]|[\>\<](?=\d)|[\d])[ \t\S]+\n)+")
# Regex for capturing the ID of a whole feature block
reId = re.compile(">Feature[\s](?P<id>[\S]+)\n")
# Regex that matches a CDS block in a feature block
reCdsBlock = re.compile("^[\d]+\t[\d]+\tCDS\n(?:[\d]+\t[\d]+\n)+(?=\t)", re.M)
# Regex that matches a CDS block in a feature block, with fuzz
reCdsBlkfz = re.compile("^[\>\<\d]+\t[\>\<\d]+\tCDS\n(?:[\>\<\d]+\t[\>\<\d]+\n)+(?=\t)", re.M)
# Regex that matches an exon pair in a CDS block
rePosPair = re.compile("(?P<start>[\S]+)\t(?P<stop>[\S]+)")
#"""
#"""
# Define classes & functions
def getExons(sId, lCoords):
"""This function assumes that lCoords is a list of dictionaries
containing the positions of exons in a gene."""
sGene = ""
for dCoords in lCoords:
dCoords["id"] = sId
sGene += "{id} {start} {stop}\n".format(**dCoords)
return sGene
class oFeatBlock:
def __init__(self, sLines):
rsId = reId.search(sLines)
dId = rsId.groupdict()
self.id = dId["id"]
self.lfeats = reCdsBlock.findall(sLines)
self.numfeats = len(self.lfeats)
def printFeats(self, fExon):
for sFeat in self.lfeats:
lExons = [e.groupdict() for e in rePosPair.finditer(sFeat)]
print(getExons(self.id, lExons), file=fExon)
#"""
#"""
# Set and parse command line arguments
sDescTool = '''This tool extracts CDS positions from
one or more Genbank feature tables and generates
an exon file suitable for trainGlimmerHMM.'''
sHelpIn = '''Input Genbank features table file/s.
For multiple files, specify this option multiple times.
This option is required.'''
sHelpOut = '''Name of new file in which results are written.
Defaults to the name of the input file suffixed with '.exon'.
This option is required.'''
oOpts = argparse.ArgumentParser(description = sDescTool)
oOpts.add_argument("-i", "--infeat",
metavar = "file.features",
action = "append",
help = sHelpIn)
oOpts.add_argument("-o", "--outexon",
metavar = "file.exons",
help = sHelpOut)
oParsedOpts = oOpts.parse_args()
#"""
#"""
# Checkpoints
# Check if options which can't be set as positional args are supplied
# Check supplied options for conflict
print("\nChecking options now...")
dReqd = {"sInfeat" : [oParsedOpts.infeat, "input features table/s"],
"sOutexon": [oParsedOpts.outexon, "output exons file path/name"]}
def checkOpts(lParam):
if lParam[0]:
if isinstance(lParam[0], list):
return len(lParam[0])
else:
return 1
else:
print("No {} provided.".format(lParam[1]))
return 0
for sReqd in dReqd:
iReqd = checkOpts(dReqd[sReqd])
if iReqd == 0:
print("Supply all required inputs.")
break
else:
dReqd[sReqd].append(iReqd)
# Check if outfile exists
bOutExists = os.path.isfile(oParsedOpts.outexon)
if bOutExists:
print("Output filename exits. Supply a new one.")
# Include all conditions in a compound checkpt for single check
bProceed = False
if iReqd != 0 and not bOutExists:
bProceed = True
#"""
#"""
# Proceed if all things check out
if bProceed:
print("Requirements supplied. Proceeding.")
with open(oParsedOpts.outexon, "w") as fOutExon:
for pIn in oParsedOpts.infeat:
fIn = open(pIn).read()
sIn = str(fIn)
lFeats = reFeatBlock.findall(sIn)
for sFeats in lFeats:
oFeat = oFeatBlock(sFeats)
oFeat.printFeats(fOutExon)
else:
pass
#"""
# ==============================================================================
#"""
# Regular expressions: Notes
# Regex that matches a whole feature block:
# ">Feature[\s][\d\|\w\.]+\n(?:(?:[\t]|[\>\<](?=\d)|[\d])[ \t\S]+\n)+"
## Broken down into parts:
#### First line of the block
#### ">Feature[\s][\d\|\w\.]+\n"
#### Succeeding lines
#### "(?:(?:[\t]|[\>\<](?=\d)|[\d])[ \t\S]+\n)+"
###### where
###### "[\t]|[\>\<](?=\d)|[\d])"
###### specifies the start of every line that isn't the first line, and
###### "[ \t\S]+"
###### is the pattern for all succeeding characters