-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPost_ORF_Parser.py
More file actions
111 lines (88 loc) · 2.43 KB
/
Post_ORF_Parser.py
File metadata and controls
111 lines (88 loc) · 2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
for file in os.listdir():
if file.endswith(".clean.fasta"):
newFile = open(file + ".korff", "w")
data = list()
tempHeader = ""
tempSeq = ""
with open(file) as fh:
for line in fh:
line = line.strip("\n")
if line.startswith(">"):
if tempSeq == "":
tempHeader = line
elif tempSeq != "":
data.append(tuple([tempHeader, tempSeq]))
tempHeader = line
tempSeq = ""
else:
tempSeq += line
data.append(tuple([tempHeader, tempSeq]))
tempHeader = ""
tempSeq = ""
parsedHeaders = list()
for entry in data:
header = entry[0]
identifier = ""
source = ""
chromosome = ""
start = ""
end = ""
sense = ""
orClass = ""
codingStatus = ""
attribute = "NULL"
header = header.split(";")
#identifier
identifier = header[0]
identifier = identifier.split(".")
identifier = identifier[0]
identifier = identifier.split("=")
identifier = identifier[1]
#source
if identifier.startswith("ORA"):
source = "ORA_PIPELINE"
elif identifier.startswith("G2OR_ANNO"):
source = "G2OR_PIPELINE_ANNO"
elif identifier.startswith("G2OR_ITER"):
source = "G2OR_PIPELINE_ITER"
else:
print("ERROR in source")
#chromosome
chromosome = header[2]
chromosome = chromosome.split("=")
chromosome = chromosome[1]
#start
start = header[3]
start = start.split("=")
start = start[1]
#end
end = header[4]
end = end.split("=")
end = end[1]
#sense + codingStatus + orClass
sense = header[5]
sense = sense.split("|")
if len(sense) == 2:
codingStatus = "CODING"
elif len(sense) == 3:
codingStatus = "PSEUDOGENE"
else:
print("ERROR in codingStatus")
orClass = sense[1]
sense = sense[0]
sense = sense.split("=")
sense = sense[1]
if sense == "-(-)":
sense = "-"
elif sense == "+(+)":
sense = "+"
else:
print("ERROR in sense")
parsedHeaders.append(tuple([identifier, source, chromosome, start, end, sense, orClass, codingStatus, entry[1], attribute]))
tab = '\t'
#newFile.write("Identifier" + tab + "Source" + tab + "Chromosome" + tab + "Start" + tab + "End" + tab + "Sense" + tab + "Class" + tab + "CodingStatus" + tab + "Sequence" + tab + "Attribute" + '\n')
for header in parsedHeaders:
newFile.write("\t".join(header) + '\n')
newFile.close()
#>ID=ORA7.match_part1 ; Target=ORA7 ; Chromosome=NC_060935.1 ; Start=56170459 ; End=56171415 ; Sense=-(-)|OR5