-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPipeLine_Collider.py
More file actions
111 lines (80 loc) · 2.79 KB
/
PipeLine_Collider.py
File metadata and controls
111 lines (80 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
g2orList = list()
ORAList = list()
ORA_New = list()
g2or_New = list()
ORA_Seqs = list()
g2or_Seqs = list()
newFile = open("Near_Matching_Cords.korff", "w")
newFile2 = open("Matching_Sequences.korff", "w")
newFile3 = open("ORA_Excess.korff", "w")
newFile4 = open("g2or_Excess.korff", "w")
tab = '\t'
newFile.write("Identifier" + tab + "Source" + tab + "Chromosome" + tab + "Start" + tab + "End" + tab + "Sense" + tab + "Class" + tab + "CodingStatus" + tab + "Sequence" + tab + "Attribute" + '\n')
newFile2.write("Identifier" + tab + "Source" + tab + "Chromosome" + tab + "Start" + tab + "End" + tab + "Sense" + tab + "Class" + tab + "CodingStatus" + tab + "Sequence" + tab + "Attribute" + '\n')
newFile3.write("Identifier" + tab + "Source" + tab + "Chromosome" + tab + "Start" + tab + "End" + tab + "Sense" + tab + "Class" + tab + "CodingStatus" + tab + "Sequence" + tab + "Attribute" + '\n')
newFile4.write("Identifier" + tab + "Source" + tab + "Chromosome" + tab + "Start" + tab + "End" + tab + "Sense" + tab + "Class" + tab + "CodingStatus" + tab + "Sequence" + tab + "Attribute" + '\n')
with open("T2T_g2or_Iter.korff") as fh:
for line in fh:
if line.startswith("Identifier"):
continue
line = line.strip('\n')
line = line.split('\t')
g2orList.append(line)
print("Read in: " + str(len(g2orList)))
with open("T2T_ORA.korff") as fh:
for line in fh:
if line.startswith("Identifier"):
continue
line = line.strip('\n')
line = line.split('\t')
ORAList.append(line)
print("Read in: " + str(len(ORAList)))
#for gene in g2orList:
#print(gene)
#for gene in ORAList:
#print(gene)
for gene1 in ORAList:
for gene2 in g2orList:
if gene1[2] != gene2[2]:
start1 = int(gene1[3])
end1 = int(gene1[4])
start2 = int(gene2[3])
end2 = int(gene2[4])
start_dist = abs(start1 - start2)
end_dist = abs(end1 - end2)
alt_start_dist = abs(start1 - end2)
alt_end_dist = abs(end1 - start2)
if ((start_dist <= 150) and (end_dist <= 150)) or ((alt_start_dist <= 150) and (alt_end_dist <= 150)):
newFile.write("\t".join(gene1))
newFile.write('\n')
for gene1 in ORAList:
for gene2 in g2orList:
if gene1[8].upper() == gene2[8].upper():
newFile2.write("\t".join(gene1))
newFile2.write('\n')
ORA_New = ORAList
g2or_New = g2orList
print(str(len(ORA_New)))
print(str(len(g2or_New)))
for gene in g2orList:
g2or_Seqs.append(gene[8])
for gene in ORAList:
ORA_Seqs.append(gene[8])
for gene in ORA_New:
if gene[8] in g2or_Seqs:
ORA_New.remove(gene)
for gene in g2or_New:
if gene[8] in ORA_Seqs:
g2or_New.remove(gene)
print(str(len(ORA_New)))
print(str(len(g2or_New)))
for gene in ORA_New:
newFile3.write("\t".join(gene))
newFile3.write('\n')
for gene in g2or_New:
newFile4.write("\t".join(gene))
newFile4.write('\n')
newFile.close()
newFile2.close()
newFile3.close()
newFile4.close()