-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPGtype.py
More file actions
95 lines (84 loc) · 4.18 KB
/
PGtype.py
File metadata and controls
95 lines (84 loc) · 4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
'''
This script was used to analyses the pseudogenes completness and type of the different pseudogene sets
(see SetEvalIdAnalysis.py comments).
Files used and origins:
PG_overlapping_kPG.id is the id of the predicted pseudogenes that overlap known pseudogenes by at least 60% of their
length. Generated by the SetEvalIdAnalysis.py.
PG_overlapping_kTE.id is the id of the predicted pseudogenes that overlap known TE by at least 60% of their length.
Generated by the SetEvalIdAnalysis.py.
ukPG.id is the id of the predicted pseudogenes that don't overlap known pseudogenes or known TE, and which sequences
don't align with any known TE sequence. Generated by the SetEvalIdAnalysis.py.
pseudogenes.info is an output of P-GRe
Output of this script is directed to the strandard output. A copy of the result is saved on the PGtype.txt
file
'''
#### FUNCTIONS
def newSet(file):
setToCreate = set()
with open(file) as fileToRead:
for line in fileToRead:
setToCreate.add(line.replace('\n', ''))
return setToCreate
def initializeDict(dict):
dict['Completness'] = {}
dict['Completness']['Copy'] = 0
dict['Completness']['Fragment'] = 0
dict['Completness']['Fragments'] = 0
dict['Completness']['Fragment or degraded copy'] = 0
dict['Type'] = {}
dict['Type']['Chimeric pseudogene'] = 0
dict['Type']['Duplicated pseudogene'] = 0
dict['Type']['(Iso)retropseudogene'] = 0
dict['Type']['Retropseudogene'] = 0
dict['Type']['Unknown'] = 0
def addToDic(dic, completness, type):
dic['Completness'][completness] += 1
dic['Type'][type] += 1
def dictRatioCompute(dic):
totalType = sum(dic['Type'].values())
totalCompletness = sum(dic['Completness'].values())
print('----- Completness:')
for completnessKind, completnessNb in dic['Completness'].items():
print(completnessKind, round(completnessNb / totalCompletness * 100, 2), completnessNb)
print('----- Type:')
for typeKind, typeNb in dic['Type'].items():
print(typeKind, round(typeNb / totalType * 100, 2), typeNb)
print('')
#### STEP1: import the different sets of pseudogenes
PGcPG = newSet('PG_overlapping_kPG.id') # set of P-GRe pseudogenes overlapping TAIR10 pseudogenes by at least 60%
PGcTE = newSet('PG_overlapping_kTE.id') # set of P-GRe pseudogenes overlapping TAIR10 transposable elements (TEs) by at
# least 60%
PGdcPGdcdaTE = newSet('ukPG.id') # set of P-GRe pseudogenes that don't cover any TAIR10 pseudogenes or any TAIR10
# transposable elements, and that don't align with any transposable element, aka
# "unknown" pseudogenes
#### STEP2: retrieve the number of each kind of completness and type for each set
PGcPGComputation = {}
initializeDict(PGcPGComputation)
PGcTEComputation = {}
initializeDict(PGcTEComputation)
PGdcPGdcdaTEComputation = {}
initializeDict(PGdcPGdcdaTEComputation)
totalComputation = {}
initializeDict(totalComputation)
with open('pseudogenes.info') as info:
next(info) # Skip header
for line in info:
pg, completness, type, parent = line.split('\t')
if pg in PGcPG:
addToDic(PGcPGComputation, completness, type)
if pg in PGcTE:
addToDic(PGcTEComputation, completness, type)
if pg in PGdcPGdcdaTE:
addToDic(PGdcPGdcdaTEComputation, completness, type)
addToDic(totalComputation, completness, type)
#### STEP3: print the % for each set
print('----------------- P-GRe pseudogenes overlapping known pseudogene -----------------')
dictRatioCompute(PGcPGComputation)
print('--------------------- P-GRe pseudogenes overlapping known TE ---------------------')
dictRatioCompute(PGcTEComputation)
print('----- P-GRe predictions not overlapping known PG or TE, not aligning with TE -----')
dictRatioCompute(PGdcPGdcdaTEComputation)
print('------------------------------ Total predictions ---------------------------------')
dictRatioCompute(totalComputation)
print('/!\ Please note that P-GRe classify as "Chimeric pseudogene" pseudogenes that have more than one parent gene.\n'
'Having more than one parent can indeed be due to true chimeric case, but also to sequence divergence. /!\\')