-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathflexibility.py
More file actions
240 lines (221 loc) · 12.1 KB
/
flexibility.py
File metadata and controls
240 lines (221 loc) · 12.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import argparse
from PDB import request
from FASTA import read
from flex_calculus import calculus
import threading
import configparser
# Arguments for the application
parser = argparse.ArgumentParser(
description="flexibility.py is a standalone application which computes a flexibility score from a protein sequence contained in a FASTA or multi-FASTA file. It is based on experimental data from SwissProt PDB files or models proposed by AlphaFold. It retrieves a parseable text file containing a flexibility score for each residue in the input sequence/s and a graphical representation of flexibility")
parser.add_argument('-i', '-in', '--input', dest='input_file', action='store',
help='FASTA or Multi-FASTA protein sequence input file', required=True, default=None)
parser.add_argument('-o', '-out', '--output', dest='output_file', action='store',
help='Prefix of protein flexibility output files. It will generate 2 files ([prefix].out - Parseable text file) and ([prefix].png - Graphycal representation of flexibility scores). If 2 or more sequences are provided, the prefixes of the files will have the format [prefix]_[FASTA identifier].', required=False, default='flexibility_output')
parser.add_argument('-alpha', '--alpha_threshold', dest='alphafold_threshold', action='store',
help='AlphaFold identity threshold. It sets the threshold of AlphaFold identity above which a match of the query will be used as the true complete structure.', required=False, default=0.95)
options = parser.parse_args()
args, leftovers = parser.parse_known_args()
# Functions defined for the main program
def alphafold(query, local, database=None):
"""
Calls the function needed to search for a match on SwissProt database using BLASTp and download a PDB file of the best match if it exists.
The function returns a PDB object from PDB.request module.
The arguments are the following:
query - It is a Query class from FASTA.read module which contains sequence and identifier.
local - True for a local search and False for an API-mediated one
database - Path for the local database
"""
if local == False:
BLAST = request.blast_sp(query)
for seq in request.parse_xml(BLAST):
if request.download_alphafold(query, seq[0], './') == False:
continue
else:
return request.Pdb(
query.identifier, seq[0], None, './' + query.identifier + '_AlphaFold.pdb', seq[1], seq[2], seq[3], seq[4])
else:
BLAST = request.blast_commandline(query, database)
BLAST = BLAST[0].split('\n|\r')
for seq in request.parse_blast(BLAST):
if request.download_alphafold(query, seq[0].split('|')[1], './') == False:
continue
else:
return request.Pdb(
query.identifier, seq[0], None, './' + query.identifier + '_AlphaFold.pdb', seq[1], seq[2], seq[3], seq[4])
def pdb(query, local, database=None):
"""
Calls the function needed to search for a match on PDB database using BLASTp and download a PDB file of the best match if it exists.
The function returns a PDB object from PDB.request module.
The arguments are the following:
query - It is a Query class from FASTA.read module which contains sequence and identifier.
local - True for a local search and False for an API-mediated one
database - Path for the local database
"""
if local == False:
BLAST = request.blast_pdb(query)
for seq in request.parse_xml(BLAST):
if request.download_pdb(query, seq[0][0:4], './') == False:
continue
else:
return request.Pdb(
query.identifier, seq[0], seq[0][5:], './' + query.identifier + '.pdb', seq[1], seq[2], seq[3], seq[4])
else:
BLAST = request.blast_commandline(query, database)
BLAST = BLAST[0].split('\n|\r')
for seq in request.parse_blast(BLAST):
if request.download_pdb(query, seq[0][0:4], './') == False:
continue
else:
return request.Pdb(
query.identifier, seq[0], seq[0][5:], './'+ seq[0] + '_' + query.identifier + '.pdb', seq[1], seq[2], seq[3], seq[4])
def retrieving_score(pdb_prefix, chain_id=None):
"""Calls the functions needed to calculate the flexibility score of a PDB file. It returns an array containing them."""
print(
f"Computing flexibility score on {pdb_prefix}...")
matrix = calculus.general_calculation(
'./'+pdb_prefix+'.pdb', chain_id)
print("Done")
return matrix
# Read the configure archive to determine local or online BLASTp search
config = configparser.ConfigParser()
config.read('config.ini')
# Read the FASTA file provided
print(f"Reading {options.input_file} file.")
fasta_list = read.fasta_parser(options.input_file)
print(f"{len(fasta_list)} record/s was/were read.")
# Variables to store SwissProt and AlphaFold matches
pdb_list = [[]]
alphafold_list = []
# Threading for API BLASTp. Online version
if config['blast']['local'] == 'False':
threads = len(fasta_list)
jobs = []
# Loop through all sequences provided
for i in range(0, threads):
print(
f"Searching similar sequences to {fasta_list[i].identifier} in PDB and SwissProt databases using BLASTp server...")
# Thread for PDB database
thread = threading.Thread(
target=pdb_list[i].append(pdb(fasta_list[i], False)))
jobs.append(thread)
# Thread for SwissProt database
thread = threading.Thread(target=alphafold_list.append(
alphafold(fasta_list[i], False)))
jobs.append(thread)
# Start the threading
for j in jobs:
j.start()
# Check if the threading is finished
for j in jobs:
j.join()
# For each AlphaFold match, check if the identity is higher than -alpha to search for PDB linked to its IDs
i = 0
for element in alphafold_list:
if element != None and element.identity > float(options.alphafold_threshold):
pdb_list[i] = []
print(
f"The model fits with {alphafold_list[i].identifier} with {alphafold_list[i].identity} of identity \nSearching PDB files linked to the UniProt code")
request.download_uniprot_xml(
alphafold_list[i].identifier, './')
pdb_swissprotid = request.parse_uniprot_xml(
alphafold_list[i].identifier + '_uniprot.xml')
if pdb_swissprotid:
print(f'{len(pdb_swissprotid)} PDB files found')
for pdb in pdb_swissprotid:
print(f"Downloading {pdb[0]} from PDB server...")
if request.download_pdb(fasta_list[i], pdb[0], './') == False:
print("Cannot download {pdb[0]} PDB file. It was omitted")
continue
else:
pdb_list[i].append(request.Pdb(fasta_list[i].identifier, pdb[0], pdb[1], './'
+ pdb[0] + '_' + fasta_list[i].identifier + '.pdb', 1, pdb[2], pdb[3], []))
print("Done")
i = i + 1
# Local version
elif config['blast']['local'] == 'True':
# Loop through all sequences provided
for i in range(0, len(fasta_list)):
# Searching for AlphaFold matches
print(
f"Searching similar sequences to {fasta_list[i].identifier} in SwissProt databases using BLASTp...")
try:
alphafold_list.append(
alphafold(fasta_list[i], True, config['blast']['SwissProtdb_path']))
print(
f"AlphaFold match: {alphafold_list[i].identifier.split('|')[1]} ({alphafold_list[i].identity:.2f})")
except (IndexError, AttributeError):
alphafold_list.append(None)
print("No AlphaFold match was found")
# Searching for PDB files linked to the SwissProt ID if identity is higher than -alpha threshold
if alphafold_list[i] != None and alphafold_list[i].identity > float(options.alphafold_threshold):
print(
f"The model fits with {alphafold_list[i].identifier.split('|')[1]} with {alphafold_list[i].identity} of identity \nSearching PDB files linked to the UniProt code")
request.download_uniprot_xml(
alphafold_list[i].identifier.split('|')[1], './')
pdb_swissprotid = request.parse_uniprot_xml(
alphafold_list[i].identifier.split('|')[1] + '_uniprot.xml')
if pdb_swissprotid:
print(f'{len(pdb_swissprotid)} PDB files found')
for pdb_hit in pdb_swissprotid:
print(f"Downloading {pdb_hit[0]} from PDB server...")
if request.download_pdb(fasta_list[i], pdb_hit[0], './') == False:
print("Cannot download {fasta_list[i]} PDB file. It was omitted")
continue
else:
pdb_list[i].append(request.Pdb(fasta_list[i].identifier, pdb_hit[0], pdb_hit[1], './'
+ pdb_hit[0] + '_' + fasta_list[i].identifier + '.pdb', 1, pdb_hit[2], pdb_hit[3], []))
print("Done")
# If no PDB is found linked to Swissprot ID, it seeks for PDBs match using BLASTp
else:
print(f"No PDB sequences found linked to {alphafold_list[i].identifier.split('|')[1]}")
try:
pdb_list[i].append(
pdb(fasta_list[i], True, config['blast']['PDBdb_path']))
print(
f"PDB match: {pdb_list[i][0].identifier} ({pdb_list[i][0].identity:.2f})")
except (IndexError, AttributeError):
pdb_list[i].append(None)
print(f"No PDB match was found")
# If AlphaFold's identity is lower than -alpha, it seeks for PDB matches using BLASTp
else:
try:
print(
f"Searching similar sequences to {fasta_list[i].identifier} in PDB database using BLASTp...")
pdb_list[i].append(
pdb(fasta_list[i], True, config['blast']['PDBdb_path']))
print(
f"PDB match: {pdb_list[i][0].identifier} ({pdb_list[i][0].identity:.2f})")
except (IndexError, AttributeError):
pdb_list[i].append(None)
print(f"No PDB match was found")
if i < len(fasta_list):
pdb_list.append([])
else:
raise Exception("Error in config.ini file\nlocal must be \"True\" or \"False\"")
name = options.output_file
for i in range(0, len(fasta_list)):
if len(fasta_list) > 1:
options.output_file = name + "_" + fasta_list[i].identifier
plot = True
matrix_pdb = []
if alphafold_list[i] != None and pdb_list[i][0] != None:
if alphafold_list[i].identity > float(options.alphafold_threshold) and len(pdb_list[i]) > 1:
matrix = retrieving_score(fasta_list[i].identifier+"_AlphaFold")
matrix_pdb = calculus.general_calculation_multiple(
pdb_list[i], alphafold_list[i])
elif alphafold_list[i].identity > pdb_list[i][0].identity:
matrix = retrieving_score(fasta_list[i].identifier+"_AlphaFold")
else:
matrix = retrieving_score(pdb_list[i][0].identifier[0:4]+"_"+fasta_list[i].identifier,
pdb_list[i][0].chain)
elif alphafold_list[i] != None and pdb_list[i][0] == None:
matrix = retrieving_score(fasta_list[i].identifier+"_AlphaFold")
elif alphafold_list[i] == None and pdb_list[i][0] != None:
matrix = retrieving_score(pdb_list[i][0].identifier[0:4]+"_"+fasta_list[i].identifier, pdb_list[i][0].chain)
else:
print("No PDB files were obtained. Flexibility cannot be calculated")
plot = False
if plot:
print("Saving data and plot...")
calculus.represent_data(matrix, options.output_file, matrix_pdb)
print("Done")