-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinput.py
More file actions
414 lines (311 loc) · 10.6 KB
/
input.py
File metadata and controls
414 lines (311 loc) · 10.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
'''
generates input data for pipeline
producing four files attributes, samples, values and coexistences
see ./docs/documentation.md for details
'''
import os
import re
import csv
import sys
import math
import json
import time
import glob
import timeit
import string
import requests
import itertools
import pandas as pd
from tqdm import tqdm
from operator import add
from functools import wraps
from collections import Counter
from collections import Mapping
from pprint import pprint as pp
from multiprocessing import Process
def fn_timer(function):
@wraps(function)
def function_timer(*args, **kwargs):
# dir(function)
t0 = time.time()
result_ = function(*args, **kwargs)
t1 = time.time()
print ("Total time running %s: %s seconds" %
(function.__name__, str(t1-t0))
)
# outF.write('Total time running: ' + function.__name__ + ' ' + str(t1-t0) + '\n' + '\n')
return result_
return function_timer
# from concurrence 1
@fn_timer
def api_samples(parms):
# BioSample API request module
startpage = parms["start"]
endpage = parms["end"]
name = parms["name"]
page_size = parms["size"]
print("In api_samples", startpage, endpage)
# counter_page = startpage
# Open a file to write into it
# `with` will close the file automatically
with open(name + '.csv', 'w') as f:
for counter_page in range(startpage, endpage + 1):
# Initialize keys_list variable
keys_list = []
query_params = {
"page": counter_page,
"size": page_size,
}
# Request the API for the samples
response = requests.get('http://www.ebi.ac.uk/biosamples/api/samples/', params=query_params)
if response.status_code is not 200:
# If the status code of the response is not 200 (OK), something is wrong with the API
# Return the process
print('Something wrong happening...')
return
# counter_page = counter_page + 1
if counter_page % 100 == 0:
print("Process " + name + " reached " + str(counter_page))
# Looking through the JSON:
# Get all samples on the page
samples_in_page = response.json()['_embedded']['samples']
# For each sample, get characteristics types and save them in the key_list variable
for sample in samples_in_page:
accession_no = sample['accession'] # added by MG to grab sampleIDs
sample_characteristics = sample['characteristics']
sample_keys = list(sample_characteristics.keys())
sample_keys = [accession_no] + sample_keys
keys_list.append(sample_keys)
# Write the characteristics list into the file
writer = csv.writer(f)
writer.writerows(keys_list)
@fn_timer
def multithread_crawler():
# Crawler script- get me a list of keys for every sample.
# Splitting up the BioSamples Pages into equal chunks for multithreading
numberOfParalelJobs = 8
pageSize = 500
query_params = {
"size": pageSize,
}
rel = requests.get('http://www.ebi.ac.uk/biosamples/api/samples/', params=query_params)
reply = rel.json()
totalPageNumer = reply['page']['totalPages']
startpoint = 0
init = []
for i in range(1, numberOfParalelJobs + 1):
params = dict()
params['run'] = i
params['size'] = pageSize
params['name'] = "Thread{}_results".format(str(i))
params['start'] = startpoint
endpoint = math.ceil(totalPageNumer / float(numberOfParalelJobs)) * i
if endpoint < int(totalPageNumer):
params['end'] = int(endpoint)
else:
params['end'] = totalPageNumer
init.append(params)
startpoint = int(endpoint) + 1
processlist = []
for entry in init:
p = Process(target=api_samples, args=[entry])
p.start()
processlist.append(p)
print("All process started")
# Going through the process list, waiting for everything to finish
for procs in processlist:
procs.join()
print("All finished")
# combine results into samples file.
filenames = []
for f in glob.glob('Thread*_results.csv'):
filenames.append(f)
with open('./data/samples.csv', 'w') as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
# sys.exit()
def stripID():
# makes .temp files that strips the first column. This will remove sample id before passing to concurrence counter.
filenames = []
for f in glob.glob('Thread*_results.csv'):
filenames.append(f)
for fname in filenames:
tempname = str(fname + '.temp')
with open(fname, 'r') as infile:
reader = csv.reader(infile)
reader_list = list(reader)
writer = csv.writer(open(tempname,'w'))
for sample in reader_list:
sample_ = sample[1:]
writer.writerow(sample_)
# removes 'Thread*_results.csv' files
for f in glob.glob('Thread*_results.csv'):
os.remove(f)
for f in glob.glob('Thread*_results.csv.temp'):
z = f[:-5]
os.rename(f, z)
# sys.exit()
# from concurrence 2
@fn_timer
def create_cooccurrence_matrix(params):
in_filename = params['filename_in']
out_filename = params['filename_out']
types_letter = list(string.ascii_lowercase) # was initially outside of function?
types_letter.insert(0, "#") # was initially outside of function?
# tcd = {letter: {} for letter in types_letter} # probably not necessary
tcd = {}
with open(in_filename, 'r') as f:
samples_type_list = f.readlines()
line_counter = 0
total_lines = len(samples_type_list)
for type_list in samples_type_list:
if line_counter % 50000 == 0:
print('Line {} of {}'.format(line_counter, total_lines))
types = [type_name.strip() for type_name in type_list.split(',') if type_name]
types.sort()
types_permutations = itertools.combinations(types, 2)
for perm in types_permutations:
(A, B) = perm
# first_letter = str(A[0]).lower()
# if first_letter not in string.ascii_lowercase:
# first_letter = "#"
if A not in tcd:
tcd[A] = {}
if B not in tcd[A]:
tcd[A][B] = 0
tcd[A][B] += 1
line_counter += 1
with open(out_filename, 'w') as fout:
json.dump(tcd, fout)
@fn_timer
def trigger_matrix():
# generate a dictionary to check if we already saw the type
# Type check dictionary
params = dict()
base_filename_in = 'Thread\d_results.csv'
base_filename_out = 'cooccurrence_matrix{}.json'
input_files = [f for f in os.listdir('./') if re.match(base_filename_in, f)]
for i in range(len(input_files)):
input_file = input_files[i]
print('Working on {}'.format(input_file))
params['filename_in'] = input_file
params['filename_out'] = base_filename_out.format(i+1)
start_time = timeit.default_timer()
create_cooccurrence_matrix(params)
print(timeit.default_timer() - start_time)
# removes 'Thread*_results.csv.temp' files
for f in glob.glob('Thread*_results.csv'):
os.remove(f)
# from concurrence 3
@fn_timer
def flattenDict(d, join=add, lift=lambda x: x):
_FLAG_FIRST = object() # was initially outside of function?
results = []
def visit(subdict, results, partial_key):
for k, v in subdict.items():
new_key = lift(k) if partial_key == _FLAG_FIRST else join(partial_key, lift(k))
if isinstance(v, Mapping):
visit(v, results, new_key)
else:
results.append((new_key, v))
visit(d, results, _FLAG_FIRST)
return results
@fn_timer
def combine_threads():
basename = 'cooccurrence_matrix\d+\.json'
output_name = './data/coexistences.json'
files_folder = './'
files = [f for f in os.listdir(files_folder) if re.match(basename, f)]
final_matrix = Counter({})
for f in files:
print('Combining results from {}'.format(f))
with open(f, 'r') as fin:
partial_matrix = json.load(fin)
partial_matrix_flatten = Counter(dict(flattenDict(partial_matrix, join=lambda a, b: a + '_' + b)))
final_matrix += partial_matrix_flatten
with open(output_name, 'w') as fout:
json.dump(final_matrix, fout, sort_keys=True, indent=4)
for f in glob.glob('cooccurrence_matrix*.json'):
os.remove(f)
def get_solr_facets():
try:
# this module gets facets from solr and returns a dict, df and csv file
# print("facets_name,facets_count")
results = []
#the inital search for facets
q = "*:*"
facet = "crt_type_ft"
query_params = {
"q" : q,
"wt" : "json",
"rows": 0,
"facet": "true",
"facet.field": facet,
"facet.limit": -1,
"facet.sort": "count",
"facet.mincount": "1"
}
response = requests.get('http://cocoa.ebi.ac.uk:8989/solr/merged/select', params=query_params)
facets = response.json()['facet_counts']['facet_fields'][facet]
# selects data to build dict and strips '_facet' no fancy regex because solr strips other underscores
facets_name_raw = facets[::2]
facets_name = [s.replace('_facet', '') for s in facets_name_raw ]
facets_count = facets[1::2]
facets_dict = {}
for i in range(len(facets_name)):
facets_dict[facets_name[i]] = facets_count[i]
facets_df = pd.DataFrame.from_dict(facets_dict, orient = 'index')
facets_df.reset_index(inplace=True)
facets_df.columns = ['facet','frequency']
# facets_list = facets_dict.keys() # list if needed
facets_df.to_csv('./data/attributes.csv', encoding='utf-8')
print('cocoa query succesful')
except:
print('cocoa failed to launch')
# try:
# print('cocoa failed to launch. Trying beans...')
# results = []
# #the inital search for facets
# q = "*:*"
# facet = "crt_type_ft"
# query_params = {
# "q" : q,
# "wt" : "json",
# "rows": 0,
# "facet": "true",
# "facet.field": facet,
# "facet.limit": -1,
# "facet.sort": "count",
# "facet.mincount": "1"
# }
# response = requests.get('http://beans.ebi.ac.uk:8989/solr/merged/select', params=query_params)
# facets = response.json()['facet_counts']['facet_fields'][facet]
# # selects data to build dict and strips '_facet' no fancy regex because solr strips other underscores
# facets_name_raw = facets[::2]
# facets_name = [s.replace('_facet', '') for s in facets_name_raw ]
# facets_count = facets[1::2]
# facets_dict = {}
# for i in range(len(facets_name)):
# facets_dict[facets_name[i]] = facets_count[i]
# facets_df = pd.DataFrame.from_dict(facets_dict, orient = 'index')
# facets_df.reset_index(inplace=True)
# facets_df.columns = ['facet','frequency']
# # facets_list = facets_dict.keys() # list if needed
# facets_df.to_csv('./data/attributes.csv', encoding='utf-8')
# print('beans success. WARNING: not using cocoa data!!!')
# except:
# print('cocoa and beans failed to launch.')
if __name__ == "__main__":
# generates attributes.csv
get_solr_facets()
# generates 'samples.csv' and 'coexistences.json'
# multithread_crawler()
# stripID()
# trigger_matrix()
# combine_threads()
# generates values
# Trish has written a program to do this. I need her to send it to me.
# I may truncate the results her program generates to speed up its progress
# We could adjust this later if we need to in v1.1