-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimanalysis_methods.py
More file actions
348 lines (294 loc) · 14.1 KB
/
simanalysis_methods.py
File metadata and controls
348 lines (294 loc) · 14.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
"""
Created by Akshay Maheshwari
09/05/2017
Simulation analysis methods for 2D hard disk systems
"""
def loadOutputList(expt_name,outputType):
"""Loads a file containing all file names with a specified output data into a local list
Args:
expt_name (String): Name of experiment (which contains how many ever simulation output files)
outputType (String): molpos or simtime. Determines which set of output filenames to load.
Returns:
TYPE: List
"""
datalist = []
if(outputType=='molpos'):
path='data/'+expt_name+'/outputMolposList.txt'
elif(outputType=='simtime'):
path='data/'+expt_name+'/outputSimtimeList.txt'
else:
raise Exception('outputType required to be either \'molpos\' or \'simtime\'')
with open(path) as f:
for outputfile in f:
datalist.append("data/"+expt_name+"/"+outputfile.rstrip())
return datalist
def molpos_1Dbin(data,bins,diameter):
"""Creates a 1D histogram from X,Y location data of a single tracked molecule over time
Args:
data (pandas dataframe): time series 2D location data of a tracked molecule
bins (int): # of rectangular bins to discretize cell with (bins are equidistant on x-axis)
diameter (float): diameter of simulated cell
Returns:
TYPE: List
"""
import numpy as np;
pos=np.linspace(-diameter/2,diameter/2,bins+1)
speciesNum = int(data.shape[1]/2)
data_hist1D_total = np.zeros(bins)
print(speciesNum)
for i in range(speciesNum):
data_hist1D = np.histogram(data.loc[:,1],bins=pos)[0]
data_hist1D_total += data_hist1D
return data_hist1D_total
#for chunk in pd.read_csv(datapath,header=None,chunksize=10**6): #Chunk size 10^6 runtime: 72.68, chunk size 10^7 runtime: 72.29
#data_hist += np.histogram(chunk.loc[:,1],bins=pos)[0]
def molpos_2Dbin(data, bins, diameter):
"""Creates a 2D histogram from X,Y location data of a single tracked molecule over time
Args:
data (pandas dataframe): time series 2D location data of a tracked molecule
bins (int): # of bins to discretize cell with (bins x bins on x and y)
diameter (float): diameter of simulated cell
Returns:
TYPE: List
"""
import numpy as np;
pos_x = np.linspace(-diameter/2,diameter/2,bins+1)
pos_y = np.linspace(-diameter/2,diameter/2,bins+1)
#for chunk in pd.read_csv(datapath,header=None,chunksize=10**6):
speciesNum = int(data.shape[1]/2)
data_hist2D_total = np.zeros((bins,bins))
for i in range(speciesNum):
data_hist2D = np.histogram2d(np.array(data.loc[:,1]),np.array(data.loc[:,2]),bins=[pos_x,pos_y])[0]
data_hist2D=np.flip(data_hist2D.T,0)
data_hist2D_total += data_hist2D
return data_hist2D_total
def molpos2D_dispersion(data,diameter):
"""Computes the total variation distance between the X,Y distribution of a tracked molecule over time &
the uniform distribution (expected over time in a dilute system). Bins is hardcoded to 10x10 to account
for circle edges in 2D array (need to manually remove edges of 10x10 grid that don't fall in cell circle).
Args:
data (pandas dataframe): time series 2D location data of a tracked molecule
diameter (float): diameter of simulated cell
Returns:[0,1]. 0 means distribution is uniform. 1 is approached as tracked molecule stays in one place.
TYPE: float
"""
import numpy as np;
bins=10
circle_bins = 88 # of bins in a 10x10 grid that a circle would fall into.
data_hist2D=molpos_2Dbin(data,bins=bins,diameter=diameter)
normalized_data=data_hist2D/sum(sum(data_hist2D))
newdata = np.abs(normalized_data-(sum(sum(normalized_data)))/circle_bins)
return 0.5*(sum(newdata[0][2:-2])+sum(newdata[-1][2:-2])+sum(newdata[2][1:-1])+sum(newdata[-2][1:-1])+sum(sum(newdata[2:-2][:])))
#return np.sqrt(sum(data.std(0)**2))
#return sum(np.abs(data.skew()))
def covertime(datapath,diameter,molposTS):
"""Finds the earliest time point at which every 2D bin in the 2D area of the cell has been traversed
by the tracked molecule. Bins is hardcoded to 10x10 to account for circle edges in 2D array (need to
manually remove edges of 10x10 grid that don't fall in cell circle).
Args:
datapath (String): File path to datafile
diameter (float): diameter of simulated cell
molposTS (float): the data sampling rate (independent from the underlying simulation time step)
Returns:[0,molposTS*total timesteps].
TYPE: float
"""
import numpy as np;
import pandas as pd;
bins=10
data_hist = np.zeros((bins, bins))
pos_x = np.linspace(-diameter/2,diameter/2,bins+1)
pos_y = np.linspace(-diameter/2,diameter/2,bins+1)
coverTime = 10
foundCoverTime = False;
timesteps=0;
chunksize=10**3
for chunk in pd.read_csv(datapath,header=None,chunksize=chunksize):
timesteps+=1;
data_hist+=molpos_2Dbin(chunk,bins=bins,diameter=diameter) #np.histogram2d(np.array(chunk.loc[:,1]),np.array(chunk.loc[:,2]),bins=[pos_x,pos_y])[0]
if not 0 in data_hist[:,2:-2] and not 0 in data_hist[1:-1,1:-1] and not 0 in data_hist[2:-2,:] and not foundCoverTime:
coverTime = timesteps
foundCoverTime = True
return (molposTS*chunksize)*coverTime
return coverTime
def timeplot(ax, expt_name,step=1,scalefactor=1,logscale=False,start=0):
"""Creates a plot of runtime (y-axis) vs. whatever dependent variable is being experimentally swept (x-axis).
Args:
ax (TYPE): figure object to plot upon
expt_name (String): Name of experiment file
step (int, optional): Step size between sweep of dependent variable
scalefactor (int, optional): Used to scale time data
logscale (bool, optional): True if want plot to have a x logscale
start (int, optional): 1st value of x-axis to begin plot with
"""
import warnings
import numpy as np
simdata = []
outputlist= loadOutputList(expt_name,'simtime')[:] #[a:b] = a to b sub-range of experiments that need to be plotted.
for outputfile in outputlist:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
simdata.append(np.loadtxt(open(outputfile), delimiter=",")/scalefactor)
simdata = np.array(simdata)
runtimes = simdata[:,1]
simtime = simdata[-1,0]
print("plotting ", runtimes)
if(logscale):
num_varx = np.logspace(start,start+len(runtimes)-1,num=len(runtimes))
ax.set_xscale("log")
else:
num_varx = np.arange(start,start+len(runtimes)*step,step)
ax.plot(num_varx,runtimes,'-o')
ax.set_xlim(0)
#Fits a quadratic curve to time data (note in order for poly1d to fit, need at least 2 data points for runtime):
#ax.plot(np.unique(num_varx), np.poly1d(np.polyfit(num_varx, runtimes, 2))(np.unique(num_varx)))
def saveHist(outputlist,expt_name, bins, diameter,molposTS):
"""Creates a pkl dump of a list of each subexperiment filename with its associated data_hist1D, data_hist2D, covertime, and molpos2D_disperse
Args:
outputlist (list): List of all data file names for an experiment
expt_name (string): Name of experiment
bins (int): # of bins to compute 1D and 2D histograms with
diameter (float): diameter of simulated cell
molposTS (float): the data sampling rate (independent from the underlying simulation time step)
Returns: A oath to the pkl dump
TYPE: string
"""
import pickle as pkl
import pandas as pd
import os
if not os.path.exists('data/'+expt_name+'/analysis/'):
os.makedirs('data/'+expt_name+'/analysis/')
path='data/'+expt_name+'/analysis/outputMolposHistogramList.pkl'
outputFile = open(path,"wb")
histlist = []
for i in range(len(outputlist)):
molpos2D = pd.read_csv(outputlist[i],header=None).loc[:,1:]
data_hist1D=molpos_1Dbin(molpos2D,bins=bins,diameter=diameter)
data_hist2D=molpos_2Dbin(molpos2D,bins=bins,diameter=diameter)
covtime=covertime(outputlist[i],diameter=diameter,molposTS=molposTS)
molpos2D_disperse = molpos2D_dispersion(molpos2D,diameter=diameter)
print("saving",data_hist1D)
histlist.append([outputlist[i], data_hist1D,data_hist2D,covtime,molpos2D_disperse])
pkl.dump(histlist, outputFile)
return path;
def plotHist(histlistpklpath, expt_name, diameter=0.1, graphs="both",logscale=True,step=1,start=0, simtime=1, x_label="# time samples"):
"""Generates a figure with (for each subexperiment):
plots of 2D molecule position heatmaps, simulation time, cover time, and total variation distance
Args:
histlistpklpath (string): Path to pkl with a list containing a list for each subexperiment containing all analysis (hist1D, hist2D, covertime, molpos2D)
expt_name (string): Name of experiment
diameter (float): diameter of simulated cell
graphs (str, optional): 'all', 'molpos', 'simtime', 'molpossim' -- determines which graphs included in analysis figure
logscale (bool, optional): True if want x_axis to logscale for all data plots
step (int, optional): Step size between sweep of dependent variable
start (int, optional): 1st value of x-axis to begin plot with
simtime (int, optional): Total simtime to set y-axis scale for covertime
x_label (str, optional): X-label for all sub-graphs
Returns:
TYPE: matplotlib figure
"""
import matplotlib.pyplot as plt
from matplotlib import gridspec
import pickle as pkl
from matplotlib.ticker import ScalarFormatter
import numpy as np
pkl_file=open(histlistpklpath,'rb')
histlist=pkl.load(pkl_file)
plot_dim = int(np.ceil(len(histlist)/3))
#fig = plt.figure(figsize=(12,4))
fig = plt.figure(figsize=(10,15))
outer=gridspec.GridSpec(4,1,height_ratios=[3,1,1,1])
inner = gridspec.GridSpecFromSubplotSpec(plot_dim,3,subplot_spec=outer[0],wspace=0.2,hspace=0.25)
xfmt=ScalarFormatter()
xfmt.set_powerlimits((-1,1))
if graphs=="all" or graphs=="molpos" or graphs=="molpossim":
###### Plot a heatmap of particle location for each subexperiment #######
for i in range(plot_dim):
for j in range(3):
if len(histlist)-1>=i*3+j:
ax = plt.Subplot(fig,inner[i,j])
print("plotting", histlist[i*3+j][1])
data_hist=histlist[i*3+j][1]
pos=np.linspace(-diameter/2,diameter/2,len(data_hist)+1)
ax.bar(pos[0:-1], data_hist,width=diameter/(len(histlist[i*3+j][1])),align='edge')
ax.imshow(histlist[i*3+j][2].T)
#ax.yaxis.set_major_formatter(xfmt)
fig.add_subplot(ax)
# np.set_printoptions(threshold=np.inf)
if graphs=="all" or graphs=="simtime" or graphs=="molpossim":
###### Plot of simtime for each subexperiment #######
ax2 = plt.Subplot(fig, outer[1])
timeplot(ax2,expt_name, logscale=logscale,step=step,start=start)
ax2.set_xlabel(x_label)
ax2.set_ylabel('Runtime (s.)')
fig.add_subplot(ax2)
if graphs=="all":
####### Plot of covertime for each subexperiment #######
ax3 = plt.Subplot(fig,outer[2])
covertimearr = [item[3] for item in histlist]
if(logscale):
num_varx = np.logspace(start,start+len(covertimearr)-1,num=len(covertimearr))
ax.set_xscale("log")
else:
num_varx = np.arange(start,start+len(covertimearr)*step,step)
print("covertimearr", covertimearr)
ax3.plot(num_varx,covertimearr,'-o')
ax3.set_ylim(0,simtime)
#ax3.set_yscale('log')
fig.add_subplot(ax3)
ax3.set_xlim(0)
ax3.set_xlabel(x_label)
ax3.set_ylabel('Cover time (s.)')
####### Plot of total variation distance for each subexperiment #######
ax4 = plt.Subplot(fig,outer[3])
stdarr = [item[4] for item in histlist]
if(logscale):
num_varx = np.logspace(start,start+len(stdarr)-1,num=len(stdarr))
ax.set_xscale("log")
else:
num_varx = np.arange(start,start+len(stdarr)*step,step)
print("stdarr, ", stdarr)
ax4.plot(num_varx,stdarr,'-o')
fig.add_subplot(ax4)
ax4.set_xlim(0)
ax4.set_xlabel(x_label)
ax4.set_ylabel('Diffusive irregularity')
outer.tight_layout(fig, rect=[0,0.03,1,0.90]) #rect args needed to leave space on top for title
return fig
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
import re
return [tryint(c) for c in re.split('([0-9]+)', s)]
def combinePkls(expt_name,outputlist,covertime):
"""Combines pkls for each subexperiment into a list of lists for each subexperiment with:
its associated filename, data_hist1D, data_hist2D, covertime, and molpos2D_disperse.data_hist2D
(Reduce part of MapReduce)
Args:
expt_name (TYPE): folder name of experiment
outputlist (TYPE): List with molpos output data subexperiment names
covertime (boolean): If the experiment computed covertime, include it in the pkl [used for experiments performed before cover time was implemented]
Returns: file path to combined pkl
TYPE: String
"""
import os
import pickle as pkl
histlist = []
path='data/'+expt_name+'/analysis/'
i=0;
print("i'm here")
for f in sorted(os.listdir(path),key=alphanum_key)[:]: #can put range here to only plot subset of experiments
if not f.startswith('.') and f.startswith('expt'):
data_hist_path = open(path+'/'+f,'rb')
data_hist = pkl.load(data_hist_path)
print("test",data_hist)
if(covertime):
histlist.append([outputlist[i],data_hist[0],data_hist[1],data_hist[2],data_hist[3]])
else:
histlist.append([outputlist[i],data_hist])
i+=1;
outputFile = open(path+'/outputMolposHistogramList.pkl',"wb")
pkl.dump(histlist,outputFile)
return 'data/'+expt_name+'/analysis/outputMolposHistogramList.pkl'