pupil_preprocessing/pupil_preprocessing.py at main · beresluca/pupil_preprocessing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
"""

Complete preprocessing pipeline for analyzing Pupil Labs pupil data
- using the whole recording
- input is the pupil_data.tsv file from one recording

Steps:
1. Confidence thresholding
2. Filter by SD and median
3. Filtering "pupil dilation speed" outliers - can be done more than once
4. Interpolating (linear or cubic spline)
5. Median filter
6. Butterworth low-pass filter

"""
import glob
import os.path
import h5py
import pandas as pd
import numpy as np
import scipy
import scipy.io
from scipy.interpolate import interp1d
from scipy.signal import butter, sosfiltfilt, medfilt


def read_pupiltsv_3d(input_tsv):
    """
    Reads in a pupil labs output pupil diameter tsv file, as exported by the pl_preprocessing.py script.
    Retains only lines with pupil diameter values from the 3d estimation procedure (hence, *_3d), as
    marked in the "method" column.

    :param input_tsv: String, path to folder holding the pupil_data.tsv file.
    :return: Pandas dataframe object containing only the pupil diameter frames estimated by the 3d method.
    """
    pl_df = pd.read_csv(input_tsv, sep='\t')
    pl_df_3d = pl_df.loc[pl_df['method'] == 'pye3d 0.1.1 real-time']
    numberOfIndices = pl_df_3d.index.size
    rowIndices = pd.Int64Index(np.arange(numberOfIndices), dtype=np.int64)
    pl_df_3d.index = rowIndices

    return pl_df_3d


def extract_times_mat(input_dir, pairNo, labName, session):
    """
    Searches for .mat files containing sharedStartTime and other timestamps
    which will be used for trimming the data.
    :param input_dir: Directory which contains behavioral data.
    :param pairNo:
    :param labName:
    :param session:
    :return: times_mat: Filepath to session level .mat file.
    """
    if session == 'playback':  # for the playback session we have a different .mat file!
        times_mat = glob.glob(f'{input_dir}/**/pair{pairNo}_{labName}_behav/pair{pairNo}{labName}_subjtimes.mat',
                              recursive=True)

        times = times_mat[0]
        videoTimes = scipy.io.loadmat(times)
        startTime = float(videoTimes["startAt"].flatten())
        stopTime = float(videoTimes["flipTimes"][-1].flatten())

    else:
        times_mat = glob.glob(f'{input_dir}/**/pair{pairNo}_{labName}_behav/pair{pairNo}_{labName}_{session}_**imes.mat',
                              recursive=True)

        times = times_mat[0]
        videoTimes = scipy.io.loadmat(times)
        startTime = float(videoTimes["sharedStartTime"].flatten())
        stopTime = float(videoTimes["stopCaptureTime"].flatten())

    return startTime, stopTime

def trimDataFrame(df, startTimeStamp, stopTimeStamp, delay=5):
    """
    Trims a data frame between start and stop time stamp (keeps only rows where the time stamp
    is larger than the start time stamp and smaller than the stop time stamp).

    :param df: Pandas dataframe.
    :param startTimeStamp: start time stamp.
    :param stopTimeStamp: stop time stamp.
    :param delay: start at startTimeStamp-delay, stop at stopTimeStamp+delay
    :return: Pandas dataframe object containing only rows of the original dataframe where the
             time stamp is larger than the start time stamp (minus delay) and smaller than the stop time stamp (plus delay).
    """
    timeStamp = df.timestamp.to_numpy()
    indicesAfterStart = np.where(timeStamp > (startTimeStamp-delay))
    firstIndex = indicesAfterStart[0][0]
    indicesBeforeEnd = np.where(timeStamp < (stopTimeStamp+delay))
    lastIndex = indicesBeforeEnd[0][-1]
    trimmedDataFrame = df.loc[firstIndex:lastIndex, :]
    numberOfIndices = trimmedDataFrame.index.size
    rowIndices = pd.Int64Index(np.arange(numberOfIndices), dtype=np.int64)
    trimmedDataFrame.index = rowIndices

    return trimmedDataFrame


def nancheck(df, column='diameter'):
    """
    Checks the amount and ratio of NaN values in the "column" column of a pandas dataframe.

    :param df: Pandas dataframe.
    :param column: Name of column where the function checks the amount / ratio of NaN values. Defaults to "diameter".
    :return: nan_no: Number of NaN values in "column" in dataframe.
    :return: nan_ratio: Ratio of NaN values in "column" in dataframe.
    """
    nan_no = sum(np.isnan(df[column]))
    nan_ratio = (nan_no / df.shape[0]) * 100
    is_nan = np.isnan(df[column])
    print('\nMissing {0:.3f}% of {1} data ({2:d} frames).'.format(nan_ratio, column, nan_no))

    return nan_no, nan_ratio, is_nan


def confidence_filter(pupil_df, cutoff):
    """
    Sets pupil diameter and gaze position values to NaN where confidence is below threshold.

    :param pupil_df: Pandas dataframe, usually the output from read_pupiltsv_3d().
                     Must have columns "confidence", "diameter", "norm_pos_x" and "norm_pos_y".
    :param cutoff: Numeric value, cutoff for confidence threshold. When confidence is below threshold,
                   pupil diameter and gaze position values are set to NaN.
    :return: Pandas dataframe object where "diameter", "norm_pos_x" and "norm_pos_y" values for frames
             with confidence below the cutoff are set to NaN.
    """
    # Insert NaN to diameter and position columns where confidence is below cutoff.
    # Here we rely on the behavior of the .where() method for pandas dataframes, which replaces values
    # with NaNs by default.
    print('\nRunning confidence-based thresholding on the supplied dataframe.')
    print('Cutoff value is ' + str(cutoff))
    conf_df = pupil_df.copy()
    conf_df.diameter = conf_df.diameter.where(conf_df.confidence > cutoff)
    conf_df.norm_pos_x = conf_df.norm_pos_x.where(conf_df.confidence > cutoff)
    conf_df.norm_pos_y = conf_df.norm_pos_y.where(conf_df.confidence > cutoff)

    return conf_df

def valid_range_filter(pupil_df, cutoff_range=None):
    """
    Filters pupil diameter and gaze position based on physiological feasibility of pupil diameter values.
    Default values are 1.5 and 9 mm, based on:
    Mathôt, S. (2018). Pupillometry: Psychology, physiology, and function. Journal of Cognition, 1(1).

    :param pupil_df: Pandas dataframe, usually the output from confidence_filter().
                     Must have columns "diameter", "norm_pos_x" and "norm_pos_y".
    :param cutoff_range: 2-element tuple specifying lower and higher thresholds.
    :return: vr_df: Pandas dataframe where gaze position and diameter values outside
                    the feasible range are set to NaN.
    """
    print('\nRunning "feasible range" thresholding on the supplied dataframe.')
    if cutoff_range is None:
        cutoff_range = (1.5, 9)
        print('Using deafult range: {}'.format(cutoff_range))
    else:
        print('Using supplied range: {}'.format(cutoff_range))
    vr_df = pupil_df.copy()
    pupil_diameter = vr_df.diameter.to_numpy()
    ind_below = pupil_diameter < cutoff_range[0]
    ind_above = pupil_diameter > cutoff_range[1]
    vr_df.loc[ind_below, ('diameter', 'norm_pos_x', 'norm_pos_y')] = np.nan
    vr_df.loc[ind_above, ('diameter', 'norm_pos_x', 'norm_pos_y')] = np.nan

    return vr_df, ind_below, ind_above


def sd_filter(pupil_df, sd_const=3):
    """
    Sets pupil diameter and gaze position values to NaN where diameter values deviate from the median value
    with at least sd_const * sd (where sd is standard deviation and is calculated relative to the median).

    :param pupil_df: Pandas dataframe, usually the output from confidence_filter().
                     Must have columns "diameter", "norm_pos_x" and "norm_pos_y".
    :param sd_const: Numeric value, the constant for the SD-based threshold. Defaults to 3.
    :return: sd_df: Pandas dataframe where "diameter", "norm_pos_x" and "norm_pos_y" values for frames above
                    or below SD-based thresholds are set to NaN.
    :return: median_thresh: Tuple with two numeric values, corresponding to the lower and upper thresholds.
    :return: indices_below: 1D numpy array, contains the indices of frames that fell below the
             lower threshold and, thus, were set to NaN in sd_df.
    :return: indices_above: 1D numpy array, contains the indices of frames that fell above the
             upper threshold and, thus, were set to NaN in sd_df.
    """
    print('\nRunning SD-based thresholding on the supplied dataframe.')
    sd_df = pupil_df.copy()
    pupil_diameter = sd_df.diameter.to_numpy()
    median_diameter = np.nanmedian(pupil_diameter)
    # We use the standard deviation relative to the median here, not relative to the mean,
    # so " sd_diameter = np.nanstd(pupil_diameter) " is not correct here.
    nan_no = np.sum(np.isnan(pupil_diameter))
    sd_diameter = (np.nansum((pupil_diameter - median_diameter) ** 2)/nan_no) ** 0.5
    # Thresholds are based on the SD from the median, times the input arg constant
    median_thresh = (median_diameter - sd_diameter * sd_const, median_diameter + sd_diameter * sd_const)
    indices_below = pupil_diameter < median_thresh[0]
    indices_above = pupil_diameter > median_thresh[1]
    sd_df.loc[indices_below, ['diameter']] = np.nan
    sd_df.loc[indices_above, ['diameter']] = np.nan
    # User reports
    print('Median pupil diameter: {0:.3f}'.format(median_diameter))
    print('SD of pupil diameter: {0:.3f}'.format(sd_diameter))
    print('Thresholds: {0:.3f} and {1:.3f}'.format(median_thresh[0], median_thresh[1]))
    print('Number of frames below and above the thresholds:\n\
    {0} ({1:.3f}%) and {2} ({3:.3f}%)'.format(np.sum(indices_below),
                                              np.sum(indices_below)/len(pupil_diameter)*100,
                                              np.sum(indices_above),
                                              np.sum(indices_above)/len(pupil_diameter)*100))

    return sd_df, median_thresh, indices_below, indices_above


def dilation_speed_filter(pupil_df, mad_const=4):
    """
    Filters pupil diameter data based on the Median Absolute Deviation (MAD) of the speed of diameter changes.

    Based on Kret, M. E., & Sjak-Shie, E. E. (2019). Preprocessing pupil size data: Guidelines and code.
    Behavior research methods, 51(3), 1336-1342.

    Calculates the speed of diameter change at each valid frame, calculates the MAD, and derives a speed threshold:
    threshold = median_speed + mad_const * MAD

    Frames with a diameter change speed above threshold are set to NaN.

    :param pupil_df: Pandas dataframe, usually the output from confidence_filter(). Must have columns "timestamp",
               "diameter", "norm_pos_x" and "norm_pos_y".
    :param mad_const: Numeric value, the constant for deriving the MAD-based speed threshold. Defaults to 4.
    :return: mad_df: Pandas dataframe where "diameter", "norm_pos_x" and "norm_pos_y" values for frames with speed
             above threshold are set to NaN.
    :return: mad_threshold: Numeric value, MAD-based threshold for diameter change speed.
    :return: mad_indices: 1D numpy array, logical. True where a frame in the dataframe failed
                          the MAD-based thresholding, and, thus, was set to NaN in output dataframe mad_df.
    """
    print('\nRunning MAD-based thresholding on the supplied dataframe.')
    print('Ignore the following warning from pandas: RuntimeWarning: All-NaN slice encountered')
    mad_df = pupil_df.copy()
    # Get timestamps relative to start, from "timestamps" column of dataframe
    timestamps = mad_df.timestamp.to_numpy()-mad_df.timestamp[1]
    # Get dilation speed values in both directions for each frame
    # (from previous frame to current, and from current frame to next one).
    diffs_forward = np.diff(mad_df.diameter, prepend=np.nan) / np.diff(timestamps, prepend=np.nan)
    diffs_backward = np.diff(mad_df.diameter, append=np.nan) / np.diff(timestamps, append=np.nan)
    # Get the maximums of the forward/backward speed values for each frame
    dil_speed = np.stack((diffs_forward, diffs_backward), axis=1)
    max_dil_speed = np.nanmax(abs(dil_speed), axis=1)
    # Get the median speed value across all frames, and the median deviance from the median
    med_d = np.nanmedian(max_dil_speed)
    mad = np.nanmedian(abs(max_dil_speed - med_d))
    # MAD is given by
    mad_threshold = med_d + (mad_const * mad)
    print('MAD threshold value set at: {0:5.3f}' .format(mad_threshold))
    # Get indices for frames above MAD-based threshold
    mad_indices = max_dil_speed >= mad_threshold
    # Set diameter, norm_pos_x and norm_pos_y values to NaN
    # where MAD-based thresholding indicated invalid data
    mad_df.loc[mad_indices, ('diameter', 'norm_pos_x', 'norm_pos_y')] = np.nan

    return mad_df, mad_threshold, mad_indices


def interpolate(pupil_df, max_loner_length=2):
    """
    Interpolate over missing diameter data (NaN values from previous preprocessing steps) using scipy interp1d function.

    :param pupil_df: Pandas dataframe, usually the output from confidence_filter(). Must have columns "timestamp",
                    "diameter".
    :param max_loner_length: Maximum number of samples for valid (non-NaN) datapoints to be considered "loners"
                            (short valid datapoints surrounded by NaN values).
    :return: interp_df: Pandas dataframe which contains the interpolated segments for diameter data.
                        Note: edges of the data are padded with NaN values.
    """
    print("\nRunning interpolation on the supplied dataframe with max loner length: {}" .format(max_loner_length))
    # Get pupil diameter data from dataframe
    interp_df = pupil_df.copy()
    diameter = interp_df.diameter.to_numpy()

    # Get the indices of NaN (invalid, previously filtered) and non-NaN (valid) frames in diameter data
    nan_ind = np.where(np.isnan(diameter))[0]
    valid_old = np.where(~np.isnan(diameter))[0]

    # find "loners" among valid frames (very short segments of non-NaN values between NaN values)
    tmp = np.where((np.diff(nan_ind, prepend=0) < (max_loner_length + 2)) & (np.diff(nan_ind, prepend=0) >= 2))[0]

    # Get indices for frames in "loner" segments
    to_del_indices = np.empty(0)
    for i in tmp:
        tmp_array = np.arange(nan_ind[i - 1] + 1, nan_ind[i])
        to_del_indices = np.hstack((to_del_indices, tmp_array))

    # We need to define valid indices again, deleting the indices for frames in "loner" segments
    valid_ind = np.setdiff1d(valid_old, to_del_indices, assume_unique=True)

    # Indices we deleted from valid indices must also appear among NaN indices,
    # to treat them as nan indices for the sake of interpolation.
    # We do not actually replace them with nans!
    nan_ind_new = np.union1d(nan_ind, to_del_indices)

    # then interpolate it
    x = valid_ind
    y = diameter[valid_ind]
    li = interp1d(x, y, kind="linear", bounds_error=False, fill_value=np.nan)
    np.put(diameter, nan_ind_new.astype('int64'), li(nan_ind_new))

    # We return the pandas dataframe
    interp_df.diameter = diameter

    return interp_df, nan_ind


def median_filter(pupil_df, order=5):
    """
    Median filter.
    :param pupil_df: Pandas dataframe, usually the output from interpolate().
                     Must have column "diameter".
    :param order: Order of the median filter, defaults to 5.
    :return: med_df: Pandas dataframe where "diameter" values are median filtered.
    """
    med_df = pupil_df.copy()
    diameter = med_df.diameter.to_numpy()
    # Check for NaN values at edges, set them to equal the first / last valid values if there are any
    if np.sum(np.isnan(diameter)) > 0:
        valid_ind = np.where(~np.isnan(diameter))[0]
        diameter[:valid_ind[0]] = diameter[valid_ind[0]]
        diameter[valid_ind[-1]:] = diameter[valid_ind[-1]]
    diameter_med = scipy.signal.medfilt(diameter, order)
    med_df.diameter = diameter_med

    return med_df


def lowpass_filter_butter_zerophase(pupil_df, cutoff=10, fs=120, order=5):
    """
    Low-pass filter for pupil diameter data. Uses a Butterworth filter and a forward-backward double pass
    for zero-phase filtering (relying on scipy.signal.sosfiltfilt). This effectively doubles the original filter order.

    :param pupil_df: Pandas dataframe, usually the output from interpolate().
                     Must have column "diameter".
    :param cutoff: Numeric value, critical frequency in Hz for the low-pass filter.
    :param fs: Numeric value, sampling frequency of the pupil diameter signal in Hz. The default value
               only applies to the Pupil Core eye trackers and their default settings, as used at our labs at RCNS.
    :param order: Numeric value, filter order.
    :return: lp_df: Pandas dataframe where "diameter" values are low-pass filtered.
    """
    lp_df = pupil_df.copy()
    diameter = lp_df.diameter.to_numpy()
    # Check for NaN values at edges, set them to equal the first / last valid values if there are any
    if np.sum(np.isnan(diameter)) > 0:
        valid_ind = np.where(~np.isnan(diameter))[0]
        diameter[:valid_ind[0]] = diameter[valid_ind[0]]
        diameter[valid_ind[-1]:] = diameter[valid_ind[-1]]
    # Get low-pass Butterworth filter, use the "sos" output
    sos = butter(order,
                 cutoff,
                 btype='low',
                 analog=False,
                 output='sos',
                 fs=fs)
    # Use forward-backward filtering
    diameter_lp = sosfiltfilt(sos, diameter)
    lp_df.diameter = diameter_lp

    return lp_df


def pupil_to_hdf5(pupil_df,
                  output_dir,
                  pair_no,
                  lab_name,
                  session,
                  demogr_dict,
                  params,
                  nans,
                  file_label=''):
    """
    Create hdf5 file from preprocessed data.
    :param pupil_df: input dataframe
    :param output_dir: path to output directory
    :param pair_no: int
    :param lab_name: str
    :param session: str
    :param demogr_dict: a dictionary with the subject's demographic data (and other possible metadata)
    :param params: a dictionary with the parameters used in the preprocessing
    :param nans: numeric array with returns of the nancheck functions
    :param file_label: String, added to default output file name, defaults to '' (empty)
    :return: none
    """
    if file_label:
        h5_filename = "prepro_pl_data_pair{}_{}_{}_{}.hdf5".format(pair_no, lab_name, session, file_label)
    else:
        h5_filename = "prepro_pl_data_pair{}_{}_{}.hdf5".format(pair_no, lab_name, session)
    h5_file = os.path.join(output_dir, h5_filename)
    f = h5py.File(h5_file, "a")
    grp = f.create_group('Pair_{}'.format(pair_no))
    subgrp_lab = grp.create_group(lab_name)
    subgrp_sess = subgrp_lab.create_group(session)
    df_len = len(pupil_df)

    ds_time = subgrp_sess.create_dataset("pupil_timestamps", shape=df_len, data=pupil_df.timestamp, dtype='f8')
    ds_dia = subgrp_sess.create_dataset("pupil_diameter", shape=df_len, data=pupil_df.diameter, dtype='f8')
    ds_isnan = subgrp_sess.create_dataset("pupil_isnan", shape=df_len, data=nans[-1], dtype='bool')
    ds_norm_pos_x = subgrp_sess.create_dataset("pupil_norm_pos_x", shape=df_len, data=pupil_df.norm_pos_x, dtype='f8')
    ds_norm_pos_y = subgrp_sess.create_dataset("pupil_norm_pos_y", shape=df_len, data=pupil_df.norm_pos_y, dtype='f8')
    if file_label:
        ds_brightness = subgrp_sess.create_dataset("pupil_brightness", shape=df_len, data=pupil_df.brightness, dtype='f8')

    # Group-level attributes
    # Mandatory attributes for pair: pair number, condition, date
    mand_attr = ['Pair_number', 'Condition', 'Date']
    for attr in mand_attr:
        grp.attrs[attr] = demogr_dict[attr]

    # Subject-level attributes
    # Mandatory attributes for subject: : pair number, lab name, condition, gender, age, handedness, height, date
    # Optional attribute is anything else in the dictionary - those are attached as well
    mand_attr = ['Pair_number', 'Lab_name', 'Condition', 'Gender', 'Age', 'Handedness', 'Height', 'Date']
    for attr in mand_attr:
        subgrp_lab.attrs[attr] = demogr_dict[attr]

    # Session-level attributes
    # Mandatory attributes for session: pair number, lab name, session, condition, gender, age, handedness, height, date
    # Optional attribute is anything else in the dictionary - those are attached as well
    mand_attr = ['Pair_number', 'Lab_name', 'Session', 'Condition', 'Gender', 'Age', 'Handedness', 'Height', 'Date']
    for attr in mand_attr:
        subgrp_sess.attrs[attr] = demogr_dict[attr]

    # Parameters of preprocessing script (session-level)
    subgrp_sess.attrs['conf_cutoff'] = params['conf_cutoff']
    subgrp_sess.attrs['sd_const'] = params['sd_const']
    subgrp_sess.attrs['mad_const'] = params['mad_const']
    subgrp_sess.attrs['max_loner_length'] = params['max_loner_length']
    subgrp_sess.attrs['lp_cutoff'] = params['lp_cutoff']

    # Additional info (session-level)
    subgrp_sess.attrs['Recording start time'] = ds_time[0]
    subgrp_sess.attrs['Recording length'] = ds_time[-1] - ds_time[0]

    # Missing data attributes (session-level)
    subgrp_sess.attrs['Nans ratio (%)'] = nans[0:7]
    subgrp_sess.attrs['Nans no of frames'] = nans[7:-2]
    subgrp_sess.attrs['nans_keys'] = ['after_conf_thres', 'after_valid_range_filt', 'after_sd_filt',
                                      'after_mad_filt', 'after_mad2_filt', 'after_interp', 'after_lp_filt']

    ds_isnan.attrs['isnan_keys'] = ['True = NaN', 'False = Valid']

    f.close()