-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDataWrangling.py
More file actions
149 lines (117 loc) · 4.39 KB
/
DataWrangling.py
File metadata and controls
149 lines (117 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plotnine import *
# colour codings:
# orange -> #FFA500
# blue -> #4F94CD
# import the needed data and turn Condition, as well as Block into a categorical value (helps with distinction in plots)
data = pd.read_csv("data/data.csv")
data["Block"] = data["Block"].astype("category")
data["Condition"] = data["Condition"].astype("category")
data["Response"] = data["Response"].astype("category")
data["StimVar"] = data["StimVar"].astype("category")
# removing outliers:
while True:
# calculate the mean and standard deviation
mean = np.mean(data["RT"])
sd = np.std(data["RT"])
# calculate z-values for all data points and append them in a separate column:
rows, cols = data.shape
data["z_value"] = np.zeros(rows)
for row in data.itertuples(index = True, name = "Row"):
index = row[0]
value = row[6]
data.at[index, "z_value"] = abs(value - mean)/ sd
# find index of data point with largest z-value
z_max = data["z_value"].idxmax()
# check if said data point is an outlier
if z_max > 3:
# remove whiole line from the data
data = data.drop(z_max)
# stop if no more outliers are found
else:
break
# update the indices in the df and have a brief look at it:
rows, cols = data.shape
# indices = np.arange(rows)
data = data.set_index(np.arange(rows))
def bar():
fig = (ggplot(data, aes(x = "RT"))
+ geom_histogram(binwidth = 50, color = "#FFA500", fill = "#FFA500", alpha = 0.4)
+ labs(
x = "Reaction Time",
y = "Number of Occurrences",
title = "Distribution of Reaction Times"
)
).draw()
return fig
def violin():
fig = (ggplot(data, aes(x='Block', y='RT'))
+ geom_violin(fill = "#FFA500", color = "#FFA500", alpha = 0.4)
+ stat_summary(color = "#4F94CD")
+ labs(
x = "Trial Block",
y = "Reaction Time in ms",
title = "Learning effect between blocks - mean Reaction Times shrink with practice"
)
).draw()
return fig
def scatter():
fig = (ggplot(data, aes(x = "Onset", y = "RT", color = "Condition"))
+ geom_point()
+ geom_smooth(method = "lm", color = "black", se = False)
+ labs(
x = "Onset in ms",
y = "Reaction time in ms",
title = "Reaction times depending on Condition plotted against stimulus onset"
)
).draw()
return fig
def stacked():
data_block_one = data.loc[data['Block'] == 1]
data_block_two = data.loc[data['Block'] == 2]
congruent_one, incongruent_one = data_block_one.groupby('Condition')['RT'].mean()
congruent_two, incongruent_two = data_block_two.groupby('Condition')['RT'].mean()
std_congruent_one, std_incongruent_one = data_block_one.groupby("Condition")["RT"].std()
std_congruent_two, std_incongruent_two = data_block_two.groupby("Condition")["RT"].std()
dataframe = pd.DataFrame({"Block" : [1, 2],
"RT_congruent" : [congruent_one, congruent_two],
"std_congruent" : [std_congruent_one, std_congruent_two],
"RT_incongruent" : [incongruent_one, incongruent_two],
"std_incongruent" : [std_incongruent_one, std_incongruent_two]})
fig, ax = plt.subplots(figsize = (9, 6))
ax.bar([0.9, 1.9],
dataframe["RT_congruent"],
label='Congruent',
yerr = dataframe["std_congruent"],
color = "#FFA500",
width = 0.2,
alpha = 0.8)
ax.bar([1.1, 2.1],
dataframe["RT_incongruent"],
label='Incongruent',
yerr = dataframe["std_congruent"],
color = "#4F94CD",
width = 0.2,
alpha = 0.8)
ax.set(
xlabel = "Block",
ylabel = "Mean Reaction Time",
title = "Stacked Average Reaction Times between Practice Blocks of the Experiment"
)
ax.set_xticks([1, 2])
ax.set_xticklabels(["Block 1", "Block 2"])
ax.legend()
return fig, ax
def lrm():
#regression model!
# prepare x and y:
x = np.array(data["Onset"]).reshape(-1, 1)
y = data["RT"]
model_onset = LinearRegression().fit(x, y)
# obtain the results:
print('correlation coefficient:', model_onset.score(x, y))
print('intercept:', model_onset.intercept_)
print('slope:', model_onset.coef_)
return