-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_utils.py
More file actions
135 lines (110 loc) · 5.73 KB
/
data_utils.py
File metadata and controls
135 lines (110 loc) · 5.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
class NoisyNumbers():
"""
Class to load the sklearn digits dataset and apply noise to it.
"""
def __init__(self):
# load sklearn digits dataset as pandas dataframe
digits_raw = load_digits()
self.X = pd.DataFrame(digits_raw.data)
self.y = pd.Series(digits_raw.target)
# define train/test splitting mask
np.random.seed(42) # Ensures reproducibility
self.train_mask = np.random.rand(len(self.X)) < 0.8
self.test_mask = ~self.train_mask
self.X_train = self.X[self.train_mask]
self.y_train = self.y[self.train_mask]
self.X_test = self.X[self.test_mask]
self.y_test = self.y[self.test_mask]
def apply_noise_to_digits(self, digit_indices, p_noise=0.1, random_state=None):
# simulate sparse salt-and-pepper noise on multiple digits with probability p_noise
if random_state is not None:
np.random.seed(random_state)
noisy_digits = self.X.iloc[digit_indices].copy()
for idx in digit_indices:
noise_mask = np.random.rand(self.X.shape[1]) < p_noise
noisy_digits.loc[idx, noise_mask] = np.random.choice([0, 16], size=np.sum(noise_mask))
return noisy_digits
def visualise_digits(self, p_noise=None, random_state=42):
# random selection of 10 digits
digits = self.X.sample(n=10, random_state=random_state)
if p_noise is not None:
noisy_digits = digits.copy()
for idx in digits.index:
noise_mask = np.random.rand(digits.shape[1]) < p_noise
noisy_digits.loc[idx, noise_mask] = np.random.choice([0, 16], size=np.sum(noise_mask))
else:
noisy_digits = digits
# plot the digits
fig, axs = plt.subplots(1, 10, figsize=(15, 2))
for i, ax in enumerate(axs.flatten()):
ax.imshow(noisy_digits.iloc[i].values.reshape(8, 8), cmap=plt.cm.gray_r, interpolation='nearest')
ax.set_title(f"Label: {self.y.iloc[digits.index[i]]}")
plt.show()
def get_noisy_data(self, p_dataset=0.1, p_image=0.1, random_state=None):
"""
Generate noisy versions of the training and test datasets.
Parameters:
p_dataset (float): Proportion of samples in the dataset to have noise applied (0 <= p_dataset <= 1).
p_image (float): Probability of noise being applied to individual pixels within a noisy sample (0 <= p_image <= 1).
random_state (int or None): Seed for reproducibility of noise application. If None, the randomness is not seeded.
Returns:
tuple: A tuple containing noisy versions of the training dataset (noisy_train) and the test dataset (noisy_test),
as well as the indices of the corrupted samples.
"""
if random_state is not None:
np.random.seed(random_state)
# Select samples for noise in train and test datasets
train_indices = self.X_train.sample(frac=p_dataset, random_state=random_state).index
test_indices = self.X_test.sample(frac=p_dataset, random_state=random_state).index
# Apply noise to selected samples
noisy_train = self.X_train.copy()
noisy_train.loc[train_indices] = self.apply_noise_to_digits(train_indices, p_noise=p_image, random_state=random_state)
noisy_test = self.X_test.copy()
noisy_test.loc[test_indices] = self.apply_noise_to_digits(test_indices, p_noise=p_image, random_state=random_state)
return noisy_train, noisy_test, train_indices, test_indices
# def evaluate_accuracy_noisy_dataset(self, p_noise=0.1, random_state=42, verbose=True):
# """
# Evaluate the accuracy of an (unoptimised) random forest classifier on datasets with different levels of noise.
# All samples in the dataset have the same level of noise.
# """
# train, test = self.get_noisy_data(p_dataset=1.0, p_image=p_noise, random_state=random_state)
# clf = RandomForestClassifier(n_estimators=50, random_state=random_state)
# clf.fit(train, self.y_train)
# accuracy = clf.score(test, self.y_test)
# if verbose:
# print(f"Accuracy: {accuracy:.2f}")
# return accuracy
class EvalDataset():
"""
Class to evaluate classifiers.
"""
def __init__(self, p_dataset=0.1, p_image=0.1, random_state=None):
self.p_dataset = p_dataset
self.p_image = p_image
self.random_state = random_state
self.noisy_numbers = NoisyNumbers()
self.X_train, self.X_test, self.train_indices, self.test_indices = self.noisy_numbers.get_noisy_data(p_dataset=p_dataset, p_image=p_image, random_state=random_state)
self.y_train = self.noisy_numbers.y_train
self.y_test = self.noisy_numbers.y_test
def evaluate_accuracy_full_dataset(self, model=RandomForestClassifier(n_estimators=50, random_state=42), verbose=True):
"""
Evaluate the accuracy of an (unoptimised) random forest classifier on datasets with different levels of noise.
All samples in the dataset have the same level of noise.
"""
model.fit(self.X_train, self.y_train)
accuracy = model.score(self.X_test, self.y_test)
if verbose:
print(f"Accuracy: {accuracy:.2f}")
return accuracy
if __name__ == "__main__":
noisy_numbers = NoisyNumbers()
print(noisy_numbers.X.shape)
# Example usage of apply_noise_to_digits
# noisy_digits = noisy_numbers.apply_noise_to_digits(digit_indices=[0, 1, 2], p_noise=0.1, random_state=42)
# print(noisy_digits)
noisy_numbers.visualise_digits(p_noise=0.1)