-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsklearn_test.py
More file actions
97 lines (75 loc) · 3.05 KB
/
sklearn_test.py
File metadata and controls
97 lines (75 loc) · 3.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from sklearn import svm
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
nfeatures = 1000
ninformative = 200
sep = 0.4
#nclasses = 2
scale = False # This is actually redundant at the moment since make_classification produces normalized data anyway
classarray = [2,3,4,8]
for nclasses in classarray:
X, y = make_classification(
n_samples=500,
n_features=nfeatures,
n_informative=ninformative,
n_redundant=0,
n_repeated=0,
n_classes=nclasses,
n_clusters_per_class=1,
shuffle = False,
class_sep=sep,
random_state=0,
)
## save simulated data for R testing
dfy,dfx = pd.DataFrame(y),pd.DataFrame(X)
dfy.to_csv('test_y.csv')
dfx.to_csv('test_x.csv')
X.all
# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# also save the scaled version for testing purposes
dfy,dfx = pd.DataFrame(y),pd.DataFrame(X)
dfx.to_csv('test_x_transform.csv')
# Train an SVC model with a linear kernel on the entire dataset
svc_model = svm.SVC(kernel='linear')
svc_model.fit(X, y)
# Initialize the minimum number of features
min_num_features = 1
# Lists to store results for plotting
num_removed_features_list = []
cv_error_list = []
while X.shape[1] > min_num_features:
# Get the coefficients and normalize them
coef = svc_model.coef_
coef = np.abs(coef)
# Identify the least informative feature
least_informative_feature_index = np.argmin(np.sum(coef, axis=0))
# Remove the least-informative feature
X = np.delete(X, least_informative_feature_index, axis=1)
# Train an SVC model with a linear kernel on the modified dataset
svc_model = svm.SVC(kernel='linear', coef0=0.0, C=1.0)
svc_model.fit(X, y)
# Cross-validation score after feature removal
cv_score = np.mean(cross_val_score(svc_model, X, y, cv=5))
# Store results for plotting
num_removed_features_list.append(X.shape[1])
cv_error_list.append(cv_score)
def thisfunction(i):
return nfeatures - i
included_features_list = list(map(thisfunction, num_removed_features_list))
figname = "Python_" + str(nfeatures) + "_" + str(ninformative) + "informative_" + str(sep) + "sep_" + str(nclasses) + "class"
# Plotting the results
plt.figure(figsize=(10, 6))
#plt.plot(included_features_list, cv_error_list, marker='o')
plt.plot(num_removed_features_list, cv_error_list, marker='o')
plt.title(str(nfeatures) + " features of which " + str(ninformative) + " informative, with " + str(nclasses) + " classes")
plt.xlabel('Number of Included Features')
plt.ylabel('Cross-Validation Score')
plt.grid(True)
plt.savefig('tempfigs/manual_' + figname + ".png")