-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMNIST_example
More file actions
57 lines (49 loc) · 1.86 KB
/
MNIST_example
File metadata and controls
57 lines (49 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
library(dslabs)
library(caret)
library(Rborist)
mnist <- read_mnist()
#see sets
#names(mnist)
#see features
#dim(mnist$train$images)
#class(mnist$train$labels)
#table(mnist$train$labels)
#create smaller data set
set.seed(123)
index <- sample(nrow(mnist$train$images), 10000)
x <- mnist$train$images[index,]
y <- factor(mnist$train$labels[index])
index <- sample(nrow(mnist$train$images), 1000)
x_test <- mnist$train$images[index,]
y_test <- factor(mnist$train$labels[index])
#preprosessing (removing variables that dont matter and converting data to be useful for functions)
#find vars with little to no variability
#library(matrixStats)
#sds <- colSds(x)
#qplot(sds, bins=256, color=I("black"))
#actually find them lol
nzv <- nearZeroVar(x)
#remove unneeded columns
col_index <- setdiff(1:ncol(x),nzv)
#model fitting
#add column names
colnames(x) <- 1:ncol(mnist$train$images)
colnames(x_test) <- colnames(mnist$train$images)
#start with knn. optimize for number of neighbors.
#use k-fold validation to improve speed of getting distance between neighbors
#control <- trainControl(method="cv", number = 10, p=.9)
#train_knn <- train(x[,col_index],y,method="knn",tuneGrid=data.frame(k=c(1,3,5,7)),trControl=control)
#for speed, test code on smaller data set:
#n <- 1000
#b <- 2
#index <- sample(nrow(x), n)
#control <- trainControl(method="cv", number = b, p.9)
#train_knn <- train(x[,col_index],y,method="knn",tuneGrid=data.frame(k=c(1,3,5,7)),trControl=control)
#fit_knn <- knn3(x[ , col_index], y, k=3)
#y_hat_knn <- predict(fit_knn,x_test[, col_index], type="class")
#cm <- confusionMatrix(y_hat_knn, factor(y_test))
#cm$overall["accuracy"]
#now with Random Forests
control <- trainControl(method="cv", number = 5, p=0.8)
grid <- expand.grid(minNode = c(1,5) , predFixed = c(10,15,25,35,50))
train_rf <- train(x[ , col_index],y,method="Rborist",nTree=50,trControl=control,tuneGrid=grid,nSamp=1000)