Supervised-Algorithm/vinothrandomforest.Rmd at master · vinothkumar3993/Supervised-Algorithm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
---
title : "SANTANDER CUSTOMER SATISFACTION"
author: "VINOTHKUMAR A"
date  : "03 NOV 2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
#include library
```{r}
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
```
# Santander Customer Satisfaction
# Comparison of unbalanced and downscaled Random Forests
```{r}
library(dplyr)
library(caret)
library(randomForest)
```
# Read data
```{r}
train0 <- read.csv("D:/santander/train.csv")
test0 <- read.csv("D:/santander/train.csv")
```
# First combine all the data for cleaning
```{r}
all_data <- rbind(train0,test0)
```
# Find predictors that have very low percentage of unique values
# zeroVar: gives vector of logicals for whether the predictor has only one disticnt value
# nzv : gives vector of logicals for whether the predictor is a near zero variance predictor
```{r}
nsv <- nearZeroVar(all_data[,-1],
                   saveMetrics = TRUE) # Do not test ID
```

# Subset, remove near zero variance predictors
# Note: Here we could have chosen zeroVar, which would lead to a larger set of predictors
```{r}
all_data.new <- all_data[, !nsv$nzv]
```
#Visualization
```{r}
library(lattice)
splom(train0[,3:5]) #only for integer values

```
#correlation plot #to find the amount of correlation between variables
```{r}
library(corrplot)
cr <- cor(train0[,5:7])
corrplot(cr, method = "number")

```

# Find duplicate columns and then remove them
```{r}
dpl.cl <- data.frame(i = integer(), j = integer())
for (i in 2:(ncol(all_data.new)-1) ){
  for
  (j in (i+1):ncol(all_data.new) ){

    if (identical(all_data.new[,i], all_data.new[,j]) == TRUE){
      dpl.cl <- rbind(dpl.cl, data.frame(i = i, j=j))
      #print(c(i,j))

    }
  }
}
all_data.new <- all_data.new[, -dpl.cl$j]

```

# Now split back into training and testing sets
```{r}
train1 <- all_data.new[1:nrow(train0), ]; train1$TARGET <- train0$TARGET
test1 <- all_data.new[(nrow(train0) + 1):nrow(all_data.new), ]
rm(train0,test0) # Remove unnecessary data.frames
```
# Turn TARGET into factor (0 and 1)
```{r}
train1 <- mutate(train1, TARGET = factor(TARGET))
nmin = sum(train1$TARGET == "1") #skewed class is 1
nmax = sum(train1$TARGET == "0")
c(nmin, nmax) # Much more 1's than 0's
```

# Create training and test data from train1
```{r}
set.seed(101)
inTrain <- createDataPartition(y = train1$TARGET, p = 0.7, list = FALSE)
training <- train1[inTrain, ]; testing <- train1[-inTrain, ]
```

# Let us try to train a simple Random Forest using down-sampling
```{r}
nmin = sum(training$TARGET == "1")
```

## Tell randomForest to sample by strata.
## Specify that the number of samples selected within each class should be the same
```{r}
rf0 <- randomForest(TARGET ~., data = training, ntree = 500, mtry = 10,
                    strata = training$TARGET, sampsize = rep(nmin,2))
pred.rf0 <- predict(rf0, newdata = testing)
cm0 <- confusionMatrix(pred.rf0, testing$TARGET)

```
# Let's look at the unbalanced RF
```{r}
rfu <- randomForest(TARGET ~. , data = training, ntree = 500, mtry = 10)
pred.rfu <- predict(rfu, newdata = testing)
cmu <- confusionMatrix(pred.rfu, testing$TARGET)
```
# Compare the confusion matrices cmu and cm0, you will see the difference
# Let's look at the ROC curves
```{r}
library(pROC)
pred.rf0 <- predict(rf0, newdata = testing, type = "prob")[,1]
pred.rfu <- predict(rfu, newdata = testing, type = "prob")[,1]
downsampledROC <- roc(response = testing$TARGET, predictor = pred.rf0,
                      levels = rev(levels(testing$TARGET)))
unbalancedROC <- roc(response = testing$TARGET, predictor = pred.rfu,
                     levels = rev(levels(testing$TARGET)))
plot(downsampledROC, col = rgb(1, 0, 0, .5), lwd = 2);
plot(unbalancedROC, col = rgb(0, 0, 1, .5), lwd = 2, add = TRUE)
```
# Area under the ROC curves
```{r}
auc(downsampledROC)
auc(unbalancedROC)
```

```{r}
submission <-data.frame(Id = test1$id,TARGET=pred.rf0)
write.csv(submission,"D:santander/submit3.csv",row.names=F)
```

# kaggle score is 0.500000 using RandomForset