-Students-Knowledge-Status-Classification-modeling-/Group_Project at main · HeisSteve/-Students-Knowledge-Status-Classification-modeling- · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
##loading package
library(kknn)
library(purrr)
library(tidyverse)
library(repr)
library(tidymodels)
library(themis)

options(repr.matrix.max.rows = 6)


set.seed(1234)
options(repr.plot.height = 5, repr.plot.width = 6)

##Loading data
url <- "https://github.com/JackyLinllk/ubc_dsci100_assignment/raw/main/data/Data_User_Modeling_Dataset_full.csv"
knowledge_data<-read_csv(url)|>
  select(STG:UNS)|>
  mutate(UNS = as.factor(UNS))

##Check NA rows
na_row= which(!complete.cases(knowledge_data))
print(na_row)

##Delete the NA row of the data
knowledge_data= knowledge_data[-nrow(knowledge_data),]

##Typo in the dataset #very_low and Very Low should be same observation
knowledge_data= knowledge_data|>
  mutate(UNS = fct_recode(UNS, "very_low"="Very Low"))


## Check proportion and visualizing
class_prop = knowledge_data|>
  group_by(UNS)|>
  summarize(count = n(),
            percentage= count/nrow(knowledge_data)
  )
class_prop

##Balancing the data
ups_recipe= recipe(UNS~. ,data=knowledge_data)|>
  step_upsample(UNS, over_ratio=1, skip=F)|>
  prep()
upsampled_knowledge=bake(ups_recipe, knowledge_data)

##Checking the balance
upsampled_knowledge|>
  group_by(UNS)|>
  summarize(n=n())


##Split the data into training set and testing set
knowledge_data_split <- initial_split(upsampled_knowledge, prop = 0.70, strata = UNS)
knowledge_data_split_train <- training(knowledge_data_split)
knowledge_data_split_test <- testing(knowledge_data_split)

knowledge_data_split_train


##Finding the k value for best accuracy
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("classification")

data_recipe <- recipe(UNS ~. , data = knowledge_data_split_train)|>
  step_center(all_predictors())|>
  step_scale(all_predictors())

training_vfold <-  vfold_cv(knowledge_data_split_train, v=5, strata = UNS)

k_value= sqrt(nrow(knowledge_data_split_train))
K <- tibble(neighbors = seq(1,k_value,2))

knn_result <- workflow() |>
  add_recipe(data_recipe) |>
  add_model(knn_tune)|>
  tune_grid(resamples = training_vfold, grid = K) |>
  collect_metrics()|>
  filter(.metric == "accuracy")

##Scatter plot on the accuracy and number of neighbors
cross_val_plot <- ggplot(knn_result, aes(x=neighbors, y= mean)) +
  geom_point() +
  geom_line() +
  labs(x = "Neighbors", y = "Accuracy Estimate") +
  ggtitle(label= "KNN Accuracy verses Number of Neighbors")
cross_val_plot


##Finding confusion matrix of model using testing set
knn_best_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = 1) |>
  set_engine("kknn") |>
  set_mode("classification")


knn_fit <- workflow() |>
  add_recipe(data_recipe) |>
  add_model(knn_best_tune)|>
  fit(knowledge_data_split_train)


knowledge_predictions= knn_fit|>
  predict(knowledge_data_split_test)|>
  bind_cols(knowledge_data_split_test)

knowledge_metrics= knowledge_predictions|>
  metrics(truth= UNS, estimate = .pred_class)|>
  filter(.metric== "accuracy")

knowledge_conf_mat= knowledge_predictions|>
  conf_mat(truth= UNS, estimate = .pred_class)

knowledge_conf_mat

##Calculating the Accuracy/Precision/Recall
knowledge_matrix= matrix(unlist(knowledge_conf_mat), nrow = 4) ##Convert list into 5x5 matrix

knowledge_accuracy= sum(diag(knowledge_matrix))/sum(knowledge_matrix) #Accuracy

knowledge_precision_high= knowledge_matrix[1,1]/(sum(knowledge_matrix[,1]))#precision for high
knowledge_precision_low= knowledge_matrix[2,2]/(sum(knowledge_matrix[,2]))#precision for low
knowledge_precision_middle= knowledge_matrix[3,3]/(sum(knowledge_matrix[,3]))#precision for middle
knowledge_precision_very_low= knowledge_matrix[4,4]/(sum(knowledge_matrix[,4]))#precision for very low
average_precision= mean(knowledge_precision_high,
                        knowledge_precision_low,
                        knowledge_precision_middle,
                        nowledge_precision_very_low)


knowledge_recall_high= knowledge_matrix[1,1]/(sum(knowledge_matrix[1,]))#Recall for high
knowledge_recall_low= knowledge_matrix[2,2]/(sum(knowledge_matrix[2,]))#Recall for low
knowledge_recall_middle= knowledge_matrix[3,3]/(sum(knowledge_matrix[3,]))#Recall for middle
knowledge_recall_very_low= knowledge_matrix[4,4]/(sum(knowledge_matrix[4,]))#Recall for very low
average_recall= mean(knowledge_recall_high,
                        knowledge_recall_low,
                        knowledge_recall_middle,
                        nowledge_recall_very_low)


summary_prediction= data.frame(
   accuracy= knowledge_accuracy,
   precision= average_precision,
   average_recall)

summary_prediction