-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSoybean_Dataset_Analysis.r
More file actions
75 lines (59 loc) · 3.23 KB
/
Soybean_Dataset_Analysis.r
File metadata and controls
75 lines (59 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Load required libraries
library(mlbench) # For loading the Soybean dataset
library(caret) # For data preprocessing and near-zero variance identification
library(ggplot2) # For plotting
library(VIM) # For visualizing missing data
library(dplyr) # For data manipulation
# Load the Soybean dataset
data(Soybean) # Load the dataset from the mlbench package
head(Soybean) # Display the first few rows of the dataset
# (a) Investigate the frequency distributions for the categorical predictors
# Identify categorical variables
categorical_vars <- sapply(Soybean, is.factor) # Determine which variables are factors
categorical_data <- Soybean[, categorical_vars] # Subset data to include only categorical predictors
# Save plots of categorical distributions to a PDF file
pdf("categorical_distributions.pdf", width = 12, height = 10) # Open a PDF device for plotting
# Adjust layout based on the number of categorical variables
par(mfrow = c(6, 7))
# Create bar plots for each categorical variable
for (var in names(categorical_data)) {
counts <- table(categorical_data[[var]]) # Get frequency counts for the variable
barplot(counts, main = var, col = "lightblue", las = 2, cex.names = 0.6) # Plot the frequencies
}
# Close the PDF device
dev.off()
# (b) Investigate missing data
# Identify near-zero variance predictors
zero_cols <- nearZeroVar(Soybean) # Find predictors with near-zero variance
zero_cols # Print indices of predictors with near-zero variance
colnames(Soybean)[zero_cols] # Print names of predictors with near-zero variance
# Count missing values per predictor
missing_data <- colSums(is.na(Soybean)) / nrow(Soybean) # Calculate missing proportion for each predictor
missing_data_summary <- data.frame(Predictor = names(missing_data), MissingProportion = missing_data) # Create a summary data frame
print(missing_data_summary) # Print the missing data summary
# Check if missing data is related to classes
missing_data_by_class <- Soybean %>%
mutate(nul = rowSums(is.na(Soybean))) %>% # Calculate number of missing values per row
group_by(Class) %>% # Group by class
summarize(miss = sum(nul)) %>% # Summarize total missing values per class
filter(miss != 0) # Filter to show classes with missing values
missing_data_by_class # Print the missing data by class
# Save missing data proportion plot to a PDF file
pdf("missing_data_proportion.pdf", width = 12, height = 6) # Open a PDF device for plotting
# Create a bar plot of missing data proportions
ggplot(missing_data_summary, aes(x = Predictor, y = MissingProportion)) +
geom_bar(stat = "identity", fill = "#04485f") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
labs(title = "Missing Data Proportion by Predictor", x = "Predictor", y = "Missing Proportion")
# Close the PDF device
dev.off()
pdf("missing_data_pattern.pdf")
# Visualize the missing data pattern using VIM package
missing_data_plot <- aggr(Soybean, col = c('navyblue', 'red'),
numbers = TRUE, sortVars = TRUE,
labels = names(Soybean),
cex.axis = 0.7, gap = 3,
ylab = c("Missing data pattern", "Pattern Count"))
# Save the missing data pattern as PDF
print(missing_data_plot)
dev.off()