DECISION TREE ALGORITHM
24CSEG034
2025-04-06
# Load libraries
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(lattice)
library(rpart)
library(rpart.plot)
library(corrplot)
## corrplot 0.95 loaded
library(RColorBrewer)
library(readr)
df <- read_csv("C:/Users/prane/Downloads/drug200.csv")
## Rows: 200 Columns: 6
## ── Column specification
────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Sex, BP, Cholesterol, Drug
## dbl (2): Age, Na_to_K
##
## ℹ Use `spec()` to retrieve the full column specification for this
data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet
this message.
# Summary
summary(df)
## Age Sex BP Cholesterol
## Min. :15.00 Length:200 Length:200 Length:200
## 1st Qu.:31.00 Class :character Class :character
Class :character
## Median :45.00 Mode :character Mode :character
Mode :character
## Mean :44.31
## 3rd Qu.:58.00
## Max. :74.00
## Na_to_K Drug
## Min. : 6.269 Length:200
## 1st Qu.:10.445 Class :character
## Median :13.937 Mode :character
## Mean :16.084
## 3rd Qu.:19.380
## Max. :38.247
# Check for missing values
missing_values <- colSums(is.na(df))
print(missing_values)
## Age Sex BP Cholesterol Na_to_K
Drug
## 0 0 0 0 0
0
# Detect outliers
boxplot.stats(df$Age)$out
## numeric(0)
boxplot.stats(df$Na_to_K)$o
## [1] 33.486 38.247 35.639 33.542 32.922 37.188 34.997 34.686
# Boxplot for all numeric variables
numeric_df <- df[, sapply(df, is.numeric)]
long_df <- stack(numeric_df)
bwplot(values ~ ind, data = long_df, col = "darkgreen", fill =
"lightyellow",
main = "Boxplots for Outlier Detection", xlab = "Features",
ylab = "Values")
# Replace outliers with median
for (col in names(df)[sapply(df, is.numeric)]) {
Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
med <- median(df[[col]], na.rm = TRUE)
df[[col]][df[[col]] < lower | df[[col]] > upper] <- med
}
df_cleaned <- df
print(names(df))
## [1] "Age" "Sex" "BP" "Cholesterol"
"Na_to_K"
## [6] "Drug"
# Plots
histogram(~Age | Drug, data = df_cleaned, layout = c(3,2),
col = "darkorange", border = "black", main = "Age
Distribution by Drug", breaks = 20)
bwplot(Drug ~ Na_to_K, data = df_cleaned, col = "purple", fill =
"lavender",
main = "Sodium-Potassium Ratio by Drug", xlab = "NaToK", ylab =
"Drug")
xyplot(Age ~ Na_to_K | Drug, data = df_cleaned, col = "firebrick", pch
= 16, cex = 1.2,
main = "Age vs NaToK by Drug", xlab = "NaToK", ylab = "Age")
bwplot(Age ~ Sex, data = df_cleaned, col = c("lightblue", "deeppink"),
main = "Age Distribution by Sex", xlab = "Sex", ylab = "Age")
bwplot(Na_to_K ~ Sex, data = df_cleaned, col = c("lightblue",
"deeppink"),
main = "Sodium-Potassium Ratio by Sex", xlab = "Sex", ylab =
"NaToK")
# Correlation plot
num_vars <- df_cleaned[, sapply(df_cleaned, is.numeric)]
cor_matrix <- cor(num_vars)
corrplot(cor_matrix, method = "color", col = brewer.pal(8, "PiYG"),
tl.col = "black", tl.srt = 45, addCoef.col = "black",
number.cex = 0.7,
title = "Correlation Matrix", mar = c(0,0,2,0))
# Split data
set.seed(123)
trainIndex <- createDataPartition(df_cleaned$Drug, p = 0.7, list =
FALSE)
trainData <- df_cleaned[trainIndex, ]
testData <- df_cleaned[-trainIndex, ]
# Train decision tree
drug_model <- rpart(Drug ~ Age + Sex + BP + Cholesterol + Na_to_K,
data = trainData, method = "class")
rpart.plot(drug_model, main = "Decision Tree for Drug Classification",
box.col = "lightsteelblue", shadow.col = "gray")
# Prediction
trainData$Drug <- factor(trainData$Drug)
testData$Drug <- factor(testData$Drug, levels =
levels(trainData$Drug))
predictions <- predict(drug_model, testData, type = "class")
predictions <- factor(predictions, levels = levels(testData$Drug))
# Accuracy
conf_matrix <- confusionMatrix(predictions, testData$Drug)
conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction drugA drugB drugC drugX drugY
## drugA 6 0 0 0 1
## drugB 0 4 0 0 0
## drugC 0 0 4 0 0
## drugX 0 0 0 16 1
## drugY 0 0 0 0 25
##
## Overall Statistics
##
## Accuracy : 0.9649
## 95% CI : (0.8789, 0.9957)
## No Information Rate : 0.4737
## P-Value [Acc > NIR] : 6.477e-16
##
## Kappa : 0.9488
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: drugA Class: drugB Class: drugC Class:
drugX
## Sensitivity 1.0000 1.00000 1.00000
1.0000
## Specificity 0.9804 1.00000 1.00000
0.9756
## Pos Pred Value 0.8571 1.00000 1.00000
0.9412
## Neg Pred Value 1.0000 1.00000 1.00000
1.0000
## Prevalence 0.1053 0.07018 0.07018
0.2807
## Detection Rate 0.1053 0.07018 0.07018
0.2807
## Detection Prevalence 0.1228 0.07018 0.07018
0.2982
## Balanced Accuracy 0.9902 1.00000 1.00000
0.9878
## Class: drugY
## Sensitivity 0.9259
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9375
## Prevalence 0.4737
## Detection Rate 0.4386
## Detection Prevalence 0.4386
## Balanced Accuracy 0.9630