Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
8 views10 pages

Decision Tree

The document outlines the implementation of a decision tree algorithm using R to classify drug types based on various features from a dataset. It includes data loading, preprocessing steps such as outlier detection and replacement, and visualization of data distributions. The model is trained and evaluated, achieving an accuracy of approximately 96.49% on the test set.

Uploaded by

akanaguhari
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views10 pages

Decision Tree

The document outlines the implementation of a decision tree algorithm using R to classify drug types based on various features from a dataset. It includes data loading, preprocessing steps such as outlier detection and replacement, and visualization of data distributions. The model is trained and evaluated, achieving an accuracy of approximately 96.49% on the test set.

Uploaded by

akanaguhari
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 10

DECISION TREE ALGORITHM

24CSEG034

2025-04-06
# Load libraries
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(lattice)
library(rpart)
library(rpart.plot)
library(corrplot)

## corrplot 0.95 loaded

library(RColorBrewer)

library(readr)
df <- read_csv("C:/Users/prane/Downloads/drug200.csv")

## Rows: 200 Columns: 6


## ── Column specification
────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Sex, BP, Cholesterol, Drug
## dbl (2): Age, Na_to_K
##
## ℹ Use `spec()` to retrieve the full column specification for this
data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet
this message.

# Summary
summary(df)

## Age Sex BP Cholesterol

## Min. :15.00 Length:200 Length:200 Length:200

## 1st Qu.:31.00 Class :character Class :character


Class :character
## Median :45.00 Mode :character Mode :character
Mode :character
## Mean :44.31

## 3rd Qu.:58.00
## Max. :74.00

## Na_to_K Drug
## Min. : 6.269 Length:200
## 1st Qu.:10.445 Class :character
## Median :13.937 Mode :character
## Mean :16.084
## 3rd Qu.:19.380
## Max. :38.247

# Check for missing values


missing_values <- colSums(is.na(df))
print(missing_values)

## Age Sex BP Cholesterol Na_to_K


Drug
## 0 0 0 0 0
0

# Detect outliers
boxplot.stats(df$Age)$out

## numeric(0)

boxplot.stats(df$Na_to_K)$o

## [1] 33.486 38.247 35.639 33.542 32.922 37.188 34.997 34.686

# Boxplot for all numeric variables


numeric_df <- df[, sapply(df, is.numeric)]
long_df <- stack(numeric_df)
bwplot(values ~ ind, data = long_df, col = "darkgreen", fill =
"lightyellow",
main = "Boxplots for Outlier Detection", xlab = "Features",
ylab = "Values")
# Replace outliers with median
for (col in names(df)[sapply(df, is.numeric)]) {
Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
med <- median(df[[col]], na.rm = TRUE)
df[[col]][df[[col]] < lower | df[[col]] > upper] <- med
}

df_cleaned <- df

print(names(df))

## [1] "Age" "Sex" "BP" "Cholesterol"


"Na_to_K"
## [6] "Drug"

# Plots
histogram(~Age | Drug, data = df_cleaned, layout = c(3,2),
col = "darkorange", border = "black", main = "Age
Distribution by Drug", breaks = 20)
bwplot(Drug ~ Na_to_K, data = df_cleaned, col = "purple", fill =
"lavender",
main = "Sodium-Potassium Ratio by Drug", xlab = "NaToK", ylab =
"Drug")
xyplot(Age ~ Na_to_K | Drug, data = df_cleaned, col = "firebrick", pch
= 16, cex = 1.2,
main = "Age vs NaToK by Drug", xlab = "NaToK", ylab = "Age")
bwplot(Age ~ Sex, data = df_cleaned, col = c("lightblue", "deeppink"),
main = "Age Distribution by Sex", xlab = "Sex", ylab = "Age")

bwplot(Na_to_K ~ Sex, data = df_cleaned, col = c("lightblue",


"deeppink"),
main = "Sodium-Potassium Ratio by Sex", xlab = "Sex", ylab =
"NaToK")
# Correlation plot
num_vars <- df_cleaned[, sapply(df_cleaned, is.numeric)]
cor_matrix <- cor(num_vars)
corrplot(cor_matrix, method = "color", col = brewer.pal(8, "PiYG"),
tl.col = "black", tl.srt = 45, addCoef.col = "black",
number.cex = 0.7,
title = "Correlation Matrix", mar = c(0,0,2,0))
# Split data
set.seed(123)
trainIndex <- createDataPartition(df_cleaned$Drug, p = 0.7, list =
FALSE)
trainData <- df_cleaned[trainIndex, ]
testData <- df_cleaned[-trainIndex, ]

# Train decision tree


drug_model <- rpart(Drug ~ Age + Sex + BP + Cholesterol + Na_to_K,
data = trainData, method = "class")
rpart.plot(drug_model, main = "Decision Tree for Drug Classification",
box.col = "lightsteelblue", shadow.col = "gray")
# Prediction
trainData$Drug <- factor(trainData$Drug)
testData$Drug <- factor(testData$Drug, levels =
levels(trainData$Drug))
predictions <- predict(drug_model, testData, type = "class")
predictions <- factor(predictions, levels = levels(testData$Drug))

# Accuracy
conf_matrix <- confusionMatrix(predictions, testData$Drug)
conf_matrix

## Confusion Matrix and Statistics


##
## Reference
## Prediction drugA drugB drugC drugX drugY
## drugA 6 0 0 0 1
## drugB 0 4 0 0 0
## drugC 0 0 4 0 0
## drugX 0 0 0 16 1
## drugY 0 0 0 0 25
##
## Overall Statistics
##
## Accuracy : 0.9649
## 95% CI : (0.8789, 0.9957)
## No Information Rate : 0.4737
## P-Value [Acc > NIR] : 6.477e-16
##
## Kappa : 0.9488
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: drugA Class: drugB Class: drugC Class:
drugX
## Sensitivity 1.0000 1.00000 1.00000
1.0000
## Specificity 0.9804 1.00000 1.00000
0.9756
## Pos Pred Value 0.8571 1.00000 1.00000
0.9412
## Neg Pred Value 1.0000 1.00000 1.00000
1.0000
## Prevalence 0.1053 0.07018 0.07018
0.2807
## Detection Rate 0.1053 0.07018 0.07018
0.2807
## Detection Prevalence 0.1228 0.07018 0.07018
0.2982
## Balanced Accuracy 0.9902 1.00000 1.00000
0.9878
## Class: drugY
## Sensitivity 0.9259
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9375
## Prevalence 0.4737
## Detection Rate 0.4386
## Detection Prevalence 0.4386
## Balanced Accuracy 0.9630

You might also like