Recognizing a road sign with kNN
# Load the 'class' package
library(class)
# Create a vector of labels
sign_types <- signs$sign_type
# Classify the next sign observed
knn(train = signs[-1], test = next_sign, cl = sign_types)
//2
Exploring the traffic sign dataset
# Examine the structure of the signs dataset
str(signs)
# Count the number of signs of each type
table(signs$sign_type)
# Check r10's average red level by sign type
aggregate(r10 ~ sign_type, data = signs, mean)
Classifying a collection of road signs
# Use kNN to identify the test road signs
sign_types <- signs$sign_type
signs_pred <- knn(train = signs[-1], test = test_signs[-1], cl = sign_types)
# Create a confusion matrix of the predicted versus actual values
signs_actual <- test_signs$sign_type
table(signs_pred, signs_actual)
# Compute the accuracy
mean(signs_pred == signs_actual)
//4
Testing other 'k' values
# Compute the accuracy of the baseline model (default k = 1)
k_1 <- knn(train = signs[-1], test = signs_test[-1], cl = sign_types)
mean(signs_actual == k_1)
# Modify the above to set k = 7
k_7 <- knn(train = signs[-1], test = signs_test[-1], cl = sign_types, k = 7)
mean(signs_actual == k_7)
# Set k = 15 and compare to the above
k_15 <- knn(train = signs[-1], test = signs_test[-1], cl = sign_types, k =
15)
mean(signs_actual == k_15)
Seeing how the neighbors voted
# Use the prob parameter to get the proportion of votes for the winning class
sign_pred <- knn(train = signs[-1], test = signs_test[-1], cl = sign_types, k
= 7, prob = TRUE)
# Get the "prob" attribute from the predicted classes
sign_prob <- attr(sign_pred, "prob")
# Examine the first several predictions
head(sign_pred)
# Examine the proportion of votes for the winning class
head(sign_prob)
//1
Computing probabilities
# Compute P(A)
p_A <- nrow(subset(where9am, location == "office")) / nrow(where9am)
# Compute P(B)
p_B <- nrow(subset(where9am, daytype == "weekday")) / nrow(where9am)
# Compute the observed P(A and B)
p_AB <- nrow(subset(where9am, location == "office" & daytype == "weekday")) /
nrow(where9am)
# Compute P(A | B) and print its value
p_A_given_B <- p_AB / p_B
p_A_given_B
//4
A simple Naive Bayes location model
# Load the naivebayes package
library(naivebayes)
# Build the location prediction model
locmodel <- naive_bayes(location ~ daytype, data = where9am)
# Predict Thursday's 9am location
predict(locmodel, thursday9am)
# Predict Saturdays's 9am location
predict(locmodel, saturday9am)
Examining "raw" probabilities
# The 'naivebayes' package is loaded into the workspace
# and the Naive Bayes 'locmodel' has been built
# Examine the location prediction model
locmodel
# Obtain the predicted probabilities for Thursday at 9am
predict(locmodel, thursday9am, type = "prob")
# Obtain the predicted probabilities for Saturday at 9am
predict(locmodel, saturday9am, type = "prob")
//3
//2
A more sophisticated location model
# The 'naivebayes' package is loaded into the workspace already
# Build a NB model of location
locmodel <- naive_bayes(location ~ daytype + hourtype, data = locations)
# Predict Brett's location on a weekday afternoon
predict(locmodel, weekday_afternoon)
# Predict Brett's location on a weekday evening
predict(locmodel, weekday_evening)
Preparing for unforeseen circumstances
# The 'naivebayes' package is loaded into the workspace already
# The Naive Bayes location model (locmodel) has already been built
# Observe the predicted probabilities for a weekend afternoon
predict(locmodel, weekend_afternoon, type = "prob")
# Build a new model using the Laplace correction
locmodel2 <- naive_bayes(location ~ daytype + hourtype, data = locations,
laplace = 1)
# Observe the new predicted probabilities for a weekend afternoon
predict(locmodel2, weekend_afternoon, type = "prob")
//1
//4
Building simple logistic regression models
# Examine the dataset to identify potential independent variables
str(donors)
# Explore the dependent variable
table(donors$donated)
# Build the donation model
donation_model <- glm(donated ~ bad_address + interest_religion +
interest_veterans,
data = donors, family = "binomial")
# Summarize the model results
summary(donation_model)
Making a binary prediction
# Estimate the donation probability
donors$donation_prob <- predict(donation_model, type = "response")
# Find the donation probability of the average prospect
mean(donors$donated)
# Predict a donation if probability of donation is greater than average
donors$donation_pred <- ifelse(donors$donation_prob > 0.0504, 1, 0)
# Calculate the model's accuracy
mean(donors$donated == donors$donation_pred)
//4
Calculating ROC Curves and AUC
# Load the pROC package
library(pROC)
# Create a ROC curve
ROC <- roc(donors$donated, donors$donation_prob)
# Plot the ROC curve
plot(ROC, col = "blue")
# Calculate the area under the curve (AUC)
auc(ROC)
//4
Coding categorical features
# Convert the wealth rating to a factor
donors$wealth_levels <- factor(donors$wealth_rating, levels = c(0, 1, 2, 3),
labels = c("Unknown", "Low", "Medium", "High"))
# Use relevel() to change reference category
donors$wealth_levels <- relevel(donors$wealth_levels, ref = "Medium")
# See how our factor coding impacts the model
summary(glm(donated ~ wealth_levels, data = donors, family = "binomial"))
Handling missing data
# Find the average age among non-missing values
summary(donors$age)
# Impute missing age values with the mean age
donors$imputed_age <- ifelse(is.na(donors$age), round(mean(donors$age, na.rm
= TRUE), 2), donors$age)
# Create missing value indicator for age
donors$missing_age <- ifelse(is.na(donors$age), 1, 0)
//4
Building a more sophisticated model
# Build a recency, frequency, and money (RFM) model
rfm_model <- glm(donated ~ recency * frequency + money, data = donors, family
= "binomial")
# Summarize the RFM model to see how the parameters were coded
summary(rfm_model)
# Compute predicted probabilities for the RFM model
rfm_prob <- predict(rfm_model, data = donors, type = "response")
# Plot the ROC curve for the new model
library(pROC)
ROC <- roc(donors$donated, rfm_prob)
plot(ROC, col = "red")
auc(ROC)
//2
Building a stepwise regression model
# Specify a null model with no predictors
null_model <- glm(donated ~ 1, data = donors, family = "binomial")
# Specify the full model using all of the potential predictors
full_model <- glm(donated ~ ., data = donors, family = "binomial")
# Use a forward stepwise algorithm to build a parsimonious model
step_model <- step(null_model, scope = list(lower = null_model, upper =
full_model), direction = "forward")
# Estimate the stepwise donation probability
step_prob <- predict(step_model, type = "response")
# Plot the ROC of the stepwise model
library(pROC)
ROC <- roc(donors$donated, step_prob)
plot(ROC, col = "red")
auc(ROC)
Building a simple decision tree
# Load the rpart package
library(rpart)
# Build a lending model predicting loan outcome versus loan amount and credit
score
loan_model <- rpart(outcome ~ loan_amount + credit_score, data = loans,
method = "class", control = rpart.control(cp = 0))
# Make a prediction for someone with good credit
predict(loan_model, good_credit, type = "class")
# Make a prediction for someone with bad credit
predict(loan_model, bad_credit, type = "class")
Visualizing classification trees
# Examine the loan_model object
loan_model
# Load the rpart.plot package
library(rpart.plot)
# Plot the loan_model with default settings
rpart.plot(loan_model)
# Plot the loan_model with customized settings
rpart.plot(loan_model, type = 3, box.palette = c("red", "green"),
fallen.leaves = TRUE)
//4
//3
Creating random test datasets
# Determine the number of rows for training
nrow(loans) * 0.75
# Create a random sample of row IDs
sample_rows <- sample(nrow(loans), nrow(loans) * 0.75)
# Create the training dataset
loans_train <- loans[sample_rows, ]
# Create the test dataset
loans_test <- loans[-sample_rows, ]
Building and evaluating a larger tree
# Grow a tree using all of the available applicant data
loan_model <- rpart(outcome ~ ., data = loans_train, method = "class",
control = rpart.control(cp = 0))
# Make predictions on the test dataset
loans_test$pred <- predict(loan_model, loans_test, type = "class")
# Examine the confusion matrix
table(loans_test$pred, loans_test$outcome)
# Compute the accuracy on the test dataset
mean(loans_test$pred == loans_test$outcome)
//2
Preventing overgrown trees
# Grow a tree with maxdepth of 6
loan_model <- rpart(outcome ~ ., data = loans_train, method = "class",
control = rpart.control(cp = 0, maxdepth = 6))
# Make a class prediction on the test set
loans_test$pred <- predict(loan_model, loans_test, type = "class")
# Compute the accuracy of the simpler tree
mean(loans_test$pred == loans_test$outcome)
//////////////////////////
# Swap maxdepth for a minimum split of 500
loan_model <- rpart(outcome ~ ., data = loans_train, method = "class",
control = rpart.control(cp = 0, minsplit = 500))
# Run this. How does the accuracy change?
loans_test$pred <- predict(loan_model, loans_test, type = "class")
mean(loans_test$pred == loans_test$outcome)
Creating a nicely pruned tree
# Grow an overly complex tree
loan_model <- rpart(outcome ~ ., data = loans_train, method = "class",
control = rpart.control(cp = 0))
# Examine the complexity plot
plotcp(loan_model)
# Prune the tree
loan_model_pruned <- prune(loan_model, cp = 0.0014)
# Compute the accuracy of the pruned tree
loans_test$pred <- predict(loan_model_pruned, loans_test, type = "class")
mean(loans_test$pred == loans_test$outcome)
//4
//3
Building a random forest model
# Load the randomForest package
library(randomForest)
# Build a random forest model
loan_model <- randomForest(outcome ~ ., data = loans_train)
# Compute the accuracy of the random forest
loans_test$pred <- predict(loan_model, loans_test)
mean(loans_test$pred == loans_test$outcome)