Thanks to visit codestin.com
Credit goes to www.scribd.com

100% found this document useful (1 vote)
176 views7 pages

ML - LAB - 7 - Jupyter Notebook

1. The student implements Naive Bayes classification from scratch to classify a food dataset without built-in functions and achieves 70% accuracy. Queries on new data points are also classified. 2. Decision tree classification is implemented on the Iris dataset using Scikit-Learn. Cross-validation accuracy is over 97%. Cost complexity pruning, alpha values, and impurity plots are used to avoid overfitting. Test accuracy remains high after pruning.

Uploaded by

suman
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
100% found this document useful (1 vote)
176 views7 pages

ML - LAB - 7 - Jupyter Notebook

1. The student implements Naive Bayes classification from scratch to classify a food dataset without built-in functions and achieves 70% accuracy. Queries on new data points are also classified. 2. Decision tree classification is implemented on the Iris dataset using Scikit-Learn. Cross-validation accuracy is over 97%. Cost complexity pruning, alpha values, and impurity plots are used to avoid overfitting. Test accuracy remains high after pruning.

Uploaded by

suman
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook

In [1]:

#Name : Mudu Suman


#RollNo : 222CD017
#ML LAB 7

1. Implement Naive Bayes Classifier algorithm without using inbuilt functions.

dataset = {'Taste':['Salty','Spicy','Spicy','Spicy','Spicy','Sweet','Salty','Sweet','Spicy','Salty'],

'Temperature':['Hot','Hot','Hot','Cold','Hot','Cold','Cold','Hot','Cold','Hot'],

'Texture':['Soft','Soft','Hard','Hard','Hard','Soft','Soft','Soft','Soft','Hard'],

'Eat':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']}

In [3]:

dataset = {'Taste':['Salty','Spicy','Spicy','Spicy','Spicy','Sweet','Salty','Sweet','Spicy'

'Temperature':['Hot','Hot','Hot','Cold','Hot','Cold','Cold','Hot','Cold','Hot'],

'Texture':['Soft','Soft','Hard','Hard','Hard','Soft','Soft','Soft','Soft','Hard'],

'Eat':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']}

localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 1/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook

In [4]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

def accuracy_score(y_true, y_pred):

""" score = (y_true - y_pred) / len(y_true) """

return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)

def pre_processing(df):

""" partioning data into features and target """

X = df.drop([df.columns[-1]], axis = 1)
y = df[df.columns[-1]]

return X, y

class NaiveBayes:

def __init__(self):
self.features = list
self.likelihoods = {}
self.class_priors = {}
self.pred_priors = {}

self.X_train = np.array
self.y_train = np.array
self.train_size = int
self.num_feats = int

def fit(self, X, y):

self.features = list(X.columns)
self.X_train = X
self.y_train = y
self.train_size = X.shape[0]
self.num_feats = X.shape[1]

for feature in self.features:


self.likelihoods[feature] = {}
self.pred_priors[feature] = {}

for feat_val in np.unique(self.X_train[feature]):


self.pred_priors[feature].update({feat_val: 0})

for outcome in np.unique(self.y_train):


self.likelihoods[feature].update({feat_val+'_'+outcome:0})
self.class_priors.update({outcome: 0})

self._calc_class_prior()
self._calc_likelihoods()
self._calc_predictor_prior()

localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 2/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook

def _calc_class_prior(self):

""" P(c) - Prior Class Probability """

for outcome in np.unique(self.y_train):


outcome_count = sum(self.y_train == outcome)
self.class_priors[outcome] = outcome_count / self.train_size

def _calc_likelihoods(self):

""" P(x|c) - Likelihood """

for feature in self.features:

for outcome in np.unique(self.y_train):


outcome_count = sum(self.y_train == outcome)
feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcom
.index.values.tolist()].value_counts().to_dict(

for feat_val, count in feat_likelihood.items():


self.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_cou

def _calc_predictor_prior(self):

""" P(x) - Evidence """

for feature in self.features:


feat_vals = self.X_train[feature].value_counts().to_dict()

for feat_val, count in feat_vals.items():


self.pred_priors[feature][feat_val] = count/self.train_size

def predict(self, X):

""" Calculates Posterior probability P(c|x) """

results = []
X = np.array(X)

for query in X:
probs_outcome = {}
for outcome in np.unique(self.y_train):
prior = self.class_priors[outcome]
likelihood = 1
evidence = 1

for feat, feat_val in zip(self.features, query):


likelihood *= self.likelihoods[feat][feat_val + '_' + outcome]
evidence *= self.pred_priors[feat][feat_val]

posterior = (likelihood * prior) / (evidence)

probs_outcome[outcome] = posterior

result = max(probs_outcome, key = lambda x: probs_outcome[x])


results.append(result)

return np.array(results)

localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 3/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook

if __name__ == "__main__":

#Weather Dataset
print("\ndataset:")

df = pd.DataFrame(dataset)
#print(df)

#Split fearures and target


X,y = pre_processing(df)

nb_clf = NaiveBayes()
nb_clf.fit(X, y)

print("Train Accuracy: {}".format(accuracy_score(y, nb_clf.predict(X))))

#Query 1:
query = np.array([['Salty','Hot', 'Soft']])
print("Query 1:- {} ---> {}".format(query, nb_clf.predict(query)))

#Query 2:
query = np.array([['Spicy','Hot', 'Soft']])
print("Query 2:- {} ---> {}".format(query, nb_clf.predict(query)))

#Query 3:
query = np.array([['Salty','Hot', 'Hard']])
print("Query 3:- {} ---> {}".format(query, nb_clf.predict(query)))

dataset:

Train Accuracy: 70.0

Query 1:- [['Salty' 'Hot' 'Soft']] ---> ['No']

Query 2:- [['Spicy' 'Hot' 'Soft']] ---> ['Yes']

Query 3:- [['Salty' 'Hot' 'Hard']] ---> ['Yes']

2. Implement Decision tree on IRIS Dataset using SK Learn library functions. Implement methods to
avoid over-fitting of the data.

localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 4/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook

In [5]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
X,y=load_iris(return_X_y=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_train_predicted=clf.predict(X_train)
y_test_predicted=clf.predict(X_test)
accuracy_score(y_train,y_train_predicted)
accuracy_score(y_test,y_test_predicted)

Out[5]:

0.9736842105263158

In [6]:

plt.figure(figsize=(16,8))
tree.plot_tree(clf)
plt.show()

localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 5/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook

In [7]:

path=clf.cost_complexity_pruning_path(X_train,y_train)
#path variable gives two things ccp_alphas and impurities
ccp_alphas,impurities=path.ccp_alphas,path.impurities
print("ccp alpha wil give list of values :",ccp_alphas)
print("***********************************************************")
print("Impurities in Decision Tree :",impurities)
ccp alpha wil give list of values : [0. 0.00869963 0.01339286 0.0357
1429 0.26539835 0.33279549]

***********************************************************

Impurities in Decision Tree : [0. 0.01739927 0.03079212 0.06650641


0.33190476 0.66470026]

In [8]:

clfs=[] #will store all the models here


for ccp_alpha in ccp_alphas:
clf=DecisionTreeClassifier(random_state=0,ccp_alpha=ccp_alpha)
clf.fit(X_train,y_train)
clfs.append(clf)
print("Last node in Decision tree is {} and ccp_alpha for last node is {}".format(clfs[-1].

Last node in Decision tree is 1 and ccp_alpha for last node is 0.33279549319
7279

In [9]:

train_scores = [clf.score(X_train, y_train) for clf in clfs]


test_scores = [clf.score(X_test, y_test) for clf in clfs]
fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",drawstyle="steps-post")
ax.legend()
plt.show()

localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 6/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook

In [10]:

clf=DecisionTreeClassifier(random_state=0,ccp_alpha=0.02)
clf.fit(X_train,y_train)
plt.figure(figsize=(12,8))
tree.plot_tree(clf,rounded=True,filled=True)
plt.show()

In [11]:

accuracy_score(y_test,clf.predict(X_test))
Out[11]:

0.9736842105263158

In [ ]:

localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 7/7

You might also like