14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook
In [1]:
#Name : Mudu Suman
#RollNo : 222CD017
#ML LAB 7
1. Implement Naive Bayes Classifier algorithm without using inbuilt functions.
dataset = {'Taste':['Salty','Spicy','Spicy','Spicy','Spicy','Sweet','Salty','Sweet','Spicy','Salty'],
'Temperature':['Hot','Hot','Hot','Cold','Hot','Cold','Cold','Hot','Cold','Hot'],
'Texture':['Soft','Soft','Hard','Hard','Hard','Soft','Soft','Soft','Soft','Hard'],
'Eat':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']}
In [3]:
dataset = {'Taste':['Salty','Spicy','Spicy','Spicy','Spicy','Sweet','Salty','Sweet','Spicy'
'Temperature':['Hot','Hot','Hot','Cold','Hot','Cold','Cold','Hot','Cold','Hot'],
'Texture':['Soft','Soft','Hard','Hard','Hard','Soft','Soft','Soft','Soft','Hard'],
'Eat':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']}
localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 1/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook
In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
def accuracy_score(y_true, y_pred):
""" score = (y_true - y_pred) / len(y_true) """
return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)
def pre_processing(df):
""" partioning data into features and target """
X = df.drop([df.columns[-1]], axis = 1)
y = df[df.columns[-1]]
return X, y
class NaiveBayes:
def __init__(self):
self.features = list
self.likelihoods = {}
self.class_priors = {}
self.pred_priors = {}
self.X_train = np.array
self.y_train = np.array
self.train_size = int
self.num_feats = int
def fit(self, X, y):
self.features = list(X.columns)
self.X_train = X
self.y_train = y
self.train_size = X.shape[0]
self.num_feats = X.shape[1]
for feature in self.features:
self.likelihoods[feature] = {}
self.pred_priors[feature] = {}
for feat_val in np.unique(self.X_train[feature]):
self.pred_priors[feature].update({feat_val: 0})
for outcome in np.unique(self.y_train):
self.likelihoods[feature].update({feat_val+'_'+outcome:0})
self.class_priors.update({outcome: 0})
self._calc_class_prior()
self._calc_likelihoods()
self._calc_predictor_prior()
localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 2/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook
def _calc_class_prior(self):
""" P(c) - Prior Class Probability """
for outcome in np.unique(self.y_train):
outcome_count = sum(self.y_train == outcome)
self.class_priors[outcome] = outcome_count / self.train_size
def _calc_likelihoods(self):
""" P(x|c) - Likelihood """
for feature in self.features:
for outcome in np.unique(self.y_train):
outcome_count = sum(self.y_train == outcome)
feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcom
.index.values.tolist()].value_counts().to_dict(
for feat_val, count in feat_likelihood.items():
self.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_cou
def _calc_predictor_prior(self):
""" P(x) - Evidence """
for feature in self.features:
feat_vals = self.X_train[feature].value_counts().to_dict()
for feat_val, count in feat_vals.items():
self.pred_priors[feature][feat_val] = count/self.train_size
def predict(self, X):
""" Calculates Posterior probability P(c|x) """
results = []
X = np.array(X)
for query in X:
probs_outcome = {}
for outcome in np.unique(self.y_train):
prior = self.class_priors[outcome]
likelihood = 1
evidence = 1
for feat, feat_val in zip(self.features, query):
likelihood *= self.likelihoods[feat][feat_val + '_' + outcome]
evidence *= self.pred_priors[feat][feat_val]
posterior = (likelihood * prior) / (evidence)
probs_outcome[outcome] = posterior
result = max(probs_outcome, key = lambda x: probs_outcome[x])
results.append(result)
return np.array(results)
localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 3/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook
if __name__ == "__main__":
#Weather Dataset
print("\ndataset:")
df = pd.DataFrame(dataset)
#print(df)
#Split fearures and target
X,y = pre_processing(df)
nb_clf = NaiveBayes()
nb_clf.fit(X, y)
print("Train Accuracy: {}".format(accuracy_score(y, nb_clf.predict(X))))
#Query 1:
query = np.array([['Salty','Hot', 'Soft']])
print("Query 1:- {} ---> {}".format(query, nb_clf.predict(query)))
#Query 2:
query = np.array([['Spicy','Hot', 'Soft']])
print("Query 2:- {} ---> {}".format(query, nb_clf.predict(query)))
#Query 3:
query = np.array([['Salty','Hot', 'Hard']])
print("Query 3:- {} ---> {}".format(query, nb_clf.predict(query)))
dataset:
Train Accuracy: 70.0
Query 1:- [['Salty' 'Hot' 'Soft']] ---> ['No']
Query 2:- [['Spicy' 'Hot' 'Soft']] ---> ['Yes']
Query 3:- [['Salty' 'Hot' 'Hard']] ---> ['Yes']
2. Implement Decision tree on IRIS Dataset using SK Learn library functions. Implement methods to
avoid over-fitting of the data.
localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 4/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook
In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
X,y=load_iris(return_X_y=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_train_predicted=clf.predict(X_train)
y_test_predicted=clf.predict(X_test)
accuracy_score(y_train,y_train_predicted)
accuracy_score(y_test,y_test_predicted)
Out[5]:
0.9736842105263158
In [6]:
plt.figure(figsize=(16,8))
tree.plot_tree(clf)
plt.show()
localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 5/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook
In [7]:
path=clf.cost_complexity_pruning_path(X_train,y_train)
#path variable gives two things ccp_alphas and impurities
ccp_alphas,impurities=path.ccp_alphas,path.impurities
print("ccp alpha wil give list of values :",ccp_alphas)
print("***********************************************************")
print("Impurities in Decision Tree :",impurities)
ccp alpha wil give list of values : [0. 0.00869963 0.01339286 0.0357
1429 0.26539835 0.33279549]
***********************************************************
Impurities in Decision Tree : [0. 0.01739927 0.03079212 0.06650641
0.33190476 0.66470026]
In [8]:
clfs=[] #will store all the models here
for ccp_alpha in ccp_alphas:
clf=DecisionTreeClassifier(random_state=0,ccp_alpha=ccp_alpha)
clf.fit(X_train,y_train)
clfs.append(clf)
print("Last node in Decision tree is {} and ccp_alpha for last node is {}".format(clfs[-1].
Last node in Decision tree is 1 and ccp_alpha for last node is 0.33279549319
7279
In [9]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",drawstyle="steps-post")
ax.legend()
plt.show()
localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 6/7
14/11/2022, 15:37 ML_LAB_7 - Jupyter Notebook
In [10]:
clf=DecisionTreeClassifier(random_state=0,ccp_alpha=0.02)
clf.fit(X_train,y_train)
plt.figure(figsize=(12,8))
tree.plot_tree(clf,rounded=True,filled=True)
plt.show()
In [11]:
accuracy_score(y_test,clf.predict(X_test))
Out[11]:
0.9736842105263158
In [ ]:
localhost:8888/notebooks/Downloads/ML_LAB_7.ipynb 7/7