Naïve Bayes Classifier Program
import numpy as np
from collections import defaultdict
# Step 1: Prepare a simple dataset
data = [
('spam', 'buy cheap amazon products now'),
('ham', 'how are you doing today'),
('spam', 'cheap watches on sale'),
('ham', 'let us meet up tomorrow'),
('spam', 'win a million dollars now'),
('ham', 'can you call me back later'),
]
# Step 2: Preprocess and count word frequencies
class NaiveBayesClassifier:
def __init__(self):
self.word_probs = defaultdict(lambda: defaultdict(float)) #
Word probabilities
self.class_probs = defaultdict(float) # Class probabilities
self.vocab = set() # Vocabulary
def train(self, dataset):
# Count occurrences
class_word_counts = defaultdict(lambda: defaultdict(int)) #
Count of words per class
class_counts = defaultdict(int) # Count of each class
for label, text in dataset:
words = text.split()
class_counts[label] += 1
for word in words:
self.vocab.add(word)
class_word_counts[label][word] += 1
# Calculate class probabilities P(C)
total_samples = sum(class_counts.values())
for label, count in class_counts.items():
self.class_probs[label] = count / total_samples
# Calculate word probabilities P(w|C)
for label, words in class_word_counts.items():
total_words = sum(words.values())
for word in self.vocab:
# Additive smoothing (Laplace smoothing)
self.word_probs[label][word] = (words[word] + 1) /
(total_words + len(self.vocab))
def predict(self, text):
words = text.split()
# Calculate P(C|w1, w2, ..., wn) for each class
class_scores = {}
for label in self.class_probs:
class_scores[label] = np.log(self.class_probs[label]) # Log
of prior P(C)
for word in words:
if word in self.vocab:
class_scores[label] += np.log(
self.word_probs[label].get(word, 1 /
len(self.vocab))) # Log of P(w|C)
# Return the class with the highest score
return max(class_scores, key=class_scores.get)
# Step 3: Train the classifier
classifier = NaiveBayesClassifier()
classifier.train(data)
# Step 4: Classify new examples
test_texts = [
'cheap watches available',
'how are you',
'call me now to win',
]
for text in test_texts:
prediction = classifier.predict(text)
print(f'Text: "{text}" => Predicted class: {prediction}')
KNN
import matplotlib.pyplot as plt
x = [4, 5, 10, 4, 3, 11, 14 , 8, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]
classes = [0, 0, 1, 0, 0, 1, 1, 0, 1, 1]
plt.scatter(x, y, c=classes)
plt.show()
from sklearn.neighbors import KNeighborsClassifier
data = list(zip(x, y))
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(data, classes)
new_x = 8
new_y = 21
new_point = [(new_x, new_y)]
prediction = knn.predict(new_point)
plt.scatter(x + [new_x], y + [new_y], c=classes + [prediction[0]])
plt.text(x=new_x-1.7, y=new_y-0.7, s=f"new point, class:
{prediction[0]}")
plt.show()
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(data, classes)
prediction = knn.predict(new_point)
plt.scatter(x + [new_x], y + [new_y], c=classes + [prediction[0]])
plt.text(x=new_x-1.7, y=new_y-0.7, s=f"new point, class:
{prediction[0]}")
plt.show()
DBSCAN
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
# Load data in X
X, y_true = make_blobs(n_samples=300, centers=4,
cluster_std=0.50, random_state=0)
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# Plot result
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = ['y', 'b', 'g', 'r']
print(colors)
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k',
markersize=6)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k',
markersize=6)
plt.title('number of clusters: %d' % n_clusters_)
plt.show()
Support Vector Machine
# importing scikit learn with make_blobs
from sklearn.datasets import make_blobs
# creating datasets X containing n_samples
# Y containing two classes
X, Y = make_blobs(n_samples=500, centers=2,
random_state=0, cluster_std=0.40)
import matplotlib.pyplot as plt
# plotting scatters
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='spring');
plt.show()
# creating linspace between -1 to 3.5
xfit = np.linspace(-1, 3.5)
# plotting scatter
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='spring')
# plot a line between the different sets of data
for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
yfit = m * xfit + b
plt.plot(xfit, yfit, '-k')
plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none',
color='#AAAAAA', alpha=0.4)
plt.xlim(-1, 3.5);
plt.show()
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# reading csv file and extracting class column to y.
x = pd.read_csv("C:\...\cancer.csv")
a = np.array(x)
y = a[:,30] # classes having 0 and 1
# extracting two features
x = np.column_stack((x.malignant,x.benign))
# 569 samples and 2 features
x.shape
print (x),(y)
# import support vector classifier
# "Support Vector Classifier"
from sklearn.svm import SVC
clf = SVC(kernel='linear')
# fitting x samples and y classes
clf.fit(x, y)
clf.predict([[120, 990]])
clf.predict([[85, 550]])
CART
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
# Define the features and target variable
features = [
["red", "large"],
["green", "small"],
["red", "small"],
["yellow", "large"],
["green", "large"],
["orange", "large"],
]
target_variable = ["apple", "lime", "strawberry", "banana",
"grape", "orange"]
# Flatten the features list for encoding
flattened_features = [item for sublist in features for item in
sublist]
# Use a single LabelEncoder for all features and target variable
le = LabelEncoder()
le.fit(flattened_features + target_variable)
# Encode features and target variable
encoded_features = [le.transform(item) for item in features]
encoded_target = le.transform(target_variable)
# Create a CART classifier
clf = DecisionTreeClassifier()
# Train the classifier on the training set
clf.fit(encoded_features, encoded_target)
# Predict the fruit type for a new instance
new_instance = ["red", "large"]
encoded_new_instance = le.transform(new_instance)
predicted_fruit_type = clf.predict([encoded_new_instance])
decoded_predicted_fruit_type =
le.inverse_transform(predicted_fruit_type)
print("Predicted fruit type:", decoded_predicted_fruit_type[0])
KMEANS
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
X,y = make_blobs(n_samples = 500,n_features = 2,centers =
3,random_state = 23)
fig = plt.figure(0)
plt.grid(True)
plt.scatter(X[:,0],X[:,1])
plt.show()
k=3
clusters = {}
np.random.seed(23)
for idx in range(k):
center = 2*(2*np.random.random((X.shape[1],))-1)
points = []
cluster = {
'center' : center,
'points' : []
}
clusters[idx] = cluster
clusters
plt.scatter(X[:,0],X[:,1])
plt.grid(True)
for i in clusters:
center = clusters[i]['center']
plt.scatter(center[0],center[1],marker = '*',c = 'red')
plt.show()
def distance(p1,p2):
return np.sqrt(np.sum((p1-p2)**2))
def assign_clusters(X, clusters):
for idx in range(X.shape[0]):
dist = []
curr_x = X[idx]
for i in range(k):
dis = distance(curr_x,clusters[i]['center'])
dist.append(dis)
curr_cluster = np.argmin(dist)
clusters[curr_cluster]['points'].append(curr_x)
return clusters
#Implementing the M-Step
def update_clusters(X, clusters):
for i in range(k):
points = np.array(clusters[i]['points'])
if points.shape[0] > 0:
new_center = points.mean(axis =0)
clusters[i]['center'] = new_center
clusters[i]['points'] = []
return clusters
def pred_cluster(X, clusters):
pred = []
for i in range(X.shape[0]):
dist = []
for j in range(k):
dist.append(distance(X[i],clusters[j]['center']))
pred.append(np.argmin(dist))
return pred
clusters = assign_clusters(X,clusters)
clusters = update_clusters(X,clusters)
pred = pred_cluster(X,clusters)
plt.scatter(X[:,0],X[:,1],c = pred)
for i in clusters:
center = clusters[i]['center']
plt.scatter(center[0],center[1],marker = '^',c = 'red')
plt.show()