# Experiment - 1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
print("Importing done")
df = pd.read_csv(r"F:\SGBIT\Subjects\2024-
2025\Even_Semester\LAB\Machine_Learning_Lab\Datasets\Datasets\housing.csv")
print("Load the data")
df.info()
df.isnull().sum()
numerical_features = df.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(15, 20))
for i, feature in enumerate(numerical_features):
plt.subplot(3, 3, i + 1)
sns.histplot(df[feature].dropna(), bins=30, kde=True, color='blue')
plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 10)) # Creates a new figure with a size of 15x10 inches for the box
plots.
for i, feature in enumerate(numerical_features):
plt.subplot(3, 3, i + 1)
sns.boxplot(x=df[feature], color='orange')
plt.title(f'Box Plot of {feature}')
plt.tight_layout()
plt.show()
print("Outliers Detection:")
for feature in numerical_features:
Q1, Q3 = df[feature].quantile([0.25, 0.75])
IQR = Q3 - Q1
lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
outliers = df[(df[feature] < lower) | (df[feature] > upper)]
print(f"{feature}: {len(outliers)} outliers")
print("\nDataset Summary:")
print(df.describe())
# Experiment - 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
print("Importing done")
df = pd.read_csv(r"F:\SGBIT\Subjects\2024-
2025\Even_Semester\LAB\Machine_Learning_Lab\Datasets\Datasets\california_housin
g.csv")
print("Load the data")
print("\nMissing Values in Each Column:")
print(df.isnull().sum())
plt.figure(figsize=(10, 8)) # sets the figure size from width =10 and height = 6
corr_matrix = df.corr() # create the correlation matrix for the dataframe
sns.heatmap(corr_matrix, annot=True, cmap='RdYlGn', fmt='.2f') # cmap-colormap
plt.title("Feature Correlation Heatmap")
plt.show()
sns.pairplot(df[df.columns], diag_kind='kde')
plt.suptitle('Pair Plot of California Housing Features', y=1.02)
plt.show()
# Experiment - 3
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
print("Libraries Imported")
df = pd.read_csv(r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-
Lab\Datasets\iris_dataset.csv")
print("Data Loaded")
print(df.head())
features = df.iloc[:, 0:4]
target = df["target"]
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)
pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_standardized)
pca_df = pd.DataFrame(data=features_pca, columns=["Principal Component 1",
"Principal Component 2"])
pca_df["target"] = target
target_names = {0: "setosa", 1: "versicolor", 2: "virginica"}
pca_df["target_name"] = pca_df["target"].map(target_names)
plt.figure(figsize=(8, 6))
for label, color in zip(pca_df["target_name"].unique(), ["red", "green", "blue"]):
plt.scatter(
pca_df.loc[pca_df["target_name"] == label, "Principal Component 1"],
pca_df.loc[pca_df["target_name"] == label, "Principal Component 2"],
label=label,
alpha=0.7,
color=color
)
plt.title("PCA on Iris Dataset (4 features to 2 features)", fontsize=14)
plt.xlabel("Principal Component 1", fontsize=12)
plt.ylabel("Principal Component 2", fontsize=12)
plt.legend(title="Species")
plt.grid()
plt.show()
# Experiment - 4
import pandas as pd
data = pd.read_csv(r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-Lab\Exp-
4\training_data.csv")
print("Load the data")
def find_s_algorithm(data):
attributes = data.columns[:-1]
class_label = data.columns[-1]
print("Attributes ", attributes)
print("Class_Label ", class_label)
hypothesis = ['a' for _ in attributes]
print("Initialize Hypothesis ", hypothesis)
for index, row in data.iterrows():
if row[class_label] == 'Yes':
hypothesis = list(row[attributes])
print("Hypothesis after first positive instance --->", hypothesis)
break
for index, row in data.iterrows(): # Loop over all rows in the dataset.
if row[class_label] == 'Yes':
for i, value in enumerate(row[attributes]):
if hypothesis[i] != value:
hypothesis[i] = '?'
return hypothesis
hypothesis = find_s_algorithm(data)
print("\nThe final hypothesis is:", hypothesis)
# Experiment - 5
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
data = np.random.rand(100)
labels = ["Class1" if x <= 0.5 else "Class2" for x in data[:50]]
def euclidean_distance(x1, x2):
return abs(x1 - x2)
def knn_classifier(train_data, train_labels, test_point, k):
distances = [(euclidean_distance(test_point, train_data[i]), train_labels[i]) for i in
range(len(train_data))]
distances.sort(key=lambda x: x[0])
k_nearest_neighbors = distances[:k]
k_nearest_labels = [label for _, label in k_nearest_neighbors]
return Counter(k_nearest_labels).most_common(1)[0][0]
train_data = data[:50]
train_labels = labels
test_data = data[50:]
k_values = [1, 2, 3, 4, 5, 20, 30]
print("--- k-Nearest Neighbors Classification ---")
print("Training dataset: First 50 points labeled based on the rule (x <= 0.5 -> Class1, x >
0.5 -> Class2)")
print("Testing dataset: Remaining 50 points to be classified\n")
results = {}
for k in k_values:
print(f"Results for k = {k}:")
classified_labels = [knn_classifier(train_data, train_labels, test_point, k) for test_point
in test_data]
results[k] = classified_labels
for i, label in enumerate(classified_labels, start=51):
print(f"Point x{i} (value: {test_data[i - 51]:.4f}) is classified as {label}")
print("\n")
for k in k_values:
classified_labels = results[k]
class1_points = [test_data[i] for i in range(len(test_data)) if classified_labels[i] ==
"Class1"]
class2_points = [test_data[i] for i in range(len(test_data)) if classified_labels[i] ==
"Class2"]
plt.figure(figsize=(10, 6))
plt.scatter(train_data, [0] * len(train_data), c=["blue" if label == "Class1" else "red" for
label in train_labels],
label="Training Data", marker="o")
plt.scatter(class1_points, [1] * len(class1_points), c="blue", label="Class1 (Test)",
marker="x")
plt.scatter(class2_points, [1] * len(class2_points), c="red", label="Class2 (Test)",
marker="x")
plt.title(f"k-NN Classification Results for k = {k}")
plt.xlabel("Data Points")
plt.ylabel("Classification Level")
plt.legend()
plt.grid(True)
plt.show()
print("Classification complete.\n")
# Experiment - 6
import numpy as np
import matplotlib.pyplot as plt
def gaussian_kernel(x, xi, tau):
return np.exp(-np.sum((x - xi) ** 2) / (2 * tau ** 2))
# np.exp():
def locally_weighted_regression(x, X, y, tau):
m = X.shape[0]
weights = np.array([gaussian_kernel(x, X[i], tau) for i in range(m)])
W = np.diag(weights)
X_transpose_W = X.T @ W
theta = np.linalg.inv(X_transpose_W @ X) @ X_transpose_W @ y
return x @ theta
np.random.seed(42)
X = np.linspace(0, 2 * np.pi, 100)
y = np.sin(X) + 0.1 * np.random.randn(100)
X_bias = np.c_[np.ones(X.shape), X]
x_test = np.linspace(0, 2 * np.pi, 200)
x_test_bias = np.c_[np.ones(x_test.shape), x_test]
tau = 0.5
y_pred = np.array([locally_weighted_regression(xi, X_bias, y, tau) for xi in x_test_bias])
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='red', label='Training Data', alpha=0.7)
plt.plot(x_test, y_pred, color='blue', label=f'LWR Fit (tau={tau})', linewidth=2)
plt.xlabel('X', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Locally Weighted Regression', fontsize=14)
plt.legend(fontsize=10)
plt.grid(alpha=0.3)
plt.show()
# Experiment -7
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
print("Import Libraries")
def linear_regression_california():
data = pd.read_csv(r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-
Lab\Coding-WithDatasets\Datasets\california_housing.csv")
X = data[["AveRooms"]]
y = data["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
sort_idx = X_test.values.flatten().argsort()
X_sorted = X_test.values.flatten()[sort_idx]
y_sorted = y_pred[sort_idx]
plt.scatter(X_test, y_test, color="blue", label="Actual")
plt.plot(X_sorted, y_sorted, color="red", label="Predicted")
plt.xlabel("Average number of rooms (AveRooms)")
plt.ylabel("Median value of homes ($100,000)")
plt.title("Linear Regression - California Housing Dataset")
plt.legend()
plt.show()
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Linear Regression - California Housing Dataset")
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2_score(y_test, y_pred))
print("-" * 60)
def polynomial_regression_auto_mpg(degree=2):
data = pd.read_csv(r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-
Lab\Coding-WithDatasets\Datasets\auto-mpg.csv")
data = data.dropna(subset=["displacement", "mpg"])
X = data["displacement"].values.reshape(-1, 1)
y = data["mpg"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
poly_model = make_pipeline(PolynomialFeatures(degree=degree), StandardScaler(),
LinearRegression())
poly_model.fit(X_train, y_train)
y_pred = poly_model.predict(X_test)
sort_idx = X_test.flatten().argsort()
X_sorted = X_test.flatten()[sort_idx]
y_sorted = y_pred[sort_idx]
plt.scatter(X_test, y_test, color="blue", label="Actual")
plt.plot(X_sorted, y_sorted, color="red", label=f"Predicted (Degree {degree})")
plt.xlabel("Displacement")
plt.ylabel("Miles per gallon (mpg)")
plt.title(f"Polynomial Regression (Degree {degree}) - Auto MPG Dataset")
plt.legend()
plt.show()
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Polynomial Regression (Degree {degree}) - Auto MPG Dataset")
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2_score(y_test, y_pred))
print("-" * 60)
if __name__ == "__main__":
print("Demonstrating Linear Regression and Polynomial Regression with Local
Datasets\n")
linear_regression_california()
polynomial_regression_auto_mpg(degree=2)
# Experiment-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
data_path = r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-Lab\Datasets\Breast
Cancer Dataset.csv"
df = pd.read_csv(data_path)
print(df.head())
X = df.drop(columns=['diagnosis'])
y = df['diagnosis'].apply(lambda x: 1 if x == 'M' else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier(random_state=42) # Create the classifier instance
clf.fit(X_train, y_train) # Train the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
new_sample = X_test.iloc[0:1] # Use .iloc to select the first row, preserving the feature
names
prediction = clf.predict(new_sample) # Predict using the trained model
prediction_class = "Benign" if prediction == 0 else "Malignant"
print(f"Predicted Class for the new sample: {prediction_class}")
plt.figure(figsize=(12, 8))
tree.plot_tree(clf, filled=True, feature_names=X.columns.tolist(),
class_names=['Benign', 'Malignant'])
plt.title("Decision Tree - Breast Cancer Dataset")
plt.show()
# Experiment-9
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
data_path = r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-
Lab\Datasets\olivetti_faces_dataset.csv"
df = pd.read_csv(data_path)
X = df.drop(columns=['target']).values
y = df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=1))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
cross_val_accuracy = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')
print(f'\nCross-validation accuracy: {cross_val_accuracy.mean() * 100:.2f}%')
fig, axes = plt.subplots(3, 5, figsize=(12, 8))
for ax, image, label, prediction in zip(axes.ravel(), X_test, y_test, y_pred):
ax.imshow(image.reshape(64, 64), cmap=plt.cm.viridis)
ax.set_title(f"True: {label}, Pred: {prediction}")
ax.axis('off')
plt.tight_layout()
plt.show()
# Experiment-10
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
data_path = r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-Lab\Datasets\Breast
Cancer Dataset.csv"
df = pd.read_csv(data_path)
X = df.drop(columns=['diagnosis']).values
y = df['diagnosis'].apply(lambda x: 0 if x == 'M' else 1).values # Convert 'M' to 0, 'B' to 1
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans.fit(X_scaled)
y_kmeans = kmeans.labels_
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_kmeans, cmap='viridis', s=30)
plt.title('K-Means Clustering on Breast Cancer Dataset (PCA Reduced)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()
print("Confusion Matrix (Cluster vs True Label):")
print(confusion_matrix(y, y_kmeans))