Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
38 views23 pages

ML Experiment WithDataset

The document outlines a series of experiments involving data analysis and machine learning techniques using Python. It includes tasks such as data loading, visualization, outlier detection, PCA, k-NN classification, locally weighted regression, linear and polynomial regression, decision tree classification, and Naive Bayes classification. Each experiment demonstrates the application of different algorithms and methods on various datasets, along with visualizations and performance metrics.

Uploaded by

s.s.kodolli
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
38 views23 pages

ML Experiment WithDataset

The document outlines a series of experiments involving data analysis and machine learning techniques using Python. It includes tasks such as data loading, visualization, outlier detection, PCA, k-NN classification, locally weighted regression, linear and polynomial regression, decision tree classification, and Naive Bayes classification. Each experiment demonstrates the application of different algorithms and methods on various datasets, along with visualizations and performance metrics.

Uploaded by

s.s.kodolli
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 23

# Experiment - 1

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

print("Importing done")

df = pd.read_csv(r"F:\SGBIT\Subjects\2024-
2025\Even_Semester\LAB\Machine_Learning_Lab\Datasets\Datasets\housing.csv")

print("Load the data")

df.info()

df.isnull().sum()

numerical_features = df.select_dtypes(include=[np.number]).columns

plt.figure(figsize=(15, 20))

for i, feature in enumerate(numerical_features):

plt.subplot(3, 3, i + 1)

sns.histplot(df[feature].dropna(), bins=30, kde=True, color='blue')

plt.title(f'Distribution of {feature}')
plt.tight_layout()

plt.show()

plt.figure(figsize=(10, 10)) # Creates a new figure with a size of 15x10 inches for the box
plots.

for i, feature in enumerate(numerical_features):

plt.subplot(3, 3, i + 1)

sns.boxplot(x=df[feature], color='orange')

plt.title(f'Box Plot of {feature}')

plt.tight_layout()

plt.show()

print("Outliers Detection:")

for feature in numerical_features:

Q1, Q3 = df[feature].quantile([0.25, 0.75])

IQR = Q3 - Q1

lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

outliers = df[(df[feature] < lower) | (df[feature] > upper)]

print(f"{feature}: {len(outliers)} outliers")

print("\nDataset Summary:")

print(df.describe())
# Experiment - 2

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

print("Importing done")
df = pd.read_csv(r"F:\SGBIT\Subjects\2024-
2025\Even_Semester\LAB\Machine_Learning_Lab\Datasets\Datasets\california_housin
g.csv")

print("Load the data")

print("\nMissing Values in Each Column:")

print(df.isnull().sum())

plt.figure(figsize=(10, 8)) # sets the figure size from width =10 and height = 6

corr_matrix = df.corr() # create the correlation matrix for the dataframe

sns.heatmap(corr_matrix, annot=True, cmap='RdYlGn', fmt='.2f') # cmap-colormap

plt.title("Feature Correlation Heatmap")

plt.show()

sns.pairplot(df[df.columns], diag_kind='kde')

plt.suptitle('Pair Plot of California Housing Features', y=1.02)

plt.show()
# Experiment - 3

import numpy as np

import pandas as pd

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

print("Libraries Imported")

df = pd.read_csv(r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-
Lab\Datasets\iris_dataset.csv")

print("Data Loaded")

print(df.head())

features = df.iloc[:, 0:4]


target = df["target"]

scaler = StandardScaler()

features_standardized = scaler.fit_transform(features)

pca = PCA(n_components=2)

features_pca = pca.fit_transform(features_standardized)

pca_df = pd.DataFrame(data=features_pca, columns=["Principal Component 1",


"Principal Component 2"])

pca_df["target"] = target

target_names = {0: "setosa", 1: "versicolor", 2: "virginica"}

pca_df["target_name"] = pca_df["target"].map(target_names)

plt.figure(figsize=(8, 6))

for label, color in zip(pca_df["target_name"].unique(), ["red", "green", "blue"]):

plt.scatter(

pca_df.loc[pca_df["target_name"] == label, "Principal Component 1"],

pca_df.loc[pca_df["target_name"] == label, "Principal Component 2"],

label=label,

alpha=0.7,

color=color
)

plt.title("PCA on Iris Dataset (4 features to 2 features)", fontsize=14)

plt.xlabel("Principal Component 1", fontsize=12)

plt.ylabel("Principal Component 2", fontsize=12)

plt.legend(title="Species")

plt.grid()

plt.show()

# Experiment - 4

import pandas as pd

data = pd.read_csv(r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-Lab\Exp-
4\training_data.csv")

print("Load the data")

def find_s_algorithm(data):

attributes = data.columns[:-1]

class_label = data.columns[-1]

print("Attributes ", attributes)

print("Class_Label ", class_label)

hypothesis = ['a' for _ in attributes]

print("Initialize Hypothesis ", hypothesis)


for index, row in data.iterrows():

if row[class_label] == 'Yes':

hypothesis = list(row[attributes])

print("Hypothesis after first positive instance --->", hypothesis)

break

for index, row in data.iterrows(): # Loop over all rows in the dataset.

if row[class_label] == 'Yes':

for i, value in enumerate(row[attributes]):

if hypothesis[i] != value:

hypothesis[i] = '?'

return hypothesis

hypothesis = find_s_algorithm(data)

print("\nThe final hypothesis is:", hypothesis)


# Experiment - 5

import numpy as np

import matplotlib.pyplot as plt

from collections import Counter

data = np.random.rand(100)

labels = ["Class1" if x <= 0.5 else "Class2" for x in data[:50]]

def euclidean_distance(x1, x2):

return abs(x1 - x2)

def knn_classifier(train_data, train_labels, test_point, k):

distances = [(euclidean_distance(test_point, train_data[i]), train_labels[i]) for i in


range(len(train_data))]

distances.sort(key=lambda x: x[0])

k_nearest_neighbors = distances[:k]

k_nearest_labels = [label for _, label in k_nearest_neighbors]

return Counter(k_nearest_labels).most_common(1)[0][0]

train_data = data[:50]

train_labels = labels
test_data = data[50:]

k_values = [1, 2, 3, 4, 5, 20, 30]

print("--- k-Nearest Neighbors Classification ---")

print("Training dataset: First 50 points labeled based on the rule (x <= 0.5 -> Class1, x >
0.5 -> Class2)")

print("Testing dataset: Remaining 50 points to be classified\n")

results = {}

for k in k_values:

print(f"Results for k = {k}:")

classified_labels = [knn_classifier(train_data, train_labels, test_point, k) for test_point


in test_data]

results[k] = classified_labels

for i, label in enumerate(classified_labels, start=51):

print(f"Point x{i} (value: {test_data[i - 51]:.4f}) is classified as {label}")

print("\n")

for k in k_values:

classified_labels = results[k]

class1_points = [test_data[i] for i in range(len(test_data)) if classified_labels[i] ==


"Class1"]

class2_points = [test_data[i] for i in range(len(test_data)) if classified_labels[i] ==


"Class2"]

plt.figure(figsize=(10, 6))

plt.scatter(train_data, [0] * len(train_data), c=["blue" if label == "Class1" else "red" for


label in train_labels],
label="Training Data", marker="o")

plt.scatter(class1_points, [1] * len(class1_points), c="blue", label="Class1 (Test)",


marker="x")

plt.scatter(class2_points, [1] * len(class2_points), c="red", label="Class2 (Test)",


marker="x")

plt.title(f"k-NN Classification Results for k = {k}")

plt.xlabel("Data Points")

plt.ylabel("Classification Level")

plt.legend()

plt.grid(True)

plt.show()

print("Classification complete.\n")

# Experiment - 6

import numpy as np

import matplotlib.pyplot as plt

def gaussian_kernel(x, xi, tau):

return np.exp(-np.sum((x - xi) ** 2) / (2 * tau ** 2))

# np.exp():

def locally_weighted_regression(x, X, y, tau):

m = X.shape[0]
weights = np.array([gaussian_kernel(x, X[i], tau) for i in range(m)])

W = np.diag(weights)

X_transpose_W = X.T @ W

theta = np.linalg.inv(X_transpose_W @ X) @ X_transpose_W @ y

return x @ theta

np.random.seed(42)

X = np.linspace(0, 2 * np.pi, 100)

y = np.sin(X) + 0.1 * np.random.randn(100)

X_bias = np.c_[np.ones(X.shape), X]

x_test = np.linspace(0, 2 * np.pi, 200)

x_test_bias = np.c_[np.ones(x_test.shape), x_test]

tau = 0.5

y_pred = np.array([locally_weighted_regression(xi, X_bias, y, tau) for xi in x_test_bias])

plt.figure(figsize=(10, 6))

plt.scatter(X, y, color='red', label='Training Data', alpha=0.7)

plt.plot(x_test, y_pred, color='blue', label=f'LWR Fit (tau={tau})', linewidth=2)

plt.xlabel('X', fontsize=12)

plt.ylabel('y', fontsize=12)

plt.title('Locally Weighted Regression', fontsize=14)

plt.legend(fontsize=10)

plt.grid(alpha=0.3)

plt.show()
# Experiment -7

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import PolynomialFeatures, StandardScaler

from sklearn.pipeline import make_pipeline

from sklearn.metrics import mean_squared_error, r2_score

print("Import Libraries")

def linear_regression_california():

data = pd.read_csv(r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-
Lab\Coding-WithDatasets\Datasets\california_housing.csv")
X = data[["AveRooms"]]

y = data["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

sort_idx = X_test.values.flatten().argsort()

X_sorted = X_test.values.flatten()[sort_idx]

y_sorted = y_pred[sort_idx]

plt.scatter(X_test, y_test, color="blue", label="Actual")

plt.plot(X_sorted, y_sorted, color="red", label="Predicted")

plt.xlabel("Average number of rooms (AveRooms)")

plt.ylabel("Median value of homes ($100,000)")

plt.title("Linear Regression - California Housing Dataset")

plt.legend()

plt.show()

mse = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)
print("Linear Regression - California Housing Dataset")

print("Mean Squared Error (MSE):", mse)

print("Root Mean Squared Error (RMSE):", rmse)

print("R^2 Score:", r2_score(y_test, y_pred))

print("-" * 60)

def polynomial_regression_auto_mpg(degree=2):

data = pd.read_csv(r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-
Lab\Coding-WithDatasets\Datasets\auto-mpg.csv")

data = data.dropna(subset=["displacement", "mpg"])

X = data["displacement"].values.reshape(-1, 1)

y = data["mpg"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

poly_model = make_pipeline(PolynomialFeatures(degree=degree), StandardScaler(),


LinearRegression())

poly_model.fit(X_train, y_train)

y_pred = poly_model.predict(X_test)
sort_idx = X_test.flatten().argsort()

X_sorted = X_test.flatten()[sort_idx]

y_sorted = y_pred[sort_idx]

plt.scatter(X_test, y_test, color="blue", label="Actual")

plt.plot(X_sorted, y_sorted, color="red", label=f"Predicted (Degree {degree})")

plt.xlabel("Displacement")

plt.ylabel("Miles per gallon (mpg)")

plt.title(f"Polynomial Regression (Degree {degree}) - Auto MPG Dataset")

plt.legend()

plt.show()

mse = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)

print(f"Polynomial Regression (Degree {degree}) - Auto MPG Dataset")

print("Mean Squared Error (MSE):", mse)

print("Root Mean Squared Error (RMSE):", rmse)

print("R^2 Score:", r2_score(y_test, y_pred))

print("-" * 60)

if __name__ == "__main__":

print("Demonstrating Linear Regression and Polynomial Regression with Local


Datasets\n")

linear_regression_california()

polynomial_regression_auto_mpg(degree=2)
# Experiment-8

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

from sklearn import tree

data_path = r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-Lab\Datasets\Breast
Cancer Dataset.csv"

df = pd.read_csv(data_path)

print(df.head())
X = df.drop(columns=['diagnosis'])

y = df['diagnosis'].apply(lambda x: 1 if x == 'M' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42) # Create the classifier instance

clf.fit(X_train, y_train) # Train the model

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")

new_sample = X_test.iloc[0:1] # Use .iloc to select the first row, preserving the feature
names

prediction = clf.predict(new_sample) # Predict using the trained model

prediction_class = "Benign" if prediction == 0 else "Malignant"

print(f"Predicted Class for the new sample: {prediction_class}")


plt.figure(figsize=(12, 8))

tree.plot_tree(clf, filled=True, feature_names=X.columns.tolist(),


class_names=['Benign', 'Malignant'])

plt.title("Decision Tree - Breast Cancer Dataset")

plt.show()
# Experiment-9

import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt

data_path = r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-
Lab\Datasets\olivetti_faces_dataset.csv"

df = pd.read_csv(data_path)

X = df.drop(columns=['target']).values

y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

gnb = GaussianNB()

gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')

print("\nClassification Report:")

print(classification_report(y_test, y_pred, zero_division=1))

print("\nConfusion Matrix:")

print(confusion_matrix(y_test, y_pred))

cross_val_accuracy = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')

print(f'\nCross-validation accuracy: {cross_val_accuracy.mean() * 100:.2f}%')

fig, axes = plt.subplots(3, 5, figsize=(12, 8))

for ax, image, label, prediction in zip(axes.ravel(), X_test, y_test, y_pred):

ax.imshow(image.reshape(64, 64), cmap=plt.cm.viridis)

ax.set_title(f"True: {label}, Pred: {prediction}")

ax.axis('off')

plt.tight_layout()

plt.show()
# Experiment-10

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix

data_path = r"C:\Users\rp038\OneDrive\Documents\BCSL606-ML-Lab\Datasets\Breast
Cancer Dataset.csv"

df = pd.read_csv(data_path)

X = df.drop(columns=['diagnosis']).values

y = df['diagnosis'].apply(lambda x: 0 if x == 'M' else 1).values # Convert 'M' to 0, 'B' to 1

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)

kmeans.fit(X_scaled)

y_kmeans = kmeans.labels_

pca = PCA(n_components=2)

X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_kmeans, cmap='viridis', s=30)

plt.title('K-Means Clustering on Breast Cancer Dataset (PCA Reduced)')

plt.xlabel('PCA Component 1')

plt.ylabel('PCA Component 2')

plt.colorbar(label='Cluster')

plt.grid(True)

plt.show()

print("Confusion Matrix (Cluster vs True Label):")

print(confusion_matrix(y, y_kmeans))

You might also like