bqb96k8bx
February 18, 2025
1 AP22110010183 - Sanat Kulkarni - Assignment 5
[4]: from sklearn.datasets import make_classification
import pandas as pd
X, y = make_classification(n_samples=500, n_features=10, n_informative=5,
n_redundant=2, n_classes=2, random_state=42)
df = pd.DataFrame(X, columns=[f'feature_{i+1}' for i in range(10)])
df['target'] = y
df.to_csv('classification_dataset.csv', index=False)
print("Dataset saved as 'classification_dataset.csv'")
Dataset saved as 'classification_dataset.csv'
[5]: from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)
train_df = pd.DataFrame(X_train, columns=[f'feature_{i+1}' for i in range(10)])
train_df['target'] = y_train
train_df.to_csv('train_dataset.csv', index=False)
test_df = pd.DataFrame(X_test, columns=[f'feature_{i+1}' for i in range(10)])
test_df['target'] = y_test
test_df.to_csv('test_dataset.csv', index=False)
print("Training and testing datasets saved as 'train_dataset.csv' and␣
↪'test_dataset.csv'")
Training and testing datasets saved as 'train_dataset.csv' and
'test_dataset.csv'
[7]: from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
1
X_test = scaler.transform(X_test)
[14]: import joblib
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
joblib.dump(knn, 'knn_model.pkl')
print("Model saved as 'knn_model.pkl'")
Model saved as 'knn_model.pkl'
[15]: import joblib
from sklearn.metrics import accuracy_score
knn = joblib.load('knn_model.pkl')
for k in range(1, 21):
knn.n_neighbors = k
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for k={k}: {accuracy:.4f}")
Accuracy for k=1: 0.8500
Accuracy for k=2: 0.8100
Accuracy for k=3: 0.8600
Accuracy for k=4: 0.8300
Accuracy for k=5: 0.8600
Accuracy for k=6: 0.8500
Accuracy for k=7: 0.8500
Accuracy for k=8: 0.8400
Accuracy for k=9: 0.8500
Accuracy for k=10: 0.8500
Accuracy for k=11: 0.8600
Accuracy for k=12: 0.8600
Accuracy for k=13: 0.8700
Accuracy for k=14: 0.8700
Accuracy for k=15: 0.8800
Accuracy for k=16: 0.8500
Accuracy for k=17: 0.8600
Accuracy for k=18: 0.8500
Accuracy for k=19: 0.8700
Accuracy for k=20: 0.8600
[16]: import matplotlib.pyplot as plt
import joblib
knn = joblib.load('knn_model.pkl')
2
accuracies = []
for k in range(1, 21):
knn.n_neighbors = k
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracies.append(accuracy)
plt.figure(figsize=(10, 6))
plt.plot(range(1, 21), accuracies, marker='o', color='b', linestyle='-',␣
↪markersize=6)
plt.title('K vs Accuracy for KNN Classifier')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Accuracy')
plt.xticks(range(1, 21))
plt.grid(True)
plt.show()
[21]: import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import pandas as pd
y_pred = knn.predict(X_test)
3
report = classification_report(y_test, y_pred, output_dict=True)
print(report)
metrics = {
'accuracy': report['accuracy'],
'precision': report['weighted avg']['precision'],
'recall': report['weighted avg']['recall'],
'f1-score': report['weighted avg']['f1-score']
}
metrics_df = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Score'])
plt.figure(figsize=(8, 6))
plt.bar(metrics_df['Metric'], metrics_df['Score'], color='grey')
plt.title('Classification Metrics')
plt.xlabel('Metric')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
{'0': {'precision': 0.8103448275862069, 'recall': 0.94, 'f1-score':
0.8703703703703703, 'support': 50.0}, '1': {'precision': 0.9285714285714286,
'recall': 0.78, 'f1-score': 0.8478260869565217, 'support': 50.0}, 'accuracy':
0.86, 'macro avg': {'precision': 0.8694581280788177, 'recall': 0.86, 'f1-score':
0.8590982286634461, 'support': 100.0}, 'weighted avg': {'precision':
0.8694581280788176, 'recall': 0.86, 'f1-score': 0.859098228663446, 'support':
100.0}}
4
[23]: from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
knn_best = KNeighborsClassifier(n_neighbors=15)
knn_best.fit(X_train, y_train)
y_pred_best = knn_best.predict(X_test)
cm = confusion_matrix(y_test, y_pred_best)
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Positives (TP): {tp}")
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0',␣
↪'Class 1'], yticklabels=['Class 0', 'Class 1'])
5
plt.title('Confusion Matrix for K=15')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()
True Negatives (TN): 46
False Positives (FP): 4
False Negatives (FN): 8
True Positives (TP): 42
[24]: from sklearn.model_selection import cross_val_score
import numpy as np
k_values = range(1, 21)
cv_scores = []
6
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
cv_scores.append(np.mean(scores))
best_k = k_values[np.argmax(cv_scores)]
print(f"Best K value from cross-validation: {best_k}")
plt.figure(figsize=(10, 6))
plt.plot(k_values, cv_scores, marker='o', color='b', linestyle='-',␣
↪markersize=6)
plt.title('K vs Cross-Validation Accuracy')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Cross-Validation Accuracy')
plt.xticks(k_values)
plt.grid(True)
plt.show()
Best K value from cross-validation: 7
From the K vs Accuracy Graph and Results, we can conclude that the best K value for the model
and the dataset is reached at K=15.
According to the cross validation applied on the datasets, The Best K-Value would be 7.
7
This is due to the K vs Accuracy results only accounting for the test dataset. In cross, validation,
dataset is split into multiple folds thus giving a different K value.