Set 1 – Titanic Dataset (Seaborn)
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# Load dataset
df = sns.load_dataset('titanic')
# Preprocessing
df.drop(['deck', 'embark_town', 'alive'], axis=1, inplace=True)
df.dropna(inplace=True)
label_cols = df.select_dtypes(include=['object', 'category']).columns
df[label_cols] = df[label_cols].apply(LabelEncoder().fit_transform)
# Feature selection
X = df.drop('survived', axis=1)
y = df['survived']
X = StandardScaler().fit_transform(X)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
# SVM
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
# Evaluation
print("Random Forest:\n", classification_report(y_test, y_pred_rf))
print("SVM:\n", classification_report(y_test, y_pred_svm))
# Plot
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
ConfusionMatrixDisplay.from_estimator(rf, X_test, y_test, ax=ax[0], cmap='Blues')
ax[0].title.set_text('Random Forest')
ConfusionMatrixDisplay.from_estimator(svm, X_test, y_test, ax=ax[1], cmap='Oranges')
ax[1].title.set_text('SVM')
plt.tight_layout()
plt.show()
Set 2 – Iris Dataset
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, f1_score
# Load and split
iris = load_iris()
X, y = iris.data, iris.target
# Step A – Decision Tree on full features
clf = DecisionTreeClassifier()
clf.fit(X, y)
y_pred = clf.predict(X)
print("Full Features:\nPrecision:", precision_score(y, y_pred, average='macro'), "F1:", f1_score(y,
y_pred, average='macro'))
# Step B – PCA reduced features
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)
clf_pca = DecisionTreeClassifier()
clf_pca.fit(X_reduced, y)
y_pred_pca = clf_pca.predict(X_reduced)
print("Reduced Features:\nPrecision:", precision_score(y, y_pred_pca, average='macro'), "F1:",
f1_score(y, y_pred_pca, average='macro'))
# Visualization
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.title("Original Features")
plt.subplot(1, 2, 2)
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y)
plt.title("PCA Reduced Features")
plt.tight_layout()
plt.show()
Set 3 – Indian Pima Diabetes Dataset
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-
diabetes.data.csv"
col_names = ['Preg', 'Glucose', 'BP', 'SkinThick', 'Insulin', 'BMI', 'DPF', 'Age', 'Outcome']
df = pd.read_csv(url, names=col_names)
# Preprocessing
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Models
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print("Random Forest:\n", classification_report(y_test, rf.predict(X_test)))
svm = SVC()
svm.fit(X_train, y_train)
print("SVM:\n", classification_report(y_test, svm.predict(X_test)))
Set 4 – Stock Price Dataset (Linear Regression & Random Forest)
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Load a stock dataset (using Yahoo Finance via pandas_datareader or a CSV file)
df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/a10.csv") # Replace
with actual stock data
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
X = df[['month', 'year']]
y = df['value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
# Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
# Evaluation
print("Linear Regression R2:", r2_score(y_test, y_pred_lr))
print("Random Forest R2:", r2_score(y_test, y_pred_rf))
# Plot
plt.plot(y_test.values, label='Actual')
plt.plot(y_pred_lr, label='Linear Regression')
plt.plot(y_pred_rf, label='Random Forest')
plt.legend()
plt.title("Stock Price Prediction")
plt.show()
Set 5 – MNIST Neural Network & KNN
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import mnist
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# Load dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train_nn, x_test_nn = x_train / 255.0, x_test / 255.0
# Neural Network
model = Sequential([
Flatten(input_shape=(28, 28)),
Dense(128, activation='relu'),
Dense(256, activation='relu'),
Dense(512, activation='relu'),
Dense(1024, activation='relu'),
Dense(10, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(x_train_nn, y_train, epochs=100, validation_split=0.2, verbose=0)
# Accuracy
val_acc = history.history['val_accuracy'][-1]
print("Validation Accuracy after 100 epochs:", val_acc)
# KNN
x_train_flat = x_train.reshape((x_train.shape[0], -1)) / 255.0
x_test_flat = x_test.reshape((x_test.shape[0], -1)) / 255.0
knn = KNeighborsClassifier()
knn.fit(x_train_flat, y_train)
y_pred_knn = knn.predict(x_test_flat)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
Set 6 – Bayesian Belief Network & EM Algorithm
# Bayesian Belief Network using pgmpy
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
import pandas as pd
data = pd.DataFrame([
['sunny', 'hot', 'high', 'FALSE', 'no'],
['sunny', 'hot', 'high', 'TRUE', 'no'],
['overcast', 'hot', 'high', 'FALSE', 'yes'],
['rainy', 'mild', 'high', 'FALSE', 'yes'],
], columns=['outlook', 'temperature', 'humidity', 'windy', 'play'])
model = BayesianModel([('outlook', 'play'), ('humidity', 'play'), ('windy', 'play')])
model.fit(data, estimator=MaximumLikelihoodEstimator)
infer = VariableElimination(model)
print(infer.map_query(['play'], evidence={'outlook': 'sunny'}))
# EM Clustering
from sklearn.mixture import GaussianMixture
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
X, _ = make_blobs(n_samples=300, centers=3, cluster_std=0.60)
gmm = GaussianMixture(n_components=3)
gmm.fit(X)
labels = gmm.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
plt.title("EM Clustering")
plt.show()
Set 7 – House Price Dataset (Multiple Algorithms)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv")
X = df.drop('medv', axis=1)
y = df['medv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = {
"Linear": LinearRegression(),
"DecisionTree": DecisionTreeRegressor(),
"SVM": SVR(),
"RandomForest": RandomForestRegressor()
}
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
pred = model.predict(X_test)
results[name] = r2_score(y_test, pred)
plt.bar(results.keys(), results.values())
plt.ylabel("R2 Score")
plt.title("Model Comparison - House Prices")
plt.show()
Set 8 – Flights Dataset from Seaborn
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
df = sns.load_dataset("flights")
df['month'] = LabelEncoder().fit_transform(df['month'])
X = df[['month', 'year']]
y = df['passengers']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
plt.scatter(y_test, y_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Flights Prediction (Linear Regression)")
plt.show()
Viva Question Answers Summary
1. Machine Learning: Field of AI that enables systems to learn from data.
2. Supervised vs Unsupervised: Labeled data vs unlabeled data learning.
3. Classification vs Regression: Discrete labels vs continuous output.
4. Underfitting/Overfitting: Too simple vs too complex models.
5. Feature Selection/Extraction: Choosing vs transforming features.
6. Dimensionality Reduction: Reducing feature space (e.g., PCA).
7. Bias-Variance Tradeoff: Balance between model simplicity and complexity.
8. Entropy/Information Gain: Measures used in decision trees.
9. Bagging/Boosting: Ensemble methods to improve performance.
10. Support Vectors: Key data points used in SVM.
11. K-Means Disadvantages: Need for k, poor with noise, non-convex clusters.
12. Core/Border/Noise in DBSCAN: Density-based clustering points.
13. Lazy Learning: Delays learning until prediction (e.g., KNN).
14. Distance-Based ML: KNN, K-means.
15. Activation Functions: ReLU, Sigmoid, Tanh, Leaky ReLU, Softmax.
16. Naive Bayes Applications: Spam filtering, sentiment analysis, text classification.