# 🧠 Full Bank Term Deposit Classification Project (with Explanations)
# 1. Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Explanation:
# - pandas, numpy: Data handling
# - matplotlib, seaborn: Visualization
# - sklearn: Machine Learning tools
# - xgboost: Advanced ensemble model
# 2. Load the dataset
df = pd.read_csv('bankmarketing.csv') # Change the path if needed
print(df.head())
# Explanation:
# - Read the dataset into a DataFrame.
# - Inspect the first few rows to understand the structure.
# 3. Preprocessing the data
# Step 3.1: Encode categorical variables
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan',
'contact', 'month', 'day_of_week', 'poutcome']
label_encoders = {}
for col in categorical_cols:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
# Explanation:
# - LabelEncoder transforms text categories into numbers (e.g., 'married' -> 1).
# - We store each encoder for possible inverse-transform later.
# Step 3.2: Encode the target column ('y')
target_encoder = LabelEncoder()
df['y'] = target_encoder.fit_transform(df['y']) # 'yes' -> 1, 'no' -> 0
# Step 3.3: Scale numerical features
numerical_cols = ['age', 'duration', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m',
'nr.employed']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
# Explanation:
# - StandardScaler centers data (mean = 0, standard deviation = 1).
# - Helps algorithms that are sensitive to feature scaling.
# 4. Split the data into train and test sets
X = df.drop('y', axis=1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Explanation:
# - 80% data for training, 20% for testing.
# - stratify=y ensures the same proportion of classes in train and test sets.
# 5. Build different classification models
models = {
"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss',
random_state=42)
}
# Explanation:
# - Logistic Regression: Simple baseline model.
# - Random Forest: Ensemble method using decision trees.
# - XGBoost: Advanced gradient boosting technique, highly accurate.
# 6. Train models and evaluate performance
for name, model in models.items():
print(f"\n==== {name} ====")
model.fit(X_train, y_train) # Train the model
y_pred = model.predict(X_test) # Predict on test set
print(classification_report(y_test, y_pred)) # Print evaluation metrics
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'{name} - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# Explanation:
# - classification_report shows precision, recall, f1-score, and support.
# - confusion_matrix visualizes true vs predicted classes.
# 📚 Interview Questions on Classification Projects:
"""
1. What is the difference between Logistic Regression and Linear Regression?
2. Why do we need to scale features before training certain models?
3. What is Stratified Sampling? Why do we use it in classification?
4. What are Precision, Recall, and F1-score?
5. What is the importance of a Confusion Matrix?
6. What is Overfitting and how can you prevent it?
7. Why would you choose Random Forest over a simple Decision Tree?
8. What is Gradient Boosting? How is it different from Random Forest?
9. How does XGBoost improve model performance?
10. How would you handle an imbalanced dataset?
11. What metrics would you monitor for a classification model?
12. Explain why feature encoding is needed.
13. What is Label Encoding vs One Hot Encoding?
14. Why would longer call duration affect subscription likelihood?
15. How would you improve the performance of this classification model?
"""
# 🏁 End of Project - Great Job! 🚀