1 – Load Dataset
We start by importing the dataset from the UCI Bank Marketing repository. The head() method
gives us a quick look at the first few rows, while isnull().sum() counts missing values for each
column. This initial inspection helps us understand data completeness and identify if data
imputation is needed.
import pandas as pd
df = pd.read_csv("bank.csv", sep=";")
df.head()
age job marital education default balance housing loan
\
0 30 unemployed married primary no 1787 no no
1 33 services married secondary no 4789 yes yes
2 35 management single tertiary no 1350 yes no
3 30 management married tertiary no 1476 yes yes
4 59 blue-collar married secondary no 0 yes no
contact day month duration campaign pdays previous poutcome
y
0 cellular 19 oct 79 1 -1 0 unknown
no
1 cellular 11 may 220 1 339 4 failure
no
2 cellular 16 apr 185 1 330 1 failure
no
3 unknown 3 jun 199 4 -1 0 unknown
no
4 unknown 5 may 226 1 -1 0 unknown
no
df.isnull().sum()
age 0
job 0
marital 0
education 0
default 0
balance 0
housing 0
loan 0
contact 0
day 0
month 0
duration 0
campaign 0
pdays 0
previous 0
poutcome 0
y 0
dtype: int64
df.columns
Index(['age', 'default', 'balance', 'housing', 'loan', 'day',
'duration',
'campaign', 'pdays', 'previous', 'y', 'job_blue-collar',
'job_entrepreneur', 'job_housemaid', 'job_management',
'job_retired',
'job_self-employed', 'job_services', 'job_student',
'job_technician',
'job_unemployed', 'job_unknown', 'marital_married',
'marital_single',
'education_secondary', 'education_tertiary',
'education_unknown',
'contact_telephone', 'contact_unknown', 'month_aug',
'month_dec',
'month_feb', 'month_jan', 'month_jul', 'month_jun',
'month_mar',
'month_may', 'month_nov', 'month_oct', 'month_sep',
'poutcome_other',
'poutcome_success', 'poutcome_unknown', 'y_num'],
dtype='object')
2 – Handle Missing Values
Missing values can bias our analysis and break machine learning models.
Numeric columns: We use the median, which is robust to outliers.
Categorical columns: We use the mode (most frequent value). After this step, all missing values
should be eliminated.
import numpy as np
# Fill numeric with median
for col in df.select_dtypes(include=np.number).columns:
df[col] = df[col].fillna(df[col].median())
# Fill categorical with mode
for col in df.select_dtypes(include='object').columns:
df[col] = df[col].fillna(df[col].mode()[0])
df.isnull().sum()
age 0
job 0
marital 0
education 0
default 0
balance 0
housing 0
loan 0
contact 0
day 0
month 0
duration 0
campaign 0
pdays 0
previous 0
poutcome 0
y 0
dtype: int64
3 – Boxplots Before Outlier Treatment
Boxplots are used to visually detect outliers.
Points lying far outside the "whiskers" are likely extreme values.
Outliers can distort mean and variance, which impacts models like Linear Regression.
import matplotlib.pyplot as plt
import seaborn as sns
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
fig, axes = plt.subplots(nrows=len(numeric_cols)//3 + 1, ncols=3,
figsize=(15, 8))
axes = axes.flatten()
for i, col in enumerate(numeric_cols):
sns.boxplot(x=df[col], ax=axes[i])
axes[i].set_title(col)
plt.tight_layout()
plt.show()
4 – Treat Outliers
We apply the IQR method to detect outliers:
Calculate Q1 (25th percentile) and Q3 (75th percentile).
Any value outside Q1 - 1.5IQR or Q3 + 1.5IQR is considered an outlier.
Outliers are replaced with the median to minimize skewness.
for col in numeric_cols:
Q1, Q3 = df[col].quantile([0.25, 0.75])
IQR = Q3 - Q1
lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
median_val = df[col].median()
df[col] = np.where((df[col] < lower) | (df[col] > upper),
median_val, df[col])
5 – Encode Categorical Variables
Machine learning models require numerical input:
Binary categorical features: Label Encoding (0/1).
Multi-class features: One-Hot Encoding, creating dummy variables.
drop_first=True prevents multicollinearity by removing one category.
from sklearn.preprocessing import LabelEncoder
categorical_cols = df.select_dtypes(include='object').columns.tolist()
label_enc = LabelEncoder()
for col in categorical_cols:
if df[col].nunique() == 2:
df[col] = label_enc.fit_transform(df[col])
else:
df = pd.get_dummies(df, columns=[col], drop_first=True)
6 – Plot Distributions
Histograms with KDE curves show:
Data spread (range, central tendency)
Skewness (positive or negative)
Whether transformations may be needed to normalize the data.
fig, axes = plt.subplots(nrows=len(numeric_cols)//3 + 1, ncols=3,
figsize=(15, 8))
axes = axes.flatten()
for i, col in enumerate(numeric_cols):
sns.histplot(df[col], kde=True, ax=axes[i])
axes[i].set_title(f"{col} Distribution")
plt.tight_layout()
plt.show()
#1. Target Variable Distribution
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='y', data=df, palette='viridis')
plt.title('Target Variable Distribution (y)')
plt.xlabel('Subscribed to Term Deposit')
plt.ylabel('Count')
plt.show()
C:\Users\jogad\AppData\Local\Temp\ipykernel_11976\3322926914.py:5:
FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be
removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.
sns.countplot(x='y', data=df, palette='viridis')
import matplotlib.pyplot as plt
import seaborn as sns
# Age distribution of subscribers vs non-subscribers
plt.figure(figsize=(8, 5))
sns.kdeplot(data=df, x='age', hue='y', fill=True)
plt.title('Age Distribution by Subscription Status')
plt.show()
plt.figure(figsize=(8,5))
sns.histplot(data=df, x='age', hue='y', kde=True, palette='coolwarm',
element='step')
plt.title('Age Distribution by Subscription Status')
plt.show()
7 – Correlation Analysis
Correlation measures how two variables move together:
Values near +1 → strong positive correlation.
Values near -1 → strong negative correlation.
Helps detect redundant features, which may be dropped.
import matplotlib.pyplot as plt
import seaborn as sns
# Select a random subset of 8 numerical columns
num_cols = df.select_dtypes(include='number').columns
sample_cols = num_cols.to_series().sample(8, random_state=42)
# Compute correlation matrix for these columns only
corr_matrix = df[sample_cols].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix (Random 8 Features)")
plt.show()
8 – Feature Selection
Variance Threshold: Removes features with no variation (constant values).
SelectKBest: Selects top k features using the ANOVA F-test for classification.
This step helps simplify the model and reduce overfitting.
from sklearn.feature_selection import VarianceThreshold, SelectKBest,
f_classif
target_col = 'y' if 'y' in df.columns else df.columns[-1]
X = df.drop(columns=[target_col])
y = df[target_col]
vt = VarianceThreshold(threshold=0.0)
X_vt = vt.fit_transform(X)
selected_vt = X.columns[vt.get_support()]
selector = SelectKBest(score_func=f_classif, k=min(5, X_vt.shape[1]))
X_kbest = selector.fit_transform(X_vt, y)
selected_kbest = selected_vt[selector.get_support()]
print("After Variance Threshold:", selected_vt.tolist())
print("Top features by SelectKBest:", selected_kbest.tolist())
After Variance Threshold: ['age', 'default', 'balance', 'housing',
'loan', 'day', 'duration', 'campaign', 'job_blue-collar',
'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
'job_self-employed', 'job_services', 'job_student', 'job_technician',
'job_unemployed', 'job_unknown', 'marital_married', 'marital_single',
'education_secondary', 'education_tertiary', 'education_unknown',
'contact_telephone', 'contact_unknown', 'month_aug', 'month_dec',
'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar',
'month_may', 'month_nov', 'month_oct', 'month_sep', 'poutcome_other',
'poutcome_success', 'poutcome_unknown']
Top features by SelectKBest: ['duration', 'contact_unknown',
'month_oct', 'poutcome_success', 'poutcome_unknown']