import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_au
from sklearn.metrics import roc_curve, auc
try:
df = pd.read_csv('heart.csv')
print("heart.csv loaded successfully!")
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows of the dataset:")
print(df.head())
except FileNotFoundError:
print("Error: 'heart.csv' not found. Please ensure the file is in the same directory as
exit()
heart.csv loaded successfully!
Dataset shape: (303, 14)
First 5 rows of the dataset:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \
0 63 1 3 145 233 1 0 150 0 2.3 0
1 37 1 2 130 250 0 1 187 0 3.5 0
2 41 0 1 130 204 0 0 172 0 1.4 2
3 56 1 1 120 236 0 1 178 0 0.8 2
4 57 0 0 120 354 0 1 163 1 0.6 2
ca thal target
0 0 1 1
1 0 2 1
2 0 2 1
3 0 2 1
4 0 2 1
print("\n--- Initial Data Info ---")
df.info()
print("\n--- Missing Values ---")
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
if missing_values.empty:
print("No missing values found in the dataset.")
else:
print(missing_values)
--- Initial Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 303 non-null int64
1 sex 303 non-null int64
2 cp 303 non-null int64
3 trestbps 303 non-null int64
4 chol 303 non-null int64
5 fbs 303 non-null int64
6 restecg 303 non-null int64
7 thalach 303 non-null int64
8 exang 303 non-null int64
9 oldpeak 303 non-null float64
10 slope 303 non-null int64
11 ca 303 non-null int64
12 thal 303 non-null int64
13 target 303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
--- Missing Values ---
No missing values found in the dataset.
print("\n--- Target Variable Distribution (target) ---")
plt.figure(figsize=(7, 5))
sns.countplot(x='target', data=df)
plt.title('Distribution of Target Variable (Heart Disease)')
plt.xlabel('Target (0: No Disease, 1: Disease)')
plt.ylabel('Count')
plt.xticks([0, 1], ['No Disease', 'Disease'])
plt.show()
--- Target Variable Distribution (target) ---
X = df.drop('target', axis=1)
y = df['target']
# Separate numerical and categorical columns
# We'll use a dynamic approach to categorize based on dtype and unique values
numeric_cols_dynamic = X.select_dtypes(include=np.number).columns.tolist()
object_cols_dynamic = X.select_dtypes(include='object').columns.tolist()
# Refine categorical columns. Many numeric-looking columns (like 'cp', 'thal') are actually
# Let's consider columns with a low number of unique values as categorical, along with obje
# This heuristic considers columns with less than 20 unique integer/float values as categor
low_cardinality_cols = [col for col in numeric_cols_dynamic if X[col].nunique() < 20 and X[
categorical_cols = list(set(object_cols_dynamic + low_cardinality_cols))
numerical_cols = [col for col in numeric_cols_dynamic if col not in categorical_cols]
print(f"\nNumber of numerical features identified: {len(numerical_cols)}")
print(f"Numerical features: {numerical_cols}")
print(f"Number of categorical features identified: {len(categorical_cols)}")
print(f"Categorical features: {categorical_cols}")
# --- Preprocessing Pipelines ---
# Numerical transformer: Impute missing with mean (if any), then scale
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
# Categorical transformer: Impute missing with most frequent (if any), then one-hot encode
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore')) # handle_unknown='ignore' for unseen
])
# Create a preprocessor that applies different transformers to different column types
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
],
remainder='passthrough' # Keep any other columns if they exist and are not processed (e
)
# --- 5. Split the Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, st
# stratify=y is important for classification to maintain class distribution
print(f"\nTraining set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")
# --- 6. Model Training and Evaluation (Classification Models) ---
# Evaluation function for classification models
def evaluate_classifier(model_name, y_true, y_pred, y_prob=None):
print(f"\n--- {model_name} Evaluation ---")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print(f"Precision: {precision_score(y_true, y_pred):.4f}")
print(f"Recall: {recall_score(y_true, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_true, y_pred):.4f}")
if y_prob is not None:
print(f"ROC AUC: {roc_auc_score(y_true, y_prob):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))
print("\nClassification Report:")
print(classification_report(y_true, y_pred))
# --- Model 1: Logistic Regression ---
print("\n--- Training Logistic Regression Model ---")
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression(random_state=42, solver='l
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)
y_prob_lr = pipeline_lr.predict_proba(X_test)[:, 1] # Probability of the positive class (1)
evaluate_classifier("Logistic Regression", y_test, y_pred_lr, y_prob_lr)
# --- Model 2: Random Forest Classifier ---
print("\n--- Training Random Forest Classifier Model ---")
pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=100, rando
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)
y_prob_rf = pipeline_rf.predict_proba(X_test)[:, 1]
evaluate_classifier("Random Forest Classifier", y_test, y_pred_rf, y_prob_rf)
# --- Model 3: K-Nearest Neighbors Classifier ---
print("\n--- Training K-Nearest Neighbors Classifier Model ---")
pipeline_knn = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', KNeighborsClassifier(n_neighbors=5))]) # Defa
pipeline_knn.fit(X_train, y_train)
y_pred_knn = pipeline_knn.predict(X_test)
# KNN's predict_proba is not always as reliable as other models, but we can still use it fo
y_prob_knn = pipeline_knn.predict_proba(X_test)[:, 1]
evaluate_classifier("K-Nearest Neighbors Classifier", y_test, y_pred_knn, y_prob_knn)
# --- 7. Visualize ROC Curve (Example for Random Forest) ---
print("\n--- Visualizing ROC Curve for Random Forest Classifier ---")
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_rf:.2f
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - Random Forest')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
# --- 8. Hyperparameter Tuning (Example with Random Forest using GridSearchCV) ---
print("\n--- Performing Hyperparameter Tuning for Random Forest (GridSearchCV) ---")
# Define a smaller parameter grid for quicker demonstration
param_grid_rf = {
'classifier__n_estimators': [50, 100, 150], # Number of trees
'classifier__max_features': ['sqrt', 'log2'], # Number of features to consider at each
'classifier__max_depth': [5, 10, None] # Max depth of trees (None means unlimited)
}
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=cv_strategy, scoring='roc_auc'
grid_search_rf.fit(X_train, y_train)
print(f"\nBest parameters for Tuned Random Forest: {grid_search_rf.best_params_}")
print(f"Best cross-validation ROC AUC for Tuned Random Forest: {grid_search_rf.best_score_:
# Evaluate on test set with the best model from GridSearchCV
best_rf_model = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
y_prob_best_rf = best_rf_model.predict_proba(X_test)[:, 1]
print("\n--- Evaluation of Tuned Random Forest Model ---")
evaluate_classifier("Tuned Random Forest Classifier", y_test, y_pred_best_rf, y_prob_best_rf
Number of numerical features identified: 5
Numerical features: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
Number of categorical features identified: 8
Categorical features: ['sex', 'fbs', 'restecg', 'thal', 'exang', 'cp', 'slope',
Training set shape: (242, 13), (242,)
Testing set shape: (61, 13), (61,)
--- Training Logistic Regression Model ---
--- Logistic Regression Evaluation ---
Accuracy: 0.8689
Precision: 0.8571
Recall: 0.9091
F1-Score: 0.8824
ROC AUC: 0.9102
Confusion Matrix:
[[23 5]
[ 3 30]]
Classification Report:
precision recall f1-score support
0 0.88 0.82 0.85 28
1 0.86 0.91 0.88 33
accuracy 0.87 61
macro avg 0.87 0.87 0.87 61
weighted avg 0.87 0.87 0.87 61
--- Training Random Forest Classifier Model ---
--- Random Forest Classifier Evaluation ---
Accuracy: 0.7869
Precision: 0.7381
Recall: 0.9394
F1-Score: 0.8267
ROC AUC: 0.9161
Confusion Matrix:
[[17 11]
[ 2 31]]
Classification Report:
precision recall f1-score support
0 0.89 0.61 0.72 28
1 0.74 0.94 0.83 33
accuracy 0.79 61
macro avg 0.82 0.77 0.78 61
weighted avg 0.81 0.79 0.78 61
--- Training K-Nearest Neighbors Classifier Model ---
--- K-Nearest Neighbors Classifier Evaluation ---
Accuracy: 0.7705
Precision: 0.7568
Recall: 0.8485
F1-Score: 0.8000
ROC AUC: 0.8690