9w3itlede
January 3, 2025
0.1 Apply PCA on heart_disease.csv for implementing binary classification.
Please refer to the meta data of heart_disease data before implementation.
[1]: import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Load the dataset
url = "https://itv-contentbucket.s3.ap-south-1.amazonaws.com/Exams/ML/PCA/
↪heart_disease.csv"
data = pd.read_csv(url)
# Display the first few rows
print(data.head())
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \
0 63 1 3 145 233 1 0 150 0 2.3 0
1 37 1 2 130 250 0 1 187 0 3.5 0
2 41 0 1 130 204 0 0 172 0 1.4 2
3 56 1 1 120 236 0 1 178 0 0.8 2
4 57 0 0 120 354 0 1 163 1 0.6 2
ca thal target
0 0 1 1
1 0 2 1
2 0 2 1
3 0 2 1
4 0 2 1
[2]: # Check for missing values
print(data.isnull().sum())
# Drop or fill missing values as required
data = data.dropna() # Example of dropping missing values
1
age 0
sex 0
cp 0
trestbps 0
chol 0
fbs 0
restecg 0
thalach 0
exang 0
oldpeak 0
slope 0
ca 0
thal 0
target 0
dtype: int64
[3]: # Example assuming 'target' is the target column based on typical naming
X = data.drop('target', axis=1) # Replace 'target' with the actual target␣
↪column name
y = data['target'] # Replace 'target' with the actual target column name
[4]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)
[5]: scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
[6]: # Choose the number of principal components to keep (e.g., 2 components)
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(f'Explained Variance Ratio: {pca.explained_variance_ratio_}')
Explained Variance Ratio: [0.2072575 0.12434085]
[7]: model = LogisticRegression()
model.fit(X_train_pca, y_train)
[7]: LogisticRegression()
[8]: y_pred = model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.8524590163934426