import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
data
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked
Self-
1 51676 Female 61.0 0 0 Yes Rural 202.21 NaN never smoked
employed
2 31112 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked
3 60182 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes
Self-
4 1665 Female 79.0 1 0 Yes Rural 174.12 24.0 never smoked
employed
... ... ... ... ... ... ... ... ... ... ... ...
5105 18234 Female 80.0 1 0 Yes Private Urban 83.75 NaN never smoked
Self-
5106 44873 Female 81.0 0 0 Yes Urban 125.20 40.0 never smoked
employed
Self-
5107 19723 Female 35.0 0 0 Yes Rural 82.99 30.6 never smoked
employed
5108 37544 Male 51.0 0 0 Yes Private Rural 166.29 25.6 formerly smoked
5109 44679 Female 44.0 0 0 Yes Govt_job Urban 85.28 26.2 Unknown
5110 rows × 12 columns
Data Preprocessing
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 5110 non-null int64
1 gender 5110 non-null object
2 age 5110 non-null float64
3 hypertension 5110 non-null int64
4 heart_disease 5110 non-null int64
5 ever_married 5110 non-null object
6 work_type 5110 non-null object
7 Residence_type 5110 non-null object
8 avg_glucose_level 5110 non-null float64
9 bmi 4909 non-null float64
10 smoking_status 5110 non-null object
11 stroke 5110 non-null int64
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
data.describe()
id age hypertension heart_disease avg_glucose_level bmi stroke
count 5110.000000 5110.000000 5110.000000 5110.000000 5110.000000 4909.000000 5110.000000
mean 36517.829354 43.226614 0.097456 0.054012 106.147677 28.893237 0.048728
std 21161.721625 22.612647 0.296607 0.226063 45.283560 7.854067 0.215320
min 67.000000 0.080000 0.000000 0.000000 55.120000 10.300000 0.000000
25% 17741.250000 25.000000 0.000000 0.000000 77.245000 23.500000 0.000000
50% 36932.000000 45.000000 0.000000 0.000000 91.885000 28.100000 0.000000
75% 54682.000000 61.000000 0.000000 0.000000 114.090000 33.100000 0.000000
max 72940.000000 82.000000 1.000000 1.000000 271.740000 97.600000 1.000000
data.isnull().sum()
id 0
gender 0
age 0
hypertension 0
heart_disease 0
ever_married 0
work_type 0
Residence_type 0
avg_glucose_level 0
bmi 201
smoking_status 0
stroke 0
dtype: int64
# Checking the distribution of the missing data column.
plt.figure(figsize=(8,5))
data['bmi'].plot(kind='kde')
plt.show()
Checking the distribution of the missing data column i.e bmi.
Missing value Treatment
data['bmi'].fillna(data['bmi'].mean(), inplace=True)
# re-checking missing value
data.isnull().sum()
id 0
gender 0
age 0
hypertension 0
heart_disease 0
ever_married 0
work_type 0
Residence_type 0
avg_glucose_level 0
bmi 0
smoking_status 0
stroke 0
dtype: int64
Droping unnecessary columns
data.drop(['id'], axis = 1, inplace=True)
data.head()
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 Male 67.0 0 1 Yes Private Urban 228.69 36.600000 formerly smoked 1
Self-
1 Female 61.0 0 0 Yes Rural 202.21 28.893237 never smoked 1
employed
2 Male 80.0 0 1 Yes Private Rural 105.92 32.500000 never smoked 1
3 Female 49.0 0 0 Yes Private Urban 171.23 34.400000 smokes 1
Self-
4 Female 79.0 1 0 Yes Rural 174.12 24.000000 never smoked 1
employed
EDA
Target variable (Stroke)
data['stroke'].value_counts().plot(kind='bar')
plt.show()
Checking outliers in our dataset (Categorical columns)
num=data.select_dtypes(exclude='object')
for i in num.columns:
sns.boxplot(data=num,x=i)
plt.show()
Gender
data['gender'].value_counts()
Female 2994
Male 2115
Other 1
Name: gender, dtype: int64
sns.countplot(data=data,x='gender')
plt.show()
sns.countplot(data=data,x='gender',hue='stroke')
plt.show()
data['stroke'].value_counts().plot(kind='pie',autopct='%0.2f%%')
plt.show()
Age
# More men than women had strokes
data.groupby('gender').mean()[['age', 'stroke']]
age stroke
gender
Female 43.757395 0.047094
Male 42.483385 0.051064
Other 26.000000 0.000000
More men than women had stroke attack.
Ever married
data['ever_married'].value_counts()
Yes 3353
No 1757
Name: ever_married, dtype: int64
sns.countplot(data=data,x='ever_married',hue='stroke')
plt.show()
Work Type
data['work_type'].unique()
data['work_type'].unique()
array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
dtype=object)
data['work_type'].value_counts()
Private 2925
Self-employed 819
children 687
Govt_job 657
Never_worked 22
Name: work_type, dtype: int64
sns.countplot(data=data,x='work_type',hue='stroke')
plt.show()
Residence Type
data['Residence_type'].unique()
array(['Urban', 'Rural'], dtype=object)
data['Residence_type'].value_counts()
Urban 2596
Rural 2514
Name: Residence_type, dtype: int64
sns.countplot(data=data,x='Residence_type',hue='stroke')
plt.show()
Smoking Features
data['smoking_status'].value_counts()
never smoked 1892
Unknown 1544
formerly smoked 885
smokes 789
Name: smoking_status, dtype: int64
sns.countplot(data=data,x='smoking_status',hue='stroke')
plt.show()
Heatmap
sns.heatmap(data.corr(),annot=True,fmt='.2f')
plt.show()
Encoding the categorical variables
data.dtypes
gender object
age float64
hypertension int64
heart_disease int64
ever_married object
work_type object
Residence_type object
avg_glucose_level float64
bmi float64
smoking_status object
stroke int64
dtype: object
from sklearn.preprocessing import LabelEncoder
lr = LabelEncoder()
data['gender'] = lr.fit_transform(data['gender'])
data['ever_married'] = lr.fit_transform(data['ever_married'])
data['work_type'] = lr.fit_transform(data['work_type'])
data['Residence_type'] = lr.fit_transform(data['Residence_type'])
data['smoking_status'] = lr.fit_transform(data['smoking_status'])
Splitting data into independent and dependent variables
X=data.drop('stroke',axis=1).values
X
array([[ 1. , 67. , 0. , ..., 228.69 ,
36.6 , 1. ],
[ 0. , 61. , 0. , ..., 202.21 ,
28.89323691, 2. ],
[ 1. , 80. , 0. , ..., 105.92 ,
32.5 , 2. ],
...,
[ 0. , 35. , 0. , ..., 82.99 ,
30.6 , 2. ],
[ 1. , 51. , 0. , ..., 166.29 ,
25.6 , 1. ],
[ 0. , 44. , 0. , ..., 85.28 ,
26.2 , 0. ]])
Y=data['stroke'].values
Y
array([1, 1, 1, ..., 0, 0, 0], dtype=int64)
# splitting
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)
Logistic Regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
LogisticRegression()
predict = classifier.predict(X_test)
predict
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
Y_test
array([1, 0, 0, ..., 0, 1, 0], dtype=int64)
Evaluation for Logistic Regression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(Y_test, predict))
[[968 0]
[ 54 0]]
print(classification_report(Y_test, predict))
precision recall f1-score support
0 0.95 1.00 0.97 968
1 0.00 0.00 0.00 54
accuracy 0.95 1022
macro avg 0.47 0.50 0.49 1022
weighted avg 0.90 0.95 0.92 1022
print('Accuracy score :',accuracy_score(Y_test, predict))
Accuracy score : 0.9471624266144814
KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
KNeighborsClassifier()
pred = knn.predict(X_test)
pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
Y_test
array([1, 0, 0, ..., 0, 1, 0], dtype=int64)
Evaluation for KNN Classifier
print('Accuracy:',accuracy_score(Y_test, pred))
Accuracy: 0.9422700587084148
Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth=3)
classifier.fit(X_train, Y_train)
DecisionTreeClassifier(max_depth=3)
Y_pred = classifier.predict(X_test)
Y_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
Y_test
array([1, 0, 0, ..., 0, 1, 0], dtype=int64)
Evaluation for Decision Tree Classifier
print('Accuracy:',accuracy_score(Y_test, Y_pred))
Accuracy: 0.9461839530332681
Ploting Tree with plot_tree
from sklearn import tree
fig = plt.figure(figsize=(15,10))
tree.plot_tree(classifier,filled=True,class_names=True,node_ids=True)
plt.show()
Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train,Y_train)
RandomForestClassifier()
Y_pred1 = classifier.predict(X_test)
Y_pred1
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
Y_test
array([1, 0, 0, ..., 0, 1, 0], dtype=int64)
Evaluation for Random Forest Classifier
print('Accuracy:', accuracy_score(Y_pred1, Y_test))
Accuracy: 0.9461839530332681
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js