In
[1]: # importing Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
In [2]: # import dataset
df = pd.read_csv('framingham.csv')
Data Inspection
In [3]: df.head()
Out[3]: male age education currentSmoker cigsPerDay BPMeds prevalentStroke prevalentHyp diabetes totChol
0 1 39 4.0 0 0.0 0.0 0 0 0 195.0
1 0 46 2.0 0 0.0 0.0 0 0 0 250.0
2 1 48 1.0 1 20.0 0.0 0 0 0 245.0
3 0 61 3.0 1 30.0 0.0 0 1 0 225.0
4 0 46 3.0 1 23.0 0.0 0 0 0 285.0
In [4]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 male 4238 non-null int64
1 age 4238 non-null int64
2 education 4133 non-null float64
3 currentSmoker 4238 non-null int64
4 cigsPerDay 4209 non-null float64
5 BPMeds 4185 non-null float64
6 prevalentStroke 4238 non-null int64
7 prevalentHyp 4238 non-null int64
8 diabetes 4238 non-null int64
9 totChol 4188 non-null float64
10 sysBP 4238 non-null float64
11 diaBP 4238 non-null float64
12 BMI 4219 non-null float64
13 heartRate 4237 non-null float64
14 glucose 3850 non-null float64
15 TenYearCHD 4238 non-null int64
dtypes: float64(9), int64(7)
memory usage: 529.9 KB
In [5]: df.describe()
Loading [MathJax]/extensions/Safe.js
Out[5]: male age education currentSmoker cigsPerDay BPMeds prevalentStroke preva
count 4238.000000 4238.000000 4133.000000 4238.000000 4209.000000 4185.000000 4238.000000 4238
mean 0.429212 49.584946 1.978950 0.494101 9.003089 0.029630 0.005899 0
std 0.495022 8.572160 1.019791 0.500024 11.920094 0.169584 0.076587 0
min 0.000000 32.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0
25% 0.000000 42.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0
50% 0.000000 49.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0
75% 1.000000 56.000000 3.000000 1.000000 20.000000 0.000000 0.000000 1
max 1.000000 70.000000 4.000000 1.000000 70.000000 1.000000 1.000000 1
In [6]: df.isnull().sum()
male 0
Out[6]:
age 0
education 105
currentSmoker 0
cigsPerDay 29
BPMeds 53
prevalentStroke 0
prevalentHyp 0
diabetes 0
totChol 50
sysBP 0
diaBP 0
BMI 19
heartRate 1
glucose 388
TenYearCHD 0
dtype: int64
Exploratory Data Analysis (EDA)
In [7]: plt.figure(figsize =(15,10))
sns.heatmap(df,yticklabels=False,cbar=False,cmap='viridis')
<AxesSubplot:>
Out[7]:
Loading [MathJax]/extensions/Safe.js
In [8]: plt.figure(figsize=(15,15))
sns.heatmap(df.corr(),annot=True)
<AxesSubplot:>
Out[8]:
Loading [MathJax]/extensions/Safe.js
In [9]: df['prevalentStroke'].value_counts()
0 4213
Out[9]:
1 25
Name: prevalentStroke, dtype: int64
In [10]: sns.set_style('whitegrid')
sns.countplot(x='TenYearCHD',data=df)
<AxesSubplot:xlabel='TenYearCHD', ylabel='count'>
Out[10]:
Loading [MathJax]/extensions/Safe.js
In [ ]:
In [11]: sns.set_style('whitegrid')
sns.countplot(x='TenYearCHD',hue='male',data=df)
<AxesSubplot:xlabel='TenYearCHD', ylabel='count'>
Out[11]:
In [12]: sns.set_style('whitegrid')
sns.countplot(x='TenYearCHD',hue='diabetes',data=df)
<AxesSubplot:xlabel='TenYearCHD', ylabel='count'>
Out[12]:
In [13]: df['education'].value_counts()
Loading [MathJax]/extensions/Safe.js
1.0 1720
Out[13]:
2.0 1253
3.0 687
4.0 473
Name: education, dtype: int64
In [14]: sns.countplot(x= 'education', data = df)
<AxesSubplot:xlabel='education', ylabel='count'>
Out[14]:
In [15]: sns.countplot(x= 'prevalentStroke', data = df)
<AxesSubplot:xlabel='prevalentStroke', ylabel='count'>
Out[15]:
In [16]: sns.distplot(df['totChol'].dropna())
C:\Users\Admin\AppData\Local\Temp\ipykernel_20640\2569219756.py:1: UserWarning:
`distplot` is a deprecated function and will be removed in seaborn v0.14.0.
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
sns.distplot(df['totChol'].dropna())
<AxesSubplot:xlabel='totChol', ylabel='Density'>
Out[16]:
Loading [MathJax]/extensions/Safe.js
In [17]: sns.distplot(df['glucose'].dropna())
C:\Users\Admin\AppData\Local\Temp\ipykernel_20640\2989925030.py:1: UserWarning:
`distplot` is a deprecated function and will be removed in seaborn v0.14.0.
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
sns.distplot(df['glucose'].dropna())
<AxesSubplot:xlabel='glucose', ylabel='Density'>
Out[17]:
In [18]: plt.figure(figsize=(12,7))
sns.boxplot(y='glucose',x='diabetes',data=df)
<AxesSubplot:xlabel='diabetes', ylabel='glucose'>
Out[18]:
Loading [MathJax]/extensions/Safe.js
In [19]: sns.boxplot(x = 'education',y= 'age',data = df)
<AxesSubplot:xlabel='education', ylabel='age'>
Out[19]:
In [20]: sns.set_style('whitegrid')
sns.countplot(x='prevalentStroke',hue='BPMeds',data=df)
<AxesSubplot:xlabel='prevalentStroke', ylabel='count'>
Out[20]:
Loading [MathJax]/extensions/Safe.js
In [21]: sns.boxplot(x = 'currentSmoker',y= 'cigsPerDay',data = df)
<AxesSubplot:xlabel='currentSmoker', ylabel='cigsPerDay'>
Out[21]:
Filling Null Values
In [22]: def input_glucose(cols):
diabetes=cols[0]
glucose =cols[1]
if pd.isnull(glucose):
if diabetes==0:
return 75
if diabetes==1:
return 149
else:
return glucose
In [23]: df['glucose']=df[['diabetes','glucose']].apply(input_glucose,axis=1)
In [24]: df['heartRate']=df['heartRate'].fillna(df['heartRate'].mean())
In [25]: df['BMI']=df['BMI'].fillna(df['BMI'].mean())
Loading [MathJax]/extensions/Safe.js
In [26]: df['totChol']=df['totChol'].fillna(df['totChol'].mean())
In [27]: df['BPMeds']=df['BPMeds'].fillna(1)
In [28]: cigs_mean=df['cigsPerDay'].mean()
In [29]: def fill_cigsPerDay(cols):
smoker=cols[0]
cigsPerDay=cols[1]
if pd.isnull(smoker):
return 0
else :
return cigs_mean
In [30]: df['cigsPerDay']=df[['currentSmoker','cigsPerDay']].apply(input_glucose,axis=1)
In [31]: def fill_education(cols):
age=cols[0]
education=cols[1]
if pd.isnull(education):
if age>53:
return 1
if age<46:
return 2
else:
return 3
else :
return education
In [32]: df['education']=df[['age','education']].apply(fill_education,axis=1)
In [33]: plt.figure(figsize =(15,10))
sns.heatmap(df,yticklabels=False,cbar=False,cmap='viridis')
<AxesSubplot:>
Out[33]:
Loading [MathJax]/extensions/Safe.js
In [34]: df.isnull().sum()
male 0
Out[34]:
age 0
education 0
currentSmoker 0
cigsPerDay 0
BPMeds 0
prevalentStroke 0
prevalentHyp 0
diabetes 0
totChol 0
sysBP 0
diaBP 0
BMI 0
heartRate 0
glucose 0
TenYearCHD 0
dtype: int64
Data Preparation
In [35]: list1=list(df.columns)
list1.remove('TenYearCHD')
In [36]: X = df[list1]
y=df['TenYearCHD']
Loading [MathJax]/extensions/Safe.js
In [37]: from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
In [38]: y_train.value_counts(normalize=True)
0 0.846608
Out[38]:
1 0.153392
Name: TenYearCHD, dtype: float64
Implementing Logistic Regression
In [39]: from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
In [40]: model.fit(X_train,y_train)
C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_
model\_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Out[40]: ▾ LogisticRegression
LogisticRegression()
In [41]: model.score(X_train,y_train)
0.848377581120944
Out[41]:
In [42]: y_pred = model.predict(X_test)
Accuracy
In [43]: from sklearn.metrics import confusion_matrix,accuracy_score
In [44]: accuracy = confusion_matrix(y_test,y_pred)
In [45]: accuracy
array([[719, 5],
Out[45]:
[117, 7]], dtype=int64)
In [46]: accuracy = accuracy_score(y_test,y_pred)
In [47]: accuracy
0.8561320754716981
Out[47]:
In [ ]:
Loading [MathJax]/extensions/Safe.js