Exploratory Data Analysis with Titanic
dataset
In [1]: #Import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
In [2]: #Load the data set
train_data=pd.read_csv('titanic_train_dataset.csv')
In [3]: #Display first 10 rows of dataset.
train_data.head(10)
Out[3]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Far
Braund,
0 1 0 3 Mr. Owen male 22.0 1 0 A/5 21171 7.250
Harris
Cumings,
Mrs. John
Bradley
1 2 1 1 female 38.0 1 0 PC 17599 71.283
(Florence
Briggs
Th...
Heikkinen,
STON/O2.
2 3 1 3 Miss. female 26.0 0 0 7.925
3101282
Laina
Futrelle,
Mrs.
Jacques
3 4 1 1 female 35.0 1 0 113803 53.100
Heath
(Lily May
Peel)
Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450 8.050
Henry
Moran,
5 6 0 3 male NaN 0 0 330877 8.458
Mr. James
McCarthy,
6 7 0 1 Mr. male 54.0 0 0 17463 51.862
Timothy J
Palsson,
Master.
7 8 0 3 male 2.0 3 1 349909 21.075
Gosta
Leonard
Johnson,
Mrs.
Oscar W
8 9 1 3 female 27.0 0 2 347742 11.133
(Elisabeth
Vilhelmina
Berg)
Nasser,
Mrs.
9 10 1 2 Nicholas female 14.0 1 0 237736 30.070
(Adele
Achem)
In [6]: #Display last 5 rows of dataset.
train_data.tail()
Out[6]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
Montvila,
886 887 0 2 Rev. male 27.0 0 0 211536 13.00
Juozas
Graham,
Miss.
887 888 1 1 female 19.0 0 0 112053 30.00
Margaret
Edith
Johnston,
Miss.
W./C.
888 889 0 3 Catherine female NaN 1 2 23.45
6607
Helen
"Carrie"
Behr, Mr.
889 890 1 1 Karl male 26.0 0 0 111369 30.00
Howell
Dooley,
890 891 0 3 Mr. male 32.0 0 0 370376 7.75
Patrick
In [5]: #Get the features of data set.
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Survived - Survival (0 = No; 1 = Yes) Pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 =
3rd) Name - Name of the passenger Sex - Sex Age - Age
SibSp - Sibsp: The dataset defines family relations... Sibling = brother, sister,
stepbrother, stepsister Spouse = husband, wife
Parch: The dataset defines family relations in this way... Parent = mother, father Child =
daughter, son, stepdaughter, stepson Some children travelled only with a nanny,
therefore parch=0 for them.
Ticket - Ticket Number Fare - Passenger Fare Cabin - Cabin Embarked - Port of
Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
In [6]: #4.Find the total number of rows and columns in the dataset.
train_data.shape
(891, 12)
Out[6]:
In [21]: #Find the mean,std,count,min,max and percentiles of dataset.
train_data.describe()
Out[21]: PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.856532 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 13.292134 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 22.000000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 30.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 36.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [7]: #Check the null values in the data set.
train_data.isnull().sum()
PassengerId 0
Out[7]:
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
In [23]: #Draw the count plot to show the passengers survived or not survived.
#A countplot counts the categories and returns a count of their occurrenc
#As it only returns the count based of a categorical column,
#we need to specify only the x parameter.
sb.countplot('Survived',hue='Survived',data=train_data)
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: Fut
ureWarning: Pass the following variable as a keyword arg: x. From version
0.12, the only valid positional argument will be `data`, and passing othe
r arguments without an explicit keyword will result in an error or misint
erpretation.
warnings.warn(
In [26]: #Identify the number of male and female survived and not survived.
train_data.groupby(['Sex', 'Survived'])['Survived'].count()
Sex Survived
Out[26]:
female 0 81
1 233
male 0 468
1 109
Name: Survived, dtype: int64
In [12]: #Plot to show a passenger class has any impact on survived vs dead.
#train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()
sb.countplot('Sex',hue='Survived',data=train_data)
plt.show()
In [9]: sb.countplot('Pclass', hue='Survived', data=train_data)
plt.title('Pclass: Survived vs Dead')
plt.show()
In [ ]: #it is clearly seen that pasangers of Class 1 are given high priority whi
#Rescue. There are greater number of passangers in Class 3 than Class 1 a
#Class 2 but very few, almost 25% in Class 3 survived. In Class 2, surviv
#and non-survival rate is 49% and 51% approx. While in Class 1 almost 68%
#people survived.
In [10]: #Identify the number of male and female survived or died based on the pas
#class.
# Compute a simple cross-tabulation of two (or more) factors. By default
#computes a frequency table of the factors unless an array of values and
#aggregation function are passed.
pd.crosstab([train_data.Sex,train_data.Survived],train_data.Pclass)
Out[10]: Pclass 1 2 3
Sex Survived
female 0 3 6 72
1 91 70 72
male 0 77 91 300
1 45 17 47
In [11]: #Find the age of oldest,youngest and average age of person travelled.
print('Age of oldest person travelled :',train_data['Age'].max())
print('Age of youngest person travelled :',train_data['Age'].min())
print('Average Age of person travelled :',train_data['Age'].mean())
Age of oldest person travelled : 80.0
Age of youngest person travelled : 0.42
Average Age of person travelled : 29.69911764705882
In [27]: train_data['Initial']=0
for i in train_data:
train_data['Initial']=train_data.Name.str.extract('([A-Za-z]+)\.') #e
In [28]: pd.crosstab(train_data.Initial,train_data.Sex)
Out[28]: Sex female male
Initial
Capt 0 1
Col 0 2
Countess 1 0
Don 0 1
Dr 1 6
Jonkheer 0 1
Lady 1 0
Major 0 2
Master 0 40
Miss 182 0
Mlle 2 0
Mme 1 0
Mr 0 517
Mrs 125 0
Ms 1 0
Rev 0 6
Sir 0 1
In [14]: train_data.groupby('Initial')['Age'].mean()
Initial
Out[14]:
Capt 70.000000
Col 58.000000
Countess 33.000000
Don 40.000000
Dr 42.000000
Jonkheer 38.000000
Lady 48.000000
Major 48.500000
Master 4.574167
Miss 21.773973
Mlle 24.000000
Mme 24.000000
Mr 32.368090
Mrs 35.898148
Ms 28.000000
Rev 43.166667
Sir 49.000000
Name: Age, dtype: float64
In [15]: train_data['Initial'].replace(['Capt','Col','Countess','Don','Dr','Jonkhe
'Mr','Miss','Mr','Other','Mr','Mrs','Mr','
In [ ]:
In [16]: train_data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Cou
'Miss','Miss','Mr','Mr','Mrs','Mrs','Othe
In [17]: #Average age based on initials
train_data.groupby('Initial')['Age'].mean()
Initial
Out[17]:
Master 4.574167
Miss 21.879195
Mr 32.891990
Mrs 35.828829
Other 42.000000
Name: Age, dtype: float64
In [19]: #Fill the null values of age with average age based on initial
train_data.loc[(train_data.Age.isnull()) & (train_data.Initial=='Mr'),'Ag
train_data.loc[(train_data.Age.isnull()) & (train_data.Initial=='Mrs'),'A
train_data.loc[(train_data.Age.isnull()) & (train_data.Initial=='Master')
train_data.loc[(train_data.Age.isnull()) & (train_data.Initial=='Miss'),'
train_data.loc[(train_data.Age.isnull()) & (train_data.Initial=='Other'),
In [27]: train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
Initial 891 non-null object
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB
In [28]: train_data.Age.isnull().any()
False
Out[28]:
In [20]: f,ax=plt.subplots(1,2,figsize=(20,20))
train_data[train_data['Survived']==0].Age.plot.hist(ax=ax[0],bins=20,edge
ax[0].set_title('Survived = 0')
x1=list(range(0,85,5))
ax[0].set_xticks(x1)
train_data[train_data['Survived']==1].Age.plot.hist(ax=ax[1],bins=20,edge
x2=list(range(0,85,5))
ax[1].set_xticks(x2)
ax[1].set_title('Survived = 1')
plt.show()
In [ ]: #Observations: (1) First priority during Rescue is given to children and
#as the persons<5 are save by large numbers (2) The oldest saved passange
# of age 80 (3) The most deaths were between 30-40
In [31]: #Identify the number of passenger died based on size of the family
#(using SibSp feature) and also draw the plot..
pd.crosstab([train_data.SibSp],train_data.Survived)
Out[31]: Survived 0 1
SibSp
0 398 210
1 97 112
2 15 13
3 12 4
4 15 3
5 5 0
8 7 0
In [29]: #Identify the number of passenger died based on size of the family
#(using SibSp feature) and also draw the plot..
pd.crosstab([train_data.SibSp],train_data.Survived).style.background_grad
Out[29]: Survived 0 1
SibSp
0 398 210
1 97 112
2 15 13
3 12 4
4 15 3
5 5 0
8 7 0
In [41]: sb.barplot('SibSp','Survived', data=train_data)
<matplotlib.axes._subplots.AxesSubplot at 0x24c61b50748>
Out[41]:
In [ ]: #The graph shows that if a passanger is alone in ship with no siblings, s
#The graph decreases as no of siblings increase. This is interesting beca
#I will save them instead of saving myself.
#But there's something wrong, the survival rate for families with 5-8 mem
#Is this because of PClass? Yes this is PClass, The crosstab shows that P
#were all in Pclass3. It is imminent that all the large families in Pclas
In [39]: f,ax=plt.subplots(1,2,figsize=(20,8))
sb.barplot('SibSp','Survived', data=train_data,ax=ax[0])
ax[0].set_title('SipSp vs Survived in BarPlot')
plt.show()
In [40]: f,ax=plt.subplots(1,2,figsize=(20,8))
sb.barplot('SibSp','Survived', data=train_data,ax=ax[0])
ax[0].set_title('SipSp vs Survived in BarPlot')
sb.factorplot('SibSp','Survived', data=train_data,ax=ax[1])
ax[1].set_title('SibSp vs Survived in FactorPlot')
plt.close(2)
plt.show()
In [33]: pd.crosstab(train_data.SibSp,train_data.Pclass).style.background_gradient
Out[33]: Pclass 1 2 3
SibSp
0 137 120 351
1 71 55 83
2 5 8 15
3 3 1 12
4 0 0 18
5 0 0 5
8 0 0 7
In [ ]: #Barplot and Crosstab data shows that if a passanger is alone in ship wit
#siblings, survival rate is 34.5%. The graph decreases as no of siblings
#increase. This is interesting because, If I have a family onboard, I wil
#them instead of saving myself. But there's something wrong, the survival
#for families with 5-8 members is 0%. Is this because of PClass?
#Yes this is PClass, The crosstab shows that Person with SibSp>3 were all
#Pclass3. It is imminent that all the large families in Pclass3(>3) died.
In [64]: train_data.corr(method='pearson')
Out[64]: PassengerId Survived Pclass Age SibSp Parch Fare
PassengerId 1.000000 -0.005007 -0.035144 0.041709 -0.057527 -0.001652 0.012658
Survived -0.005007 1.000000 -0.338481 -0.091497 -0.035322 0.081629 0.257307
Pclass -0.035144 -0.338481 1.000000 -0.339582 0.083081 0.018443 -0.549500
Age 0.041709 -0.091497 -0.339582 1.000000 -0.267734 -0.198712 0.089087
SibSp -0.057527 -0.035322 0.083081 -0.267734 1.000000 0.414838 0.159651
Parch -0.001652 0.081629 0.018443 -0.198712 0.414838 1.000000 0.216225
Fare 0.012658 0.257307 -0.549500 0.089087 0.159651 0.216225 1.000000
In [ ]: #From above correlation table we can see that Survival is inversly correl
#Pclass value. In this case since Class 1 has lower numerical value, it h
#better survival rate compared to other classes.
#We also see that Age and Survival are slighltly correlated.
In [70]: train_data.groupby(['Survived']).hist()
sb.factorplot('Survived', data=train_data, kind='count')
<seaborn.axisgrid.FacetGrid at 0x27a34bf3c50>
Out[70]:
In [78]: #Plot Agewise distribution of the passenger aboard.
sb.distplot(train_data['Age'].dropna(), bins=15,kde=False)
<matplotlib.axes._subplots.AxesSubplot at 0x27a35629b70>
Out[78]:
In [ ]: #Note: Many passensgers are of age 15-40 yrs. But this is not complete da
In [83]: #Age wise Distribution of Male and Female passengers
plt.hist(train_data['Age'][(train_data['Sex'] == 'female')].dropna(), bin
plt.hist(train_data['Age'][(train_data['Sex'] == 'male')].dropna(), bins=
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age wise Distribution of Male and Female passengers')
plt.show()
In [ ]: #There were many male passengers aboared compared to female passengers.
In [85]: #survival based on passenger’s class for both genders.
grouped_by_pclass = train_data.groupby(['Pclass', 'Survived', 'Sex'])
grouped_by_pclass.size()
Pclass Survived Sex
Out[85]:
1 0 female 3
male 77
1 female 91
male 45
2 0 female 6
male 91
1 female 70
male 17
3 0 female 72
male 300
1 female 72
male 47
dtype: int64
In [ ]: