In [1]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
In [2]: df=pd.read_csv('titan.csv')
df.head()
Out[2]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare ... Embarked WikiId
Braund,
A/5
0 1 0.0 3 Mr. Owen male 22.0 1 0 7.2500 ... S 691.0
21171
Harris
Cumings,
Mrs. John
Bradley
1 2 1.0 1 female 38.0 1 0 PC 17599 71.2833 ... C 90.0
(Florence
Briggs
Th...
Heikkinen,
STON/O2.
2 3 1.0 3 Miss. female 26.0 0 0 7.9250 ... S 865.0
3101282
Laina
Futrelle,
Mrs.
Jacques
3 4 1.0 1 female 35.0 1 0 113803 53.1000 ... S 127.0
Heath
(Lily May
Peel)
Allen, Mr.
4 5 0.0 3 William male 35.0 0 0 373450 8.0500 ... S 627.0
Henry
5 rows × 21 columns
In [3]: df.describe()
Out[3]: PassengerId Survived Pclass Age SibSp Parch Fare WikiId
count 1309.000000 891.000000 1309.000000 1046.000000 1309.000000 1309.000000 1308.000000 1304.000000 13
mean 655.000000 0.383838 2.294882 29.881138 0.498854 0.385027 33.295479 658.534509
std 378.020061 0.486592 0.837836 14.413493 1.041658 0.865560 51.758668 380.377373
min 1.000000 0.000000 1.000000 0.170000 0.000000 0.000000 0.000000 1.000000
25% 328.000000 0.000000 2.000000 21.000000 0.000000 0.000000 7.895800 326.750000
50% 655.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200 661.500000
75% 982.000000 1.000000 3.000000 39.000000 1.000000 0.000000 31.275000 987.250000
max 1309.000000 1.000000 3.000000 80.000000 8.000000 9.000000 512.329200 1314.000000
In [4]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 1309 non-null int64
1 Survived 891 non-null float64
2 Pclass 1309 non-null int64
3 Name 1309 non-null object
4 Sex 1309 non-null object
5 Age 1046 non-null float64
6 SibSp 1309 non-null int64
7 Parch 1309 non-null int64
8 Ticket 1309 non-null object
9 Fare 1308 non-null float64
10 Cabin 295 non-null object
11 Embarked 1307 non-null object
12 WikiId 1304 non-null float64
13 Name_wiki 1304 non-null object
14 Age_wiki 1302 non-null float64
15 Hometown 1304 non-null object
16 Boarded 1304 non-null object
17 Destination 1304 non-null object
18 Lifeboat 502 non-null object
19 Body 130 non-null object
20 Class 1304 non-null float64
dtypes: float64(6), int64(4), object(11)
memory usage: 214.9+ KB
In [5]: df.shape
(1309, 21)
Out[5]:
In [6]: df.isnull().sum()
PassengerId 0
Out[6]:
Survived 418
Pclass 0
Name 0
Sex 0
Age 263
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 1014
Embarked 2
WikiId 5
Name_wiki 5
Age_wiki 7
Hometown 5
Boarded 5
Destination 5
Lifeboat 807
Body 1179
Class 5
dtype: int64
In [7]: df.sample(10)
Out[7]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare ... Embarked W
797 798 1.0 3 Osman, Mrs. female 31.00 0 0 349244 8.6833 ... S
Mara
Hanna, Mr.
296 297 0.0 3 male 23.50 0 0 2693 7.2292 ... C
Mansour
Baclini,
644 645 1.0 3 Miss. female 0.75 2 1 2666 19.2583 ... C
Eugenie
Candee,
Mrs. Edward
PC
1115 1116 NaN 1 (Helen female 53.00 0 0 27.4458 ... C
17606
Churchill
Hungerford)
Denkoff, Mr.
335 336 0.0 3 male NaN 0 0 349225 7.8958 ... S
Mitto
Ling, Mr.
169 170 0.0 3 male 28.00 0 0 1601 56.4958 ... S
Lee
Barry, Miss.
977 978 NaN 3 female 27.00 0 0 330844 7.8792 ... Q
Julia
Warren,
Mrs. Frank
Manley
366 367 1.0 1 female 60.00 1 0 110813 75.2500 ... C
(Anna
Sophia
Atkinson)
Elias, Mr.
532 533 0.0 3 male 17.00 1 1 2690 7.2292 ... C
Joseph Jr
Moran, Mr.
5 6 0.0 3 male NaN 0 0 330877 8.4583 ... Q
James
10 rows × 21 columns
UNIVARIATE ANALYSIS
KDE PLOT
In [8]: plt.figure(figsize=(4,3))
sns.kdeplot(data=df.PassengerId)
plt.show()
In [9]: plt.figure(figsize=(4,3))
sns.kdeplot(data=df.Age)
plt.show()
In [10]: plt.figure(figsize=(4,3))
sns.kdeplot(data=df.Fare)
plt.show()
HISTPLOT
In [11]: sns.histplot(df.Fare)
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
Out[11]:
BOX PLOT
In [12]: sns.boxplot(df.Age)
plt.show()
In [13]: sns.boxplot(x='Embarked', y='Age', data=df)
plt.title("Age distribution as function of Embarked Port")
plt.show()
In [14]: sns.boxplot(x='Embarked', y='Fare', data=df)
plt.title("Fare distribution as function of Embarked Port")
plt.show()
MULTI VARIATE ANALYSIS
LINE PLOT
In [15]: sns.lineplot(x='Age', y='Fare', data=df)
plt.title('Age vs Fare')
plt.show()
PIE CHART
In [16]: df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
Out[16]:
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'WikiId', 'Name_wiki',
'Age_wiki', 'Hometown', 'Boarded', 'Destination', 'Lifeboat', 'Body',
'Class'],
dtype='object')
In [17]: pclass_survived=df.groupby(['Pclass'])['Survived'].sum()
In [18]: pclass_survived
Pclass
Out[18]:
1 136.0
2 87.0
3 119.0
Name: Survived, dtype: float64
In [19]: sns.set_style('ticks')
pclass_survived.plot.pie()
plt.legend()
plt.show()
In [20]: pclass_sex_Survived=df.groupby(['Pclass','Sex'])['Survived'].sum()
In [21]: pclass_sex_Survived
Pclass Sex
Out[21]:
1 female 91.0
male 45.0
2 female 70.0
male 17.0
3 female 72.0
male 47.0
Name: Survived, dtype: float64
In [22]: pclass_sex_Survived.plot.pie(autopct = '%1.2f%%')
plt.legend(bbox_to_anchor=(1.5,1),loc='upper left',borderaxespad=0)
plt.show()
BAR CHART
In [23]: sns.countplot(x='Sex',data=df)
<Axes: xlabel='Sex', ylabel='count'>
Out[23]:
In [24]: sns.catplot(x ="Sex", hue ="Survived",
kind ="count", data = df);
COUNT PLOT
In [25]: sns.countplot(x='Embarked', hue='Pclass', data=df)
plt.title("Count of Passengers as function of Embarked Port")
plt.show()
In [26]: plt.figure(figsize=(4,3))
sns.set_style('darkgrid')
sns.countplot(x='Pclass',hue='Survived',data=df)
plt.title('Pclass:Survived vs Dead')
plt.show()
In [27]: plt.figure(figsize=(4,3))
sns.set_style('darkgrid')
sns.countplot(x='Pclass',hue='Sex',data=df)
plt.title('Pclass:Sex vs Dead')
plt.show()
violin plot
In [28]: # Violinplot Displays distribution of data
# across all levels of a category.
sns.violinplot(x ="Sex", y ="Age", hue ="Survived",
data = df, split = True)
<Axes: xlabel='Sex', ylabel='Age'>
Out[28]:
his graph gives a summary of the age range of men, women and children who were saved. The survival rate
is –
Good for children.
High for women in the age range 20-50.
Less for men as the age increases.
Since Age column is important, the missing values need to be filled, either by using the Name
column(ascertaining age based on salutation – Mr, Mrs etc.) or by using a regressor. After this step, another
column – Age_Range (based on age column) can be created and the data can be analyzed again.
BAR PLOT
In [29]: plt.figure(figsize=(8,4))
sns.barplot(x='SibSp',y='Survived',data=df)
plt.title('SibSp & Survived')
plt.show()
In [30]: # Divide Fare into 4 bins
df['Fare_Range'] = pd.qcut(df['Fare'], 4)
# Barplot - Shows approximate values based
# on the height of bars.
sns.barplot(x ='Fare_Range', y ='Survived',
data = df)
<Axes: xlabel='Fare_Range', ylabel='Survived'>
Out[30]:
Fare denotes the fare paid by a passenger. As the values in this column are continuous, they need to be put
in separate bins(as done for Age feature) to get a clear idea. It can be concluded that if a passenger paid a
higher fare, the survival rate is more.
Pair plot
In [31]: sns.pairplot(data=df)
plt.show()
Heat map
In [32]: heat_map=df.corr()
sns.heatmap(heat_map)
plt.show()
In [33]: plt.scatter(df.Fare,df.Age);
strip plot
In [34]: sns.stripplot(x='Fare',y='Age',data=df)
plt.show()
In [35]: sns.stripplot(x='Fare',y='Age',data=df,size=4)
plt.show()
In [ ]:
In [ ]: