Data Science Practical Notebook
T.Y.B.Sc(Computer Science)
CS 358 : Data Science Practicals
Assignment 1 : The Data Science Environment
Assignment 2 : Statistical Data Analysis
Assignment 3 : Data Preprocessing
Assignment 4 : Data Visualization
ASSIGNMENT 1 : THE DATA SCIENCE ENVIRONMENT
SET A
1.
Create and view a data frame
#import the library
import pandas as pd
import numpy as np
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'
],
'Age' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21],
'Percentage' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame
index Name Age Percentage
0 A 26 56
1 B 28 62
2 C 20 42
3 D 15 74
4 E 20 32
5 F 16 63
6 G 18 74
7 H 17 84
8 I 22 96
9 J 21 21
2.
#print shape >> number of rows - columns
print("Size={}\n Shape={}\nNumber of rows={}\nNumber of Columns={}".
format(data.size, data.shape, data.shape[0], data.shape[1]))
print("\n Feature Names = {}, {}, {}".
format(data.columns[0], data.columns[1], data.columns[2]))
Size = 30
Shape = (10, 3)
Number of rows = 10
Number of Columns = 3
Feature Names = Name, Age, Percentage
3.
Adding 5 rows and 1 column
data.loc[10] = ['K',21,56 ]
data.loc[11] = ['L',21,None]
data.loc[12] = ['M',None, 45]
data.loc[13] = ['K',21,56]
data.loc[14] = ['O',25,84]
data["Remarks"] = None
data
index Name Age Percentage Remarks
0 A 26 56 null
1 B 28 62 null
2 C 20 42 null
3 D 15 74 null
4 E 20 32 null
5 F 16 63 null
6 G 18 74 null
7 H 17 84 null
8 I 22 96 null
9 J 21 21 null
10 K 21 56 null
11 L 21 null null
12 M null 45 null
13 K 21 56 null
14 O 25 84 null
4.
print("Number of Observations = ", len(data.index))
print(" \nTotal missing values in a DataFrame : \n\n",
data.isnull().sum().sum())
print(" \nTotal missing values in a DataFrame : \n\n",
data.isnull().sum().sum())
print(data.duplicated().value_counts() #number of duplicate values
Number of Observations = 15
Total missing values in a DataFrame :
17
Total missing values in a DataFrame :
17
False 14
True 1
dtype: int64
5.
Removing a column and missing values
data2=data.drop(columns='Remarks')
data2=data2.dropna(axis=0)
#print modified data
data2
index Name Age Percentage
0 A 26 56
1 B 28 62
2 C 20 42
3 D 15 74
4 E 20 32
5 F 16 63
6 G 18 74
7 H 17 84
8 I 22 96
9 J 21 21
10 K 21 56
13 K 21 56
14 O 25 84
6.
Scatterplot
data2.plot.scatter(x='Name',y='Percentage',
title = "Scatterplot")
plt.show()
SET B
1.
import pandas as pd
data=pd.read_csv('SOCR-HeightWeight.csv')
data.tail(10) #print last 10 rows
data.sample(20) #print 20 random rows
data.head(10) #print first 10 rows
index Height(Inches) Weight(Pounds)
0 65.78331 112.9925
1 71.51521 136.4873
2 69.39874 153.0269
3 68.2166 142.3354
4 67.78781 144.2971
5 68.69784 123.3024
6 69.80204 141.4947
7 70.01472 136.4623
8 67.90265 112.3723
9 66.78236 120.6672
2.
Add column "BMI"
data2=data.assign(BMI=data['Weight(Pounds)']/(data['Height(Inches)']*
data['Height(Inches)']))
3.
print("\n Maximum BMI = ",max(data2['BMI']))
print("\n Minimum BMI = ",min(data2['BMI']))
Maximum BMI = 0.03701443692089851
Minimum BMI = 0.018591137267932455
ASSIGNMENT 2 : STATISTICAL DATA ANALYSIS
SET A :
1.
import numpy as np
#Inserting the two data points
a=np.array((2,3))
b=np.array((4,5))
#Euclidean Distance
print("Euclidean Distance = ", np.linalg.norm(a-b))
Euclidean Distance = 2.8284271247461903
2.
Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'
],
'Scores' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
print(data) #To view the data frame
print("\n Mean Score = ",s.tmean(data["Scores"]) )
print("\n Maximum = ",max(data["Scores"]))
print("\n Minimum = ",min(data["Scores"]))
print("\n Range = ",
max(data["Scores"]) - min(data["Scores"]) )
q3,q1 = np.percentile(data["Scores"],[75,25])
print("\n Q3 = ", q3)
print("\n Q1 = ", q1)
print("\n IQR = ", q3 - q1)
Name Scores
0 A 56
1 B 62
2 C 42
3 D 74
4 E 32
5 F 63
6 G 74
7 H 84
8 I 96
9 J 21
Mean Score = 60.4
Maximum = 96
Minimum = 21
Range = 75
Q3 = 74.0
Q1 = 45.5
IQR = 28.5
3.
Program to find Manhattan distance between all pairs of points
import math
def manhattan(a,b,n):
sum = 0
i = 0
for i in range(n):
sum += abs(a[i]-b[i])
return sum
a=[3,5,5,6,5,4,3]
b=[-2,3,2,-5,2,3,-1]
n=len(a) #or len(b)
print("Manhattan Distance = ", manhattan(a,b,n))
Manhattan Distance = 29
SET B
1.
data=pd.read_csv('iris.csv')
print("Number of records for different variety/class attribute \n")
data['variety'].value_counts()
Number of records for different variety/class attribute
Versicolor 50
Setosa 50
Virginica 50
Name: variety, dtype: int64
2.
import pandas as pd
from pandas.api.types import is_numeric_dtype
print("Iris Dataset : Column wise Mean and Median \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Mean = %.2f' % data[col].mean())
print('\t Median = %.2f' % data[col].median())
Iris Dataset : Column wise Mean and Median
sepal.length:
Mean = 5.84
Median = 5.80
sepal.width:
Mean = 3.06
Median = 3.00
petal.length:
Mean = 3.76
Median = 4.35
petal.width:
Mean = 1.20
Median = 1.30
SET C :
1.
Program to find Minkowskii Distance between two points
from math import *
from decimal import Decimal
def nth_root(value,root):
root_value = 1/float(root)
return round(Decimal(value)**
Decimal(root_value),3)
def minkowski(a,b,n):
return(nth_root(sum(pow(abs(i-j),n)
for i,j in zip(a,b)),n))
a=[-1,5]
b=[2,4]
n=len(a) #OR root value
print("\n Minkowski Distance = ",minkowski(a,b,n))
Minkowski Distance = 3.162
2.
import numpy as np
x = np.array([0, 1, 3])
y = np.array([2, 4, 5])
print("\nOriginal array1:")
print(x)
print("\nOriginal array1:")
print(y)
print("\nCross-correlation of the said arrays:\n",np.cov(x, y))
Original array1:
[0 1 3]
Original array1:
[2 4 5]
Cross-correlation of the said arrays:
[[2.33333333 2.16666667]
[2.16666667 2.33333333]]
3.
Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Student' : ["1","2","3","4","5","6","7","8","9","10"],
'Subject 1':[41,62,35,15,21,65,84,75,42,95],
'Subject 2' : [56,62,42,74,32,63,74,84,96,21],
'Subject 3' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21],
'Subject 4' : [41,75,84,62,13,56,42,84,95,23],
'Subject 5' : [45,74,62,31,21,54,45,86,95,32]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame
Index Student Subject 1 Subject 2 Subject 3 Subject 4 Subject 5
0 1 41 56 26 41 45
1 2 62 62 28 75 74
2 3 35 42 20 84 62
3 4 15 74 15 62 31
4 5 21 32 20 13 21
5 6 65 63 16 56 54
6 7 84 74 18 42 45
7 8 75 84 17 84 86
8 9 42 96 22 95 95
9 10 95 21 21 23 32
from pandas.api.types import is_numeric_dtype
from scipy.stats.mstats import gmean
import statistics as stat
print("Subject wise Mean \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Arithmetic Mean = %.2f' % data[col].mean())
print('\t Geometric Mean = %.2f' % gmean(data[col]))
print('\t Harmonic Mean = %.2f' % stat.harmonic_mean(data[col]))
Subject wise Mean
Subject 1:
Arithmetic Mean = 53.50
Geometric Mean = 46.35
Harmonic Mean = 38.71
Subject 2:
Arithmetic Mean = 60.40
Geometric Mean = 55.41
Harmonic Mean = 49.53
Subject 3:
Arithmetic Mean = 20.30
Geometric Mean = 19.93
Harmonic Mean = 19.58
Subject 4:
Arithmetic Mean = 57.50
Geometric Mean = 49.59
Harmonic Mean = 39.96
Subject 5:
Arithmetic Mean = 54.50
Geometric Mean = 49.33
Harmonic Mean = 44.27
ASSIGNMENT 3 : DATA PREPROCESSING
SET A
1.
import pandas as pd
import io
data = pd.read_csv('Data.csv',sep = ',')
data
index Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
a.
data.describe()
index Age Salary
count 9.0 9.0
mean 38.77777777777778 63777.77777777778
std 7.693792591722527 12265.579661982732
min 27.0 48000.0
25% 35.0 54000.0
50% 38.0 61000.0
75% 44.0 72000.0
max 50.0 83000.0
b.
print("Size = {} \n Shape of DataFrame Object = {}\n Number of rows
= {} \n Number of Columns = {}".
format(data.size, data.shape, data.shape[0], data.shape[1]))
Size = 40
Shape of DataFrame Object = (10, 4)
Number of rows = 10
Number of Columns = 4
c.
print("\n first 3 rows from Dataset")
data.head(3)
First 3 rows from dataset
index Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
2. a.
Applying OneHot Encoding on Country Column
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc_data= pd.DataFrame(enc.fit_transform(data[['Country']]).toarray()
)
enc_data
index 0 1 2 3
0 1.0 0.0 0.0 0.0
1 0.0 0.0 0.0 1.0
2 0.0 0.0 1.0 0.0
3 0.0 0.0 0.0 1.0
4 0.0 0.0 1.0 0.0
5 1.0 0.0 0.0 0.0
6 0.0 0.0 0.0 1.0
7 0.0 1.0 0.0 0.0
8 0.0 0.0 1.0 0.0
9 1.0 0.0 0.0 0.0
data_merge= data.join(enc_data)
data_merge
index Country Age Salary Purchased 0 1 2 3
0 France 44.0 72000.0 No 1.0 0.0 0.0 0.0
1 Spain 27.0 48000.0 Yes 0.0 0.0 0.0 1.0
2 Germany 30.0 54000.0 No 0.0 0.0 1.0 0.0
3 Spain 38.0 61000.0 No 0.0 0.0 0.0 1.0
4 Germany 40.0 NaN Yes 0.0 0.0 1.0 0.0
5 France 35.0 58000.0 Yes 1.0 0.0 0.0 0.0
6 Spain NaN 52000.0 No 0.0 0.0 0.0 1.0
7 France 48.0 79000.0 Yes 0.0 1.0 0.0 0.0
8 Germany 50.0 83000.0 No 0.0 0.0 1.0 0.0
9 France 37.0 67000.0 Yes 1.0 0.0 0.0 0.0
b.
Applying label encoding on purchased column
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
data['Purchased'] = labelencoder.fit_transform(data['Purchased'])
data
index Country Age Salary Purchased
0 France 44.0 72000.0 0
1 Spain 27.0 48000.0 1
2 Germany 30.0 54000.0 0
3 Spain 38.0 61000.0 0
4 Germany 40.0 NaN 1
5 France 35.0 58000.0 1
6 Spain NaN 52000.0 0
7 France 48.0 79000.0 1
8 Germany 50.0 83000.0 0
9 France 37.0 67000.0 1
#The purchased labels are replaces by numbers 0 and 1,where 'No' is
assigned 0, and 'Yes' is assigned 1.
SET B
1.
# Rescaling Data
import pandas, scipy, numpy
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
array=data.values
#Separating data into input and output components
data_scaler=preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled = data_scaler.fit_transform(array)
print("\n Min Max Scaled Data \n \n ")
print(data_scaled.round(3))
Min Max Scaled Data
[[0.248 ………………………………, 0.4]
………………………………………………,
[0.124 ………………………, 0.4 0.6]]
2.
# Standardizing Data
from sklearn.preprocessing import StandardScaler
import scipy.stats as s
scaler=StandardScaler().fit(data)
std_data=scaler.transform(data)
print("\n Standardized Data \n ")
print(std_data)
print("\n Standardized Mean : ",s.tmean(std_data).round(2))
print(" Standardized Standard Deviation : ",round(std_data.std(),2))
Standardized Data
[[-0.528 ………………………… ]
[………………………………,
[………………………………………, 0.45084835]]
Standardized Mean : 0.0
Standardized Standard Deviation : 1.0
3.
# Normalizing Data
import numpy as np
import pandas as pd
import scipy.stats as s
from sklearn import preprocessing
norm_data=preprocessing.normalize(data,norm='l1')
print("\n Normalized Data \n ")
norm_data
Normalized Data
array([[0.099…………………………………….],
[………………………………………….., 0.06487013]])
4.
# Binarizing Data
binarized_data=preprocessing.Binarizer(threshold=0.0).fit(data).trans
form(data)
print("\n Binarized Data \n ")
binarized_data
Binarized Data
array([[1., 1., 0., ..., 1., 1., 1.],
[1., …………......, 1., 1., 1.],
[1., 1., 1., ..., 1., 1., 1.]])
SET C
1.
import pandas as pd
import io
data= pd.read_csv('Student_bucketing.csv')
data=pd.DataFrame(data)
data['bucket']=pd.cut(data['marks'],5,
labels=['Poor','Below_average','Average','Above_average
','Excellent'])
data.head(10)
index Student_id Age Grade Employed marks bucket
0 1 19 1st Class yes 29 Poor
1 2 20 2nd Class no 41 Below_average
2 3 18 1st Class no 57 Average
3 4 21 2nd Class no 29 Poor
4 5 19 1st Class no 57 Average
5 6 20 2nd Class yes 53 Average
6 7 19 3rd Class yes 78 Above_average
7 8 21 3rd Class yes 70 Above_average
8 9 22 3rd Class yes 97 Excellent
9 10 21 1st Class no 58 Average
ASSGNMENT 4 : DATA VISUALIZATION
SET A
1.
from matplotlib import pyplot as plt
import numpy as np
# generate random array using NumPy
a1 = np.random.randn(50)
a2 = np.random.randn(50)
plt.plot(a1,color="k",linewidth=1,linestyle=':')
plt.title("Line Chart")
plt.show()
plt.scatter(a1,a2,c=np.random.randn(50) ,marker ='*',alpha = 0.9)
plt.title("Scatter Plot")
plt.show()
plt.hist(a2,bins=15,facecolor ='lawngreen',edgecolor = "k",alpha=0.7)
print("Histogram")
Histogram
box=plt.boxplot(a2,vert=False,patch_artist = True)
print("Boxplot")
2.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
sns.countplot(x='variety',data = data)
plt.title("Iris Species Count")
plt.show()
3.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
ax=plt.subplots(1,1,figsize=(10,8))
data['variety'].value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct
='%1.1f%%',shadow=True,figsize=(10,8))
plt.title("Iris Species %")
plt.show()
4.
import seaborn as sns
iris_setosa=data.loc[data["variety"]=="Setosa"]
iris_virginica=data.loc[data["variety"]=="Virginica"]
iris_versicolor=data.loc[data["variety"]=="Versicolor"]
sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.length").ad
d_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.width").add
_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.length").ad
d_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.width").add
_legend()
plt.show()
SET B
1.
import seaborn as sns
import matplotlib.pyplot as plt
def graph(a):
sns.boxplot(x="variety", y=a, data=data)
plt.figure(figsize=(10,10))
plt.subplot(221)
graph('sepal.length')
plt.subplot(222)
graph('sepal.width')
plt.subplot(223)
graph('petal.length')
plt.subplot(224)
graph('petal.width')
plt.show()
SET C
1.
#Plot to compare all features of iris dataset
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(data,hue='variety', height=2)
plt.show()
s
2.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
g = sns.jointplot(x="sepal.length", y="sepal.width",shade=True, data=
data, kind="kde", color="b")
g.plot_joint(plt.scatter, c="gold", s=40, linewidth=1, marker="*")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$SepalLength$", "$SepalWidth$")
plt.show()