Pg 24 Table 2.
# Import required packages
import pandas as pd
# Load data
housing_df = pd.read_csv ('WestRoxbury.csv')
housing_df.shape # find the dimension of data frame
housing_df.head () # show the first five rows
print (housing_df) # show all the data
# Rename columns: replace spaces with '_' to allow dot notation
housing_df = housing_df.rename (columns={'Total Value' : 'Total_Value'}) # explicit
housing_df.columns = [s.strip().replace(' ', '_') for s in housing_df.columns] # all columns
# Practice showing the first four rows of the data
housing_df.loc[0:3] # loc[a:b] gives rows a to b, inclusive
housing_df.iloc[0:4] # iloc[a:b] gives rows a to b-1
# Different ways of showing the first 10 values in column Total_Value
housing_df ['Total_Value'] .iloc[0:10]
housing_df.iloc [4, 0:10]
housing_df.iloc [4:5, 0:10] # use a slice to return a data frame
# Use pd.concat to combine non-consecutive columns into a new data frame
# The axis argument specifies the dimension along which the
# concatenation happens, 0=rows, 1=columns
pd.concat([housing_df.iloc[4:6,0:2], housing_df.iloc[4:6,4:6]], axis=1)
# To specify a full column, use:
housing.iloc[:,0:1]
housing.Total_Value
housing_df['Total_Value'] [0:10] # show the first 10 rows of the first column
# Descriptive statistics
print ('Number of rows ', len(housing_df['Total_Value'])) # show length of first column
print ('Mean of Total_Value ', housing_df['Total_Value'] .mean()) # show mean of column
housing_df.describe() # show summary statistics for each column
Table 4.3
Cereals_df = pd.read_csv(‘Cereals.csv’)
Cereals_df =cereals_df.rename(columns={‘CAT, MEDV’ ; ‘CAT_MEDV’})
Cereals_df.head(9)
Cereals_df .describe()
# Compute mean, standard deviation, min, max, median
# cereals
Print (‘Mean : ‘, Cereals_df.cereals.mean())
Print (‘Std. dev : ‘, Cereals_df. cereals.std())
Print (‘Min : ‘, Cereals_df. cereals.min())
Print (‘Max : ‘, Cereals_df. cereals.max())
Print (‘Median : ‘, Cereals_df. cereals.median())
# Compute mean, standard dev., min, max, median
Pd.DataFrame({‘mean’ : cereals_df.mean() ,
‘sd’ : cereals_df.std() ,
‘min’ : cereals_df.min() ,
‘max’ : cereals_df.max() ,
‘median’ : cereals_df.median})
Python code in practice
import pandas as pd
df = pd.read_csv("Cereals.csv")
df.head()
# import pandas
import pandas as pd
# import matplotlib
import matplotlib.pyplot as plt
# import seaborn
import seaborn as sns
%matplotlib inline