Pandas
Pandas is a high-level data manipulation tool developed by Wes McKinney.
It is built on the top of Numpy package
Pandas key data structure is called the Series and DataFrame.
DataFrames allow you to store and manipulate tabular data in rows of observations and
columns of variables.
Pandas is an open source Python package that is most widely used for data
science/data analysis and machine learning tasks
What Can You Do With DataFrames Using Pandas?
Data cleansing
Data fill
Data normalization
Merges and joins
Data visualization
Statistical analysis
Data inspection
Loading and saving data
Series : series are similar to numpy array except we can give named or
datetime index instead of numerical index
import numpy as np
import pandas as pd
lable =['a','b','c']
lst =[10,20,30]
arr =np.array([10,20,30])
dis ={'a':10,'b':20,'c':30}
pd.Series(lst)
pd.Series(lst,lable)
pd.Series(arr,lable)
pd.Series(dis)
pd.Series([sum,print,len])
ser1 =pd.Series([1,2,3,4],['USA','CHAINA','FRANCE','GERMANY'])
ser2 =pd.Series([1,2,3,4],['USA','CHAINA','INDIA','SINGAPOOR'])
ser1
ser2
ser1['USA']
ser1 + ser2
Data frames which is but directly top of series which is used in financial data
The numpy.random.randn() function creates an array of specified shape and fills it with random
values as per standard normal distribution.
import numpy as np
import pandas as pd
from numpy.random import randn
np.random.seed(101)
df =pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
columns are series all sharing common index
df['W']
type(df['W'])
type(df)
df.w
df[['W','X']]
adding new column to data frame
df['new'] =df['Y']+df['Z']
df.drop('new')
df.drop('new',axis=1)
df
df.drop('new',axis=1,inplace=True)
df.drop('E',inplace=True)
selecting row in two ways
df.loc['A']
df.iloc[2]
df.loc[['A','B']]
subset of rows and columns
select row a, b and column w,y
df.loc[['A','B'],['W','Y']]
df.iloc[2:,:]
df.iloc[2:,2:]
df.iloc[2:,:2]
df.iloc[:2,:2]
f.iloc[1:3,1:3]
df.iloc[-2:,-2:]
df.iloc[0:2,0:2]
df >0
booldf =df >0
df[booldf]
df[df>0]
df['W']>0
df[df['W']>0]
resultdf =df[df['W']<0]
resultdf
resultdf[['X','Z']]
Instead of doing two steps
df[df['W']<0][['X','Z']]
df[(df['W']<0) & (df['Y'] >0)]
df[(df['W']<0) | (df['Y'] >0)]
df.reset_index()
lst=['TN','AP','KA','MH','TS']
df['STATE']=lst
Multi level data frame
outside =['G1','G1','G1','G2','G2','G2']
inside =[1,2,3,1,2,3]
hier_index=list(zip(outside,inside))
hier_index=pd.MultiIndex.from_tuples(hier_index)
df =pd.DataFrame(randn(6,2),hier_index,['A','B'])
df.loc['G1']
df.loc['G1']['A']
df.index.names
df.index.names=['Groups','Num']
df.loc['G2'].loc[2]['B']
Cross Section
df.xs('G1')
df.xs(1,level='Num')
df.xs(('G1',2))
Missing data
d ={'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]}
df = pd.DataFrame(d)
df.dropna()
df.dropna(axis=1)
df.dropna(thresh=2)
fill value
df.fillna(value=0)
df['A'].fillna(df['A'].mean())
df['A'].fillna(df['A'].mean(),inplace=True)
Grouping
d ={'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['RAM','SHAM','SUNIL','SUDEEP','RAHEEM','SHEETAL'],
'Sales':[250,400,200,150,350,100]}
df =pd.DataFrame(d)
bycomp.mean()
bycomp.max()
bycomp.std()
bycomp.min()
bycomp.sum()
bycomp.sum().loc['FB']
bycomp.describe()
bycomp.describe().transpose()
df.groupby('Company').describe().transpose()['FB']
Merging , joining,Concatination
df1 =pd.DataFrame({'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3'],
'C':['C0','C1','C2','C3']},
index =[0,1,2,3])
df2=pd.DataFrame({'A':['A4','A5','A6','A7'],
'B':['B4','B5','B6','B7'],
'C':['C4','C5','C6','C7']},
index =[4,5,6,7])
df3=pd.DataFrame({'A':['A8','A9','A10','A11'],
'B':['B8','B9','B10','B11'],
'C':['C8','C9','C10','C11']},
index =[8,9,10,11])
concatinate
pd.concat([df1,df2,df2])
pd.concat([df1,df2,df2],axis=1)
left =pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right =pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
pd.merge(left,right,how='inner',on='key')
emp=pd.DataFrame({'EMPNO':['E001','E0002','E003','E004'],
'ENAME':['BABJEE','RAM','SUNIL','SHAM'],
'DEPTNO':[10,10,20,30]})
dept=pd.DataFrame({'Dname':['Accounts','Admin','It'],'DEPTNO':[10,20,50]})
pd.merge(emp,dept,how='inner',on='DEPTNO')
emp=pd.DataFrame({'EMPNO':['E001','E0002','E003','E004'],
'ENAME':['BABJEE','RAM','SUNIL','SHAM']},
index =[10,10,20,30])
dept=pd.DataFrame({'DNAME':['Accounts','Admin','It'],
'LOCATION':['CHENNAI','MUMBAI','PUNE'] },
index=[10,20,50])
emp.join(dept,how='inner')
emp.join(dept,how='outer')
df =pd.DataFrame({'Col1':[1,2,3,4],
'Col2':[444,555,666,444],
'Col3':['abc','def','ghi','xyz']})
df.head(2)
df.tail(2)
df['Col2'].unique()
len(df['Col2'].unique())
df['Col2'].nunique()
df['Col2'].value_counts()
df[df['Col1']>2]
df[(df['Col1']>2) & (df['Col2']==444)]
df['Col1'].sum()
Customs function
def times2(x):
retrun x*x
df['Col1'].apply(times2)
calling built-in functions
df['Col3'].apply(len)
df['Col2'].apply(lambda x: x *x)
df.drop('Col1',axis=1)
df.columns
df.index
df.sort_values(by='Col2',ascending=False)
df.isnull()
input and output
pwd
pd.read_csv('d:\\demo\example.csv')
pd.read_excel("d:\demo\example.xlsx")
df.to_csv("d://demo/myoutput.csv",index=False)
pd.read_excel("d:\\demo\\example.xlsx",sheet_name='Sheet1')
df.to_excel("d:\\demo\\example1.xlsx",sheet_name='Sheet2',index=False)
table_MN = pd.read_html('https://en.wikipedia.org/wiki/Minnesota',match='Election results from
statewide races')
import pandas as pd
from sqlalchemy import create_engine
cnx = create_engine('mysql+pymysql://root:admin123@localhost:3306/demo').connect()
sql = 'select * from customers'
df = pd.read_sql(sql, cnx)
The Pandas datareader is a sub package that allows one to create a dataframe from
various internet datasources, currently including:
Yahoo! Finance
Google Finance
St.Louis FED (FRED)
Kenneth French’s data library
World Bank
Google Analytics
pip install pandas-datareader
import pandas_datareader.data as web
import datetime as dt
start=dt.datetime(2015,1,1)
end=dt.datetime(2015,12,31)
facebook =web.DataReader('FB','yahoo',start,end
Pandas time series
majority of data in financial analysis is time series
datatime index
import pandas as pd
import numpy as np
from datetime import datetime
first_two =[datetime(2017,1,1),datetime(2017,1,2)]
dt_ind =pd.DatetimeIndex(first_two)
data =np.random.randn(2,2)
df =pd.DataFrame(data,dt_ind,['a','b'])
df.index.argmax()
df.index.argmin()
df.index.max()
Time resampling
df = pd.read_csv("d://demo//walmart_stock.csv")
df.head()
df.info()
df['Date']=pd.to_datetime(df['Date'])
df.info()
df.set_index('Date',inplace=True)
df.index()