> Advanced Indexing Also see NumPy Arrays > Combining Data
Python For Data Science
Selecting
>>> df3.loc[:,(df3>1).any()] #Select cols with any vals >1
Data Wrangling in Pandas Cheat Sheet >>>
>>>
>>>
df3.loc[:,(df3>1).all()] #Select cols with vals > 1
df3.loc[:,df3.isnull().any()] #Select cols with NaN
df3.loc[:,df3.notnull().all()] #Select cols without NaN
Learn Data Wrangling online at www.DataCamp.com Indexing With isin()
>>> df[(df.Country.isin(df2.Type))] #Find same elements
>>> df3.filter(items=”a”,”b”]) #Filter on values
Merge
>>> df.select(lambda x: not x%5) #Select specific elements
Where >>> pd.merge(data1,
data2,
> Reshaping Data >>> s.where(s > 0) #Subset the data
Query
how='left',
on='X1')
>>> df6.query('second > first') #Query DataFrame
Pivot >>> pd.merge(data1,
data2,
>>> df3= df2.pivot(index='Date', #Spread rows into columns
Setting/Resetting Index how='right',
on='X1')
columns='Type',
values='Value') >>> df.set_index('Country') #Set the index
>>> df4 = df.reset_index() #Reset the index
>>> pd.merge(data1,
>>> df = df.rename(index=str, #Rename
data2,
DataFrame columns={"Country":"cntry",
how='inner',
"Capital":"cptl",
on='X1')
"Population":"ppltn"})
>>> pd.merge(data1,
Reindexing data2,
how='outer',
on='X1')
Pivot Table >>> s2 = s.reindex(['a','c','d','e','b'])
Forward Filling Backward Filling
>>> df4 = pd.pivot_table(df2, #Spread rows into
columns values='Value',
>>> df.reindex(range(4),
>>> s3 = s.reindex(range(5),
index='Date',
method='ffill') method='bfill') Join
columns='Type']) Country Capital Population
0 3
0 Belgium Brussels 11190846
1 3
>>> data1.join(data2, how='right')
1 India New Delhi 1303171035
2 3
Stack / Unstack 2 Brazil Brasília 207847528
3 3
3 Brazil Brasília 207847528 4 3
Concatenate
>>> stacked = df5.stack() #Pivot a level of column labels
>>> stacked.unstack() #Pivot a level of index labels
MultiIndexing Vertical
>>> s.append(s2)
>>> arrays = [np.array([1,2,3]),
np.array([5,4,3])]
Horizontal/Vertical
>>> df5 = pd.DataFrame(np.random.rand(3, 2), index=arrays)
>>> pd.concat([s,s2],axis=1, keys=['One','Two'])
>>> tuples = list(zip(*arrays))
>>> pd.concat([data1, data2], axis=1, join='inner')
>>> index = pd.MultiIndex.from_tuples(tuples,
names=['first', 'second'])
>>> df6 = pd.DataFrame(np.random.rand(3, 2), index=index)
Melt >>> df2.set_index(["Date", "Type"])
> Dates
> Duplicate Data
>>> pd.melt(df2, #Gather columns into rows
id_vars=["Date"],
value_vars=["Type", "Value"],
>>> df2['Date']= pd.to_datetime(df2['Date'])
value_name="Observations") >>> df2['Date']= pd.date_range('2000-1-1',
>>> s3.unique() #Return unique values
periods=6,
>>> df2.duplicated('Type') #Check duplicates
freq='M')
>>> df2.drop_duplicates('Type', keep='last') #Drop duplicates
>>> dates = [datetime(2012,5,1), datetime(2012,5,2)]
>>> df.index.duplicated() #Check index duplicates
>>> index = pd.DatetimeIndex(dates)
>>> index = pd.date_range(datetime(2012,2,1), end, freq='BM')
> Grouping Data
> Visualization Also see Matplotlib
Aggregation
> Iteration >>> df2.groupby(by=['Date','Type']).mean()
>>> import matplotlib.pyplot as plt
>>> s.plot()
>>> df2.plot()
>>> df4.groupby(level=0).sum()
>>> df4.groupby(level=0).agg({'a':lambda x:sum(x)/len(x), 'b': np.sum}) >>> plt.show() >>> plt.show()
>>> df.iteritems() #(Column-index, Series) pairs
>>> df.iterrows() #(Row-index, Series) pairs
Transformation
>>> customSum = lambda x: (x+x%2)
> Missing Data
>>> df4.groupby(level=0).transform(customSum)
>>> df.dropna() #Drop NaN values
>>> df3.fillna(df3.mean()) #Fill NaN values with a predetermined value
>>> df2.replace("a", "f") #Replace values with others
Learn Data Skills Online at www.DataCamp.com