DATA MANIPULATION WITH PANDAS homelessness_ind = homelessness.
sort_values("individuals")
1) Inspecting a DataFrame # Print the top few rows
print(homelessness_ind.head())
# edited/added
# Sort homelessness by descending family members
import pandas as pd
homelessness_fam = homelessness.sort_values("family_members",
homelessness = pd.read_csv('homelessness.csv', index_col=0)
ascending=False)
# Print the head of the homelessness data
# Print the top few rows
print(homelessness.head())
print(homelessness_fam.head())
# Print information about homelessness
# Sort homelessness by region, then descending family members
print(homelessness.info())
homelessness_reg_fam = homelessness.sort_values(["region", "family_members"],
# Print the shape of homelessness ascending=[True, False])
print(homelessness.shape) # Print the top few rows
# Print a description of homelessness print(homelessness_reg_fam.head())
print(homelessness.describe())
4) Subsetting columns
2) Parts of a DataFrame
# Select the individuals column
# Import pandas using the alias pd individuals = homelessness["individuals"]
import pandas as pd # Print the head of the result
# Print the values of homelessness print(individuals.head())
print(homelessness.values) # Select the state and family_members columns
# Print the column index of homelessness state_fam = homelessness[["state", "family_members"]]
print(homelessness.columns) # Print the head of the result
# Print the row index of homelessness print(state_fam.head())
print(homelessness.index) # Select only the individuals and state columns, in that order
ind_state = homelessness[["individuals", "state"]]
3) Sorting rows
# Print the head of the result
# Sort homelessness by individuals print(ind_state.head())
Subsetting rows # Add total col as sum of individuals and family_members
# Filter for rows where individuals is greater than 10000 homelessness["total"] = homelessness["individuals"] +
homelessness["family_members"]
ind_gt_10k = homelessness[homelessness["individuals"] > 10000]
# Add p_individuals col as proportion of total that are individuals
# See the result
homelessness["p_individuals"] = homelessness["individuals"] /
print(ind_gt_10k)
homelessness["total"]
# Filter for rows where region is Mountain
# See the result
mountain_reg = homelessness[homelessness["region"] == "Mountain"]
print(homelessness)
# See the result
print(mountain_reg) Combo-attack!
# Filter for rows where family_members is less than 1000 # and region is Pacific # Create indiv_per_10k col as homeless individuals per 10k state pop
fam_lt_1k_pac = homelessness[(homelessness["family_members"] < 1000) & homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] /
(homelessness["region"] == "Pacific")] homelessness["state_pop"]
# See the result # Subset rows for indiv_per_10k greater than 20
print(fam_lt_1k_pac) high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
# Sort high_homelessness by descending indiv_per_10k
Subsetting rows by categorical variables
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k",
# Subset for rows in South Atlantic or Mid-Atlantic regions ascending=False)
south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic") | # From high_homelessness_srt, select the state and indiv_per_10k cols
(homelessness["region"] == "Mid-Atlantic")]
result = high_homelessness_srt[["state", "indiv_per_10k"]]
# See the result
# See the result
print(south_mid_atlantic)
print(result)
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"] Mean and median
# Filter for rows in the Mojave Desert states # edited/added
mojave_homelessness = homelessness[homelessness["state"].isin(canu)] sales = pd.read_csv('sales_subset.csv', index_col=0)
# See the result # Print the head of the sales DataFrame
print(mojave_homelessness) print(sales.head())
Adding new columns
# Print the info about the sales DataFrame print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr,
np.median]))
print(sales.info())
# Print the mean of weekly_sales Cumulative statistics
print(sales["weekly_sales"].mean())
# edited/added
# Print the median of weekly_sales
sales_1_1 = sales[(sales["department"] == 1) & (sales["store"] == 1)]
print(sales["weekly_sales"].median())
# Sort sales_1_1 by date
Summarizing dates sales_1_1 = sales_1_1.sort_values("date")
# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
# Print the maximum of the date column
sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()
print(sales["date"].max())
# Get the cumulative max of weekly_sales, add as cum_max_sales col
# Print the minimum of the date column
sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()
print(sales["date"].min())
# See the columns you calculated
Efficient summaries print(sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]])
# A custom IQR functiondef iqr(column):
Dropping duplicates
return column.quantile(0.75) - column.quantile(0.25)
# Print IQR of the temperature_c column Remove rows of sales with duplicate pairs of store and type and save
as store_types and print the head.
print(sales["temperature_c"].agg(iqr)) Remove rows of sales with duplicate pairs of store and department and save
# A custom IQR functiondef iqr(column): as store_depts and print the head.
return column.quantile(0.75) - column.quantile(0.25)
# Drop duplicate store/type combinations
# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
store_types = sales.drop_duplicates(subset=["store", "type"])
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr))
print(store_types.head())
# Import NumPy and create custom IQR functionimport numpy as npdef
iqr(column): # Drop duplicate store/department combinations
return column.quantile(0.75) - column.quantile(0.25) store_depts = sales.drop_duplicates(subset=["store", "department"])
# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & print(store_depts.head())
unemployment # Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales["is_holiday"]].drop_duplicates(subset="date") sales_C = sales[sales["type"] == "C"]["weekly_sales"].sum()
# Print date col of holiday_dates # Get proportion for each type
print(holiday_dates["date"]) sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all
print(sales_propn_by_type)
Counting categorical variables
Calculations with .groupby()
# Count the number of stores of each type
store_counts = store_types["type"].value_counts() # Group by type; calc total weekly sales
print(store_counts) sales_by_type = sales.groupby("type")["weekly_sales"].sum()
# Get the proportion of stores of each type # Get proportion for each type
store_props = store_types["type"].value_counts(normalize=True) sales_propn_by_type = sales_by_type / sum(sales_by_type)
print(store_props) print(sales_propn_by_type)
# Count the number of each department number and sort # Group by type and is_holiday; calc total weekly sales
dept_counts_sorted = store_depts["department"].value_counts(sort=True) sales_by_type_is_holiday = sales.groupby(["type", "is_holiday"])
["weekly_sales"].sum()
print(dept_counts_sorted)
print(sales_by_type_is_holiday)
# Get the proportion of departments of each number and sort
dept_props_sorted = store_depts["department"].value_counts(sort=True, Multiple grouped summaries
normalize=True)
print(dept_props_sorted) # Import numpy with the alias npimport numpy as np
# For each store type, aggregate weekly_sales: get min, max, mean, and median
What percent of sales occurred at each store type? sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean,
np.median])
# Calc total weekly sales
# Print sales_stats
sales_all = sales["weekly_sales"].sum()
print(sales_stats)
# Subset for type A stores, calc total weekly sales
# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min,
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()
max, mean, and median
# Subset for type B stores, calc total weekly sales
unemp_fuel_stats = sales.groupby("type")[["unemployment",
sales_B = sales[sales["type"] == "B"]["weekly_sales"].sum() "fuel_price_usd_per_l"]].agg([np.min, np.max, np.mean, np.median])
# Subset for type C stores, calc total weekly sales # Print unemp_fuel_stats
print(unemp_fuel_stats) # edited/added
temperatures = pd.read_csv('temperatures.csv', index_col=0)
Pivoting on one variable
temperatures['date'] = pd.to_datetime(temperatures['date'],
# Pivot for mean weekly_sales for each store type infer_datetime_format=True)
mean_sales_by_type = sales.pivot_table(values="weekly_sales", index="type") # Look at temperatures
# Print mean_sales_by_type print(temperatures)
print(mean_sales_by_type) # Set the index of temperatures to city
# Import NumPy as npimport numpy as np temperatures_ind = temperatures.set_index("city")
# Pivot for mean and median weekly_sales for each store type # Look at temperatures_ind
mean_med_sales_by_type = sales.pivot_table("weekly_sales", "type", aggfunc = print(temperatures_ind)
[np.mean, np.median]) # Reset the temperatures_ind index, keeping its contents
# Print mean_med_sales_by_type print(temperatures_ind.reset_index())
print(mean_med_sales_by_type) # Reset the temperatures_ind index, dropping its contents
# Pivot for mean weekly_sales by store type and holiday print(temperatures_ind.reset_index(drop=True))
mean_sales_by_type_holiday = sales.pivot_table("weekly_sales", "type",
"is_holiday") Subsetting with .loc[]
# Print mean_sales_by_type_holiday
# Make a list of cities to subset on
print(mean_sales_by_type_holiday)
cities = ["Moscow", "Saint Petersburg"]
# Subset temperatures using square brackets
Fill in missing values and sum values with pivot tables
print(temperatures[temperatures["city"].isin(cities)])
# Print mean weekly_sales by department and type; fill missing values with 0
# Subset temperatures_ind using .loc[]
print(sales.pivot_table(values="weekly_sales", index="department",
columns="type", fill_value=0)) print(temperatures_ind.loc[cities])
# Print the mean weekly_sales by department and type; fill missing values with 0s;
Setting multi-level indexes
sum all rows and cols
print(sales.pivot_table(values="weekly_sales", index="department", # Index temperatures by country & city
columns="type", fill_value=0, margins =True)) temperatures_ind = temperatures.set_index(["country", "city"])
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
Setting and removing indexes
rows_to_keep = [("Brazil", "Rio De Janeiro"), ("Pakistan", "Lahore")] print(temperatures_srt.loc[:, "date":"avg_temp_c"])
# Subset for rows to keep # Subset in both directions at once
print(temperatures_ind.loc[rows_to_keep]) print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad"),
"date":"avg_temp_c"])
Sorting by index values
Slicing time series
# Sort temperatures_ind by index values
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
print(temperatures_ind.sort_index())
temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") &
# Sort temperatures_ind by index values at the city level
(temperatures["date"] <= "2011-12-31")]
print(temperatures_ind.sort_index(level="city"))
print(temperatures_bool)
# Sort temperatures_ind by country then descending city
# Set date as the index and sort the index
print(temperatures_ind.sort_index(level=["country", "city"], ascending = [True,
temperatures_ind = temperatures.set_index("date").sort_index()
False]))
# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
Slicing index values print(temperatures_ind.loc["2010":"2011"])
# Sort the index of temperatures_ind # Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
temperatures_srt = temperatures_ind.sort_index() print(temperatures_ind.loc["2010-08":"2011-02"])
# Subset rows from Pakistan to Russia
Subsetting by row/column number
print(temperatures_srt.loc["Pakistan":"Russia"])
# Get 23rd row, 2nd column (index 22, 1)
# Try to subset rows from Lahore to Moscow
print(temperatures.iloc[22, 1])
print(temperatures_srt.loc["Lahore":"Moscow"])
# Use slicing to get the first 5 rows
# Subset rows from Pakistan, Lahore to Russia, Moscow
print(temperatures.iloc[:5])
print(temperatures_srt.loc[("Pakistan", "Lahore"):("Russia", "Moscow")])
# Use slicing to get columns 3 to 4
Slicing in both directions print(temperatures.iloc[:, 2:4])
# Subset rows from India, Hyderabad to Iraq, Baghdad # Use slicing in both directions at once
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad")]) print(temperatures.iloc[:5, 2:4])
# Subset columns from date to avg_temp_c
Pivot temperature by city and year
# Add a year column to temperatures # edited/added
temperatures["year"] = temperatures["date"].dt.year import urllib.requestimport pickle
# Pivot avg_temp_c by country and city vs year avocados = pd.read_pickle("avoplotto.pkl")
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index = # Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt
["country", "city"], columns = "year")
# Look at the first few rows of data
# See the result
print(avocados.head())
print(temp_by_country_city_vs_year)
# Get the total number of avocados sold of each size
nb_sold_by_size = avocados.groupby("size")["nb_sold"].sum()
Subsetting pivot tables
# Create a bar plot of the number of avocados sold by size
# Subset for Egypt to India
nb_sold_by_size.plot(kind="bar")
temp_by_country_city_vs_year.loc["Egypt":"India"]
# Show the plot
# Subset for Egypt, Cairo to India, Delhi
plt.show()
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi")]
# Subset for Egypt, Cairo to India, Delhi, and 2005 to 2010 Changes in sales over time
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi"),
# Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt
"2005":"2010"]
# Get the total number of avocados sold on each date
Calculating on a pivot table nb_sold_by_date = avocados.groupby("date")["nb_sold"].sum()
# Get the worldwide mean temp by year # Create a line plot of the number of avocados sold by date
mean_temp_by_year = temp_by_country_city_vs_year.mean() nb_sold_by_date.plot(kind="line")
# Filter for the year that had the highest mean temp # Show the plot
print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()]) plt.show()
# Get the mean temp by city
Avocado supply and demand
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")
# Scatter plot of avg_price vs. nb_sold with title
# Filter for the city that had the lowest mean temp
avocados.plot(x="nb_sold", y="avg_price", kind="scatter", title="Number of
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()])
avocados sold vs. average price")
# Show the plot
Which avocado size is most popular?
plt.show() plt.show()
Price of conventional vs. organic avocados Finding missing values
# Histogram of conventional avg_price # edited/added
avocados[avocados["type"] == "conventional"]["avg_price"].hist() avocados_2016 = pd.read_csv('avocados_2016.csv')
# Histogram of organic avg_price cols_with_missing = ['small_sold', 'large_sold', 'xl_sold']
avocados[avocados["type"] == "organic"]["avg_price"].hist() # Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt
# Add a legend # Check individual values for missing values
plt.legend(["conventional", "organic"]) print(avocados_2016.isna())
# Show the plot # Check each column for missing values
plt.show() print(avocados_2016.isna().any())
# Modify histogram transparency to 0.5 # Bar plot of missing values by variable
avocados[avocados["type"] == "conventional"]["avg_price"].hist(alpha=0.5) avocados_2016.isna().sum().plot(kind="bar")
# Modify histogram transparency to 0.5 # Show plot
avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha=0.5) plt.show()
# Add a legend
Removing missing values
plt.legend(["conventional", "organic"])
# Show the plot # Remove rows with missing values
plt.show() avocados_complete = avocados_2016.dropna()
# Modify bins to 20 # Check if any columns contain missing values
avocados[avocados["type"] == "conventional"]["avg_price"].hist(bins=20, print(avocados_complete.isna().any())
alpha=0.5)
# Modify bins to 20 Replacing missing values
avocados[avocados["type"] == "organic"]["avg_price"].hist(bins= 20, alpha=0.5) # List the columns with missing values
# Add a legend cols_with_missing = ["small_sold", "large_sold", "xl_sold"]
plt.legend(["conventional", "organic"]) # Create histograms showing the distributions cols_with_missing
# Show the plot avocados_2016[cols_with_missing].hist()
# Show the plot # Read CSV as DataFrame called airline_bumping
plt.show() airline_bumping = pd.read_csv('airline_bumping.csv') # edited/added
# Take a look at the DataFrame
List of dictionaries
print(airline_bumping.head())
# Create a list of dictionaries with new data # For each airline, select nb_bumped and total_passengers and sum
avocados_list = [ airline_totals = airline_bumping.groupby("airline")[["nb_bumped",
{"date": "2019-11-03", "small_sold": 10376832, "large_sold": 7835071}, "total_passengers"]].sum()
{"date": "2019-11-10", "small_sold": 10717154, "large_sold": 8561348}, # Create new col, bumps_per_10k: no. of bumps per 10k passengers for each airline
] airline_totals["bumps_per_10k"] = airline_totals["nb_bumped"] /
airline_totals["total_passengers"] * 10000
# Convert list into DataFrame
# Print airline_totals
avocados_2019 = pd.DataFrame(avocados_list)
print(airline_totals)
# Print the new DataFrame
# Print airline_totals
print(avocados_2019)
print(airline_totals)
Dictionary of lists
DataFrame to CSV
# Create a dictionary of lists with new data
# Create airline_totals_sorted
avocados_dict = {
airline_totals_sorted = airline_totals.sort_values("bumps_per_10k",
"date": ["2019-11-17", "2019-12-01"], ascending=False)
"small_sold": [10859987, 9291631], # Print airline_totals_sorted
"large_sold": [7674135, 6238096] print(airline_totals_sorted)
} # Save as airline_totals_sorted.csv
# Convert dictionary into DataFrame airline_totals_sorted.to_csv("airline_totals_sorted.csv")
avocados_2019 = pd.DataFrame(avocados_dict)
# Print the new DataFrame
print(avocados_2019)
CSV to DataFrame