Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
62 views9 pages

Numpy

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
62 views9 pages

Numpy

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 9

DATA MANIPULATION WITH PANDAS homelessness_ind = homelessness.

sort_values("individuals")
1) Inspecting a DataFrame # Print the top few rows
print(homelessness_ind.head())
# edited/added
# Sort homelessness by descending family members
import pandas as pd
homelessness_fam = homelessness.sort_values("family_members",
homelessness = pd.read_csv('homelessness.csv', index_col=0)
ascending=False)
# Print the head of the homelessness data
# Print the top few rows
print(homelessness.head())
print(homelessness_fam.head())
# Print information about homelessness
# Sort homelessness by region, then descending family members
print(homelessness.info())
homelessness_reg_fam = homelessness.sort_values(["region", "family_members"],
# Print the shape of homelessness ascending=[True, False])
print(homelessness.shape) # Print the top few rows
# Print a description of homelessness print(homelessness_reg_fam.head())
print(homelessness.describe())
4) Subsetting columns
2) Parts of a DataFrame
# Select the individuals column
# Import pandas using the alias pd individuals = homelessness["individuals"]
import pandas as pd # Print the head of the result
# Print the values of homelessness print(individuals.head())
print(homelessness.values) # Select the state and family_members columns
# Print the column index of homelessness state_fam = homelessness[["state", "family_members"]]
print(homelessness.columns) # Print the head of the result
# Print the row index of homelessness print(state_fam.head())
print(homelessness.index) # Select only the individuals and state columns, in that order
ind_state = homelessness[["individuals", "state"]]
3) Sorting rows
# Print the head of the result
# Sort homelessness by individuals print(ind_state.head())
Subsetting rows # Add total col as sum of individuals and family_members
# Filter for rows where individuals is greater than 10000 homelessness["total"] = homelessness["individuals"] +
homelessness["family_members"]
ind_gt_10k = homelessness[homelessness["individuals"] > 10000]
# Add p_individuals col as proportion of total that are individuals
# See the result
homelessness["p_individuals"] = homelessness["individuals"] /
print(ind_gt_10k)
homelessness["total"]
# Filter for rows where region is Mountain
# See the result
mountain_reg = homelessness[homelessness["region"] == "Mountain"]
print(homelessness)
# See the result
print(mountain_reg) Combo-attack!
# Filter for rows where family_members is less than 1000 # and region is Pacific # Create indiv_per_10k col as homeless individuals per 10k state pop
fam_lt_1k_pac = homelessness[(homelessness["family_members"] < 1000) & homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] /
(homelessness["region"] == "Pacific")] homelessness["state_pop"]
# See the result # Subset rows for indiv_per_10k greater than 20
print(fam_lt_1k_pac) high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
# Sort high_homelessness by descending indiv_per_10k
Subsetting rows by categorical variables
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k",
# Subset for rows in South Atlantic or Mid-Atlantic regions ascending=False)
south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic") | # From high_homelessness_srt, select the state and indiv_per_10k cols
(homelessness["region"] == "Mid-Atlantic")]
result = high_homelessness_srt[["state", "indiv_per_10k"]]
# See the result
# See the result
print(south_mid_atlantic)
print(result)
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"] Mean and median
# Filter for rows in the Mojave Desert states # edited/added
mojave_homelessness = homelessness[homelessness["state"].isin(canu)] sales = pd.read_csv('sales_subset.csv', index_col=0)
# See the result # Print the head of the sales DataFrame
print(mojave_homelessness) print(sales.head())

Adding new columns


# Print the info about the sales DataFrame print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr,
np.median]))
print(sales.info())
# Print the mean of weekly_sales Cumulative statistics
print(sales["weekly_sales"].mean())
# edited/added
# Print the median of weekly_sales
sales_1_1 = sales[(sales["department"] == 1) & (sales["store"] == 1)]
print(sales["weekly_sales"].median())
# Sort sales_1_1 by date
Summarizing dates sales_1_1 = sales_1_1.sort_values("date")
# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
# Print the maximum of the date column
sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()
print(sales["date"].max())
# Get the cumulative max of weekly_sales, add as cum_max_sales col
# Print the minimum of the date column
sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()
print(sales["date"].min())
# See the columns you calculated
Efficient summaries print(sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]])

# A custom IQR functiondef iqr(column):


Dropping duplicates
return column.quantile(0.75) - column.quantile(0.25)
# Print IQR of the temperature_c column  Remove rows of sales with duplicate pairs of store and type and save
as store_types and print the head.
print(sales["temperature_c"].agg(iqr))  Remove rows of sales with duplicate pairs of store and department and save
# A custom IQR functiondef iqr(column): as store_depts and print the head.
return column.quantile(0.75) - column.quantile(0.25)
# Drop duplicate store/type combinations
# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
store_types = sales.drop_duplicates(subset=["store", "type"])
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr))
print(store_types.head())
# Import NumPy and create custom IQR functionimport numpy as npdef
iqr(column): # Drop duplicate store/department combinations
return column.quantile(0.75) - column.quantile(0.25) store_depts = sales.drop_duplicates(subset=["store", "department"])
# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & print(store_depts.head())
unemployment # Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales["is_holiday"]].drop_duplicates(subset="date") sales_C = sales[sales["type"] == "C"]["weekly_sales"].sum()
# Print date col of holiday_dates # Get proportion for each type
print(holiday_dates["date"]) sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all
print(sales_propn_by_type)
Counting categorical variables
Calculations with .groupby()
# Count the number of stores of each type
store_counts = store_types["type"].value_counts() # Group by type; calc total weekly sales
print(store_counts) sales_by_type = sales.groupby("type")["weekly_sales"].sum()
# Get the proportion of stores of each type # Get proportion for each type
store_props = store_types["type"].value_counts(normalize=True) sales_propn_by_type = sales_by_type / sum(sales_by_type)
print(store_props) print(sales_propn_by_type)
# Count the number of each department number and sort # Group by type and is_holiday; calc total weekly sales
dept_counts_sorted = store_depts["department"].value_counts(sort=True) sales_by_type_is_holiday = sales.groupby(["type", "is_holiday"])
["weekly_sales"].sum()
print(dept_counts_sorted)
print(sales_by_type_is_holiday)
# Get the proportion of departments of each number and sort
dept_props_sorted = store_depts["department"].value_counts(sort=True, Multiple grouped summaries
normalize=True)
print(dept_props_sorted) # Import numpy with the alias npimport numpy as np
# For each store type, aggregate weekly_sales: get min, max, mean, and median
What percent of sales occurred at each store type? sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean,
np.median])
# Calc total weekly sales
# Print sales_stats
sales_all = sales["weekly_sales"].sum()
print(sales_stats)
# Subset for type A stores, calc total weekly sales
# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min,
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()
max, mean, and median
# Subset for type B stores, calc total weekly sales
unemp_fuel_stats = sales.groupby("type")[["unemployment",
sales_B = sales[sales["type"] == "B"]["weekly_sales"].sum() "fuel_price_usd_per_l"]].agg([np.min, np.max, np.mean, np.median])
# Subset for type C stores, calc total weekly sales # Print unemp_fuel_stats
print(unemp_fuel_stats) # edited/added
temperatures = pd.read_csv('temperatures.csv', index_col=0)
Pivoting on one variable
temperatures['date'] = pd.to_datetime(temperatures['date'],
# Pivot for mean weekly_sales for each store type infer_datetime_format=True)

mean_sales_by_type = sales.pivot_table(values="weekly_sales", index="type") # Look at temperatures

# Print mean_sales_by_type print(temperatures)

print(mean_sales_by_type) # Set the index of temperatures to city

# Import NumPy as npimport numpy as np temperatures_ind = temperatures.set_index("city")

# Pivot for mean and median weekly_sales for each store type # Look at temperatures_ind

mean_med_sales_by_type = sales.pivot_table("weekly_sales", "type", aggfunc = print(temperatures_ind)


[np.mean, np.median]) # Reset the temperatures_ind index, keeping its contents
# Print mean_med_sales_by_type print(temperatures_ind.reset_index())
print(mean_med_sales_by_type) # Reset the temperatures_ind index, dropping its contents
# Pivot for mean weekly_sales by store type and holiday print(temperatures_ind.reset_index(drop=True))
mean_sales_by_type_holiday = sales.pivot_table("weekly_sales", "type",
"is_holiday") Subsetting with .loc[]
# Print mean_sales_by_type_holiday
# Make a list of cities to subset on
print(mean_sales_by_type_holiday)
cities = ["Moscow", "Saint Petersburg"]
# Subset temperatures using square brackets
Fill in missing values and sum values with pivot tables
print(temperatures[temperatures["city"].isin(cities)])
# Print mean weekly_sales by department and type; fill missing values with 0
# Subset temperatures_ind using .loc[]
print(sales.pivot_table(values="weekly_sales", index="department",
columns="type", fill_value=0)) print(temperatures_ind.loc[cities])

# Print the mean weekly_sales by department and type; fill missing values with 0s;
Setting multi-level indexes
sum all rows and cols
print(sales.pivot_table(values="weekly_sales", index="department", # Index temperatures by country & city
columns="type", fill_value=0, margins =True)) temperatures_ind = temperatures.set_index(["country", "city"])
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
Setting and removing indexes
rows_to_keep = [("Brazil", "Rio De Janeiro"), ("Pakistan", "Lahore")] print(temperatures_srt.loc[:, "date":"avg_temp_c"])
# Subset for rows to keep # Subset in both directions at once
print(temperatures_ind.loc[rows_to_keep]) print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad"),
"date":"avg_temp_c"])
Sorting by index values
Slicing time series
# Sort temperatures_ind by index values
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
print(temperatures_ind.sort_index())
temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") &
# Sort temperatures_ind by index values at the city level
(temperatures["date"] <= "2011-12-31")]
print(temperatures_ind.sort_index(level="city"))
print(temperatures_bool)
# Sort temperatures_ind by country then descending city
# Set date as the index and sort the index
print(temperatures_ind.sort_index(level=["country", "city"], ascending = [True,
temperatures_ind = temperatures.set_index("date").sort_index()
False]))
# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
Slicing index values print(temperatures_ind.loc["2010":"2011"])

# Sort the index of temperatures_ind # Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011

temperatures_srt = temperatures_ind.sort_index() print(temperatures_ind.loc["2010-08":"2011-02"])

# Subset rows from Pakistan to Russia


Subsetting by row/column number
print(temperatures_srt.loc["Pakistan":"Russia"])
# Get 23rd row, 2nd column (index 22, 1)
# Try to subset rows from Lahore to Moscow
print(temperatures.iloc[22, 1])
print(temperatures_srt.loc["Lahore":"Moscow"])
# Use slicing to get the first 5 rows
# Subset rows from Pakistan, Lahore to Russia, Moscow
print(temperatures.iloc[:5])
print(temperatures_srt.loc[("Pakistan", "Lahore"):("Russia", "Moscow")])
# Use slicing to get columns 3 to 4
Slicing in both directions print(temperatures.iloc[:, 2:4])

# Subset rows from India, Hyderabad to Iraq, Baghdad # Use slicing in both directions at once

print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad")]) print(temperatures.iloc[:5, 2:4])

# Subset columns from date to avg_temp_c


Pivot temperature by city and year
# Add a year column to temperatures # edited/added
temperatures["year"] = temperatures["date"].dt.year import urllib.requestimport pickle
# Pivot avg_temp_c by country and city vs year avocados = pd.read_pickle("avoplotto.pkl")
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index = # Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt
["country", "city"], columns = "year")
# Look at the first few rows of data
# See the result
print(avocados.head())
print(temp_by_country_city_vs_year)
# Get the total number of avocados sold of each size
nb_sold_by_size = avocados.groupby("size")["nb_sold"].sum()
Subsetting pivot tables
# Create a bar plot of the number of avocados sold by size
# Subset for Egypt to India
nb_sold_by_size.plot(kind="bar")
temp_by_country_city_vs_year.loc["Egypt":"India"]
# Show the plot
# Subset for Egypt, Cairo to India, Delhi
plt.show()
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi")]
# Subset for Egypt, Cairo to India, Delhi, and 2005 to 2010 Changes in sales over time
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi"),
# Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt
"2005":"2010"]
# Get the total number of avocados sold on each date
Calculating on a pivot table nb_sold_by_date = avocados.groupby("date")["nb_sold"].sum()

# Get the worldwide mean temp by year # Create a line plot of the number of avocados sold by date

mean_temp_by_year = temp_by_country_city_vs_year.mean() nb_sold_by_date.plot(kind="line")

# Filter for the year that had the highest mean temp # Show the plot

print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()]) plt.show()

# Get the mean temp by city


Avocado supply and demand
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")
# Scatter plot of avg_price vs. nb_sold with title
# Filter for the city that had the lowest mean temp
avocados.plot(x="nb_sold", y="avg_price", kind="scatter", title="Number of
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()])
avocados sold vs. average price")
# Show the plot
Which avocado size is most popular?
plt.show() plt.show()

Price of conventional vs. organic avocados Finding missing values

# Histogram of conventional avg_price # edited/added


avocados[avocados["type"] == "conventional"]["avg_price"].hist() avocados_2016 = pd.read_csv('avocados_2016.csv')
# Histogram of organic avg_price cols_with_missing = ['small_sold', 'large_sold', 'xl_sold']
avocados[avocados["type"] == "organic"]["avg_price"].hist() # Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt
# Add a legend # Check individual values for missing values
plt.legend(["conventional", "organic"]) print(avocados_2016.isna())
# Show the plot # Check each column for missing values
plt.show() print(avocados_2016.isna().any())
# Modify histogram transparency to 0.5 # Bar plot of missing values by variable
avocados[avocados["type"] == "conventional"]["avg_price"].hist(alpha=0.5) avocados_2016.isna().sum().plot(kind="bar")
# Modify histogram transparency to 0.5 # Show plot
avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha=0.5) plt.show()
# Add a legend
Removing missing values
plt.legend(["conventional", "organic"])
# Show the plot # Remove rows with missing values
plt.show() avocados_complete = avocados_2016.dropna()
# Modify bins to 20 # Check if any columns contain missing values
avocados[avocados["type"] == "conventional"]["avg_price"].hist(bins=20, print(avocados_complete.isna().any())
alpha=0.5)
# Modify bins to 20 Replacing missing values
avocados[avocados["type"] == "organic"]["avg_price"].hist(bins= 20, alpha=0.5) # List the columns with missing values
# Add a legend cols_with_missing = ["small_sold", "large_sold", "xl_sold"]
plt.legend(["conventional", "organic"]) # Create histograms showing the distributions cols_with_missing
# Show the plot avocados_2016[cols_with_missing].hist()
# Show the plot # Read CSV as DataFrame called airline_bumping
plt.show() airline_bumping = pd.read_csv('airline_bumping.csv') # edited/added
# Take a look at the DataFrame
List of dictionaries
print(airline_bumping.head())
# Create a list of dictionaries with new data # For each airline, select nb_bumped and total_passengers and sum
avocados_list = [ airline_totals = airline_bumping.groupby("airline")[["nb_bumped",
{"date": "2019-11-03", "small_sold": 10376832, "large_sold": 7835071}, "total_passengers"]].sum()

{"date": "2019-11-10", "small_sold": 10717154, "large_sold": 8561348}, # Create new col, bumps_per_10k: no. of bumps per 10k passengers for each airline

] airline_totals["bumps_per_10k"] = airline_totals["nb_bumped"] /
airline_totals["total_passengers"] * 10000
# Convert list into DataFrame
# Print airline_totals
avocados_2019 = pd.DataFrame(avocados_list)
print(airline_totals)
# Print the new DataFrame
# Print airline_totals
print(avocados_2019)
print(airline_totals)
Dictionary of lists
DataFrame to CSV
# Create a dictionary of lists with new data
# Create airline_totals_sorted
avocados_dict = {
airline_totals_sorted = airline_totals.sort_values("bumps_per_10k",
"date": ["2019-11-17", "2019-12-01"], ascending=False)
"small_sold": [10859987, 9291631], # Print airline_totals_sorted
"large_sold": [7674135, 6238096] print(airline_totals_sorted)
} # Save as airline_totals_sorted.csv
# Convert dictionary into DataFrame airline_totals_sorted.to_csv("airline_totals_sorted.csv")
avocados_2019 = pd.DataFrame(avocados_dict)
# Print the new DataFrame
print(avocados_2019)

CSV to DataFrame

You might also like