3
SQL
DATA
ANALYTICS
PROJECT
1
Advanced Data Analytics
.
Answer Business Problems:
Complex Queries
Window Functions
Subqueries
CTE
Reports
2
Change Over Time, Trends
∑[ 𝑚𝑒𝑎𝑠𝑢𝑟𝑒] 𝑏𝑦 [𝐷𝑎𝑡𝑒 𝐷𝑖𝑚𝑒𝑛𝑠𝑖𝑜𝑛]
• Total sales by year
• Average cost by year
Year Sales
2023 300
2024 100
2025 200
300
200
100
2023 2024 2025
3
Problem: Analyze sales performance over time.
create database data_warehouse; -- create a database
use data_warehouse; -- use database
select*from customers;
select*from products;
select*from sales;
select*from report_customers;
select*from report_products;
SELECT
year(order_date) as order_year,
month(order_date) as order_month,
sum(sales_amount) as total_sales,
count (distinct customer_key) as total_customers,
sum(quantity) as total_quantity
FROM sales
WHERE order_date IS NOT NULL
group by year(order_date), month(order_date)
ORDER BY year(order_date),month(order_date)
LIMIT 184467440 OFFSET 1; -- here we have to give maximum limits
4
5
SELECT
DATE_FORMAT(order_date, '%Y-%m-01') AS order_date, -- date formatting
SUM(sales_amount) AS total_sales,
COUNT(DISTINCT customer_key) AS total_customers,
SUM(quantity) AS total_quantity
FROM sales
WHERE order_date IS NOT NULL
GROUP BY DATE_FORMAT(order_date, '%Y-%m-01')
ORDER BY DATE_FORMAT(order_date, '%Y-%m-01')
LIMIT 18446744073 OFFSET 1;
6
Cumulative Analysis
∑[ 𝐶𝑢𝑚𝑢𝑙𝑎𝑡𝑖𝑣𝑒 𝑀𝑒𝑎𝑠𝑢𝑟𝑒] 𝑏𝑦 [𝐷𝑎𝑡𝑒 𝐷𝑖𝑚𝑒𝑛𝑠𝑖𝑜𝑛]
• Running total sales by year
• Moving the average sales by month
Cumulative value
2024 300 300
2025 100 400
2026 200 600
7
Problem: Calculate the total sales per month and the running
total of overtime sales.
SELECT
order_date,
total_sales,
SUM(total_sales) OVER (partition by order_date order by order_date) AS
running_total_sales --window fcn
FROM
SELECT
DATE_FORMAT(order_date, '%Y-%m-01') AS order_date,
SUM(sales_amount) AS total_sales
FROM sales
WHERE order_date IS NOT NULL
GROUP BY DATE_FORMAT(order_date, '%Y-%m-01')
order by total_sales
LIMIT 184467440 OFFSET 1
) AS monthly_sales
ORDER BY order_date;
8
SELECT
STR_TO_DATE(CONCAT(order_year, ' -01-01'), '%Y-%m-%d') AS order_date,
total_sales,
SUM(total_sales) OVER (ORDER BY order_year) AS running_total_sales,
ROUND(AVG(avg_price) OVER (ORDER BY order_year), 2) AS
moving_average_price
FROM (
SELECT
YEAR(order_date) AS order_year,
SUM(sales_amount) AS total_sales,
avg(price) avg_price
9
FROM sales
WHERE order_date IS NOT NULL
GROUP BY YEAR(order_date)
ORDER BY YEAR(order_date)
LIMIT 184467440737 OFFSET 1
) AS yearly_sales
ORDER BY order_year;
10
Performance Analysis
𝐶𝑢𝑟𝑟𝑒𝑛𝑡 [𝑀𝑒𝑎𝑠𝑢𝑟𝑒] − 𝑇𝑎𝑟𝑔𝑒𝑡 [𝑚𝑒𝑎𝑠𝑢𝑟𝑒]
• Current sales – Average sales
• Current year sales – Previous year sales
• Current sales – Lowest sales
Current Target Performance
A 400 400 0
B 600 400 200
C 200 400 -200
11
Problem: Analyze the yearly performance of each product by
comparing its sales to both its average annual sales and the previous
year's sales.
WITH yearly_product_sales AS ( -- starting with CTE
SELECT
YEAR(s.order_date) AS order_year,
p.product_name,
SUM(s.sales_amount) AS current_sales
FROM sales s
Joined sales with products on
LEFT JOIN products p ON s.product_key = p.product_key product_key using LEFT JOIN
to retain all sales data.
WHERE s.order_date IS NOT NULL
GROUP BY YEAR(s.order_date), p.product_name
SELECT
order_year, product_name, current_sales,
ROUND(AVG(current_sales) OVER (PARTITION BY product_name)) AS
avg_sales,
current_sales - ROUND(AVG(current_sales) OVER (PARTITION BY
product_name)) AS diff_avg,
case when current_sales - ROUND(AVG(current_sales) OVER (PARTITION BY
product_name)) > 0 then "above Avg"
when current_sales - ROUND(AVG(current_sales) OVER (PARTITION BY
product_name)) < 0 then "below Avg"
else "Avg" end avg_change,
12
-- year over year analysis
LAG(current_sales) over (partition by product_name order by order_year)
prev_sales,
current_sales - LAG(current_sales) over (partition by product_name order by
order_year) as diff_prev,
case
when current_sales - ROUND( LAG(current_sales) over (partition by
product_name order by order_year)) > 0 then "Increase"
when current_sales - ROUND( LAG(current_sales) over (partition by
product_name order by order_year)) < 0 then "Decrease"
else "No change"
end prev_change
FROM yearly_product_sales
WHERE order_year IS NOT NULL;
13
Proportional Analysis or Part to whole
Assess the performance of individual components in relation to the overall
business, enabling us to identify which category contributes most significantly
to overall impact.
([𝑀𝑒𝑎𝑠𝑢𝑟𝑒]/𝑇𝑜𝑡𝑎𝑙 [𝑀𝑒𝑎𝑠𝑢𝑟𝑒]) ∗ 100 𝑏𝑦 [𝐷𝑖𝑚𝑒𝑛𝑠𝑖𝑜𝑛]
(𝑆𝑎𝑙𝑒𝑠/𝑇𝑜𝑡𝑎𝑙 𝑠𝑎𝑙𝑒𝑠) ∗ 100 𝑏𝑦 [𝐷𝑖𝑚𝑒𝑛𝑠𝑖𝑜𝑛]
(𝑄𝑢𝑎𝑛𝑡𝑖𝑡𝑦/𝑇𝑜𝑡𝑎𝑙 𝑄𝑢𝑎𝑛𝑡𝑖𝑡𝑦) ∗ 100 𝑏𝑦 [𝐶𝑜𝑢𝑛𝑡𝑟𝑦]
A 200 33%
B 300 50%
C 100 17%
17%
33% A
B
C
50%
14
Problem: Which categories contribute the most to overall sales?
with category_sales as (
select p.category,
sum(sales_amount) as total_sales from sales s
left join products p
on p.product_key= s.product_key group by category order by total_sales desc)
select category, total_sales,
sum(total_sales) over() as overall_sales ,
round((total_sales/ sum(total_sales) over ())*100,2) as "total(%)"
from category_sales;
15
Data Segmentation
[𝑀𝑒𝑎𝑠𝑢𝑟𝑒] 𝑏𝑦 [𝑀𝑒𝑎𝑠𝑢𝑟𝑒]
[𝑇𝑜𝑡𝑎𝑙 𝑝𝑟𝑜𝑑𝑢𝑐𝑡𝑠 𝑏𝑦 𝑠𝑎𝑙𝑒𝑠 𝑅𝑎𝑛𝑔𝑒]
[𝑇𝑜𝑡𝑎𝑙 𝑐𝑢𝑠𝑡𝑜𝑚𝑒𝑟𝑠 𝑏𝑦 𝐴𝑔𝑒]
Sum Categorize
3 50
Low 7
4 100
5 150
Medium 6
1 200
10 250 High 15
5 300
16
Problem: Segment products into cost ranges and count how many
products fall into each segment.
use data_warehouse;
with product_segment as (
select product_key, product_name , cost,
case when cost<100 then "below 100"
when cost between 100 and 500 then "100-500"
when cost between 500 and 1000 then "500-1000"
else "above 1000"
end cost_range from products )
select cost_range,
count(product_key) as total_products from product_segment group by
cost_range
order by total_products desc;
17
Problem: Group customers into three segments based on their
spending behavior:
- VIP: Customers with at least 12 months of history and spending
more than €5,000.
- Regular: Customers with at least 12 months of history but spending
€5,000 or less.
- New: Customers with a lifespan less than 12 months.
And find the total number of customers by each group
with customer_spending as (
SELECT
c.customer_key,
SUM(s.sales_amount) AS total_spending,
MIN(s.order_date) AS first_order,
MAX(s.order_date) AS last_order,
TIMESTAMPDIFF(MONTH, MIN(s.order_date), MAX(s.order_date)) AS lifespan
FROM sales s
LEFT JOIN customers c
ON s.customer_key = c.customer_key
GROUP BY c.customer_key
SELECT
customer_segment,
COUNT(customer_key) AS total_customers
18
FROM (
SELECT
customer_key,
CASE
WHEN lifespan >= 12 AND total_spending > 5000 THEN 'VIP'
WHEN lifespan >= 12 AND total_spending <= 5000 THEN 'Regular'
ELSE 'New'
END AS customer_segment
FROM customer_spending
) AS t
GROUP BY customer_segment
ORDER BY total_customers DESC;
19
Customer Report
Purpose: This report consolidates key customer metrics and behaviors.
Highlights:
1. Gathers essential fields such as names, ages, and transaction details.
2. Segments customers into categories (VIP, Regular, New) and age groups.
3. Aggregates customer-level metrics:
➢ total orders
➢ total sales
➢ total quantity purchased
➢ total products
➢ lifespan (in months)
4. Calculates valuable KPIs:
➢ recency (months since last order)
➢ average order value
➢ average monthly expenses
20
Task:
1. Gathers essential fields such as names, ages, and transaction details (Base
Query)
WITH base_query as ( --By CTE
SELECT
s.order_number, s.product_key, s.order_date, s.sales_amount, s.quantity,
c.customer_key, c.customer_number,
CONCAT(c.first_name, ' ', c.last_name) AS customer_name,
TIMESTAMPDIFF(YEAR, c.birthdate, CURDATE()) AS age
FROM sales s
LEFT JOIN customers c
ON c.customer_key = s.customer_key
WHERE s.order_date IS NOT NULL)
select*from base_query;
21
2) Customer Aggregations: Summarizes key metrics at the customer level.
WITH base_query as (
SELECT
s.order_number,
s.product_key,
s.order_date,
s.sales_amount,
s.quantity,
c.customer_key,
c.customer_number,
CONCAT(c.first_name, ' ', c.last_name) AS customer_name,
TIMESTAMPDIFF(YEAR, c.birthdate, CURDATE()) AS age
FROM sales s
LEFT JOIN customers c
ON c.customer_key = s.customer_key
WHERE s.order_date IS NOT NULL)
SELECT
customer_key,
customer_number,
customer_name,
age,
COUNT(DISTINCT order_number) AS total_orders,
SUM(sales_amount) AS total_sales,
22
SUM(quantity) AS total_quantity,
COUNT(DISTINCT product_key) AS total_products,
MAX(order_date) AS last_order_date,
TIMESTAMPDIFF(MONTH, MIN(order_date), MAX(order_date)) AS lifespan
FROM base_query
GROUP BY
customer_key,
customer_number,
customer_name,
age;
23
3. Segments customers into categories (VIP, Regular, New) and age groups.
WITH base_query as (
SELECT
s.order_number,
s.product_key,
s.order_date,
s.sales_amount,
s.quantity,
c.customer_key,
c.customer_number,
CONCAT(c.first_name, ' ', c.last_name) AS customer_name,
TIMESTAMPDIFF(YEAR, c.birthdate, CURDATE()) AS age
FROM sales s
LEFT JOIN customers c
ON c.customer_key = s.customer_key
WHERE s.order_date IS NOT NULL)
customer_aggregation AS (
SELECT
customer_key,
customer_number,
customer_name,
age,
COUNT(DISTINCT order_number) AS total_orders,
24
SUM(sales_amount) AS total_sales,
SUM(quantity) AS total_quantity,
COUNT(DISTINCT product_key) AS total_products,
MAX(order_date) AS last_order_date,
TIMESTAMPDIFF(MONTH, MIN(order_date), MAX(order_date)) AS lifespan
FROM base_query
GROUP BY
customer_key,
customer_number,
customer_name,
age)
SELECT
customer_key,
customer_number,
customer_name,
age,
-- Age group classification
CASE
WHEN age < 20 THEN 'Under 20'
WHEN age BETWEEN 20 AND 29 THEN '20-29'
WHEN age BETWEEN 30 AND 39 THEN '30-39'
WHEN age BETWEEN 40 AND 49 THEN '40-49'
ELSE '50 and above'
END AS age_group,
25
-- Customer segment classification
CASE
WHEN lifespan >= 12 AND total_sales > 5000 THEN 'VIP'
WHEN lifespan >= 12 AND total_sales <= 5000 THEN 'Regular'
ELSE 'New'
END AS customer_segment,
last_order_date,
total_orders,
total_sales,
total_quantity,
total_products,
lifespan
FROM customer_aggregation;
26
4. Calculates valuable KPIs:
CREATE view customers_report as -- make the View
WITH base_query as (
SELECT
s.order_number,
s.product_key,
s.order_date,
s.sales_amount,
s.quantity,
c.customer_key,
c.customer_number,
CONCAT(c.first_name, ' ', c.last_name) AS customer_name,
TIMESTAMPDIFF(YEAR, c.birthdate, CURDATE()) AS age
FROM sales s
LEFT JOIN customers c
ON c.customer_key = s.customer_key
WHERE s.order_date IS NOT NULL)
customer_aggregation AS (
SELECT
customer_key,
customer_number,
customer_name,
age,
COUNT(DISTINCT order_number) AS total_orders,
SUM(sales_amount) AS total_sales,
27
SUM(quantity) AS total_quantity,
COUNT(DISTINCT product_key) AS total_products,
MAX(order_date) AS last_order_date,
TIMESTAMPDIFF(MONTH, MIN(order_date), MAX(order_date)) AS lifespan
FROM base_query
GROUP BY
customer_key,
customer_number,
customer_name,
age
SELECT
customer_key,
customer_number,
customer_name,
age,
-- Age group classification
CASE
WHEN age < 20 THEN 'Under 20'
WHEN age BETWEEN 20 AND 29 THEN '20-29'
WHEN age BETWEEN 30 AND 39 THEN '30-39'
WHEN age BETWEEN 40 AND 49 THEN '40-49'
ELSE '50 and above'
END AS age_group,
28
-- Customer segment classification
CASE
WHEN lifespan >= 12 AND total_sales > 5000 THEN 'VIP'
WHEN lifespan >= 12 AND total_sales <= 5000 THEN 'Regular'
ELSE 'New'
END AS customer_segment, last_order_date,
TIMESTAMPDIFF(MONTH, last_order_date, CURDATE()) AS recency,
total_orders,
total_sales,
total_quantity,
total_products,
lifespan,
-- Compute average order value (AOV).
CASE
WHEN total_orders = 0 THEN 0
ELSE round( total_sales / total_orders , 2)
END AS avg_order_value,
-- Compute average monthly spends.
CASE
WHEN lifespan = 0 THEN round( total_sales , 2)
ELSE round( total_sales / lifespan , 2)
END AS avg_monthly_spend
FROM customer_aggregation;
29
-- using view
SELECT * FROM data_warehouse.customers_report; -- it showing whole
reports
select age_group,
count(customer_number) as total_customers,
sum(total_sales) total_sales from customers_report
group by age_group;
30
select customer_segment,
count(customer_number) as total_customers,
sum(total_sales) total_sales from customers_report
group by customer_segment;
31
Products Report
Purpose: This report consolidates key product metrics and behaviors.
Highlights:
1. Gathers essential fields such as product name, category, subcategory, and cost.
2. Segments Products by revenue to identify High-Performers, Mid-Range, or
Low-Performers.
3. Aggregates product-level metrics:
➢ total orders
➢ total sales
➢ total quantity sold
➢ total customers (unique)
➢ lifespan (in months)
4. Calculates valuable KPIs:
➢ recency (months since last sale)
➢ average order revenue (AOR)
➢ average monthly revenue
32
Task:
(1) Base Query: Retrieves core columns from sales and products.
WITH base_query AS (
SELECT
s.order_number,
s.order_date,
s.customer_key,
s.sales_amount,
s.quantity,
p.product_key,
p.product_name,
p.category,
p.subcategory,
p.cost
FROM sales s
LEFT JOIN products p
ON s.product_key = p.product_key
WHERE s.order_date IS NOT NULL -- only consider valid sales dates
select* from base_query
33
2. Segments Products by revenue to identify High-Performers, Mid-Range, or
Low-Performers.
WITH base_query AS (
SELECT
s.order_number,
s.order_date,
s.customer_key,
s.sales_amount,
s.quantity,
34
p.product_key,
p.product_name,
p.category,
p.subcategory,
p.cost
FROM sales s
LEFT JOIN products p
ON s.product_key = p.product_key
WHERE s.order_date IS NOT NULL -- only consider valid sales dates
product_aggregations AS (
-- Product Aggregations: Summarizes key metrics at the product level.
SELECT
product_key,
product_name,
category,
subcategory,
cost,
TIMESTAMPDIFF(MONTH, MIN(order_date), MAX( order_date)) AS lifespan,
MAX(order_date) AS last_sale_date,
COUNT(DISTINCT order_number) AS total_orders,
COUNT(DISTINCT customer_key) AS total_customers,
SUM(sales_amount) AS total_sales,
SUM(quantity) AS total_quantity,
35
ROUND(AVG(CASE WHEN quantity != 0 THEN sales_amount / quantity
ELSE NULL END), 1) AS avg_selling_price
FROM base_query
GROUP BY
product_key,
product_name,
category,
subcategory,
cost
SELECT
product_key,
product_name,
category,
subcategory,
cost,
last_sale_date,
TIMESTAMPDIFF(MONTH, last_sale_date, CURDATE()) AS recency_in_months,
-- Product Segment based on sales
CASE
WHEN total_sales > 50000 THEN 'High-Performer'
WHEN total_sales >= 10000 THEN 'Mid-Range'
ELSE 'Low-Performer'
END AS product_segment
from product_aggregations;
36
3. Aggregates product-level metrics: total orders, total sales , total quantity sold,
total customers (unique),lifespan (in months).
WITH base_query AS (
SELECT
s.order_number,
s.order_date,
s.customer_key,
s.sales_amount,
s.quantity,
p.product_key,
p.product_name,
p.category,
37
p.subcategory,
p.cost
FROM sales s
LEFT JOIN products p
ON s.product_key = p.product_key
WHERE s.order_date IS NOT NULL -- only consider valid sales dates
product_aggregations AS (
SELECT
product_key,
product_name,
category,
subcategory,
cost,
TIMESTAMPDIFF(MONTH, MIN(order_date), MAX( order_date)) AS lifespan,
MAX(order_date) AS last_sale_date,
COUNT(DISTINCT order_number) AS total_orders,
COUNT(DISTINCT customer_key) AS total_customers,
SUM(sales_amount) AS total_sales,
SUM(quantity) AS total_quantity,
ROUND(AVG(CASE WHEN quantity != 0 THEN sales_amount / quantity ELSE
NULL END), 1) AS avg_selling_price
FROM base_query
GROUP BY
product_key,
38
product_name,
category,
subcategory,
cost
SELECT
product_key,
product_name,
category,
subcategory,
cost,
last_sale_date,
TIMESTAMPDIFF(MONTH, last_sale_date, CURDATE()) AS recency_in_months,
-- Product Segment based on sales.
CASE
WHEN total_sales > 50000 THEN 'High-Performer'
WHEN total_sales >= 10000 THEN 'Mid-Range'
ELSE 'Low-Performer'
END AS product_segment,
lifespan,
total_orders,
total_sales,
total_quantity,
39
total_customers,
avg_selling_price
from product_aggregations;
4. Calculates valuable KPIs: recency (months since last sale), average order
revenue (AOR), average monthly revenue.
CREATE VIEW products_report AS -- create view
WITH base_query AS (
SELECT
s.order_number,
s.order_date,
s.customer_key,
s.sales_amount,
40
s.quantity,
p.product_key,
p.product_name,
p.category,
p.subcategory,
p.cost
FROM sales s
LEFT JOIN products p
ON s.product_key = p.product_key
WHERE s.order_date IS NOT NULL
),
product_aggregations AS (
SELECT
product_key,
product_name,
category,
subcategory,
cost,
TIMESTAMPDIFF(MONTH, MIN(order_date), MAX(order_date)) AS lifespan,
MAX(order_date) AS last_sale_date,
COUNT(DISTINCT order_number) AS total_orders,
COUNT(DISTINCT customer_key) AS total_customers,
SUM(sales_amount) AS total_sales,
SUM(quantity) AS total_quantity,
ROUND(AVG(CASE WHEN quantity != 0 THEN sales_amount / quantity ELSE
NULL END), 1) AS avg_selling_price
41
FROM base_query
GROUP BY
product_key,
product_name,
category,
subcategory,
cost)
SELECT
product_key,
product_name,
category,
subcategory,
cost,
last_sale_date,
TIMESTAMPDIFF(MONTH, last_sale_date, CURDATE()) AS recency_in_months,
-- Product Segment based on sales
CASE
WHEN total_sales > 50000 THEN 'High-Performer'
WHEN total_sales >= 10000 THEN 'Mid-Range'
ELSE 'Low-Performer'
END AS product_segment,
-- Average Order Revenue (AOR)
CASE
WHEN total_orders = 0 THEN 0
42
ELSE round(total_sales / total_orders,2)
END AS avg_order_revenue,
-- Average Monthly Revenue
CASE
WHEN lifespan = 0 THEN round(total_sales,2)
ELSE round( total_sales / lifespan,2)
END AS avg_monthly_revenue
FROM product_aggregations;
SELECT * FROM data_warehouse.products_report; -- view
Thanks !
43