Subject – Data Warehouse and Mining Lab
SR NO- 1
Name of experiment - Data Warehouse Construction a) Real life Problem
to be defined for Warehouse Design
b) Construction of star schema and snow flake schema c) ETL Operations.
import sqlite3
import pandas as pd
# --- Step 1: Create sample CSV files ---
def create_sample_csvs():
products_csv = """Product_ID,Product_Name,Category,Brand,Price
101,Laptop X1,Electronics,TechBrand,1200.00
102,Headphones A2,Electronics,SoundMax,150.00
103,Smartphone Z5,Electronics,MobilePro,850.00
"""
stores_csv = """Store_ID,Store_Name,City,Region
201,Downtown Store,New York,East
202,Uptown Store,Chicago,Midwest
203,Suburban Store,San Francisco,West
"""
customers_csv = """Customer_ID,Name,Gender,Age,Membership_Level
301,Alice Johnson,Female,34,Gold
302,Bob Smith,Male,28,Silver
303,Carol Lee,Female,41,Platinum
"""
dates_csv = """Date_ID,Date,Month,Quarter,Year
1,2025-07-01,July,Q3,2025
2,2025-07-02,July,Q3,2025
3,2025-07-03,July,Q3,2025
"""
sales_csv =
"""Sale_ID,Date_ID,Product_ID,Store_ID,Customer_ID,Quantity_Sold,Total_
Amount
1,1,101,201,301,1,0
2,2,102,202,302,0,0
3,3,103,203,303,1,0
"""
# Write CSV contents to files
with open("products.csv", "w") as f:
f.write(products_csv)
with open("stores.csv", "w") as f:
f.write(stores_csv)
with open("customers.csv", "w") as f:
f.write(customers_csv)
with open("dates.csv", "w") as f:
f.write(dates_csv)
with open("sales.csv", "w") as f:
f.write(sales_csv)
print("Sample CSV files created.")
# --- Step 2: Create Star Schema tables ---
def create_star_schema(conn):
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS DimDate (
Date_ID INTEGER PRIMARY KEY,
Date TEXT,
Month TEXT,
Quarter TEXT,
Year INTEGER
);
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS DimProduct (
Product_ID INTEGER PRIMARY KEY,
Product_Name TEXT,
Category TEXT,
Brand TEXT,
Price REAL
);
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS DimStore (
Store_ID INTEGER PRIMARY KEY,
Store_Name TEXT,
City TEXT,
Region TEXT
);
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS DimCustomer (
Customer_ID INTEGER PRIMARY KEY,
Name TEXT,
Gender TEXT,
Age INTEGER,
Membership_Level TEXT
);
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS FactSales (
Sale_ID INTEGER PRIMARY KEY,
Date_ID INTEGER,
Product_ID INTEGER,
Store_ID INTEGER,
Customer_ID INTEGER,
Quantity_Sold INTEGER,
Total_Amount REAL,
FOREIGN KEY (Date_ID) REFERENCES DimDate(Date_ID),
FOREIGN KEY (Product_ID) REFERENCES DimProduct(Product_ID),
FOREIGN KEY (Store_ID) REFERENCES DimStore(Store_ID),
FOREIGN KEY (Customer_ID) REFERENCES DimCustomer(Customer_ID)
);
""")
conn.commit()
print("Star schema tables created.")
# --- Step 3: ETL process ---
def etl_load_from_csv(conn):
# Load CSV files
df_products = pd.read_csv("products.csv")
df_stores = pd.read_csv("stores.csv")
df_customers = pd.read_csv("customers.csv")
df_dates = pd.read_csv("dates.csv")
df_sales = pd.read_csv("sales.csv")
# Clean data
df_products.drop_duplicates(inplace=True)
df_products['Product_Name'] =
df_products['Product_Name'].str.strip()
df_stores.drop_duplicates(inplace=True)
df_customers.drop_duplicates(inplace=True)
df_dates.drop_duplicates(inplace=True)
df_sales.drop_duplicates(inplace=True)
# Calculate Total_Amount in sales
product_prices = df_products.set_index('Product_ID')['Price']
df_sales['Total_Amount'] = df_sales.apply(
lambda row: row['Quantity_Sold'] *
product_prices.get(row['Product_ID'], 0),
axis=1
)
# Load into SQLite, replacing old data
df_products.to_sql('DimProduct', conn, if_exists='replace',
index=False)
df_stores.to_sql('DimStore', conn, if_exists='replace',
index=False)
df_customers.to_sql('DimCustomer', conn, if_exists='replace',
index=False)
df_dates.to_sql('DimDate', conn, if_exists='replace', index=False)
df_sales.to_sql('FactSales', conn, if_exists='replace',
index=False)
print("ETL completed and data loaded.")
# --- Step 4: Main ---
def main():
create_sample_csvs()
conn = sqlite3.connect("retail_warehouse.db")
create_star_schema(conn)
etl_load_from_csv(conn)
# Show some sample data from FactSales
df = pd.read_sql_query("SELECT * FROM FactSales LIMIT 5;", conn)
print("\nSample data from FactSales:")
print(df)
conn.close()
if __name__ == "__main__":
main()
OUTPUT
Sample CSV files created.
Star schema tables created.
ETL completed and data loaded.
Sample data from FactSales:
Sale_ID Date_ID Product_ID Store_ID Customer_ID Quantity_Sold
\
0 1 1 101 201 301 1
1 2 2 102 202 302 0
2 3 3 103 203 303 1
Total_Amount
0 1200.0
1 0.0
2 850.0