from pyspark.
sql import SparkSession
from pyspark.sql.functions import (
col, date_format, concat_ws, sum as spark_sum, avg, stddev, count,
lag, lead, when, lit,
array,to_date,isnan,min,max,row_number,greatest,lit,array_contains,exists,percent_r
ank
)
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType
df_inv = spark.sql("select * from sharepoint_db.inv_new")
df_mas = spark.sql("select * from sharepoint_db.product_master_data")
# STEP 1: Select and rename relevant columns from raw invoice table
inv_selected = df_inv.select(
col("Invoice No").alias("inv_no"),
col("Order Line No").alias("order_line_no"),
lower(trim(col("Customer Code"))).alias("cust_code"),
col("Material Group").alias("product"),
col("Quantity").alias("quantity"),
col("Unit Price").alias("unit_price"),
col("Net Sales").alias("net_sales"),
col("Original Document").alias("return_inv"),
col("Invoice Date").alias("invoice_date"),
col("Payment Days").alias("payment_days")
)
# STEP 2: Separate original and return invoices
original_inv = inv_selected.filter(trim(col("return_inv")).isNull() |
(trim(col("return_inv")) == ""))
return_inv = inv_selected.filter(trim(col("return_inv")).isNotNull() &
(trim(col("return_inv")) != ""))\
.withColumnRenamed("inv_no", "ret_inv_no")\
.withColumnRenamed("order_line_no", "ret_order_line_no")\
.withColumnRenamed("cust_code", "ret_cust_code")\
.withColumnRenamed("product", "ret_product")\
.withColumnRenamed("quantity", "ret_quantity")\
.withColumnRenamed("unit_price", "ret_unit_price")\
.withColumnRenamed("net_sales", "ret_net_sales")\
.withColumnRenamed("return_inv", "ret_return_inv")\
.withColumnRenamed("invoice_date", "ret_invoice_date")\
.withColumnRenamed("payment_days", "ret_payment_days")
# STEP 3: Filter valid return invoices (those whose return reference exists in
original)
valid_return_inv = return_inv.join(
original_inv.select("inv_no").distinct(),
return_inv.ret_return_inv == original_inv.inv_no,
how="inner"
).drop(original_inv.inv_no)
# STEP 4: Join original and return invoices
adjusted_inv = original_inv.alias("orig").join(
valid_return_inv.alias("ret"),
(col("orig.inv_no") == col("ret.ret_return_inv")) &
(col("orig.order_line_no") == col("ret.ret_order_line_no")) &
(col("orig.cust_code") == col("ret.ret_cust_code")),
how="left"
)
# STEP 5: Adjust quantity and net sales
adjusted_inv = adjusted_inv.withColumn(
"adjusted_quantity",
when(
col("ret.ret_unit_price").isNotNull() &
(col("orig.unit_price") != -col("ret.ret_unit_price")) &
(col("orig.quantity") == -col("ret.ret_quantity")),
col("orig.quantity")
).when(
col("ret.ret_quantity").isNotNull(),
col("orig.quantity") + col("ret.ret_quantity")
).otherwise(col("orig.quantity"))
)
adjusted_inv = adjusted_inv.withColumn(
"adjusted_net_sales",
when(
col("ret.ret_net_sales").isNotNull(),
col("orig.net_sales") + col("ret.ret_net_sales")
).otherwise(col("orig.net_sales"))
)
# STEP 6: Remove fully returned items
final_inv = adjusted_inv.filter(col("adjusted_net_sales") != 0)
# STEP 7: Final clean DataFrame with proper naming
inv_df = final_inv.select(
col("orig.inv_no").alias("Invoice No"),
col("orig.order_line_no").alias("Order Line No"),
col("orig.cust_code").alias("Customer Code"),
col("orig.product").alias("Material Group"),
col("adjusted_quantity").alias("Quantity"),
col("orig.unit_price").alias("Unit Price"),
col("adjusted_net_sales").alias("Net Sales"),
col("orig.invoice_date").alias("Invoice Date"),
col("orig.payment_days").alias("Payment Days"),
col("ret.ret_inv_no").alias("Returned By")
)
joined_df = inv_df.join(df_mas, on="Material Group", how="inner")
df = joined_df.withColumn("invoice_date_parsed", to_date(col("Invoice Date"),
"dd.MM.yyyy"))
df = df.withColumn("invoice_month", date_format(col("invoice_date_parsed"), "yyyy-
MM"))
df = df.withColumn("product_bu_id", concat_ws("_", col("Material"), col("SBU
Code")))
df = df.withColumn(
"Actual_Quantity",
when(col("Quantity") > 0, col("Quantity")).otherwise(lit(0).cast(DoubleType()))
).withColumn(
"Return_Quantity",
when(col("Quantity") < 0, -
col("Quantity")).otherwise(lit(0).cast(DoubleType()))
).withColumn(
"Net_Sales_Positive",
when(col("Net Value") > 0, col("Net
Value")).otherwise(lit(0).cast(DoubleType()))
).withColumn(
"Net_Sales_Negative",
when(col("Net Value") < 0, -col("Net
Value")).otherwise(lit(0).cast(DoubleType()))
)
monthly_agg_df = df.groupBy("invoice_month", "product_bu_id").agg(
spark_sum("Actual_Quantity").alias("monthly_sales_qty"),
spark_sum("Net_Sales_Positive").alias("monthly_sales_value")
).orderBy("product_bu_id", "invoice_month")
# Convert Month to Date Type
monthly_agg_df = monthly_agg_df.withColumn(
"invoice_month_date",
to_date(col("invoice_month"), "yyyy-MM")
)
window_1 = Window.partitionBy("product_bu_id").orderBy("invoice_month_date") #
Time-ordered
window_2 = Window.partitionBy("product_bu_id") # Static stats
# window for each product bu, ordered by month
window_1 = Window.partitionBy("product_bu_id").orderBy("invoice_month_date")
# lag features - sales 1m,2m,3m ago
monthly_agg_df = monthly_agg_df.withColumn("lag_1", lag("monthly_sales_qty",
1).over(window_1))
monthly_agg_df = monthly_agg_df.withColumn("lag_2", lag("monthly_sales_qty",
2).over(window_1))
monthly_agg_df = monthly_agg_df.withColumn("lag_3", lag("monthly_sales_qty",
3).over(window_1))
# rolling avg - avg sales of last 3m
monthly_agg_df = monthly_agg_df.withColumn("rolling_3m_avg",
avg("monthly_sales_qty").over(window_1.rowsBetween(-2, 0)))
# growth rate - compared to prev mnth
monthly_agg_df = monthly_agg_df.withColumn("growth_rate", when(col("lag_1") > 0,
(col("monthly_sales_qty") - col("lag_1")) / col("lag_1")).otherwise(lit(None)))
# Static Statistics per Product-BU
monthly_agg_df = monthly_agg_df.withColumn(
"avg_monthly_sales_qty",
avg("monthly_sales_qty").over(window_2)
).withColumn(
"std_monthly_sales_qty",
stddev("monthly_sales_qty").over(window_2)
)
# Zero-Sales Metrics
monthly_agg_df = monthly_agg_df.withColumn(
"zero_sales",
when(col("monthly_sales_qty") == 0, 1).otherwise(0)
).withColumn(
"zero_sales_months",
spark_sum("zero_sales").over(window_2)
).withColumn(
"total_months",
count("monthly_sales_qty").over(window_2)
).withColumn(
"zero_sales_percent",
col("zero_sales_months") / col("total_months")
)
# Average Growth Rate per Product-BU
monthly_agg_df = monthly_agg_df.withColumn(
"avg_growth_rate",
avg("growth_rate").over(window_2)
)
monthly_agg_df = monthly_agg_df.withColumn(
"monthly_avg_price",
when(col("monthly_sales_qty") != 0, col("monthly_sales_value") /
col("monthly_sales_qty")).otherwise(lit(0))
)
# Lag Price Features
monthly_agg_df = monthly_agg_df.withColumn("price_lag_1", lag("monthly_avg_price",
1).over(window_1))
monthly_agg_df = monthly_agg_df.withColumn("price_lag_2", lag("monthly_avg_price",
2).over(window_1))
monthly_agg_df = monthly_agg_df.withColumn("price_lag_3", lag("monthly_avg_price",
3).over(window_1))
# Replace Null Lag Values
for col_name in ["price_lag_1", "price_lag_2", "price_lag_3"]:
monthly_agg_df = monthly_agg_df.withColumn(
col_name,
when(col(col_name).isNull(), lit(0)).otherwise(col(col_name))
)
# Rolling Average of Price (Last 3 Months)
monthly_agg_df = monthly_agg_df.withColumn(
"rolling_price_3m_avg",
avg("monthly_avg_price").over(window_1.rowsBetween(-2, 0))
)
monthly_agg_df = monthly_agg_df.withColumn(
"price_growth_rate",
when(col("price_lag_1") > 0,
(col("monthly_avg_price") - col("price_lag_1")) / col("price_lag_1"))
.otherwise(lit(0))
)
monthly_agg_df = monthly_agg_df.withColumn(
"avg_monthly_price",
avg("monthly_avg_price").over(window_2)
).withColumn(
"std_monthly_price",
stddev("monthly_avg_price").over(window_2)
)
col_name="growth_rate"
monthly_agg_df = monthly_agg_df.withColumn(
col_name,
when(col(col_name).isNull(), lit(0)).otherwise(col(col_name))
)
col_name="avg_growth_rate"
monthly_agg_df = monthly_agg_df.withColumn(
col_name,
when(col(col_name).isNull(), lit(0)).otherwise(col(col_name))
)
# Target Variables (Next 3 Months)
for i in [1, 2, 3]:
monthly_agg_df = monthly_agg_df.withColumn(
f"y_{i}",
lead("monthly_sales_qty", i).over(window_1)
)
# Drop Rows with Missing Targets
monthly_agg_df = monthly_agg_df.na.drop(subset=["y_1", "y_2", "y_3"])
# Combine Targets into Array
monthly_agg_df = monthly_agg_df.withColumn(
"target",
array("y_1", "y_2", "y_3")
).drop("y_1", "y_2", "y_3")
from pyspark.sql.functions import avg, stddev, max, min, col
cluster_features_df = monthly_agg_df.groupBy("product_bu_id").agg(
avg("monthly_sales_qty").alias("avg_sales"),
stddev("monthly_sales_qty").alias("std_sales"),
avg("growth_rate").alias("avg_growth_rate"),
avg("zero_sales_percent").alias("zero_sales_ratio"),
max("monthly_sales_qty").alias("peak_sales"),
min("monthly_sales_qty").alias("min_sales"),
avg("monthly_avg_price").alias("avg_price"),
stddev("monthly_avg_price").alias("std_price"),
avg("price_growth_rate").alias("avg_price_growth")
).withColumn("cv_sales", col("std_sales") / col("avg_sales")) \
.withColumn("cv_price", col("std_price") / col("avg_price")) \
.fillna({
"avg_growth_rate": 0,
"std_sales": 0,
"cv_sales": 0,
"zero_sales_ratio": 0,
"avg_price": 0,
"std_price": 0,
"avg_price_growth": 0,
"cv_price": 0
})
features = [
"avg_sales", "std_sales", "avg_growth_rate", "zero_sales_ratio",
"peak_sales", "min_sales", "cv_sales",
"avg_price", "std_price", "avg_price_growth", "cv_price"
]
# Apply percent_rank for each feature
for feature in features:
rank_col = f"{feature}_rank"
w = Window.orderBy(col(feature))
cluster_features_df = cluster_features_df.withColumn(rank_col,
percent_rank().over(w))
# Select product_bu_id and all rank columns
rank_cols = [f"{f}_rank" for f in features]
cluster_input_df = cluster_features_df.select(["product_bu_id"] + rank_cols)
from pyspark.ml.feature import VectorAssembler, StandardScaler
# Step 1: Assemble features into a single vector
assembler = VectorAssembler(
inputCols=["avg_sales_rank", "std_sales_rank","avg_growth_rate_rank",
"zero_sales_ratio_rank","peak_sales_rank","min_sales_rank","cv_sales_rank","avg_pri
ce_rank","std_price_rank",
"avg_price_growth_rank","cv_price_rank" ],
outputCol="features_unscaled"
)
assembled_data = assembler.transform(cluster_input_df)
# Step 2: Scale features (mean=0, std=1)
scaler = StandardScaler(inputCol="features_unscaled", outputCol="scaled_features")
scaler_model = scaler.fit(assembled_data)
scaled_data = scaler_model.transform(assembled_data)
from pyspark.ml.clustering import KMeans
# Define range of K values to test
k_values = range(2, 10)
wcss = []
# Compute WCSS for each K
for k in k_values:
kmeans = KMeans(k=k, seed=42).setFeaturesCol("scaled_features")
model = kmeans.fit(scaled_data)
wcss.append(model.summary.trainingCost)
# Plot Elbow Chart
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 5))
plt.plot(k_values, wcss, marker='o', linestyle='--')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Within-Cluster Sum of Squares (WCSS)")
plt.title("Elbow Method for Optimal K")
plt.grid(True)
plt.show()
from pyspark.ml.clustering import KMeans
# Apply K-means clustering
kmeans = KMeans(k=6, seed=42).setFeaturesCol("scaled_features") # Adjust k=6 via
elbow chart
model = kmeans.fit(scaled_data)
clustered_products = model.transform(scaled_data).select("product_bu_id",
"prediction")
from pyspark.ml.evaluation import ClusteringEvaluator
# Apply the model to the full scaled_data, not just select columns
clustered_data = model.transform(scaled_data)
# Create evaluator
evaluator = ClusteringEvaluator(featuresCol='scaled_features',
metricName='silhouette', distanceMeasure='squaredEuclidean')
# Evaluate silhouette score
silhouette_score = evaluator.evaluate(clustered_data)
print(f"Silhouette Score: {silhouette_score}")