import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

plt.style.use('seaborn-v0_8')

# Load the cleaned dataset
from pathlib import Path

csv_path = Path(__file__).parent / "ecommerce_orders_cleaned.csv" if "__file__" in dir() else Path("../AS3/ecommerce_orders_cleaned.csv")
df = pd.read_csv(csv_path)

print(df.shape)
df.head()

(46287, 17)

# Construct all_orders_df — one row per order with derived clustering features

# one row per order
base = df.drop_duplicates(subset=["order_id"]).copy()

# required columns + renamed fields
all_orders_df = pd.DataFrame({
    "order_id": base["order_id"],
    "basket_size": base["order_total_value"],
    "basket_qty": base["num_items"],
    "top_product_category": base["top_product_category"],
    "payment_type": base["payment_type"],
    "order_size_category": base["order_size_category"],
})

# derived features (safe division)
all_orders_df["freight_ratio"] = np.where(
    base["order_total_value"] > 0,
    base["total_freight_value"] / base["order_total_value"],
    0.0
)

all_orders_df["product_diversity"] = np.where(
    base["num_items"] > 0,
    base["num_unique_products"] / base["num_items"],
    0.0
)

# handle missing values
cat_cols = ["top_product_category", "payment_type", "order_size_category"]
num_cols = ["basket_size", "basket_qty", "freight_ratio", "product_diversity"]

all_orders_df[cat_cols] = all_orders_df[cat_cols].fillna("Unknown")
all_orders_df[num_cols] = all_orders_df[num_cols].fillna(0)

# optional checks
print("Unique order_id:", all_orders_df["order_id"].nunique())
print("Rows:", len(all_orders_df))
print("Missing values:\n", all_orders_df.isna().sum())

print(all_orders_df.head(10))
print("Shape:", all_orders_df.shape)
print(all_orders_df.dtypes)

Unique order_id: 46287
Rows: 46287
Missing values:
 order_id                0
basket_size             0
basket_qty              0
top_product_category    0
payment_type            0
order_size_category     0
freight_ratio           0
product_diversity       0
dtype: int64
        order_id  basket_size  basket_qty             top_product_category  \
0  sdv-id-whzjUX   744.312535           1  construction_tools_construction   
1  sdv-id-dbopoJ  1556.667902           1                    health_beauty   
2  sdv-id-FSEOvM    62.060506           1              luggage_accessories   
3  sdv-id-bQcBUR    73.873470           1            computers_accessories   
4  sdv-id-MPxIXB   361.961537           3                         pet_shop   
5  sdv-id-ySodbB   867.750230           7                   sports_leisure   
6  sdv-id-wdwBrk    55.965822           2                             toys   
7  sdv-id-maBYtk   170.295452           3            computers_accessories   
8  sdv-id-oodUXj    83.065603           1                  furniture_decor   
9  sdv-id-zroJIL    98.250248           1         fashion_bags_accessories   

  payment_type order_size_category  freight_ratio  product_diversity  
0      voucher               Small       0.092421           1.000000  
1  credit_card               Small       0.009889           1.000000  
2   debit_card               Small       0.302145           1.000000  
3  credit_card               Small       0.117375           1.000000  
4      voucher              Medium       0.095953           1.000000  
5      voucher               Large       0.204442           0.857143  
6       points               Small       0.285101           0.500000  
7      voucher              Medium       0.089290           1.000000  
8      voucher               Small       0.236471           1.000000  
9       points               Small       0.369709           1.000000  
Shape: (46287, 8)
order_id                    str
basket_size             float64
basket_qty                int64
top_product_category        str
payment_type                str
order_size_category         str
freight_ratio           float64
product_diversity       float64
dtype: object

# Sample 30% of all_orders_df for cluster-count selection (analysis subset)

_, orders_analysis_df = train_test_split(
    all_orders_df, test_size=0.3, random_state=37
)

print("Shape of orders_analysis_df:", orders_analysis_df.shape)
print("\nFirst 10 rows of orders_analysis_df:")
print(orders_analysis_df.head(10))

Shape of orders_analysis_df: (13887, 8)

First 10 rows of orders_analysis_df:
            order_id  basket_size  basket_qty      top_product_category  \
1882   sdv-id-ISddRn   132.611695           1        christmas_supplies   
40995  sdv-id-GmlErp   730.510053           6           furniture_decor   
40104  sdv-id-BWpket   632.486142           1            bed_bath_table   
25808  sdv-id-krmbPT    47.102393           1                 telephony   
23294  sdv-id-vhfMmG   133.285008           1             health_beauty   
7160   sdv-id-kcTYfM   403.696187           2                      auto   
7180   sdv-id-tFFFCg   211.465148           2           furniture_decor   
20801  sdv-id-KgDKAA    57.520738           1            bed_bath_table   
19879  sdv-id-cfCjEB   576.212699           3            bed_bath_table   
39874  sdv-id-uxRZME   141.772602           2  fashion_bags_accessories   

      payment_type order_size_category  freight_ratio  product_diversity  
1882       voucher               Small       0.126285           1.000000  
40995  credit_card               Large       0.075288           0.666667  
40104  credit_card               Small       0.051262           1.000000  
25808  credit_card               Small       0.248932           1.000000  
23294   debit_card               Small       0.135860           1.000000  
7160   credit_card               Small       0.079065           0.500000  
7180       voucher               Small       0.162263           1.000000  
20801  credit_card               Small       0.228879           1.000000  
19879  credit_card              Medium       0.056350           0.666667  
39874  credit_card               Small       0.200719           1.000000

# K-Means inertia sweep across k = 3 to 9 on standardized numerical features

num_cols = ["basket_size", "basket_qty", "freight_ratio", "product_diversity"]
X = orders_analysis_df[num_cols].copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k_values = list(range(3, 10))
inertia_values = []

for k in k_values:
    km = KMeans(n_clusters=k, random_state=37, n_init=10)
    km.fit(X_scaled)
    inertia_values.append(km.inertia_)

inertia_table = pd.DataFrame({"k": k_values, "inertia": inertia_values})
inertia_table

# Elbow Method — plot inertia vs k to identify the optimal cluster count

plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia_values, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.xticks(k_values)
plt.grid(True)
plt.show()
optimal_k = 4

# Fit K-Means with optimal k on the full dataset and assign cluster labels

X_all = all_orders_df[num_cols].copy()

scaler_all = StandardScaler()
X_all_scaled = scaler_all.fit_transform(X_all)

kmeans_final = KMeans(n_clusters=optimal_k, random_state=37, n_init=10)
all_orders_df["cluster"] = kmeans_final.fit_predict(X_all_scaled)

all_orders_df.head(10)

# Cluster Profile Heatmap — Z-scored mean features per cluster

cluster_means = all_orders_df.groupby("cluster")[num_cols].mean()

z_scaler = StandardScaler()
cluster_means_z = pd.DataFrame(
    z_scaler.fit_transform(cluster_means),
    index=cluster_means.index,
    columns=cluster_means.columns
)

plt.figure(figsize=(9, 4))
sns.heatmap(
    cluster_means_z.T,
    cmap="RdBu_r",
    annot=True,
    fmt=".2f",
    center=0
)
plt.title("Cluster Profile Heatmap (Z-scores)")
plt.xlabel("Cluster")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# Cluster Size Distribution — order volume per segment
cluster_counts = all_orders_df["cluster"].value_counts().sort_index()

plt.figure(figsize=(8, 5))
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, color="steelblue")
plt.title("Distribution of Orders per Cluster", fontsize=16)
plt.xlabel("Cluster Number", fontsize=12)
plt.ylabel("Number of Orders", fontsize=12)
plt.xticks(rotation=0)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Build the transaction list — each order as a basket of 3 tagged categorical items

# Create the transactions list using a list comprehension for efficiency
transactions = [
    [
        f"cat_{row['top_product_category']}",
        f"pay_{row['payment_type']}",
        f"size_{row['order_size_category']}"
    ]
    for _, row in all_orders_df.iterrows()
]

# Display the first 5 transactions to verify the format
print(f"Total transactions: {len(transactions)}")
print("\nFirst 5 transactions (Baskets):")
for i in range(5):
    print(f"Order {i+1}: {transactions[i]}")

Total transactions: 46287

First 5 transactions (Baskets):
Order 1: ['cat_construction_tools_construction', 'pay_voucher', 'size_Small']
Order 2: ['cat_health_beauty', 'pay_credit_card', 'size_Small']
Order 3: ['cat_luggage_accessories', 'pay_debit_card', 'size_Small']
Order 4: ['cat_computers_accessories', 'pay_credit_card', 'size_Small']
Order 5: ['cat_pet_shop', 'pay_voucher', 'size_Medium']

# Step 1: One-hot encode the transaction list using TransactionEncoder

te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
trans_df = pd.DataFrame(te_array, columns=te.columns_)
print(f"Shape of encoded DataFrame: {trans_df.shape}")
print("Encoded shape:", trans_df.shape)
trans_df.head()

Shape of encoded DataFrame: (46287, 79)
Encoded shape: (46287, 79)

# Step 2: Apply Apriori to mine frequent itemsets (min_support = 0.01)

frequent_itemsets = apriori(trans_df, min_support=0.01, use_colnames=True)
print("Frequent itemsets:", len(frequent_itemsets))
frequent_itemsets.head()

Frequent itemsets: 102

# Step 3: Generate association rules filtered by lift >= 1.0

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
print("Rules generated:", len(rules))
rules.head()

Rules generated: 138

# Rank rules by lift (descending) and display the top 10 most surprising associations

# Step 1. Sort rules by Lift in descending order
top_rules = rules.sort_values(by='lift', ascending=False).head(10)

# Step 2. Select only the required columns for clarity
# We convert frozensets to strings/lists for better readability in the table
top_rules_display = top_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()
top_rules_display['antecedents'] = top_rules_display['antecedents'].apply(lambda x: list(x))
top_rules_display['consequents'] = top_rules_display['consequents'].apply(lambda x: list(x))

# Step 3. Display the table
print("Top 10 Association Rules (Sorted by Lift):")
display(top_rules_display)

Top 10 Association Rules (Sorted by Lift):

	order_id	order_status	order_purchase_hour	order_purchase_dayofweek	order_purchase_month	order_total_value	num_items	num_unique_products	num_unique_sellers	total_item_price	avg_item_price	total_freight_value	top_product_category	customer_state	payment_type	order_value_per_item	order_size_category
0	sdv-id-whzjUX	shipped	10	4	4	744.312535	1	1	1	352.420029	369.966521	68.790159	construction_tools_construction	Massachusetts	voucher	352.420029	Small
1	sdv-id-dbopoJ	delivered	19	2	3	1556.667902	1	1	1	289.242639	1354.621410	15.394619	health_beauty	Vermont	credit_card	289.242639	Small
2	sdv-id-FSEOvM	delivered	15	4	8	62.060506	1	1	1	26.893468	48.485654	18.751282	luggage_accessories	South Carolina	debit_card	26.893468	Small
3	sdv-id-bQcBUR	delivered	21	0	8	73.873470	1	1	1	37.790896	75.704909	8.670875	computers_accessories	Kentucky	credit_card	37.790896	Small
4	sdv-id-MPxIXB	delivered	13	5	5	361.961537	3	3	3	169.528323	50.132979	34.731146	pet_shop	Missouri	voucher	56.509441	Medium

	k	inertia
0	3	28884.287460
1	4	21464.393895
2	5	17303.035640
3	6	15013.968444
4	7	13219.438102
5	8	11762.794068
6	9	10516.159393

Metric	Definition
Support	How frequently a pattern appears across all transactions (0–1)
Confidence	Probability of the consequent given the antecedent (0–1)
Lift	How much more likely the rule is versus random chance (> 1 = positive association)

	support	itemsets
0	0.030592	frozenset({cat_auto})
1	0.023592	frozenset({cat_baby})
2	0.138678	frozenset({cat_bed_bath_table})
3	0.073152	frozenset({cat_computers_accessories})
4	0.016311	frozenset({cat_consoles_games})

	antecedents	consequents	antecedent support	consequent support	support	confidence	lift	representativity	leverage	conviction	zhangs_metric	jaccard	certainty	kulczynski
0	frozenset({pay_credit_card})	frozenset({cat_auto})	0.604641	0.030592	0.020503	0.033909	1.108423	1.0	0.002006	1.003433	0.247414	0.033352	0.003422	0.352053
1	frozenset({cat_auto})	frozenset({pay_credit_card})	0.030592	0.604641	0.020503	0.670198	1.108423	1.0	0.002006	1.198777	0.100904	0.033352	0.165817	0.352053
2	frozenset({pay_credit_card})	frozenset({cat_baby})	0.604641	0.023592	0.015015	0.024833	1.052604	1.0	0.000750	1.001273	0.126403	0.024486	0.001271	0.330640
3	frozenset({cat_baby})	frozenset({pay_credit_card})	0.023592	0.604641	0.015015	0.636447	1.052604	1.0	0.000750	1.087487	0.051182	0.024486	0.080449	0.330640
4	frozenset({size_Small})	frozenset({cat_baby})	0.864735	0.023592	0.020913	0.024184	1.025108	1.0	0.000512	1.000607	0.181074	0.024110	0.000607	0.455316

Unsupervised Learning Analysis for E-Commerce Orders Segmentation¶

Setup¶

Part 1: Data Ingestion¶

Part 2: Unsupervised Learning¶

Objective¶

Data Preprocessing for Unsupervised Learning¶

Constructing the Order-Level Feature Matrix (`all_orders_df`)¶

Sampling for Cluster Exploration (`orders_analysis_df`)¶

Determining the Optimal Number of Clusters¶

K-Means Inertia Sweep (k = 3 to 9)¶

Elbow Method — Selecting the Optimal k¶

Cluster Analysis and Visualization¶

Assigning Clusters to the Full Dataset¶

Cluster Profiling and Visualization¶

Cluster Interpretation¶

Market Basket Analysis (Association Rule Mining)¶

Methodology Overview¶

Transaction Encoding¶

Apriori Algorithm — Frequent Itemsets and Rule Generation¶

Top 10 Rules by Lift¶

Rule Interpretation and Sensitivity Analysis¶

Business Recommendations¶

	cat_agro_industry_and_commerce	cat_air_conditioning	cat_art	cat_arts_and_craftmanship	cat_audio	cat_auto	cat_baby	cat_bed_bath_table	cat_books_general_interest	cat_books_imported	...	cat_toys	cat_watches_gifts	pay_credit_card	pay_debit_card	pay_not_defined	pay_points	pay_voucher	size_Large	size_Medium	size_Small
0	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	True	False	False	True
1	False	False	False	False	False	False	False	False	False	False	...	False	False	True	False	False	False	False	False	False	True
2	False	False	False	False	False	False	False	False	False	False	...	False	False	False	True	False	False	False	False	False	True
3	False	False	False	False	False	False	False	False	False	False	...	False	False	True	False	False	False	False	False	False	True
4	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	True	False	True	False

	antecedents	consequents	support	confidence	lift
24	[size_Large]	[cat_furniture_decor]	0.014583	0.392899	5.328480
25	[cat_furniture_decor]	[size_Large]	0.014583	0.197773	5.328480
73	[size_Medium, pay_credit_card]	[cat_bed_bath_table]	0.016851	0.280576	2.023212
76	[cat_bed_bath_table]	[size_Medium, pay_credit_card]	0.016851	0.121514	2.023212
75	[size_Medium]	[pay_credit_card, cat_bed_bath_table]	0.016851	0.171693	1.757439
74	[pay_credit_card, cat_bed_bath_table]	[size_Medium]	0.016851	0.172490	1.757439
27	[cat_furniture_decor]	[size_Medium]	0.012660	0.171696	1.749354
26	[size_Medium]	[cat_furniture_decor]	0.012660	0.128990	1.749354
8	[size_Medium]	[cat_bed_bath_table]	0.022749	0.231785	1.671388
9	[cat_bed_bath_table]	[size_Medium]	0.022749	0.164044	1.671388

Unsupervised Learning Analysis for E-Commerce Orders Segmentation¶

Setup¶

Part 1: Data Ingestion¶

Part 2: Unsupervised Learning¶

Objective¶

Data Preprocessing for Unsupervised Learning¶

Constructing the Order-Level Feature Matrix (all_orders_df)¶

Sampling for Cluster Exploration (orders_analysis_df)¶

Determining the Optimal Number of Clusters¶

K-Means Inertia Sweep (k = 3 to 9)¶

Elbow Method — Selecting the Optimal k¶

Cluster Analysis and Visualization¶

Assigning Clusters to the Full Dataset¶

Cluster Profiling and Visualization¶

Cluster Interpretation¶

Market Basket Analysis (Association Rule Mining)¶

Methodology Overview¶

Transaction Encoding¶

Apriori Algorithm — Frequent Itemsets and Rule Generation¶

Top 10 Rules by Lift¶

Rule Interpretation and Sensitivity Analysis¶

Business Recommendations¶

Constructing the Order-Level Feature Matrix (`all_orders_df`)¶

Sampling for Cluster Exploration (`orders_analysis_df`)¶