import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')

# A random state is introduced to ensure reproducibility and stability in the machine learning workflows
RANDOM_STATE = 8 

print(f"Random state set to: {RANDOM_STATE}")

Random state set to: 8

df = pd.read_csv("ecommerce_orders_cleaned.csv")
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (46076, 17)

# Define target variable
y = df['payment_type']

# Define features (all columns except payment_type)
cols_to_select = [col for col in df.columns if col not in ('order_id', 'payment_type')]
X = df[cols_to_select]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Print shapes to verify
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (36860, 15)
X_test shape: (9216, 15)
y_train shape: (36860,)
y_test shape: (9216,)

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

Categorical columns: ['order_status', 'top_product_category', 'customer_state', 'order_size_category']

# Create a copy of X_train to store encoded values
X_train_encoded = X_train.copy()

# Initialise LabelEncoder
le = LabelEncoder()

# Fit on training data
le.fit(X_train_encoded['order_status'].astype(str))
le.classes_

array(['approved', 'canceled', 'created', 'delivered', 'invoiced',
       'processing', 'shipped', 'unavailable'], dtype=object)

# Apply Label Encoding to categorical columns

# Create copies to avoid modifying original data
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

# Dictionary to store label encoders for each column
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()

    # Fit on training data
    le.fit(X_train_encoded[col].astype(str))

    # Transform train and test
    X_train_encoded[col] = le.transform(X_train_encoded[col].astype(str))

    # Handle unseen categories in test set
    le.classes_ = np.append(le.classes_, "unknown_category")
    X_test_encoded[col] = X_test_encoded[col].astype(str).where(X_test_encoded[col].isin(le.classes_), "unknown_category")
    X_test_encoded[col] = le.transform(X_test_encoded[col])

    # Store encoder
    label_encoders[col] = le

# Print the shape after encoding
print(f"X_train_encoded shape: {X_train_encoded.shape}")
print(f"X_test_encoded shape: {X_test_encoded.shape}")

X_train_encoded shape: (36860, 15)
X_test_encoded shape: (9216, 15)

X.describe()

# Identify numerical columns
numerical_cols = X_train_encoded.columns.tolist()
print(f"Numerical columns: {numerical_cols}")
print(f"Number of numerical columns: {len(numerical_cols)}")

Numerical columns: ['order_status', 'order_purchase_hour', 'order_purchase_dayofweek', 'order_purchase_month', 'order_total_value', 'num_items', 'num_unique_products', 'num_unique_sellers', 'total_item_price', 'avg_item_price', 'total_freight_value', 'top_product_category', 'customer_state', 'order_value_per_item', 'order_size_category']
Number of numerical columns: 15

# Initialise StandardScaler
scaler = StandardScaler()

# Fit on training data only, then transform both train and test
X_train_scaled = scaler.fit_transform(X_train_encoded[numerical_cols])
X_test_scaled = scaler.fit_transform(X_test_encoded[numerical_cols])

# Print the shape after scaling
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")

X_train_scaled shape: (36860, 15)
X_test_scaled shape: (9216, 15)

# Train Logistic Regression model
lr_model = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

# Generate predictions
y_pred_lr = lr_model.predict(X_test_scaled)

print("Logistic Regression training complete.")

Logistic Regression training complete.

# Train Decision Tree model
dt_model = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt_model.fit(X_train_scaled, y_train)

# Generate predictions
y_pred_dt = dt_model.predict(X_test_scaled)

print("Decision Tree training complete.")

Decision Tree training complete.

# Train Random Forest

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=RANDOM_STATE
)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)

print("Random Forest training complete.")

Random Forest training complete.

# Compute evaluation metrics for all three models
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }

# Evaluate all models
results = [
    evaluate_model(y_test, y_pred_lr, 'Logistic Regression'),
    evaluate_model(y_test, y_pred_dt, 'Decision Tree'),
    evaluate_model(y_test, y_pred_rf, 'Random Forest')  # or your chosen model
]

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Plot confusion matrices for all three models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

def plot_confusion_matrix(y_true, y_pred, ax, title):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plot_confusion_matrix(y_test, y_pred_lr, axes[0], 'Logistic Regression')
plot_confusion_matrix(y_test, y_pred_dt, axes[1], 'Decision Tree')
plot_confusion_matrix(y_test, y_pred_rf, axes[2], 'Random Forest')  # or your chosen model

plt.tight_layout()
plt.show()

# Display the results table
print("Model Performance Comparison:")
print(results_df.to_string(index=False))

# Identify the best model based on your criteria
best_model = results_df.loc[results_df['F1-Score'].idxmax(), 'Model']
print(f"\nBest performing model: {best_model}")

Model Performance Comparison:
              Model  Accuracy  Precision   Recall  F1-Score
Logistic Regression  0.606988   0.553678 0.606988  0.483541
      Decision Tree  0.503798   0.511951 0.503798  0.507755
      Random Forest  0.630534   0.582534 0.630534  0.569791

Best performing model: Random Forest

	order_id	order_status	order_purchase_hour	order_purchase_dayofweek	order_purchase_month	order_total_value	num_items	num_unique_products	num_unique_sellers	total_item_price	avg_item_price	total_freight_value	top_product_category	customer_state	payment_type	order_value_per_item	order_size_category
0	sdv-id-whzjUX	shipped	10	4	4	744.312535	1	1	1	352.420029	369.966521	68.790159	construction_tools_construction	Massachusetts	voucher	352.420029	Small
1	sdv-id-dbopoJ	delivered	19	2	3	1556.667902	1	1	1	289.242639	1354.621410	15.394619	health_beauty	Vermont	credit_card	289.242639	Small
2	sdv-id-FSEOvM	delivered	15	4	8	62.060506	1	1	1	26.893468	48.485654	18.751282	luggage_accessories	South Carolina	debit_card	26.893468	Small
3	sdv-id-bQcBUR	delivered	21	0	8	73.873470	1	1	1	37.790896	75.704909	8.670875	computers_accessories	Kentucky	credit_card	37.790896	Small
4	sdv-id-MPxIXB	delivered	13	5	5	361.961537	3	3	3	169.528323	50.132979	34.731146	pet_shop	Missouri	voucher	56.509441	Medium

	order_purchase_hour	order_purchase_dayofweek	order_purchase_month	order_total_value	num_items	num_unique_products	num_unique_sellers	total_item_price	avg_item_price	total_freight_value	order_value_per_item
count	46076.000000	46076.000000	46076.000000	46076.000000	46076.000000	46076.000000	46076.000000	46076.000000	46076.000000	46076.000000	46076.000000
mean	14.968856	2.765583	6.235567	215.071101	1.589526	1.402965	1.163686	128.680045	104.606243	25.924525	87.919473
std	5.240562	2.036003	3.121741	268.827491	1.245700	1.071458	0.536977	183.724414	135.327151	23.721633	123.504255
min	0.000000	0.000000	1.000000	6.116141	1.000000	0.000000	0.000000	0.000000	0.850000	0.000000	0.000000
25%	12.000000	1.000000	3.000000	77.835386	1.000000	1.000000	1.000000	39.647803	39.443877	13.721818	32.069555
50%	15.000000	2.000000	7.000000	134.222752	1.000000	1.000000	1.000000	75.259779	73.906002	18.199461	56.677959
75%	19.000000	5.000000	8.000000	215.567663	2.000000	1.000000	1.000000	139.397564	120.253773	30.271361	99.938720
max	23.000000	6.000000	12.000000	2890.551922	8.000000	8.000000	5.000000	2606.843945	1809.335277	277.552685	2606.843945

Supervised Learning Techniques for E-Commerce Orders¶

Setup¶

Part 1: Data Preparation¶

Data Loading and Splitting¶

Handling Categorical Variables¶

Handling Numerical Variables¶

Part 2: Applying Machine Learning Models¶

Logistic Regression¶

Decision Trees¶

Random Forest¶

Part 3: Evaluation & Visualization¶

Part 4: Model Selection & Final Recommendation¶

Justifications¶

Model Explainability/Interpretability¶

Part 5: Conclusion¶

	Model	Accuracy	Precision	Recall	F1-Score
0	Logistic Regression	0.606988	0.553678	0.606988	0.483541
1	Decision Tree	0.503798	0.511951	0.503798	0.507755
2	Random Forest	0.630534	0.582534	0.630534	0.569791