import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

plt.style.use('seaborn-v0_8')

# Load the CSV files
df_order_items = pd.read_csv('order_items.csv')
df_shipping = pd.read_csv('order_shipping.csv')
df_orders = pd.read_csv('orders.csv')
df_payments = pd.read_csv('payments.csv')

# Execute structural audit for all primary dataframes
display(df_order_items.head())
print(f"Shape of order_items: {df_order_items.shape}")

display(df_shipping.head())
print(f"Shape of order_shipping: {df_shipping.shape}")

display(df_orders.head())
print(f"Shape of orders: {df_orders.shape}")

display(df_payments.head())
print(f"Shape of payments: {df_payments.shape}")

Shape of order_items: (100000, 8)

Shape of order_shipping: (100000, 2)

Shape of orders: (100000, 6)

Shape of payments: (100000, 2)

# Consolidate core order metadata with itemized transaction data
step1 = pd.merge(df_orders, df_order_items, on='order_id', how='inner')

# Enrich integrated records with logistics/shipping dimensions
step2 = pd.merge(step1, df_shipping, on='order_id', how='inner')

# Finalize the master analytical record with payment data
df_orders_full = pd.merge(step2, df_payments, on='order_id', how='inner')

# Validation of the integrated master dataset
display(df_orders_full.head())
print(f"Final result dataframe shape: {df_orders_full.shape}")

Final result dataframe shape: (100000, 15)

# Handle missing values

for col in df_orders_full.columns:
  if df_orders_full[col].dtype in ['float64','int64']:
    median_val = df_orders_full[col].median()
    df_orders_full[col] = df_orders_full[col].fillna(median_val)

  elif df_orders_full[col].dtype == 'object':
    mode_val = df_orders_full[col].mode()[0]
    df_orders_full[col] = df_orders_full[col].fillna(mode_val)

# Enforce record-level integrity by removing rows with missing primary identifiers
df_orders_full.dropna(subset=['order_id','payment_type'], inplace=True)

# Validation of cleaned data state
print(f"Dataframe df_orders_full shape: {df_orders_full.shape}")

Dataframe df_orders_full shape: (99678, 15)

# Filter invalid numeric records

df_orders_full = df_orders_full[
    (df_orders_full['num_items'] > 0) &
    (df_orders_full['order_total_value'] > 0) &
    (df_orders_full['total_item_price'] >= 0) &
    (df_orders_full['total_freight_value'] >= 0)
]

# Quantify the impact of the filtering process
print(f"Dataframe df_orders_full shape: {df_orders_full.shape}")

Dataframe df_orders_full shape: (88853, 15)

# Ensure zero missingness across all features
null_counts = df_orders_full.isnull().sum().sum()

if null_counts == 0:
  print("There are no null values in dataset")
else:
  print("There are some null values existing in dataset")
  print(null_counts[null_counts>0])

# Validate the primary key at the order grain
total_rows = df_orders_full.shape[0]
unique_orders = df_orders_full['order_id'].nunique()

if total_rows == unique_orders:
  print("Rows are unique")
else:
  print(f"There are {total_rows - unique_orders} duplicated rows")

There are some null values existing in dataset
[866]
Rows are unique

# Apply logical consistency constraints to ensure transactional integrity

df_orders_full = df_orders_full[
    (df_orders_full['num_items'] >= df_orders_full['num_unique_products']) &
    (df_orders_full['order_total_value'] >= df_orders_full['total_item_price'] + df_orders_full['total_freight_value'])
]

# Audit the dataset following the application of logical constraints
print(f"Dataframe df_orders_full shape: {df_orders_full.shape}")

Dataframe df_orders_full shape: (46076, 15)

# order_value_per_item: Captures the average economic value of each unit within a basket

df_orders_full['order_value_per_item'] = df_orders_full['total_item_price'] / df_orders_full['num_items']

# Validate the addition of the new feature dimension
print(f"Dataframe df_orders_full shape: {df_orders_full.shape}")

Dataframe df_orders_full shape: (46076, 16)

# Small: 1-2 items | Medium: 3-5 items | Large: >5 items

conditions = [
    (df_orders_full['num_items'] <= 2),
    (df_orders_full['num_items'] <= 5),
    (df_orders_full['num_items'] > 2)
]

choices = ['Small','Medium','Large']

df_orders_full['order_size_category'] = np.select(conditions, choices, default='unknown')

# Ensure feature engineering did not introduce missingness
print(df_orders_full.isnull().sum())

# Inspect data types and memory usage
df_orders_full.info()

# Capture the terminal shape of the workflow
print(f"Final Shape of Dataframe df_orders_full {df_orders_full.shape}")

# Review the enriched record structure
display(df_orders_full.head())

order_id                      0
order_status                  0
order_purchase_hour           0
order_purchase_dayofweek      0
order_purchase_month          0
order_total_value             0
num_items                     0
num_unique_products           0
num_unique_sellers            0
total_item_price              0
avg_item_price                0
total_freight_value           0
top_product_category        523
customer_state                0
payment_type                  0
order_value_per_item          0
order_size_category           0
dtype: int64
<class 'pandas.DataFrame'>
Index: 46076 entries, 0 to 99997
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   order_id                  46076 non-null  str    
 1   order_status              46076 non-null  str    
 2   order_purchase_hour       46076 non-null  int64  
 3   order_purchase_dayofweek  46076 non-null  int64  
 4   order_purchase_month      46076 non-null  int64  
 5   order_total_value         46076 non-null  float64
 6   num_items                 46076 non-null  int64  
 7   num_unique_products       46076 non-null  int64  
 8   num_unique_sellers        46076 non-null  int64  
 9   total_item_price          46076 non-null  float64
 10  avg_item_price            46076 non-null  float64
 11  total_freight_value       46076 non-null  float64
 12  top_product_category      45553 non-null  str    
 13  customer_state            46076 non-null  str    
 14  payment_type              46076 non-null  str    
 15  order_value_per_item      46076 non-null  float64
 16  order_size_category       46076 non-null  str    
dtypes: float64(5), int64(6), str(6)
memory usage: 6.3 MB
Final Shape of Dataframe df_orders_full (46076, 17)

# Persist the high-integrity analytical dataset for downstream consumption

df_orders_full.to_csv('ecommerce_orders_cleaned.csv', index=False)

# Load the analytical record

df_orders_cleaned = pd.read_csv('ecommerce_orders_cleaned.csv')

# Verify data availability and dimensional integrity for the EDA session
display(df_orders_cleaned.head())
print(f"Cleaned Dataframe Shape: {df_orders_cleaned.shape}")

Cleaned Dataframe Shape: (46076, 17)

# Quantify categorical cardinality and primary transactional drivers
unique_categories = df_orders_cleaned['top_product_category'].nunique()
unique_payments = df_orders_cleaned['payment_type'].nunique()

print(f"Number of unique product categories: {unique_categories}")
print(f"Number of unique payment types: {unique_payments}")

# Identify the top 5 high-velocity product categories
top_categories = df_orders_cleaned['top_product_category'].value_counts().head(5)
print("--- Top 5 Product Categories ---")
print(top_categories)

# Identify the top 3 dominant payment channels
top_payments = df_orders_cleaned['payment_type'].value_counts().head(3)
print("--- Top 5 Payment Types ---")
print(top_payments)

Number of unique product categories: 71
Number of unique payment types: 5
--- Top 5 Product Categories ---
top_product_category
bed_bath_table           5895
health_beauty            4011
furniture_decor          3412
computers_accessories    3386
telephony                2570
Name: count, dtype: int64
--- Top 5 Payment Types ---
payment_type
credit_card    27776
voucher        13812
points          2532
Name: count, dtype: int64

# Configure visual parameters for executive reporting
plt.figure(figsize=(12,15))

# Determine categorical ranking by descending transaction frequency
category_order = df_orders_cleaned['top_product_category'].value_counts().index

# Render categorical velocity distribution
sns.countplot(
    data = df_orders_cleaned,
    y = 'top_product_category',
    order = category_order,
    color='steelblue',
)

# Refine chart aesthetics for professional presentation
plt.title('Total Orders by Product Category', fontsize = 16, pad = 25)
plt.xlabel('Number of Orders', fontsize = 12)
plt.ylabel('Product Category', fontsize = 8)

plt.show()

# Configure canvas for distributional bivariate analysis
plt.figure(figsize=(12,7))

# Generate box plot to visualize value distribution and extreme outliers
sns.boxplot(
    data = df_orders_cleaned,
    x = 'num_items',
    y = 'order_total_value',
    color='steelblue'
)

# Enhance aesthetics for professional reporting
plt.title('Order Total Value Distribution by Number of Items', fontsize = 16, pad = 20)
plt.xlabel('Number of Items in Order', fontsize = 12)
plt.ylabel('Order Total Value ($)', fontsize = 12)

plt.show()

	order_id	num_items	num_unique_products	num_unique_sellers	total_item_price	avg_item_price	total_freight_value	top_product_category
0	sdv-id-whzjUX	1	1	1	352.420029	369.966521	68.790159	construction_tools_construction
1	sdv-id-gqShVM	1	1	1	1580.187928	1089.284136	84.603289	auto
2	sdv-id-vtaqcY	1	1	1	45.969263	36.969754	24.022637	furniture_decor
3	sdv-id-xkqwdo	1	1	1	146.023435	126.755360	16.945349	consoles_games
4	sdv-id-sGyHvQ	1	1	1	30.525812	22.797409	17.785846	air_conditioning

	order_id	customer_state
0	sdv-id-whzjUX	Massachusetts
1	sdv-id-gqShVM	Ohio
2	sdv-id-vtaqcY	Wisconsin
3	sdv-id-xkqwdo	Michigan
4	sdv-id-sGyHvQ	Nevada

	order_id	order_status	order_purchase_hour	order_purchase_dayofweek	order_purchase_month	order_total_value
0	sdv-id-whzjUX	shipped	10	4	4	744.312535
1	sdv-id-gqShVM	delivered	19	2	4	1030.521912
2	sdv-id-vtaqcY	delivered	18	4	8	28.472994
3	sdv-id-xkqwdo	invoiced	23	1	8	143.914263
4	sdv-id-sGyHvQ	delivered	19	0	2	16.944537

	order_id	order_status	order_purchase_hour	order_purchase_dayofweek	order_purchase_month	order_total_value	num_items	num_unique_products	num_unique_sellers	total_item_price	avg_item_price	total_freight_value	top_product_category	customer_state	payment_type
0	sdv-id-whzjUX	shipped	10	4	4	744.312535	1	1	1	352.420029	369.966521	68.790159	construction_tools_construction	Massachusetts	voucher
1	sdv-id-gqShVM	delivered	19	2	4	1030.521912	1	1	1	1580.187928	1089.284136	84.603289	auto	Ohio	voucher
2	sdv-id-vtaqcY	delivered	18	4	8	28.472994	1	1	1	45.969263	36.969754	24.022637	furniture_decor	Wisconsin	voucher
3	sdv-id-xkqwdo	invoiced	23	1	8	143.914263	1	1	1	146.023435	126.755360	16.945349	consoles_games	Michigan	credit_card
4	sdv-id-sGyHvQ	delivered	19	0	2	16.944537	1	1	1	30.525812	22.797409	17.785846	air_conditioning	Nevada	credit_card

	order_id	order_status	order_purchase_hour	order_purchase_dayofweek	order_purchase_month	order_total_value	num_items	num_unique_products	num_unique_sellers	total_item_price	avg_item_price	total_freight_value	top_product_category	customer_state	payment_type	order_value_per_item	order_size_category
0	sdv-id-whzjUX	shipped	10	4	4	744.312535	1	1	1	352.420029	369.966521	68.790159	construction_tools_construction	Massachusetts	voucher	352.420029	Small
5	sdv-id-dbopoJ	delivered	19	2	3	1556.667902	1	1	1	289.242639	1354.621410	15.394619	health_beauty	Vermont	credit_card	289.242639	Small
8	sdv-id-FSEOvM	delivered	15	4	8	62.060506	1	1	1	26.893468	48.485654	18.751282	luggage_accessories	South Carolina	debit_card	26.893468	Small
12	sdv-id-bQcBUR	delivered	21	0	8	73.873470	1	1	1	37.790896	75.704909	8.670875	computers_accessories	Kentucky	credit_card	37.790896	Small
14	sdv-id-MPxIXB	delivered	13	5	5	361.961537	3	3	3	169.528323	50.132979	34.731146	pet_shop	Missouri	voucher	56.509441	Medium

Data Preprocessing and EDA for E‑Commerce Orders¶

Setup¶

Part 1: Data Preprocessing¶

Data Loading & Joining¶

Load the CSV files¶

Inspect each table¶

Join the tables on `order_id` to create a single order-level DataFrame¶

Data Cleaning¶

Handle missing values¶

Remove clearly invalid numeric records¶

Check cleanliness and key uniqueness¶

Data Transformation¶

Sanity checks on counts and values¶

Create `order_value_per_item`¶

Create `order_size_category`¶

Final transformation check¶

Data Storage¶

Save cleaned dataset¶

Part 2: EDA & Data Visualization¶

Load Transformed Dataset¶

Reload CSV¶

Unique Categories and Payment Types¶

Bar Chart: Orders by Product Category¶

Brief interpretation¶

Box Plot: Order Value vs Number of Items¶

Brief interpretation¶

Data Preprocessing and EDA for E‑Commerce Orders¶

Setup¶

Part 1: Data Preprocessing¶

Data Loading & Joining¶

Load the CSV files¶

Inspect each table¶

Join the tables on order_id to create a single order-level DataFrame¶

Data Cleaning¶

Handle missing values¶

Remove clearly invalid numeric records¶

Check cleanliness and key uniqueness¶

Data Transformation¶

Sanity checks on counts and values¶

Create order_value_per_item¶

Create order_size_category¶

Final transformation check¶

Data Storage¶

Save cleaned dataset¶

Part 2: EDA & Data Visualization¶

Load Transformed Dataset¶

Reload CSV¶

Unique Categories and Payment Types¶

Bar Chart: Orders by Product Category¶

Brief interpretation¶

Box Plot: Order Value vs Number of Items¶

Brief interpretation¶

Join the tables on `order_id` to create a single order-level DataFrame¶

Create `order_value_per_item`¶

Create `order_size_category`¶