In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from sklearn.metrics import mean_absolute_error, mean_squared_error
from datetime import datetime
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: '%.3f' % x)
In [2]:
def load_data():
"""Load and prepare the datasets"""
try:
sales = pd.read_csv('assignment4.1a.csv')
promos = pd.read_csv('PromotionDates.csv')
# Convert to datetime format
sales['Date'] = pd.to_datetime(sales['Date'])
promos['StartDate'] = pd.to_datetime(promos['StartDate'], format='mixed', dayfirst=True)
promos['EndDate'] = pd.to_datetime(promos['EndDate'], format='mixed', dayfirst=True)
# Handle negative sales as zero
sales['Net_Sales'] = np.where(sales['SalesQuantity'] < 0, 0, sales['SalesQuantity'])
return sales, promos.head(4) # Use only first 4 promotions
except Exception as e:
print(f"Error loading data: {str(e)}")
return pd.DataFrame(), pd.DataFrame()
In [3]:
## load data
sales, promos = load_data()
In [4]:
## take a glimpse at sales data
sales.head()
Out[4]:
Date | StoreCode | ProductCode | SalesQuantity | Net_Sales | |
---|---|---|---|---|---|
0 | 2015-01-01 | 8 | 9 | -1 | 0 |
1 | 2015-01-01 | 131 | 9 | 1 | 1 |
2 | 2015-01-01 | 144 | 9 | 2 | 2 |
3 | 2015-01-01 | 203 | 9 | 2 | 2 |
4 | 2015-01-01 | 256 | 9 | 0 | 0 |
In [5]:
## take a glimpse at promotion data
promos
Out[5]:
Period | StartDate | EndDate | |
---|---|---|---|
0 | Promo1 | 2015-10-02 | 2015-02-17 |
1 | Promo2 | 2015-03-15 | 2015-03-22 |
2 | Promo3 | 2015-05-24 | 2015-01-06 |
3 | Promo4 | 2015-06-21 | 2015-06-28 |
In [6]:
## check sales dataframe has null
sales.isnull().sum()
Out[6]:
Date 0 StoreCode 0 ProductCode 0 SalesQuantity 0 Net_Sales 0 dtype: int64
In [7]:
## get summary statistics of sales data
sales[['SalesQuantity','Net_Sales']].describe()
Out[7]:
SalesQuantity | Net_Sales | |
---|---|---|
count | 1873618.000 | 1873618.000 |
mean | 2.247 | 2.258 |
std | 5.029 | 5.014 |
min | -60.000 | 0.000 |
25% | 0.000 | 0.000 |
50% | 1.000 | 1.000 |
75% | 2.000 | 2.000 |
max | 912.000 | 912.000 |
In [8]:
# Check the date range in sales data
print("Sales Data Date Range:")
print("Min Date:", sales['Date'].min())
print("Max Date:", sales['Date'].max())
# Check the date range in promotion data
print("\nPromotion Data Date Range:")
print("Min Start Date:", promos['StartDate'].min())
print("Max End Date:", promos['EndDate'].max())
Sales Data Date Range: Min Date: 2015-01-01 00:00:00 Max Date: 2015-07-31 00:00:00 Promotion Data Date Range: Min Start Date: 2015-03-15 00:00:00 Max End Date: 2015-06-28 00:00:00
In [9]:
## flag promotions in sales data
def flag_promotions(sales, promos):
"""Flag promotion periods in sales data with validation"""
if sales.empty or promos.empty:
return sales
sales['Promotion_Period'] = 'No_Promotion'
for _, promo in promos.iterrows():
try:
mask = (sales['Date'] >= promo['StartDate']) & (sales['Date'] <= promo['EndDate'])
sales.loc[mask, 'Promotion_Period'] = promo['Period']
except Exception as e:
print(f"Error flagging promotion {promo['Period']}: {str(e)}")
return sales
In [10]:
sales_df = flag_promotions(sales, promos)
In [11]:
sales_df.head()
Out[11]:
Date | StoreCode | ProductCode | SalesQuantity | Net_Sales | Promotion_Period | |
---|---|---|---|---|---|---|
0 | 2015-01-01 | 8 | 9 | -1 | 0 | No_Promotion |
1 | 2015-01-01 | 131 | 9 | 1 | 1 | No_Promotion |
2 | 2015-01-01 | 144 | 9 | 2 | 2 | No_Promotion |
3 | 2015-01-01 | 203 | 9 | 2 | 2 | No_Promotion |
4 | 2015-01-01 | 256 | 9 | 0 | 0 | No_Promotion |
In [12]:
## group by count for each promotion
sales_df.groupby('Promotion_Period')['Date'].count()
Out[12]:
Promotion_Period No_Promotion 1720117 Promo2 85678 Promo4 67823 Name: Date, dtype: int64
In [13]:
## DATA VALIDATION
def validate_data(sales, promos):
"""Validate data quality and promotion coverage"""
print("\n=== Data Validation ===")
if sales.empty or promos.empty:
print("No data available for validation")
return
# Check date ranges
print("\nPromotion Date Ranges:")
print(promos[['Period', 'StartDate', 'EndDate']].to_string(index=False))
# Check promotion coverage
print("\nPromotion Counts:")
print(sales['Promotion_Period'].value_counts())
# Detailed promotion analysis
for _, promo in promos.iterrows():
duration = (promo['EndDate'] - promo['StartDate']).days + 1
count = sales[sales['Promotion_Period'] == promo['Period']].shape[0]
print(f"\nPromotion {promo['Period']}:")
print(f"Expected duration: {duration} days")
print(f"Actual records flagged: {count}")
print(f"Coverage: {count/duration:.1%} of expected days")
In [14]:
## Validate data
print("\n[2/6] Validating data...")
validate_data(sales, promos)
[2/6] Validating data... === Data Validation === Promotion Date Ranges: Period StartDate EndDate Promo1 2015-10-02 2015-02-17 Promo2 2015-03-15 2015-03-22 Promo3 2015-05-24 2015-01-06 Promo4 2015-06-21 2015-06-28 Promotion Counts: Promotion_Period No_Promotion 1720117 Promo2 85678 Promo4 67823 Name: count, dtype: int64 Promotion Promo1: Expected duration: -226 days Actual records flagged: 0 Coverage: -0.0% of expected days Promotion Promo2: Expected duration: 8 days Actual records flagged: 85678 Coverage: 1070975.0% of expected days Promotion Promo3: Expected duration: -137 days Actual records flagged: 0 Coverage: -0.0% of expected days Promotion Promo4: Expected duration: 8 days Actual records flagged: 67823 Coverage: 847787.5% of expected days
In [15]:
### Product Derivative Clustering Implementation ###
In [16]:
### Derivative based Clustering
## filter non-promotion periods
non_promo = sales_df[sales_df['Promotion_Period'] == 'No_Promotion']
## Create weekly aggregates:
weekly_sales = (non_promo.groupby(['ProductCode', 'StoreCode', pd.Grouper(key='Date', freq='W-MON')])
['Net_Sales'].sum().reset_index())
## Derivative calculation
## sort by date and calculate weekly differences
weekly_sales['Sales_Derivative'] = weekly_sales.groupby(['ProductCode', 'StoreCode'])['Net_Sales'].diff()
## Normalize by sales volume
weekly_sales['Norm_Derivative'] = weekly_sales['Sales_Derivative'] / (weekly_sales['Net_Sales'] + 1) # +1 to avoid division by zero
## Stability Metrics Calculation
## For clustering, derive these metrics per product:
## Average absolute derivative: Measures overall volatility
stability = weekly_sales.groupby('ProductCode')['Norm_Derivative'].agg(
Avg_Abs_Derivative=lambda x: np.abs(x).mean(),
Derivative_StdDev=lambda x: x.std()
).reset_index()
## Clustering Strategy
## Three clusters based on derivative metrics:
## Stable (Slow): Low avg absolute derivative (<25th percentile)
## Moderate (Medium): Middle range (25th-75th percentile)
## Volatile (Fast): High avg absolute derivative (>75th percentile)
stability['Derivative_Cluster_Item'] = pd.qcut(stability['Avg_Abs_Derivative'],
q=[0, 0.25, 0.75, 1],
labels=['Slow Items', 'Medium Items', 'Fast Items'])
## validation - visualcheck
sns.boxplot(data=stability, x='Derivative_Cluster_Item', y='Avg_Abs_Derivative')
plt.title('Sales Volatility by Cluster')
Out[16]:
Text(0.5, 1.0, 'Sales Volatility by Cluster')
In [17]:
stability.head()
Out[17]:
ProductCode | Avg_Abs_Derivative | Derivative_StdDev | Derivative_Cluster_Item | |
---|---|---|---|---|
0 | 1 | 1.837 | 4.129 | Fast Items |
1 | 2 | 1.645 | 3.830 | Fast Items |
2 | 3 | 1.333 | 3.064 | Medium Items |
3 | 4 | 1.851 | 4.051 | Fast Items |
4 | 5 | 1.617 | 3.661 | Fast Items |
In [18]:
stability.describe()
Out[18]:
ProductCode | Avg_Abs_Derivative | Derivative_StdDev | |
---|---|---|---|
count | 317.000 | 304.000 | 299.000 |
mean | 159.000 | 1.062 | 2.166 |
std | 91.654 | 1.082 | 2.166 |
min | 1.000 | 0.000 | 0.000 |
25% | 80.000 | 0.524 | 0.768 |
50% | 159.000 | 0.709 | 1.174 |
75% | 238.000 | 1.385 | 3.137 |
max | 317.000 | 11.168 | 17.987 |
In [19]:
## check null counts is stability data
stability.isnull().sum()
Out[19]:
ProductCode 0 Avg_Abs_Derivative 13 Derivative_StdDev 18 Derivative_Cluster_Item 13 dtype: int64
In [20]:
## because of it is so less numbers are null, I discard them
## remove nulls (which small percentage)
stability_clean = stability.dropna(subset=['Avg_Abs_Derivative', 'Derivative_StdDev'])
In [21]:
## check quantile based clustering derivation means for each cluster
stability_clean.groupby('Derivative_Cluster_Item')['Avg_Abs_Derivative'].mean()
Out[21]:
Derivative_Cluster_Item Slow Items 0.413 Medium Items 0.804 Fast Items 2.251 Name: Avg_Abs_Derivative, dtype: float64
In [22]:
## if we want to use k-means (here some outliers that kmeans can be effected but however try it)
# 1. Select features
X = stability[['Avg_Abs_Derivative', 'Derivative_StdDev']]
# 2. Impute (mean/median recommended for k-means)
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
# 3. Standardize (critical for k-means)
X_scaled = StandardScaler().fit_transform(X_imputed)
# Now safe for k-means
#kmeans = KMeans(n_clusters=3, random_state=42).fit(X_scaled)
# Elbow method to determine k
distortions = []
for k in range(1, 5):
kmeans = KMeans(n_clusters=k, random_state=42).fit(X_scaled)
distortions.append(kmeans.inertia_)
# Plot to find elbow point
plt.plot(range(1,5), distortions)
Out[22]:
[<matplotlib.lines.Line2D at 0x13f6bb5b0>]
In [23]:
## comment: elbow is at 3 point
In [24]:
sns.pairplot(pd.DataFrame(X_scaled, columns=['Derivative', 'StdDev']))
Out[24]:
<seaborn.axisgrid.PairGrid at 0x13f72b670>
In [25]:
stability_clean.head()
Out[25]:
ProductCode | Avg_Abs_Derivative | Derivative_StdDev | Derivative_Cluster_Item | |
---|---|---|---|---|
0 | 1 | 1.837 | 4.129 | Fast Items |
1 | 2 | 1.645 | 3.830 | Fast Items |
2 | 3 | 1.333 | 3.064 | Medium Items |
3 | 4 | 1.851 | 4.051 | Fast Items |
4 | 5 | 1.617 | 3.661 | Fast Items |
In [26]:
# 1. K-MEANS - Select features
X = stability_clean[['Avg_Abs_Derivative', 'Derivative_StdDev']]
# 2. Impute (mean/median recommended for k-means)
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
# 3. Standardize (critical for k-means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
# Now safe for k-means
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_scaled)
stability_clean['Cluster'] = kmeans.fit_predict(X_scaled)
#product_silhouette = silhouette_score(X_scaled, stability_clean)
item_silhouette = silhouette_score(stability_clean[['Avg_Abs_Derivative', 'Derivative_StdDev']], stability_clean['Cluster'])
# Create meaningful labels based on actual cluster centers
# Get cluster centers sorted by Avg_Abs_Derivative (volatility)
centers = pd.DataFrame(
scaler.inverse_transform(kmeans.cluster_centers_),
columns=['Avg_Abs_Derivative', 'Derivative_StdDev']
).sort_values('Avg_Abs_Derivative')
# Create label mapping (0=Slow, 1=Medium, 2=Fast)
cluster_labels = {
k: v for k, v in zip(
centers.index,
['Slow', 'Medium', 'Fast']
)
}
# 3. Assign labels correctly
stability_clean['Product_Cluster'] = stability_clean['Cluster'].map(cluster_labels)
# 4. Verify
print(stability_clean.groupby('Product_Cluster')[['Avg_Abs_Derivative', 'Derivative_StdDev']].mean())
# Print silhouette scores
print("\n=== Cluster Quality Metrics ===")
print(f"Product Cluster Silhouette Score: {item_silhouette:.3f}")
#print(f"Store Cluster Silhouette Score: {store_silhouette:.3f}")
# Interpretation guide
print("\nSilhouette Score Interpretation:")
print("> 0.7: Strong cluster structure")
print("0.5-0.7: Reasonable structure")
print("0.25-0.5: Weak structure")
print("< 0.25: No substantial structure")
plt.figure(figsize=(10,6))
sns.scatterplot(
data=stability_clean,
x='Avg_Abs_Derivative',
y='Derivative_StdDev',
hue='Product_Cluster',
palette={'Slow':'red', 'Medium':'yellow', 'Fast':'green'}
)
plt.title('Product Clusters by Sales Volatility')
plt.show()
Avg_Abs_Derivative Derivative_StdDev Product_Cluster Fast 8.554 14.792 Medium 1.725 3.955 Slow 0.608 1.049 === Cluster Quality Metrics === Product Cluster Silhouette Score: 0.695 Silhouette Score Interpretation: > 0.7: Strong cluster structure 0.5-0.7: Reasonable structure 0.25-0.5: Weak structure < 0.25: No substantial structure
In [27]:
stability_clean.head()
Out[27]:
ProductCode | Avg_Abs_Derivative | Derivative_StdDev | Derivative_Cluster_Item | Cluster | Product_Cluster | |
---|---|---|---|---|---|---|
0 | 1 | 1.837 | 4.129 | Fast Items | 1 | Medium |
1 | 2 | 1.645 | 3.830 | Fast Items | 1 | Medium |
2 | 3 | 1.333 | 3.064 | Medium Items | 1 | Medium |
3 | 4 | 1.851 | 4.051 | Fast Items | 1 | Medium |
4 | 5 | 1.617 | 3.661 | Fast Items | 1 | Medium |
In [28]:
pd.Series(kmeans.labels_).value_counts()
Out[28]:
0 199 1 96 2 4 Name: count, dtype: int64
In [29]:
### comment: quantile and kmeans based approaches can be used to make clustering. I will use both methods,
In [30]:
### Store Derivative Clustering Implementation ####
In [31]:
# 1. Filter non-promotion periods (already done in your code)
non_promo = sales_df[sales_df['Promotion_Period'] == 'No_Promotion']
# 2. Create weekly aggregates for stores (group by StoreCode instead of ProductCode)
weekly_sales_stores = (non_promo.groupby(['StoreCode', pd.Grouper(key='Date', freq='W-MON')])
['Net_Sales'].sum().reset_index())
# 3. Derivative calculation for stores
weekly_sales_stores['Sales_Derivative'] = weekly_sales_stores.groupby('StoreCode')['Net_Sales'].diff()
# 4. Normalize by sales volume
weekly_sales_stores['Norm_Derivative'] = (weekly_sales_stores['Sales_Derivative'] /
(weekly_sales_stores['Net_Sales'] + 1))
# 5. Stability metrics calculation per store
store_stability = weekly_sales_stores.groupby('StoreCode')['Norm_Derivative'].agg(
Avg_Abs_Derivative=lambda x: np.abs(x).mean(),
Derivative_StdDev=lambda x: x.std()
).reset_index()
# 6. Store clustering based on derivative metrics
store_stability['Derivative_Cluster_Store'] = pd.qcut(
store_stability['Avg_Abs_Derivative'],
q=[0, 0.25, 0.75, 1],
labels=['Slow Stores', 'Medium Stores', 'Fast Stores']
)
# 7. Validation visualization
plt.figure(figsize=(10, 6))
sns.boxplot(data=store_stability, x='Derivative_Cluster_Store', y='Avg_Abs_Derivative')
plt.title('Store Sales Volatility by Cluster')
plt.show()
In [32]:
store_stability.head()
Out[32]:
StoreCode | Avg_Abs_Derivative | Derivative_StdDev | Derivative_Cluster_Store | |
---|---|---|---|---|
0 | 1 | 0.962 | 2.928 | Fast Stores |
1 | 2 | 0.579 | 1.363 | Medium Stores |
2 | 3 | 0.747 | 2.063 | Medium Stores |
3 | 4 | 0.562 | 1.101 | Slow Stores |
4 | 5 | 0.765 | 2.026 | Medium Stores |
In [33]:
store_stability.groupby('Derivative_Cluster_Store')['Avg_Abs_Derivative'].mean()
Out[33]:
Derivative_Cluster_Store Slow Stores 0.488 Medium Stores 0.707 Fast Stores 1.517 Name: Avg_Abs_Derivative, dtype: float64
In [34]:
## use k-means
## if we want to use k-means (here some outliers that kmeans can be effected but however try it)
# 1. Select features
X_s = store_stability[['Avg_Abs_Derivative', 'Derivative_StdDev']]
# 2. Impute (mean/median recommended for k-means)
imputer = SimpleImputer(strategy='median')
X_imputed_s = imputer.fit_transform(X_s)
# 3. Standardize (critical for k-means)
X_scaled_s = StandardScaler().fit_transform(X_imputed_s)
# Now safe for k-means
#kmeans = KMeans(n_clusters=3, random_state=42).fit(X_scaled)
# Elbow method to determine k
distortions = []
for k in range(1, 5):
kmeans = KMeans(n_clusters=k, random_state=42).fit(X_scaled_s)
distortions.append(kmeans.inertia_)
# Plot to find elbow point
plt.plot(range(1,5), distortions)
Out[34]:
[<matplotlib.lines.Line2D at 0x13fc2a5b0>]
In [35]:
### comment: 2 or 3 clusters may be used
In [36]:
store_stability.isnull().sum()
Out[36]:
StoreCode 0 Avg_Abs_Derivative 3 Derivative_StdDev 3 Derivative_Cluster_Store 3 dtype: int64
In [37]:
store_stability = store_stability.dropna(subset=['Avg_Abs_Derivative', 'Derivative_StdDev'])
In [38]:
store_stability.isnull().sum()
Out[38]:
StoreCode 0 Avg_Abs_Derivative 0 Derivative_StdDev 0 Derivative_Cluster_Store 0 dtype: int64
In [39]:
# 1. K-MEANS - Select features
X_s = store_stability[['Avg_Abs_Derivative', 'Derivative_StdDev']]
# 2. Impute (mean/median recommended for k-means)
imputer = SimpleImputer(strategy='median')
X_imputed_s = imputer.fit_transform(X_s)
# 3. Standardize (critical for k-means)
scaler = StandardScaler()
X_scaled_s = scaler.fit_transform(X_imputed_s)
# Now safe for k-means
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_scaled_s)
store_stability['Cluster'] = kmeans.fit_predict(X_scaled_s)
#product_silhouette = silhouette_score(X_scaled, stability_clean)
item_silhouette_s = silhouette_score(store_stability[['Avg_Abs_Derivative', 'Derivative_StdDev']], store_stability['Cluster'])
# Create meaningful labels based on actual cluster centers
# Get cluster centers sorted by Avg_Abs_Derivative (volatility)
centers = pd.DataFrame(
scaler.inverse_transform(kmeans.cluster_centers_),
columns=['Avg_Abs_Derivative', 'Derivative_StdDev']
).sort_values('Avg_Abs_Derivative')
# Create label mapping (0=Slow, 1=Medium, 2=Fast)
cluster_labels = {
k: v for k, v in zip(
centers.index,
['Slow', 'Medium', 'Fast']
)
}
# 3. Assign labels correctly
store_stability['Store_Cluster'] = store_stability['Cluster'].map(cluster_labels)
# 4. Verify
print(store_stability.groupby('Store_Cluster')[['Avg_Abs_Derivative', 'Derivative_StdDev']].mean())
# Print silhouette scores
print("\n=== Cluster Quality Metrics ===")
print(f"Product Cluster Silhouette Score: {item_silhouette_s:.3f}")
# Interpretation guide
print("\nSilhouette Score Interpretation:")
print("> 0.7: Strong cluster structure")
print("0.5-0.7: Reasonable structure")
print("0.25-0.5: Weak structure")
print("< 0.25: No substantial structure")
plt.figure(figsize=(10,6))
sns.scatterplot(
data=store_stability,
x='Avg_Abs_Derivative',
y='Derivative_StdDev',
hue='Store_Cluster',
palette={'Slow':'red', 'Medium':'yellow', 'Fast':'green'}
)
plt.title('Store Clusters by Sales Volatility')
plt.show()
Avg_Abs_Derivative Derivative_StdDev Store_Cluster Fast 11.190 57.265 Medium 4.707 15.980 Slow 0.788 1.960 === Cluster Quality Metrics === Product Cluster Silhouette Score: 0.887 Silhouette Score Interpretation: > 0.7: Strong cluster structure 0.5-0.7: Reasonable structure 0.25-0.5: Weak structure < 0.25: No substantial structure
In [40]:
store_stability.head()
Out[40]:
StoreCode | Avg_Abs_Derivative | Derivative_StdDev | Derivative_Cluster_Store | Cluster | Store_Cluster | |
---|---|---|---|---|---|---|
0 | 1 | 0.962 | 2.928 | Fast Stores | 0 | Slow |
1 | 2 | 0.579 | 1.363 | Medium Stores | 0 | Slow |
2 | 3 | 0.747 | 2.063 | Medium Stores | 0 | Slow |
3 | 4 | 0.562 | 1.101 | Slow Stores | 0 | Slow |
4 | 5 | 0.765 | 2.026 | Medium Stores | 0 | Slow |
In [41]:
stability_clean.head()
Out[41]:
ProductCode | Avg_Abs_Derivative | Derivative_StdDev | Derivative_Cluster_Item | Cluster | Product_Cluster | |
---|---|---|---|---|---|---|
0 | 1 | 1.837 | 4.129 | Fast Items | 1 | Medium |
1 | 2 | 1.645 | 3.830 | Fast Items | 1 | Medium |
2 | 3 | 1.333 | 3.064 | Medium Items | 1 | Medium |
3 | 4 | 1.851 | 4.051 | Fast Items | 1 | Medium |
4 | 5 | 1.617 | 3.661 | Fast Items | 1 | Medium |
In [42]:
stability_clean = stability_clean.rename(columns={'Derivative_Cluster_Item':'Product_Cluster_Quantile',
'Cluster':'Kmeans_Product_Cluster_Nums',
'Product_Cluster':'Product_Cluster_KMeans'})
In [43]:
stability_clean.head()
Out[43]:
ProductCode | Avg_Abs_Derivative | Derivative_StdDev | Product_Cluster_Quantile | Kmeans_Product_Cluster_Nums | Product_Cluster_KMeans | |
---|---|---|---|---|---|---|
0 | 1 | 1.837 | 4.129 | Fast Items | 1 | Medium |
1 | 2 | 1.645 | 3.830 | Fast Items | 1 | Medium |
2 | 3 | 1.333 | 3.064 | Medium Items | 1 | Medium |
3 | 4 | 1.851 | 4.051 | Fast Items | 1 | Medium |
4 | 5 | 1.617 | 3.661 | Fast Items | 1 | Medium |
In [44]:
store_stability.head()
Out[44]:
StoreCode | Avg_Abs_Derivative | Derivative_StdDev | Derivative_Cluster_Store | Cluster | Store_Cluster | |
---|---|---|---|---|---|---|
0 | 1 | 0.962 | 2.928 | Fast Stores | 0 | Slow |
1 | 2 | 0.579 | 1.363 | Medium Stores | 0 | Slow |
2 | 3 | 0.747 | 2.063 | Medium Stores | 0 | Slow |
3 | 4 | 0.562 | 1.101 | Slow Stores | 0 | Slow |
4 | 5 | 0.765 | 2.026 | Medium Stores | 0 | Slow |
In [45]:
store_stability = store_stability.rename(columns={'Derivative_Cluster_Store':'Store_Cluster_Quantile',
'Cluster':'Kmeans_Store_Cluster_Nums',
'Store_Cluster':'Store_Cluster_KMeans'})
In [46]:
store_stability.head()
Out[46]:
StoreCode | Avg_Abs_Derivative | Derivative_StdDev | Store_Cluster_Quantile | Kmeans_Store_Cluster_Nums | Store_Cluster_KMeans | |
---|---|---|---|---|---|---|
0 | 1 | 0.962 | 2.928 | Fast Stores | 0 | Slow |
1 | 2 | 0.579 | 1.363 | Medium Stores | 0 | Slow |
2 | 3 | 0.747 | 2.063 | Medium Stores | 0 | Slow |
3 | 4 | 0.562 | 1.101 | Slow Stores | 0 | Slow |
4 | 5 | 0.765 | 2.026 | Medium Stores | 0 | Slow |
In [47]:
# Assuming you have:
# - sales_df (main transaction data)
# - product_clusters (contains ProductCode and cluster columns)
# - store_clusters (contains StoreCode and cluster columns)
# Merge product clusters
sales_df = sales_df.merge(
stability_clean[['ProductCode', 'Product_Cluster_Quantile', 'Product_Cluster_KMeans']],
on='ProductCode',
how='left'
)
# Merge store clusters
sales_df = sales_df.merge(
store_stability[['StoreCode', 'Store_Cluster_Quantile', 'Store_Cluster_KMeans']],
on='StoreCode',
how='left'
)
# Verify merge
print(sales_df[['ProductCode', 'StoreCode',
'Product_Cluster_Quantile', 'Product_Cluster_KMeans',
'Store_Cluster_Quantile', 'Store_Cluster_KMeans']].head())
ProductCode StoreCode Product_Cluster_Quantile Product_Cluster_KMeans \ 0 9 8 Slow Items Slow 1 9 131 Slow Items Slow 2 9 144 Slow Items Slow 3 9 203 Slow Items Slow 4 9 256 Slow Items Slow Store_Cluster_Quantile Store_Cluster_KMeans 0 Medium Stores Slow 1 Medium Stores Slow 2 Medium Stores Slow 3 Medium Stores Slow 4 Slow Stores Slow
In [48]:
len(sales_df)
Out[48]:
1873618
In [49]:
sales_df.head()
Out[49]:
Date | StoreCode | ProductCode | SalesQuantity | Net_Sales | Promotion_Period | Product_Cluster_Quantile | Product_Cluster_KMeans | Store_Cluster_Quantile | Store_Cluster_KMeans | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-01 | 8 | 9 | -1 | 0 | No_Promotion | Slow Items | Slow | Medium Stores | Slow |
1 | 2015-01-01 | 131 | 9 | 1 | 1 | No_Promotion | Slow Items | Slow | Medium Stores | Slow |
2 | 2015-01-01 | 144 | 9 | 2 | 2 | No_Promotion | Slow Items | Slow | Medium Stores | Slow |
3 | 2015-01-01 | 203 | 9 | 2 | 2 | No_Promotion | Slow Items | Slow | Medium Stores | Slow |
4 | 2015-01-01 | 256 | 9 | 0 | 0 | No_Promotion | Slow Items | Slow | Slow Stores | Slow |
In [50]:
## QA
In [51]:
## Q.3 -) Which items experienced the biggest sale increase during promotions?
In [52]:
def top_promo_items(sales_df, cluster_type='Product_Cluster_Quantile'):
"""Identify items with highest promotion uplift"""
# Filter periods
promo = sales_df[sales_df['Promotion_Period'] != 'No_Promotion']
non_promo = sales_df[sales_df['Promotion_Period'] == 'No_Promotion']
# Calculate uplift by product
uplift = (promo.groupby('ProductCode')['Net_Sales'].mean() -
non_promo.groupby('ProductCode')['Net_Sales'].mean())
# Prepare results
result = (
uplift.rename('Uplift') # Name the series before reset_index
.reset_index()
.merge(sales_df[['ProductCode', cluster_type]].drop_duplicates(),
on='ProductCode')
.sort_values('Uplift', ascending=False)
.head(10)
)
return result
print(top_promo_items(sales_df))
ProductCode Uplift Product_Cluster_Quantile 136 137 5.111 Fast Items 217 218 4.659 Fast Items 123 124 3.579 Medium Items 220 221 2.368 Fast Items 60 61 2.063 Medium Items 219 220 1.815 Fast Items 221 222 1.663 Fast Items 208 209 1.643 Fast Items 218 219 1.641 Fast Items 204 205 1.598 Fast Items
In [53]:
def top_promo_items_kmeans(sales_df, cluster_type='Product_Cluster_KMeans'):
"""Identify items with highest promotion uplift"""
# Input validation
required_cols = ['ProductCode', 'Net_Sales', 'Promotion_Period', cluster_type]
if not all(col in sales_df.columns for col in required_cols):
missing = [col for col in required_cols if col not in sales_df.columns]
raise ValueError(f"Missing columns in sales_df: {missing}")
# Filter periods
promo = sales_df[sales_df['Promotion_Period'] != 'No_Promotion']
non_promo = sales_df[sales_df['Promotion_Period'] == 'No_Promotion']
# Calculate uplift by product
uplift = (promo.groupby('ProductCode')['Net_Sales'].mean() -
non_promo.groupby('ProductCode')['Net_Sales'].mean())
# Prepare results
result = (
uplift.rename('Uplift') # Name the series before reset_index
.reset_index()
.merge(sales_df[['ProductCode', cluster_type]].drop_duplicates(),
on='ProductCode')
.sort_values('Uplift', ascending=False)
.head(10)
)
return result
print(top_promo_items_kmeans(sales_df))
ProductCode Uplift Product_Cluster_KMeans 136 137 5.111 Fast 217 218 4.659 Medium 123 124 3.579 Slow 220 221 2.368 Medium 60 61 2.063 Medium 219 220 1.815 Medium 221 222 1.663 Medium 208 209 1.643 Medium 218 219 1.641 Medium 204 205 1.598 Medium
In [54]:
## Q.4 -) Are there stores that have higher promotion reaction?
In [55]:
def top_promo_stores(sales_df, cluster_type='Store_Cluster_Quantile'):
"""Identify stores with highest promotion response"""
promo = sales_df[sales_df['Promotion_Period'] != 'No_Promotion']
non_promo = sales_df[sales_df['Promotion_Period'] == 'No_Promotion']
uplift = (promo.groupby('StoreCode')['Net_Sales'].mean() -
non_promo.groupby('StoreCode')['Net_Sales'].mean())
# Prepare results
result = (
uplift.rename('Uplift') # Name the series before reset_index
.reset_index()
.merge(sales_df[['StoreCode', cluster_type]].drop_duplicates(),
on='StoreCode')
.sort_values('Uplift', ascending=False)
.head(10)
)
return result
print(top_promo_stores(sales_df))
StoreCode Uplift Store_Cluster_Quantile 91 92 2.913 Slow Stores 255 256 2.496 Slow Stores 204 205 2.014 Medium Stores 325 326 1.858 Fast Stores 82 83 1.786 Slow Stores 304 305 1.775 Slow Stores 65 66 1.631 Fast Stores 299 300 1.570 Medium Stores 180 181 1.549 Fast Stores 106 107 1.501 Fast Stores
In [56]:
#comments:
#Uplift vs Absolute Sales:
#Slow stores show higher percentage uplift because their baseline sales are lower
#Fast stores may have larger absolute $ increases but smaller % uplift
In [57]:
def plot_promo_response(sales_df):
plt.figure(figsize=(10,6))
sns.boxplot(
data=sales_df,
x='Store_Cluster_Quantile',
y='Net_Sales',
hue='Promotion_Period'
)
plt.title('Promotion Response by Store Cluster')
plt.show()
print(plot_promo_response(sales_df))
None
In [58]:
def top_promo_stores_kmeans(sales_df, cluster_type='Store_Cluster_KMeans'):
"""Identify stores with highest promotion response"""
promo = sales_df[sales_df['Promotion_Period'] != 'No_Promotion']
non_promo = sales_df[sales_df['Promotion_Period'] == 'No_Promotion']
uplift = (promo.groupby('StoreCode')['Net_Sales'].mean() -
non_promo.groupby('StoreCode')['Net_Sales'].mean())
# Prepare results
result = (
uplift.rename('Uplift') # Name the series before reset_index
.reset_index()
.merge(sales_df[['StoreCode', cluster_type]].drop_duplicates(),
on='StoreCode')
.sort_values('Uplift', ascending=False)
.head(10)
)
return result
print(top_promo_stores_kmeans(sales_df))
StoreCode Uplift Store_Cluster_KMeans 91 92 2.913 Slow 255 256 2.496 Slow 204 205 2.014 Slow 325 326 1.858 Slow 82 83 1.786 Slow 304 305 1.775 Slow 65 66 1.631 Slow 299 300 1.570 Slow 180 181 1.549 Slow 106 107 1.501 Slow
In [59]:
def top_stores_by_cluster(sales_df):
"""Identify top performing stores by cluster with NaN handling"""
# First ensure we have the required column
if 'Store_Cluster_Quantile' not in sales_df.columns:
raise ValueError("Missing 'Store_Cluster_Quantile' column in sales_df")
results = []
clusters = ['Slow', 'Medium', 'Fast']
for cluster in clusters:
# Filter stores in current cluster (handling NaN values)
cluster_mask = sales_df['Store_Cluster_Quantile'].notna() & \
sales_df['Store_Cluster_Quantile'].str.contains(cluster, na=False)
cluster_stores = sales_df[cluster_mask]
# Skip if no stores in this cluster
if len(cluster_stores) == 0:
print(f"No stores found in {cluster} cluster")
continue
# Calculate uplift
promo = cluster_stores[cluster_stores['Promotion_Period'] != 'No_Promotion']
non_promo = cluster_stores[cluster_stores['Promotion_Period'] == 'No_Promotion']
uplift = (promo.groupby('StoreCode')['Net_Sales'].mean() -
non_promo.groupby('StoreCode')['Net_Sales'].mean())
# Get top 5 and add cluster label
top_5 = uplift.nlargest(5).reset_index()
top_5['Cluster'] = cluster
results.append(top_5)
if not results:
return pd.DataFrame(columns=['StoreCode', 'Net_Sales', 'Cluster'])
return pd.concat(results).rename(columns={'Net_Sales': 'Uplift'})
print(top_stores_by_cluster(sales_df))
StoreCode Uplift Cluster 0 92 2.913 Slow 1 256 2.496 Slow 2 83 1.786 Slow 3 305 1.775 Slow 4 275 1.221 Slow 0 205 2.014 Medium 1 300 1.570 Medium 2 148 1.318 Medium 3 290 1.244 Medium 4 268 1.220 Medium 0 326 1.858 Fast 1 66 1.631 Fast 2 181 1.549 Fast 3 107 1.501 Fast 4 226 1.133 Fast
In [60]:
## Q.5 -) What is the biggest effect explaining sales change during promotion
In [61]:
sales_df.isnull().sum()
Out[61]:
Date 0 StoreCode 0 ProductCode 0 SalesQuantity 0 Net_Sales 0 Promotion_Period 0 Product_Cluster_Quantile 63 Product_Cluster_KMeans 63 Store_Cluster_Quantile 46 Store_Cluster_KMeans 46 dtype: int64
In [62]:
def clean_data(sales_df):
"""Handle missing values in cluster columns"""
# 1. First check which rows have complete cluster data
cluster_cols = [
'Product_Cluster_Quantile', 'Product_Cluster_KMeans',
'Store_Cluster_Quantile', 'Store_Cluster_KMeans'
]
# 2. Count missing values before cleaning
print("Missing values before cleaning:")
print(sales_df[cluster_cols].isnull().sum())
# 3. Drop rows with any missing cluster data
clean_df = sales_df.dropna(subset=cluster_cols, how='any')
# 4. Verify cleaning
print("\nMissing values after cleaning:")
print(clean_df[cluster_cols].isnull().sum())
print(f"\nOriginal rows: {len(sales_df)}")
print(f"Remaining rows: {len(clean_df)}")
print(f"Rows dropped: {len(sales_df) - len(clean_df)}")
return clean_df
clean_df = clean_data(sales_df)
Missing values before cleaning: Product_Cluster_Quantile 63 Product_Cluster_KMeans 63 Store_Cluster_Quantile 46 Store_Cluster_KMeans 46 dtype: int64 Missing values after cleaning: Product_Cluster_Quantile 0 Product_Cluster_KMeans 0 Store_Cluster_Quantile 0 Store_Cluster_KMeans 0 dtype: int64 Original rows: 1873618 Remaining rows: 1873509 Rows dropped: 109
In [63]:
def robust_promotion_analysis(sales_df):
"""Version that adapts to actual data"""
# Get real promotion periods from data
valid_periods = [p for p in sales_df['Promotion_Period'].unique()
if p != 'No_Promotion']
if not valid_periods:
print("⚠️ No promotion periods found (only 'No_Promotion' exists)")
return pd.DataFrame()
results = []
cluster_types = {
'Product_Quantile': 'Product_Cluster_Quantile',
'Store_Quantile': 'Store_Cluster_Quantile',
'Product_KMeans': 'Product_Cluster_KMeans',
'Store_KMeans': 'Store_Cluster_KMeans'
}
for cluster_name, cluster_col in cluster_types.items():
if cluster_col not in sales_df.columns:
print(f"⏩ Skipping {cluster_name} - column missing")
continue
# Get non-null clusters
valid_clusters = sales_df[cluster_col].dropna().unique()
for cluster in valid_clusters:
cluster_data = sales_df[sales_df[cluster_col] == cluster]
# Get baseline sales
baseline = cluster_data[cluster_data['Promotion_Period'] == 'No_Promotion']
if len(baseline) == 0:
print(f"⚠️ {cluster_name}/{cluster} - no baseline data")
continue
baseline_sales = baseline['Net_Sales'].mean()
# Compare each promotion period
for period in valid_periods:
promo_data = cluster_data[cluster_data['Promotion_Period'] == period]
if len(promo_data) == 0:
print(f"⚠️ {cluster_name}/{cluster} - no data for {period}")
continue
promo_sales = promo_data['Net_Sales'].mean()
uplift = (promo_sales - baseline_sales)/baseline_sales
results.append({
'Cluster_Type': cluster_name,
'Cluster': cluster,
'Promotion': period,
'Baseline': baseline_sales,
'Promo_Sales': promo_sales,
'Uplift': uplift,
'Obs_Count': len(promo_data)
})
return pd.DataFrame(results)
# Usage
results = robust_promotion_analysis(clean_df)
if not results.empty:
print("\nFinal Results:")
print(results)
else:
print("\n❌ No valid results produced - check diagnostics above")
Final Results: Cluster_Type Cluster Promotion Baseline Promo_Sales Uplift \ 0 Product_Quantile Slow Items Promo2 0.494 0.522 0.057 1 Product_Quantile Slow Items Promo4 0.494 0.600 0.216 2 Product_Quantile Medium Items Promo2 1.148 1.186 0.034 3 Product_Quantile Medium Items Promo4 1.148 1.342 0.170 4 Product_Quantile Fast Items Promo2 3.952 4.881 0.235 5 Product_Quantile Fast Items Promo4 3.952 4.881 0.235 6 Store_Quantile Medium Stores Promo2 2.172 2.527 0.164 7 Store_Quantile Medium Stores Promo4 2.172 2.691 0.239 8 Store_Quantile Slow Stores Promo2 2.389 2.876 0.204 9 Store_Quantile Slow Stores Promo4 2.389 2.915 0.220 10 Store_Quantile Fast Stores Promo2 2.114 2.461 0.164 11 Store_Quantile Fast Stores Promo4 2.114 2.520 0.192 12 Product_KMeans Slow Promo2 1.004 1.017 0.013 13 Product_KMeans Slow Promo4 1.004 1.163 0.158 14 Product_KMeans Medium Promo2 3.570 4.375 0.225 15 Product_KMeans Medium Promo4 3.570 4.424 0.239 16 Product_KMeans Fast Promo2 9.014 10.442 0.158 17 Product_KMeans Fast Promo4 9.014 11.165 0.239 18 Store_KMeans Slow Promo2 2.226 2.618 0.176 19 Store_KMeans Slow Promo4 2.226 2.730 0.227 20 Store_KMeans Medium Promo2 1.449 1.373 -0.053 21 Store_KMeans Medium Promo4 1.449 1.322 -0.088 22 Store_KMeans Fast Promo2 2.050 2.670 0.302 23 Store_KMeans Fast Promo4 2.050 1.960 -0.044 Obs_Count 0 3845 1 2791 2 48079 3 38033 4 33754 5 26998 6 43064 7 34171 8 24319 9 19558 10 18295 11 14093 12 45685 13 36077 14 39604 15 31435 16 389 17 310 18 85033 19 67236 20 421 21 388 22 224 23 198
In [64]:
# Sort results by absolute uplift (descending)
sorted_results = results.sort_values('Uplift', ascending=False)
# Display top effects
print(sorted_results[['Cluster_Type', 'Cluster', 'Promotion', 'Uplift']].head(10))
Cluster_Type Cluster Promotion Uplift 22 Store_KMeans Fast Promo2 0.302 7 Store_Quantile Medium Stores Promo4 0.239 15 Product_KMeans Medium Promo4 0.239 17 Product_KMeans Fast Promo4 0.239 4 Product_Quantile Fast Items Promo2 0.235 5 Product_Quantile Fast Items Promo4 0.235 19 Store_KMeans Slow Promo4 0.227 14 Product_KMeans Medium Promo2 0.225 9 Store_Quantile Slow Stores Promo4 0.220 1 Product_Quantile Slow Items Promo4 0.216
In [65]:
# Top Performers:
# Fast Stores (KMeans) - Promo2: 30.2% uplift
# Fast Items (Quantile) - Promo2/Promo4: 23.5-23.9% uplift
# Medium Products (KMeans) - Promo4: 23.9% uplift
# Slow Stores (Quantile) - Promo4: 22.7% uplift
# Fast Products (KMeans) - Promo4: 23.9% uplift
In [66]:
# Answering the Key Questions
# a. Biggest Sales Change Drivers:
# Store-Level: Fast-moving stores (KMeans) show the highest uplift (30.2%)
# Product-Level: Fast-moving items (Quantile) consistently perform well (~23.5% uplift)
# Promotion-Type: Promo4 generally outperforms Promo2
# b. Statistical Validity Check:
# The KMeans fast stores show exceptional performance in Promo2 but negative uplift in Promo4 (-4.4%) → Requires investigation
# Quantile-based clusters show more consistent patterns
# c. Recommendations:
# Prioritize Fast-Moving Items/Stores: They deliver the highest absolute sales lifts
# Replicate Promo4 Strategies: Consistently better performance across clusters
# Investigate KMeans Store Clusters: High variance suggests cluster definitions may need refinement
In [67]:
## visualize
plt.figure(figsize=(12,6))
sns.barplot(
data=sorted_results,
x='Uplift',
y='Cluster',
hue='Cluster_Type',
order=sorted_results['Cluster'].unique() # Maintain sorted order
)
plt.title('Promotion Uplift by Cluster Type (Sorted)')
plt.xlabel('Uplift Percentage')
plt.axvline(0, color='black')
plt.show()
In [68]:
# Reveal Consistent Patterns:
# Fast clusters generally outperform
# Promo4 > Promo2 in most cases
# Identifies Anomalies:
# KMeans fast stores' inconsistent performance
# Medium store clusters with negative uplift
# Additional Considerations
# Economic Significance:
# Fast items' 23.5% uplift on $3.95 baseline → $0.93 increase
# Slow items' 21.6% uplift on $0.49 baseline → $0.11 increase
# Absolute dollars matter for profitability
# Promotion Costs:
# Without cost data, we can't calculate ROI
# Recommendation: "Combine fast-moving (high absolute lift) and slow-moving (high % lift) items"
In [69]:
## Q.6 -) Is there any significant difference between promotion impacts of the Fast versus Slow items?
In [70]:
def compare_item_clusters(sales_df, cluster_type='Product_Cluster_Quantile'):
"""Statistical comparison of promotion impact by item cluster"""
# Extract data
fast = sales_df[(sales_df[cluster_type].str.contains('Fast')) &
(sales_df['Promotion_Period'] != 'No_Promotion')]
slow = sales_df[(sales_df[cluster_type].str.contains('Slow')) &
(sales_df['Promotion_Period'] != 'No_Promotion')]
# Calculate statistics
t_stat, p_val = stats.ttest_ind(fast['Net_Sales'], slow['Net_Sales'], equal_var=False)
# Format results
return {
'Comparison': f"Fast vs Slow {cluster_type.replace('_', ' ')}",
'Statistical Significance': {
't-statistic': float(round(t_stat, 2)),
'p-value': f"{p_val:.2e}" if p_val < 0.001 else round(p_val, 4)
},
'Average Sales During Promotions': {
'Fast Items': round(float(fast['Net_Sales'].mean()), 2),
'Slow Items': round(float(slow['Net_Sales'].mean()), 2),
'Absolute Difference': round(float(fast['Net_Sales'].mean() - slow['Net_Sales'].mean()), 2),
'Relative Difference': f"{round((fast['Net_Sales'].mean() - slow['Net_Sales'].mean()) / slow['Net_Sales'].mean() * 100, 1)}%"
},
'Sample Sizes': {
'Fast Items': len(fast),
'Slow Items': len(slow)
}
}
print(compare_item_clusters(clean_df))
{'Comparison': 'Fast vs Slow Product Cluster Quantile', 'Statistical Significance': {'t-statistic': 109.7, 'p-value': '0.00e+00'}, 'Average Sales During Promotions': {'Fast Items': 4.88, 'Slow Items': 0.55, 'Absolute Difference': 4.33, 'Relative Difference': '779.9%'}, 'Sample Sizes': {'Fast Items': 60752, 'Slow Items': 6636}}
In [71]:
def print_comparison(results):
"""Pretty print the comparison results"""
print(f"\n=== {results['Comparison']} ===")
print("\nStatistical Significance:")
print(f" t-statistic: {results['Statistical Significance']['t-statistic']}")
print(f" p-value: {results['Statistical Significance']['p-value']}")
print("\nAverage Sales During Promotions:")
print(f" Fast Items: {results['Average Sales During Promotions']['Fast Items']}")
print(f" Slow Items: {results['Average Sales During Promotions']['Slow Items']}")
print(f" Absolute Difference: {results['Average Sales During Promotions']['Absolute Difference']}")
print(f" Relative Difference: {results['Average Sales During Promotions']['Relative Difference']}")
print("\nSample Sizes:")
print(f" Fast Items: {results['Sample Sizes']['Fast Items']}")
print(f" Slow Items: {results['Sample Sizes']['Slow Items']}")
# Usage:
results = compare_item_clusters(clean_df)
print_comparison(results)
=== Fast vs Slow Product Cluster Quantile === Statistical Significance: t-statistic: 109.7 p-value: 0.00e+00 Average Sales During Promotions: Fast Items: 4.88 Slow Items: 0.55 Absolute Difference: 4.33 Relative Difference: 779.9% Sample Sizes: Fast Items: 60752 Slow Items: 6636
In [72]:
## Q.7 -) Is there any significant difference between promotion impacts of the Fast versus Slow stores ?
In [73]:
def compare_store_clusters(sales_df, cluster_type='Store_Cluster_Quantile'):
"""Statistical comparison of promotion impact by store cluster with pretty output"""
# Data extraction
fast = sales_df[(sales_df[cluster_type].str.contains('Fast')) &
(sales_df['Promotion_Period'] != 'No_Promotion')]
slow = sales_df[(sales_df[cluster_type].str.contains('Slow')) &
(sales_df['Promotion_Period'] != 'No_Promotion')]
# Statistical analysis
t_stat, p_val = stats.ttest_ind(fast['Net_Sales'], slow['Net_Sales'], equal_var=False)
# Calculate effect sizes
fast_mean = fast['Net_Sales'].mean()
slow_mean = slow['Net_Sales'].mean()
abs_diff = fast_mean - slow_mean
rel_diff = (abs_diff / slow_mean) * 100
# Create pretty output
result = {
"analysis": f"Promotion Impact: Fast vs Slow {cluster_type.replace('_', ' ')}",
"statistical_significance": {
"t_statistic": round(float(t_stat), 2),
"p_value": f"{p_val:.3e}" if p_val < 0.001 else round(float(p_val), 4),
"interpretation": "Highly significant" if p_val < 0.001 else
"Significant" if p_val < 0.05 else "Not significant"
},
"performance_comparison": {
"fast_stores": {
"mean_sales": round(float(fast_mean), 2),
"sample_size": len(fast)
},
"slow_stores": {
"mean_sales": round(float(slow_mean), 2),
"sample_size": len(slow)
},
"absolute_difference": round(float(abs_diff), 2),
"relative_difference": f"{round(rel_diff, 1)}%"
},
"visualization": generate_comparison_plot(fast, slow, cluster_type)
}
return result
def generate_comparison_plot(fast_df, slow_df, cluster_type):
"""Generate visualization for cluster comparison"""
plt.figure(figsize=(10, 6))
# Prepare data for plotting
plot_data = pd.DataFrame({
'Cluster': ['Fast'] * len(fast_df) + ['Slow'] * len(slow_df),
'Net_Sales': np.concatenate([fast_df['Net_Sales'], slow_df['Net_Sales']])
})
# Create plot
sns.boxplot(x='Cluster', y='Net_Sales', data=plot_data)
plt.title(f"Promotion Sales Comparison\n{cluster_type.replace('_', ' ')}")
plt.ylabel("Net Sales During Promotions")
plt.xlabel("Store Cluster")
# Save plot to display in output
plt.savefig("store_cluster_comparison.png", bbox_inches='tight', dpi=100)
plt.close()
return "store_cluster_comparison.png"
def print_pretty_results(results):
"""Print formatted results to console"""
print("\n" + "="*50)
print(f"📊 {results['analysis']}")
print("="*50)
print("\n🔍 Statistical Significance:")
print(f" • t-statistic: {results['statistical_significance']['t_statistic']}")
print(f" • p-value: {results['statistical_significance']['p_value']}")
print(f" • Interpretation: {results['statistical_significance']['interpretation']}")
print("\n💰 Performance Comparison:")
print(f" Fast Stores:")
print(f" • Mean Sales: {results['performance_comparison']['fast_stores']['mean_sales']}")
print(f" • Sample Size: {results['performance_comparison']['fast_stores']['sample_size']}")
print(f" Slow Stores:")
print(f" • Mean Sales: {results['performance_comparison']['slow_stores']['mean_sales']}")
print(f" • Sample Size: {results['performance_comparison']['slow_stores']['sample_size']}")
print(f"\n ➡ Absolute Difference: {results['performance_comparison']['absolute_difference']}")
print(f" ➡ Relative Difference: {results['performance_comparison']['relative_difference']}")
print("\n📈 Visualization saved to:", results['visualization'])
print("\n" + "="*50)
# Example usage:
results = compare_store_clusters(sales_df)
print_pretty_results(results)
print(compare_store_clusters(clean_df))
================================================== 📊 Promotion Impact: Fast vs Slow Store Cluster Quantile ================================================== 🔍 Statistical Significance: • t-statistic: -8.4 • p-value: 4.685e-17 • Interpretation: Highly significant 💰 Performance Comparison: Fast Stores: • Mean Sales: 2.49 • Sample Size: 32388 Slow Stores: • Mean Sales: 2.89 • Sample Size: 43877 ➡ Absolute Difference: -0.41 ➡ Relative Difference: -14.1% 📈 Visualization saved to: store_cluster_comparison.png ================================================== {'analysis': 'Promotion Impact: Fast vs Slow Store Cluster Quantile', 'statistical_significance': {'t_statistic': -8.4, 'p_value': '4.685e-17', 'interpretation': 'Highly significant'}, 'performance_comparison': {'fast_stores': {'mean_sales': 2.49, 'sample_size': 32388}, 'slow_stores': {'mean_sales': 2.89, 'sample_size': 43877}, 'absolute_difference': -0.41, 'relative_difference': '-14.1%'}, 'visualization': 'store_cluster_comparison.png'}
In [74]:
# Direction: Slow stores outperformed Fast stores during promotions (reverse of typical expectations)
# Slow stores: $2.89 mean sales
# Fast stores: $2.49 mean sales
# Absolute difference: -$0.40 per transaction
# Statistical Significance:
# Extremely low p-value (4.685 × 10⁻¹⁷) → Confidence >99.999%
# Large t-statistic (-8.4) → Difference is 8.4 standard errors from null
# Effect Size:
# 14.1% relative performance gap
# Consistent across large samples (32k+ Fast vs 44k+ Slow store promotions)
In [75]:
# This finding relates to our complete analysis through:
# Cluster Validation:
# Confirms our store clustering method captures real behavioral differences
# Suggests "Slow" stores aren't inherently low-performing during promotions
# Promotion Strategy:
# Complements product-level findings (where Fast items usually perform better)
# Reveals store-level dynamics may override product-level patterns
# Data Quality:
# Large sample sizes reinforce result reliability
# Consistent direction across both quantile and k-means clustering (in full analysis)
# Why This Matters Beyond the Direct Question
# Strategic Implications:
# Cannot assume Fast stores automatically benefit more from promotions
# May need different promotion designs for Fast vs Slow stores
# Measurement Insight:
# Store velocity classification predicts inverse promotion response
# Suggests need to measure promotion elasticity by store type
# Research Questions Raised:
# Is this pattern specific to certain promotion types?
# Does it hold across all product categories?
# Are slow stores' customers more promotion-sensitive?
# One-Sentence Executive Summary
# "Contrary to expectations, our slow stores generated 14% higher sales during promotions than fast stores (p < 0.0001),
# suggesting store velocity classifications predict inverse promotion responsiveness in our current environment."
In [76]:
def calculate_cluster_performance(df):
"""Corrected implementation matching your actual analysis"""
# 1. Verify promotion flags exist
if 'Promotion_Period' not in df.columns:
raise ValueError("Missing promotion period flags")
# 2. Filter valid promotion periods (excluding 'No_Promotion')
promo_data = df[df['Promotion_Period'] != 'No_Promotion']
if len(promo_data) == 0:
raise ValueError("No promotion period data found")
# 3. Calculate baseline (non-promo sales)
baseline = df[df['Promotion_Period'] == 'No_Promotion'].groupby(
['ProductCode', 'StoreCode']
)['Net_Sales'].mean().reset_index(name='Baseline')
# 4. Merge with promotion data
merged = promo_data.merge(
baseline,
on=['ProductCode', 'StoreCode'],
how='left'
).dropna(subset=['Baseline'])
# 5. Calculate uplift by cluster
results = []
for cluster in ['Fast Items', 'Medium Items', 'Slow Items','Fast Stores', 'Medium Stores', 'Slow Stores','Fast', 'Medium', 'Slow']:
for cluster_type in ['Product_Cluster_Quantile','Product_Cluster_KMeans','Store_Cluster_Quantile','Store_Cluster_KMeans']:
cluster_data = merged[merged[cluster_type] == cluster]
if len(cluster_data) == 0:
continue
uplift = (cluster_data['Net_Sales'].mean() - cluster_data['Baseline'].mean()) / cluster_data['Baseline'].mean()
results.append({
'Cluster_Type': cluster_type,
'Cluster': cluster,
'Uplift': uplift,
'Baseline': cluster_data['Baseline'].mean(),
'Promo_Sales': cluster_data['Net_Sales'].mean()
#'Sample_Size': len(cluster_data)
})
return pd.DataFrame(results)
# Usage:
performance_df = calculate_cluster_performance(clean_df)
print(performance_df)
Cluster_Type Cluster Uplift Baseline Promo_Sales 0 Product_Cluster_Quantile Fast Items 0.255 3.888 4.881 1 Product_Cluster_Quantile Medium Items 0.100 1.142 1.257 2 Product_Cluster_Quantile Slow Items 0.125 0.496 0.558 3 Store_Cluster_Quantile Fast Stores 0.182 2.108 2.491 4 Store_Cluster_Quantile Medium Stores 0.208 2.157 2.606 5 Store_Cluster_Quantile Slow Stores 0.228 2.361 2.898 6 Product_Cluster_KMeans Fast 0.184 9.113 10.791 7 Store_Cluster_KMeans Fast 0.084 2.160 2.342 8 Product_Cluster_KMeans Medium 0.251 3.519 4.401 9 Store_Cluster_KMeans Medium -0.051 1.441 1.367 10 Product_Cluster_KMeans Slow 0.083 1.001 1.084 11 Store_Cluster_KMeans Slow 0.210 2.209 2.673
In [77]:
# ======================
# BASELINE MODELS
# ======================
In [78]:
# Ensure your merged data is clean
df = clean_df.copy()
# 1. Handle negative/zero sales
df['Net_Sales'] = df['SalesQuantity'].clip(lower=0)
# 2. Convert dates (if not already done)
df['Date'] = pd.to_datetime(df['Date'])
df['DayOfWeek'] = df['Date'].dt.day_name()
# 3. Verify promotion flags
print("Promotion periods check:")
print(df['Promotion_Period'].value_counts())
Promotion periods check: Promotion_Period No_Promotion 1720009 Promo2 85678 Promo4 67822 Name: count, dtype: int64
In [79]:
def get_baselines(df):
"""Calculate day-adjusted baselines for non-promo periods"""
# Use only clean non-promotion data
non_promo = df[
(df['Promotion_Period'] == 'No_Promotion') &
(df['Net_Sales'] > 0)
].copy()
# Calculate baseline = average sales per product-store-day
baselines = non_promo.groupby(
['ProductCode', 'StoreCode', 'DayOfWeek']
)['Net_Sales'].agg(
['mean', 'count']
).reset_index()
# Filter for reliable baselines (minimum 3 observations)
reliable_baselines = baselines[baselines['count'] >= 3].copy()
reliable_baselines.rename(columns={'mean': 'Baseline'}, inplace=True)
return reliable_baselines[['ProductCode', 'StoreCode', 'DayOfWeek', 'Baseline']]
baselines = get_baselines(df)
print(f"Generated {len(baselines)} reliable baselines")
Generated 152694 reliable baselines
In [80]:
# get baseline() - Core Purpose
# To establish day-of-week adjusted baseline sales expectations for each product-store combination during non-promotion periods, which will be used to:
# Measure promotion uplift (actual sales vs expected sales)
# Evaluate promotion effectiveness
# Forecast future promotion impacts
# Key Features
# Data Filtering:
# Only uses non-promotion periods (Promotion_Period == 'No_Promotion')
# Excludes negative/zero sales (Net_Sales > 0)
# Granular Baseline Calculation:
# Computes averages at the product-store-dayofweek level
# Captures both:
# Product/store specific demand patterns
# Weekly seasonality (day-of-week effects)
# Data Quality Control:
# Requires minimum 3 observations per baseline
# Returns only statistically reliable estimates
# Why This Approach?
# Day-of-Week Adjustment:
# Accounts for natural weekly cycles (e.g., weekend spikes)
# More accurate than simple product-store averages
In [81]:
def calculate_promo_uplift(df, baselines):
"""Safe uplift calculation with NaN protection"""
# Merge with promotion data
promo_data = df[df['Promotion_Period'] != 'No_Promotion'].copy()
merged = promo_data.merge(
baselines,
on=['ProductCode', 'StoreCode', 'DayOfWeek'],
how='left'
)
# Handle cases without baselines
merged['Baseline'] = merged['Baseline'].fillna(0)
# Calculate uplift metrics
merged['Abs_Uplift'] = merged['Net_Sales'] - merged['Baseline']
merged['Rel_Uplift'] = np.where(
merged['Baseline'] > 0,
merged['Abs_Uplift'] / merged['Baseline'],
0 # When no baseline exists
)
return merged
uplift_results = calculate_promo_uplift(df, baselines)
In [82]:
# calculate_promo_uplift() - Concept
# This function calculates promotional uplift by comparing net sales during promotions against baseline sales,
# handling missing baselines by defaulting to zero. It returns absolute and relative uplift metrics while protecting against NaN values.
In [83]:
def analyze_clusters(uplift_df):
"""Aggregate results by your existing clusters"""
results = []
for cluster_type in ['Product_Cluster_Quantile', 'Store_Cluster_Quantile']:
cluster_results = uplift_df.groupby(cluster_type).agg({
'Rel_Uplift': ['mean', 'count'],
'Abs_Uplift': 'mean',
'Net_Sales': 'mean',
'Baseline': 'mean'
})
# Flatten multi-index columns
cluster_results.columns = ['_'.join(col).strip() for col in cluster_results.columns.values]
cluster_results = cluster_results.reset_index()
cluster_results['Cluster_Type'] = cluster_type
results.append(cluster_results)
return pd.concat(results)
cluster_analysis = analyze_clusters(uplift_results)
In [84]:
# analyze_clusters () - Concept
# This function aggregates promotion uplift results by product and store clusters, calculating mean uplift, sales, and baseline metrics for each cluster type.
# It combines the results into a single DataFrame for easy comparison across different cluster categories.
In [85]:
def analyze_promotion_performance(uplift_results):
"""Aggregate results by existing Promotion_Period column"""
# Filter only rows with promotions (exclude 'No_Promotion')
promo_data = uplift_results[uplift_results['Promotion_Period'].str.startswith('Promo')].copy()
# Get top 5 best-performing promotions overall
top_promotions = promo_data.sort_values('Rel_Uplift', ascending=False).head(5)
# Aggregate by Promotion_Period and Cluster
promo_performance = promo_data.groupby(
['Promotion_Period', 'Product_Cluster_Quantile']
).agg({
'Rel_Uplift': ['mean', 'count'],
'Abs_Uplift': 'mean',
'Net_Sales': 'mean',
'Baseline': 'mean'
}).reset_index()
# Flatten multi-index columns
promo_performance.columns = ['_'.join(col).strip() if col[1] else col[0]
for col in promo_performance.columns.values]
return top_promotions, promo_performance
top_promos, promo_stats = analyze_promotion_performance(uplift_results)
In [86]:
# analyze_promotion_performance() - Concept
# This function analyzes promotion performance by:
# 1-) Identifying top 5 promotions by relative uplift
# 2-) Aggregating results by promotion period and product cluster
# 3-) Returning both detailed cluster-level stats and top performers
# It transforms raw uplift data into actionable insights about which promotions and product clusters perform best.
In [87]:
def print_promotion_insights(top_promos, promo_stats):
print("=== TOP 5 PROMOTIONS ===")
print(top_promos[['Promotion_Period', 'ProductCode', 'StoreCode',
'Rel_Uplift', 'Abs_Uplift']].to_string(index=False))
print("\n=== PERFORMANCE BY PROMOTION & CLUSTER ===")
# Pivot for better readability
pivot_table = promo_stats.pivot(
index='Promotion_Period',
columns='Product_Cluster_Quantile',
values=['Rel_Uplift_mean', 'Abs_Uplift_mean']
)
print(pivot_table.to_string())
In [88]:
def plot_promotion_breakdown(promo_stats):
plt.figure(figsize=(14, 6))
# Plot 1: Uplift by promotion
plt.subplot(1, 2, 1)
sns.barplot(data=promo_stats,
x='Promotion_Period',
y='Rel_Uplift_mean',
hue='Product_Cluster_Quantile')
plt.title('Relative Uplift by Promotion')
plt.axhline(0, color='black', linestyle='--')
# Plot 2: Absolute lift
plt.subplot(1, 2, 2)
sns.barplot(data=promo_stats,
x='Promotion_Period',
y='Abs_Uplift_mean',
hue='Product_Cluster_Quantile')
plt.title('Absolute Sales Lift by Promotion')
plt.tight_layout()
plt.show()
In [89]:
top_promos, promo_stats = analyze_promotion_performance(uplift_results)
print_promotion_insights(top_promos, promo_stats)
plot_promotion_breakdown(promo_stats)
=== TOP 5 PROMOTIONS === Promotion_Period ProductCode StoreCode Rel_Uplift Abs_Uplift Promo4 222 24 50.429 58.833 Promo4 210 189 47.000 47.000 Promo2 213 7 39.000 58.500 Promo2 218 256 38.503 888.913 Promo2 222 73 35.000 46.667 === PERFORMANCE BY PROMOTION & CLUSTER === Rel_Uplift_mean Abs_Uplift_mean Product_Cluster_Quantile Slow Items Medium Items Fast Items Slow Items Medium Items Fast Items Promotion_Period Promo2 -0.029 -0.102 0.153 0.401 0.259 0.889 Promo4 -0.032 -0.061 0.136 0.469 0.364 0.826
In [90]:
## Validation Step on New Data
In [91]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
def safe_mape(actual, predicted):
"""Calculate MAPE with zero-division protection"""
with np.errstate(divide='ignore', invalid='ignore'):
errors = np.abs((actual - predicted) / actual)
errors = errors[~np.isinf(errors) & ~np.isnan(errors)]
return np.mean(errors) * 100
def evaluate_promotion5_performance(historical_sales, new_data_path, promos):
"""
Evaluate baseline model performance on Promotion 5 using new data
Returns metrics and diagnostic plots
"""
try:
# 1. Load and prepare new data
new_data = pd.read_csv(new_data_path, parse_dates=['Date'])
new_data['Net_Sales'] = np.where(new_data['SalesQuantity'] < 0, 0, new_data['SalesQuantity'])
# 2. Flag promotions in new data
new_data = flag_promotions(new_data, promos)
# 3. Get Promotion 5 dates
promo5 = promos[promos['Period'] == 'Promo5'].iloc[0]
mask = (new_data['Date'] >= promo5['StartDate']) & (new_data['Date'] <= promo5['EndDate'])
promo5_data = new_data[mask]
if len(promo5_data) == 0:
raise ValueError(f"No data found for Promo5 ({promo5['StartDate'].date()} to {promo5['EndDate'].date()})")
# 4. Create baseline predictions
non_promo_baseline = historical_sales[
(historical_sales['Promotion_Period'] == 'No_Promotion') &
(historical_sales['Net_Sales'] > 0)]
baseline = non_promo_baseline.groupby(['ProductCode', 'StoreCode'])['Net_Sales'].mean().reset_index()
baseline.columns = ['ProductCode', 'StoreCode', 'Predicted_Sales']
# 5. Merge predictions with actuals
evaluation = promo5_data.merge(baseline, on=['ProductCode', 'StoreCode'], how='left')
evaluation = evaluation.dropna(subset=['Predicted_Sales'])
if len(evaluation) == 0:
raise ValueError("No matching product-store pairs between historical and new data")
# Filter out problematic values
evaluation = evaluation[(evaluation['Net_Sales'] > 0) & (evaluation['Predicted_Sales'] > 0)]
# 6. Calculate evaluation metrics
actual = evaluation['Net_Sales']
predicted = evaluation['Predicted_Sales']
metrics = {
'MAE': mean_absolute_error(actual, predicted),
'RMSE': np.sqrt(mean_squared_error(actual, predicted)),
'MAPE': safe_mape(actual, predicted),
'Correlation': np.corrcoef(actual, predicted)[0, 1],
'Avg_Actual': actual.mean(),
'Avg_Predicted': predicted.mean(),
'Uplift': actual.mean() - predicted.mean(),
'Uplift_Pct': ((actual.mean() - predicted.mean()) / predicted.mean()) * 100,
'Sample_Size': len(evaluation)
}
# 7. Diagnostic plots
plt.figure(figsize=(15, 5))
# Actual vs Predicted scatter plot
plt.subplot(1, 3, 1)
sns.regplot(x=predicted, y=actual, line_kws={'color': 'red'})
plt.plot([0, max(predicted)], [0, max(predicted)], '--', color='gray')
plt.title('Actual vs Predicted Sales')
plt.xlabel('Predicted Sales')
plt.ylabel('Actual Sales')
# Error distribution
plt.subplot(1, 3, 2)
errors = actual - predicted
sns.histplot(errors, kde=True)
plt.title('Prediction Error Distribution')
plt.xlabel('Error (Actual - Predicted)')
# Uplift by cluster (if clusters exist)
if 'Product_Cluster' in evaluation.columns:
plt.subplot(1, 3, 3)
cluster_uplift = evaluation.groupby('Product_Cluster').agg({
'Net_Sales': 'mean',
'Predicted_Sales': 'mean'
})
cluster_uplift['Uplift_Pct'] = (cluster_uplift['Net_Sales'] - cluster_uplift['Predicted_Sales']) / cluster_uplift['Predicted_Sales'] * 100
sns.barplot(data=cluster_uplift.reset_index(), x='Product_Cluster', y='Uplift_Pct')
plt.title('Uplift by Product Cluster')
plt.ylabel('Uplift Percentage')
plt.axhline(0, color='black', linestyle='--')
plt.tight_layout()
plt.show()
return metrics, evaluation
except Exception as e:
print(f"Evaluation failed: {str(e)}")
return None, str(e)
# ===== MAIN EXECUTION =====
if __name__ == "__main__":
print("=== PROMOTION 5 EVALUATION ===")
# 1. Load historical data (from part A)
historical_sales = df.copy()
# 2. Load promotion dates (correcting Promo5 if needed)
promos = pd.read_csv('PromotionDates.csv', parse_dates=['StartDate', 'EndDate'])
promos.loc[promos['Period'] == 'Promo5', 'StartDate'] = pd.to_datetime('2015-09-01')
promos.loc[promos['Period'] == 'Promo5', 'EndDate'] = pd.to_datetime('2015-09-06')
# 3. Evaluate performance on new data
print("\nEvaluating Promotion 5 performance...")
metrics, results = evaluate_promotion5_performance(
historical_sales,
'assignment4.1b.csv',
promos
)
if metrics:
print("\n=== EVALUATION METRICS ===")
for k, v in metrics.items():
print(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}")
print("\n=== DIAGNOSTIC FINDINGS ===")
print("1. Goodness of Fit Measures:")
print(f"- MAE of {metrics['MAE']:.2f} indicates average prediction error")
print(f"- MAPE of {metrics['MAPE']:.2f}% shows percentage error")
print(f"- Correlation of {metrics['Correlation']:.2f} suggests prediction quality")
print("\n2. Model Performance:")
print(f"- Predicted uplift: {metrics['Uplift_Pct']:.1f}% (Actual: {metrics['Avg_Actual']:.1f} vs Predicted: {metrics['Avg_Predicted']:.1f})")
print("\n3. Problem Points:")
if metrics['MAPE'] > 30:
print("- High MAPE suggests model struggles with absolute sales volume")
if metrics['Correlation'] < 0.5:
print("- Low correlation indicates poor prediction of relative performance")
if 'Product_Cluster' in results.columns:
cluster_errors = results.groupby('Product_Cluster').apply(
lambda x: mean_absolute_error(x['Net_Sales'], x['Predicted_Sales'])
)
worst_cluster = cluster_errors.idxmax()
print(f"- Worst performance in {worst_cluster} product cluster (MAE: {cluster_errors[worst_cluster]:.2f})")
print("\n4. Suggested Improvements:")
print("- Incorporate seasonality and day-of-week effects")
print("- Add promotional intensity/type as a factor")
print("- Consider store-item interaction effects")
print("- Use more sophisticated baseline (e.g., moving average)")
else:
print("\nEvaluation failed - check error messages above")
=== PROMOTION 5 EVALUATION === Evaluating Promotion 5 performance...
=== EVALUATION METRICS === MAE: 2.59 RMSE: 5.70 MAPE: 97.74 Correlation: 0.52 Avg_Actual: 3.67 Avg_Predicted: 3.21 Uplift: 0.47 Uplift_Pct: 14.53 Sample_Size: 30434 === DIAGNOSTIC FINDINGS === 1. Goodness of Fit Measures: - MAE of 2.59 indicates average prediction error - MAPE of 97.74% shows percentage error - Correlation of 0.52 suggests prediction quality 2. Model Performance: - Predicted uplift: 14.5% (Actual: 3.7 vs Predicted: 3.2) 3. Problem Points: - High MAPE suggests model struggles with absolute sales volume 4. Suggested Improvements: - Incorporate seasonality and day-of-week effects - Add promotional intensity/type as a factor - Consider store-item interaction effects - Use more sophisticated baseline (e.g., moving average)
In [92]:
# What the Baseline Is
# It's not a regression model forecasting future sales
# It's a naive benchmark: "What sales would we expect without promotions, based purely on historical averages?"
# How It Predicts Promo5 Performance
# The assumption: "During Promo5, sales should deviate from this non-promoted baseline if the promotion worked"
# The "prediction" is counterfactual: "This is what sales would have been without promotion"
# Uplift = Actual Promo5 Sales - Baseline (Expected Non-Promo Sales)
In [93]:
# Model Performance Summary:
# 14.5% Uplift: Promo5 drove higher sales than the non-promotion baseline (3.67 vs 3.21 units)
# High MAPE (97.7%): Indicates large percentage errors for low-volume items (common when baseline sales are near zero)
# Moderate Correlation (0.52): Baseline captures some but not all sales patterns
# Why the Baseline Works Despite Limitations:
# The 14.5% uplift is meaningful because:
# Derived from clean non-promotion data (no contamination)
# Controls for inherent product/store variability
# High MAPE is expected with:
# Intermittent demand (many zero/low sales days)
# No temporal adjustments in this version
In [94]:
# QA
# 1. Measure for Goodness of Fit
# For promotion impact analysis, use:
# MAE (Mean Absolute Error) → Best for understanding average prediction error in units.
# MAPE (Mean Absolute Percentage Error) → Useful for relative error, but unreliable near zero sales.
# Correlation (Pearson r) → Measures if baseline predicts sales trends (0.52 is moderate).
# Uplift Consistency → Check if uplift % aligns with business expectations.
# Best for this case: MAE + Uplift % (since MAPE is inflated by low-volume items).
# 2. How Good Is the Current Model?
# Strengths:
# Detects Uplift (14.5%) → Baseline successfully identifies promotional lift.
# Controls for Variability → Product/store-specific baselines reduce noise.
# Simple & Explainable → Easy to interpret for stakeholders.
# Weaknesses:
# High MAPE (97.7%) → Poor absolute % accuracy (due to low sales days).
# No Seasonality → Misses weekly/monthly patterns (e.g., weekends sell more).
# No Promo Intensity → Treats all promotions equally (discount % not factored).
# Verdict: Decent for directional insights, but not precise enough for forecasting.
In [95]:
## Product Category Added Model
In [96]:
# ======================
# 1. DATA PREPARATION
# ======================
def load_data():
"""Load all datasets with robust error handling and category merging"""
try:
# Load main datasets
sales = pd.read_csv('assignment4.1a.csv', parse_dates=['Date'])
promos = pd.read_csv('PromotionDates.csv', parse_dates=['StartDate', 'EndDate'])
categories = pd.read_csv('assignment4.1c.csv')
# Validation data
val_data = pd.read_csv('assignment4.1b.csv', parse_dates=['Date'])
# Clean sales data
sales['Net_Sales'] = np.where(sales['SalesQuantity'] < 0, 0, sales['SalesQuantity'])
val_data['Net_Sales'] = np.where(val_data['SalesQuantity'] < 0, 0, val_data['SalesQuantity'])
# Fix invalid promotion dates
promos.loc[promos['Period'] == 'Promo1', 'StartDate'] = pd.to_datetime('2015-02-10')
promos.loc[promos['Period'] == 'Promo1', 'EndDate'] = pd.to_datetime('2015-02-17')
promos.loc[promos['Period'] == 'Promo5', 'StartDate'] = pd.to_datetime('2015-09-01')
promos.loc[promos['Period'] == 'Promo5', 'EndDate'] = pd.to_datetime('2015-09-06')
# Merge categories with proper validation
if not categories.empty:
sales = sales.merge(categories, on='ProductCode', how='left')
val_data = val_data.merge(categories, on='ProductCode', how='left')
print("Successfully merged product categories")
else:
print("Warning: No category data loaded")
sales['ProductGroup1'] = 'Unknown'
sales['ProductGroup2'] = -1
val_data['ProductGroup1'] = 'Unknown'
val_data['ProductGroup2'] = -1
return sales, promos, val_data
except Exception as e:
print(f"Error loading data: {str(e)}")
return None, None, None
def flag_promotions(df, promo_df):
"""Flag promotion periods in sales data"""
df['Promotion_Period'] = 'No_Promotion'
for _, promo in promo_df.iterrows():
# Validate date range
if promo['StartDate'] > promo['EndDate']:
print(f"Warning: Invalid date range for {promo['Period']}")
continue
mask = (df['Date'] >= promo['StartDate']) & (df['Date'] <= promo['EndDate'])
df.loc[mask, 'Promotion_Period'] = promo['Period']
return df
# ======================
# 2. CLUSTERING
# ======================
def create_clusters(sales):
"""Enhanced clustering with product categories and stability metrics"""
# Weekly sales during non-promotion periods
non_promo = sales[sales['Promotion_Period'] == 'No_Promotion']
# Handle missing categories
if 'ProductGroup1' not in non_promo.columns:
non_promo['ProductGroup1'] = 'Unknown'
if 'ProductGroup2' not in non_promo.columns:
non_promo['ProductGroup2'] = -1
weekly_sales = non_promo.groupby(
['ProductGroup1', 'ProductGroup2', 'ProductCode', 'StoreCode',
pd.Grouper(key='Date', freq='W-MON')]
)['Net_Sales'].sum().reset_index()
# Calculate stability metrics with error handling
try:
stability = weekly_sales.groupby(
['ProductGroup1', 'ProductGroup2', 'ProductCode', 'StoreCode']
).agg(
Avg_Sales=('Net_Sales', 'mean'),
Sales_Volatility=('Net_Sales', 'std')
).reset_index()
# Clean data - remove rows with NaN or infinite values
stability_clean = stability.replace([np.inf, -np.inf], np.nan).dropna()
if len(stability_clean) == 0:
raise ValueError("No valid data for clustering after cleaning")
# Scale features for clustering
scaler = StandardScaler()
features = stability_clean[['Avg_Sales', 'Sales_Volatility']]
scaled_features = scaler.fit_transform(features)
# K-means clustering with silhouette score validation
kmeans = KMeans(n_clusters=3, random_state=42)
stability_clean['Velocity_Cluster'] = kmeans.fit_predict(scaled_features)
# Calculate silhouette score
silhouette = silhouette_score(scaled_features, stability_clean['Velocity_Cluster'])
print(f"Clustering Silhouette Score: {silhouette:.3f}")
# Label clusters based on centroids (in original scale)
centroids = pd.DataFrame(
scaler.inverse_transform(kmeans.cluster_centers_),
columns=['Avg_Sales', 'Sales_Volatility']
).sort_values('Avg_Sales')
centroids['Cluster_Label'] = ['Slow', 'Medium', 'Fast']
cluster_map = dict(enumerate(centroids['Cluster_Label']))
stability_clean['Velocity_Cluster'] = stability_clean['Velocity_Cluster'].map(cluster_map)
# Merge back to sales data with INNER JOIN to exclude unclustered items
sales = sales.merge(
stability_clean[['ProductCode', 'StoreCode', 'Velocity_Cluster']],
on=['ProductCode', 'StoreCode'],
how='inner'
)
return sales
except Exception as e:
print(f"Clustering failed: {str(e)}")
sales['Velocity_Cluster'] = 'Unknown'
return sales
# ======================
# 3. ANALYSIS
# ======================
def analyze_promotion_impact(sales):
"""Robust promotion impact analysis with category hierarchy"""
results = []
# Handle missing category columns
if 'ProductGroup1' not in sales.columns:
sales['ProductGroup1'] = 'Unknown'
if 'ProductGroup2' not in sales.columns:
sales['ProductGroup2'] = -1
# Group by category hierarchy and velocity cluster
group_cols = ['ProductGroup1', 'ProductGroup2', 'Velocity_Cluster']
for (category, subcategory, cluster), group in sales.groupby(group_cols):
# Calculate baseline (non-promotion) sales with minimum observation check
baseline_group = group[group['Promotion_Period'] == 'No_Promotion']
if len(baseline_group) < 5: # Minimum observations threshold
continue
baseline = baseline_group['Net_Sales'].mean()
# Calculate promotion uplift for each promotion period
for promo in group['Promotion_Period'].unique():
if promo == 'No_Promotion':
continue
promo_group = group[group['Promotion_Period'] == promo]
if len(promo_group) < 3: # Minimum observations threshold
continue
promo_sales = promo_group['Net_Sales'].mean()
uplift = (promo_sales - baseline) / baseline if baseline > 0 else 0
# Statistical significance test
_, p_value = stats.ttest_ind(
baseline_group['Net_Sales'],
promo_group['Net_Sales'],
equal_var=False
)
results.append({
'ProductGroup1': category, # Changed from 'Category' to 'ProductGroup1'
'ProductGroup2': subcategory,
'Velocity_Cluster': cluster,
'Promotion': promo,
'Baseline_Sales': baseline,
'Promo_Sales': promo_sales,
'Uplift': uplift,
'P_Value': p_value,
'Obs_Count': len(promo_group),
'Sig_Level': 'Significant' if p_value < 0.05 else 'Not Significant'
})
return pd.DataFrame(results)
# ======================
# 4. VALIDATION
# ======================
def validate_model(val_data, promos, sales_with_clusters):
"""Comprehensive validation with Promo5 and category analysis
Returns:
tuple: (metrics_dict, category_results_df)
- metrics_dict: Dictionary of validation metrics (MAE, RMSE, etc.)
- category_results_df: DataFrame with uplift by ProductGroup1
"""
try:
# 1. Flag promotions in validation data
val_data = flag_promotions(val_data, promos)
# 2. Get Promo5 data (fixed to Sep 1-6, 2015)
promo5 = promos[promos['Period'] == 'Promo5'].iloc[0]
mask = (val_data['Date'] >= promo5['StartDate']) & (val_data['Date'] <= promo5['EndDate'])
promo5_data = val_data[mask]
if len(promo5_data) == 0:
raise ValueError(f"No Promo5 data found ({promo5['StartDate'].date()} to {promo5['EndDate'].date()})")
# 3. Merge with clusters from training data (INNER JOIN to exclude unknowns)
val_merged = promo5_data.merge(
sales_with_clusters[['ProductCode', 'StoreCode', 'Velocity_Cluster']].drop_duplicates(),
on=['ProductCode', 'StoreCode'],
how='inner' # Changed from 'left' to exclude unknowns
)
# 4. Ensure categories exist (ProductGroup1)
if 'ProductGroup1' not in val_merged.columns:
val_merged['ProductGroup1'] = 'Unknown'
# 5. Calculate baseline sales (non-promo average per category-cluster)
baseline = sales_with_clusters[
(sales_with_clusters['Promotion_Period'] == 'No_Promotion') &
(sales_with_clusters['Velocity_Cluster'] != 'Unknown')
].groupby(['ProductGroup1', 'Velocity_Cluster'])['Net_Sales'].mean().reset_index()
baseline.columns = ['ProductGroup1', 'Velocity_Cluster', 'Baseline_Sales']
# 6. Merge baselines and calculate uplift (INNER JOIN to ensure complete cases)
val_results = val_merged.merge(
baseline,
on=['ProductGroup1', 'Velocity_Cluster'],
how='inner'
)
if len(val_results) == 0:
raise ValueError("No valid product-store pairs with complete data")
# Calculate uplift safely (handle division by zero)
with np.errstate(divide='ignore', invalid='ignore'):
val_results['Uplift'] = np.where(
val_results['Baseline_Sales'] > 0,
(val_results['Net_Sales'] - val_results['Baseline_Sales']) / val_results['Baseline_Sales'],
0
)
# 7. Calculate validation metrics
metrics = {
'MAE': mean_absolute_error(val_results['Net_Sales'], val_results['Baseline_Sales']),
'RMSE': np.sqrt(mean_squared_error(val_results['Net_Sales'], val_results['Baseline_Sales'])),
'Correlation': np.corrcoef(val_results['Net_Sales'], val_results['Baseline_Sales'])[0, 1],
'Avg_Actual': val_results['Net_Sales'].mean(),
'Avg_Predicted': val_results['Baseline_Sales'].mean(),
'Avg_Uplift': val_results['Uplift'].mean(),
'Sample_Size': len(val_results)
}
# 8. Category-level results (ensure ProductGroup1 exists)
if 'ProductGroup1' in val_results.columns:
category_results = val_results.groupby('ProductGroup1').agg({
'Uplift': 'mean',
'Net_Sales': 'mean',
'Baseline_Sales': 'mean',
'Velocity_Cluster': lambda x: x.mode()[0] if len(x.mode()) > 0 else None
}).reset_index()
else:
category_results = pd.DataFrame(columns=['ProductGroup1', 'Uplift', 'Net_Sales', 'Baseline_Sales', 'Velocity_Cluster'])
return metrics, category_results
except Exception as e:
print(f"Validation error: {str(e)}")
# Return empty but properly structured results
empty_metrics = {
'MAE': None,
'RMSE': None,
'Correlation': None,
'Avg_Actual': None,
'Avg_Predicted': None,
'Avg_Uplift': None,
'Sample_Size': 0
}
empty_df = pd.DataFrame(columns=['ProductGroup1', 'Uplift', 'Net_Sales', 'Baseline_Sales', 'Velocity_Cluster'])
return empty_metrics, empty_df# ======================
# 5. REPORTING
# ======================
def generate_recommendations(results):
"""Generate data-driven category-specific strategies"""
recommendations = []
if results.empty:
return pd.DataFrame({'Message': ['No valid results for recommendations']})
# Ensure we're using ProductGroup1 instead of Category
group_col = 'ProductGroup1' if 'ProductGroup1' in results.columns else 'Category'
top_categories = results[results['Sig_Level'] == 'Significant'].groupby(group_col).agg(
Avg_Uplift=('Uplift', 'mean'),
Best_Cluster=('Velocity_Cluster', lambda x: x.value_counts().index[0]),
Sample_Size=('Obs_Count', 'sum')
).sort_values('Avg_Uplift', ascending=False)
for category, row in top_categories.iterrows():
rec = {
'Category': category,
'Avg_Uplift': f"{row['Avg_Uplift']:.1%}",
'Best_Performing_Cluster': row['Best_Cluster'],
'Sample_Size': row['Sample_Size'],
'Recommended_Strategy': ''
}
if row['Avg_Uplift'] > 0.2:
rec['Recommended_Strategy'] = (
f"Focus promotions on {row['Best_Cluster']} velocity items in this category. "
f"Expected uplift: {row['Avg_Uplift']:.1%}"
)
elif row['Avg_Uplift'] > 0.1:
rec['Recommended_Strategy'] = (
f"Moderate promotions for {row['Best_Cluster']} velocity items. "
f"Expected uplift: {row['Avg_Uplift']:.1%}"
)
else:
rec['Recommended_Strategy'] = (
"Limited promotion potential - consider bundling with higher-performing categories "
"or reassessing promotion mechanics"
)
recommendations.append(rec)
return pd.DataFrame(recommendations)
# ======================
# MAIN EXECUTION
# ======================
if __name__ == "__main__":
print("=== CATEGORY-AWARE PROMOTION IMPACT ANALYSIS ===")
# 1. Load and prepare data
print("\n[1/5] Loading and preparing data...")
sales, promos, val_data = load_data()
if sales is None:
exit()
# 2. Flag promotions in historical data
print("\n[2/5] Flagging promotion periods...")
sales = flag_promotions(sales, promos)
# 3. Create velocity clusters with categories
print("\n[3/5] Creating velocity clusters...")
sales = create_clusters(sales)
print("\nCluster distribution:")
print(sales['Velocity_Cluster'].value_counts())
# 4. Analyze promotion impact
print("\n[4/5] Analyzing promotion impact...")
uplift_results = analyze_promotion_impact(sales)
# 5. Validate on new data
print("\n[5/5] Validating on new data (Promo5)...")
val_metrics, val_category_results = validate_model(val_data, promos, sales)
# Generate recommendations
recommendations = generate_recommendations(
pd.concat([uplift_results, val_category_results]) if not val_category_results.empty else uplift_results
)
# Output results
print("\n=== TRAINING RESULTS ===")
if not uplift_results.empty:
print("\nTop Performing Categories (Historical):")
# Changed from 'Category' to 'ProductGroup1'
print(uplift_results.groupby('ProductGroup1')['Uplift'].mean()
.sort_values(ascending=False).head(5))
print("\nPromotion Effectiveness by Cluster:")
print(uplift_results.groupby('Velocity_Cluster')['Uplift'].mean()
.sort_values(ascending=False))
else:
print("No valid training results available")
print("\n=== VALIDATION RESULTS (PROMO5) ===")
if val_metrics:
print("\nValidation Metrics:")
for k, v in val_metrics.items():
print(f"{k}: {v:.3f}" if isinstance(v, float) else f"{k}: {v}")
if not val_category_results.empty:
print("\nCategory-Level Performance (Validation):")
print(val_category_results.sort_values('Uplift', ascending=False).to_string(index=False))
else:
print("No valid validation results available")
print("\n=== CATEGORY-SPECIFIC RECOMMENDATIONS ===")
print(recommendations.to_string(index=False))
# Visualizations
if not uplift_results.empty:
plt.figure(figsize=(14, 6))
# Uplift by Category and Cluster
plt.subplot(1, 2, 1)
# Changed from 'Category' to 'ProductGroup1'
sns.boxplot(
data=uplift_results[uplift_results['Sig_Level'] == 'Significant'],
x='ProductGroup1',
y='Uplift',
hue='Velocity_Cluster'
)
plt.title('Significant Uplift by Category and Cluster')
plt.xticks(rotation=45)
plt.axhline(0, color='black', linestyle='--')
# Validation Performance
if val_metrics:
plt.subplot(1, 2, 2)
metrics_df = pd.DataFrame.from_dict(val_metrics, orient='index', columns=['Value'])
metrics_df = metrics_df[metrics_df.index.isin(['MAE', 'RMSE', 'Correlation', 'Uplift_Pct'])]
sns.barplot(data=metrics_df.reset_index(), x='index', y='Value')
plt.title('Validation Metrics for Promo5')
plt.xlabel('Metric')
plt.ylabel('Value')
plt.tight_layout()
plt.show()
else:
print("\nCannot generate visualizations - no valid results available")
=== CATEGORY-AWARE PROMOTION IMPACT ANALYSIS === [1/5] Loading and preparing data... Successfully merged product categories [2/5] Flagging promotion periods... [3/5] Creating velocity clusters... Clustering Silhouette Score: 0.764 Cluster distribution: Velocity_Cluster Slow 1401832 Fast 393016 Medium 68736 Name: count, dtype: int64 [4/5] Analyzing promotion impact... [5/5] Validating on new data (Promo5)... === TRAINING RESULTS === Top Performing Categories (Historical): ProductGroup1 A 0.231 D 0.210 E 0.165 H 0.094 G 0.092 Name: Uplift, dtype: float64 Promotion Effectiveness by Cluster: Velocity_Cluster Slow 0.168 Fast 0.016 Medium 0.013 Name: Uplift, dtype: float64 === VALIDATION RESULTS (PROMO5) === Validation Metrics: MAE: 2.379 RMSE: 5.441 Correlation: 0.430 Avg_Actual: 2.772 Avg_Predicted: 2.185 Avg_Uplift: 0.305 Sample_Size: 40232 Category-Level Performance (Validation): ProductGroup1 Uplift Net_Sales Baseline_Sales Velocity_Cluster E 0.448 2.568 1.944 Slow J 0.443 1.320 1.048 Slow A 0.354 3.702 2.749 Slow G 0.323 3.573 3.049 Slow D 0.321 1.090 0.852 Slow H 0.244 0.869 0.699 Slow I 0.201 1.090 0.918 Slow B 0.147 0.811 0.710 Slow F 0.130 1.327 1.191 Slow C 0.012 2.689 2.639 Fast === CATEGORY-SPECIFIC RECOMMENDATIONS === Category Avg_Uplift Best_Performing_Cluster Sample_Size Recommended_Strategy E 38.2% Slow 5221.000 Focus promotions on Slow velocity items in this category. Expected uplift: 38.2% A 27.9% Slow 132645.000 Focus promotions on Slow velocity items in this category. Expected uplift: 27.9% H 23.4% Slow 14563.000 Focus promotions on Slow velocity items in this category. Expected uplift: 23.4% J 16.4% Slow 9954.000 Moderate promotions for Slow velocity items. Expected uplift: 16.4% D 14.9% Slow 9253.000 Moderate promotions for Slow velocity items. Expected uplift: 14.9% G 14.1% Slow 32732.000 Moderate promotions for Slow velocity items. Expected uplift: 14.1% C 9.8% Fast 3180.000 Limited promotion potential - consider bundling with higher-performing categories or reassessing promotion mechanics F 9.1% Slow 15064.000 Limited promotion potential - consider bundling with higher-performing categories or reassessing promotion mechanics I -5.1% Fast 15299.000 Limited promotion potential - consider bundling with higher-performing categories or reassessing promotion mechanics B -14.1% Slow 6165.000 Limited promotion potential - consider bundling with higher-performing categories or reassessing promotion mechanics
by Hincal Topcuoglu