import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from IPython.display import Markdown, display

# Generate Poisson Random Variables
lambda_param = 3  # Average number of events
poisson_rvs = np.random.poisson(lambda_param, size=1000)
print("Sample Poisson random variables:", poisson_rvs[:10])

Sample Poisson random variables: [6 2 2 2 2 1 2 4 4 3]

# Plot Poisson Distribution
x = np.arange(0, 15)
pmf = stats.poisson.pmf(x, lambda_param)
plt.bar(x, pmf)
plt.xlabel('Number of Events')
plt.ylabel('Probability')
plt.title('Poisson Distribution PMF (λ=3)')
plt.show()

# Calculate Poisson Probabilities
k = 5  # Specific number of events
prob = stats.poisson.pmf(k, lambda_param)
print(f"Probability of exactly {k} events: {prob}")

# Cumulative probability
cum_prob = stats.poisson.cdf(k, lambda_param)
print(f"Cumulative probability up to {k} events: {cum_prob}")

Probability of exactly 5 events: 0.10081881344492458
Cumulative probability up to 5 events: 0.9160820579686965

import numpy as np
from scipy.stats import poisson
import matplotlib.pyplot as plt

# Let's define different lambda values
lambda_values = [2, 5, 10, 20]

# A range for the possible number of events (k)
k_values = np.arange(0, 40)

# Graph settings
plt.figure(figsize=(14, 8))

# Let's plot the distribution for each lambda value
for i, lam in enumerate(lambda_values):
    # Calculate the Probability Mass Function (PMF)
    probabilities = poisson.pmf(k_values, lam)

    ax = plt.subplot(2, 2, i + 1)
    ax.bar(k_values, probabilities, label=f'λ = {lam}')
    ax.set_title(f'Poisson Distribution (λ = {lam})')
    ax.set_xlabel('Number of Occurrences (k)')
    ax.set_ylabel('Probability')
    ax.legend()
    ax.grid(True)

plt.tight_layout()
plt.show()

# Let's create a dataset that IS truly Poisson distributed for our test.
# We'll use lambda = 8 (e.g., average of 8 website visitors per minute)
np.random.seed(42)
true_lambda = 8
data = np.random.poisson(true_lambda, size=1000)


# --- Step 1: Estimate Lambda ---
estimated_lambda = np.mean(data)
print(f"Step 1: Estimate Parameters")
print(f"-----------------------------")
print(f"Estimated Lambda (Sample Mean): {estimated_lambda:.4f}\n")


# --- Step 2: Verify the Distribution ---

# 1. Compare Mean and Variance
sample_variance = np.var(data, ddof=1) # ddof=1 for sample variance
print(f"Check 1: Compare Mean and Variance")
print(f"----------------------------------")
print(f"Sample Mean:    {estimated_lambda:.4f}")
print(f"Sample Variance: {sample_variance:.4f}")
if abs(estimated_lambda - sample_variance) < (estimated_lambda * 0.1): # Check if they are within 10%
    print("Result: Mean and Variance are close. This supports the Poisson assumption.\n")
else:
    print("Result: Mean and Variance are not close. Check for over/underdispersion.\n")


# 2. Visual Check with Histogram
print(f"Check 2: Visual Inspection")
print(f"---------------------------")
print("Generating plot...")
# Get the observed frequencies
observed_counts = np.bincount(data)
observed_freq = observed_counts / len(data)

# Get the theoretical probabilities from the Poisson PMF
k_values = np.arange(0, max(data) + 1)
theoretical_probs = stats.poisson.pmf(k_values, mu=estimated_lambda)

# Plotting
plt.figure(figsize=(12, 6))
plt.hist(data, bins=k_values, density=True, alpha=0.6, label='Observed Data Frequencies', color='skyblue', edgecolor='black')
plt.plot(k_values, theoretical_probs, 'r-o', label=f'Theoretical Poisson PMF (λ={estimated_lambda:.2f})')
plt.title('Observed Data vs. Theoretical Poisson Distribution')
plt.xlabel('Number of Events (k)')
plt.ylabel('Probability / Density')
plt.legend()
plt.grid(True)
plt.show()


# 3. Chi-Squared Goodness-of-Fit Test
print(f"\nCheck 3: Chi-Squared Goodness-of-Fit Test")
print(f"-------------------------------------------")

# The Chi-Squared test works best with binned data.
# We'll use the counts we already calculated.
# The expected frequencies are the theoretical probabilities multiplied by the total number of samples.
expected_counts = theoretical_probs * len(data)

# Normalize expected counts to match observed sum (to handle floating point precision issues)
expected_counts = expected_counts * (observed_counts.sum() / expected_counts.sum())

# Perform the test. We compare the observed counts with the expected counts.
# Note: Some bins might have very low expected counts, which can be an issue.
# For a more robust test, one might group bins with expected counts < 5.
# For this example, we'll proceed.
chi2_stat, p_value = stats.chisquare(f_obs=observed_counts[:len(expected_counts)], f_exp=expected_counts)

print(f"Chi-Squared Statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value > 0.05:
    print("Result: P-value is greater than 0.05. We do not reject the null hypothesis.")
    print("Conclusion: The data is consistent with a Poisson distribution.")
else:
    print("Result: P-value is less than 0.05. We reject the null hypothesis.")
    print("Conclusion: The data does not appear to follow a Poisson distribution.")

Step 1: Estimate Parameters
-----------------------------
Estimated Lambda (Sample Mean): 7.8410

Check 1: Compare Mean and Variance
----------------------------------
Sample Mean:    7.8410
Sample Variance: 6.7525
Result: Mean and Variance are not close. Check for over/underdispersion.

Check 2: Visual Inspection
---------------------------
Generating plot...

Check 3: Chi-Squared Goodness-of-Fit Test
-------------------------------------------
Chi-Squared Statistic: 20.0929
P-value: 0.2161
Result: P-value is greater than 0.05. We do not reject the null hypothesis.
Conclusion: The data is consistent with a Poisson distribution.

def analyze_dispersion(data, name):
    """Calculates and interprets the dispersion of a dataset."""
    mean = np.mean(data)
    variance = np.var(data, ddof=1) # Use ddof=1 for the sample variance
    dispersion_index = variance / mean

    print(f"--- Analysis for: {name} ---")
    print(f"Sample Mean:    {mean:.3f}")
    print(f"Sample Variance: {variance:.3f}")
    print(f"Dispersion Index (Var/Mean): {dispersion_index:.3f}")

    if 0.8 <= dispersion_index <= 1.2:
        print("Conclusion: Looks consistent with a Poisson distribution.\n")
    elif dispersion_index > 1.2:
        print("Conclusion: Evidence of OVERDISPERSION. A Poisson model may not be appropriate.\n")
    else: # dispersion_index < 0.8
        print("Conclusion: Evidence of UNDERDISPERSION. A Poisson model may not be appropriate.\n")

# --- Datasets ---
np.random.seed(42)

# 1. A dataset that should be Poisson
# Lambda = 5. We expect mean and variance to be around 5.
poisson_data = np.random.poisson(lam=5, size=1000)

# 2. An OVERDISPERSED dataset
# We use the Negative Binomial distribution, which is like a Poisson but with more variance.
# It has parameters n (number of successes) and p (probability of success).
# Mean = n*(1-p)/p, Variance = n*(1-p)/p^2. Variance is always > Mean.
overdispersed_data = np.random.negative_binomial(n=5, p=0.5, size=1000)

# 3. An UNDERDISPERSED dataset
# A Binomial distribution can sometimes be underdispersed.
# Mean = n*p, Variance = n*p*(1-p). Variance is always < Mean.
underdispersed_data = np.random.binomial(n=10, p=0.5, size=1000)


# --- Run the Analysis ---
analyze_dispersion(poisson_data, "Poisson Data")
analyze_dispersion(overdispersed_data, "Overdispersed Data")
analyze_dispersion(underdispersed_data, "Underdispersed Data")

--- Analysis for: Poisson Data ---
Sample Mean:    4.985
Sample Variance: 5.002
Dispersion Index (Var/Mean): 1.003
Conclusion: Looks consistent with a Poisson distribution.

--- Analysis for: Overdispersed Data ---
Sample Mean:    4.802
Sample Variance: 9.342
Dispersion Index (Var/Mean): 1.945
Conclusion: Evidence of OVERDISPERSION. A Poisson model may not be appropriate.

--- Analysis for: Underdispersed Data ---
Sample Mean:    5.048
Sample Variance: 2.520
Dispersion Index (Var/Mean): 0.499
Conclusion: Evidence of UNDERDISPERSION. A Poisson model may not be appropriate.

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

# --- Data Generation with a Two-Stage Process ---
np.random.seed(42)
n_samples = 500

# Features remain the same
temperature = np.random.normal(loc=18, scale=7, size=n_samples)
is_weekend = np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3])

# True coefficients for the average rate remain the same
intercept_true = 1.0
beta_temp_true = 0.07
beta_weekend_true = 0.6

# Calculate the EXPECTED MEAN (mu) based on our features
# This is the average around which our lambda will vary
mu_values = np.exp(intercept_true + (beta_temp_true * temperature) + (beta_weekend_true * is_weekend))


# --- Stage 1: Introduce Randomness to the Rate (Lambda) ---
# We'll use a Gamma distribution to create our random lambdas.
# A key property is that we can set the Gamma distribution's mean to be our mu.
# The 'shape' parameter controls the variance. A smaller shape parameter means more variance.
shape_param = 50  # Let's call this our "heterogeneity factor". Lower value = more dispersion.
scale_param = mu_values / shape_param

# Now, for each data point, draw a unique lambda from the Gamma distribution
lambda_values_randomized = np.random.gamma(shape=shape_param, scale=scale_param, size=n_samples)


# --- Stage 2: Generate the Final Count from the Randomized Lambda ---
# Use the unique, noisy lambda for each data point to generate the final count
num_rentals_overdispersed = np.random.poisson(lambda_values_randomized)


# --- Create the DataFrame ---
df_robust = pd.DataFrame({
    'num_rentals': num_rentals_overdispersed,
    'temperature': temperature,
    'is_weekend': is_weekend,
    'expected_mu': mu_values, # The "clean" average rate
    'random_lambda': lambda_values_randomized # The "noisy" rate used for generation
})

print("--- Sample of the Robustly Generated Data ---")
# Notice how random_lambda varies around expected_mu
print(df_robust.head())

--- Sample of the Robustly Generated Data ---
   num_rentals  temperature  is_weekend  expected_mu  random_lambda
0            8    21.476999           0    12.223862      11.856705
1            1    17.032150           0     8.955344       9.082119
2           10    22.533820           0    13.162441      13.770663
3           26    28.661209           1    36.828965      34.579363
4            8    16.360926           0     8.544304       8.719571

# --- Verify the Dispersion ---
print("\n--- Verifying the Dispersion ---")
mean_rentals = df_robust['num_rentals'].mean()
var_rentals = df_robust['num_rentals'].var()
dispersion_index = var_rentals / mean_rentals

print(f"Mean of num_rentals: {mean_rentals:.3f}")
print(f"Variance of num_rentals: {var_rentals:.3f}")
print(f"Dispersion Index (Var/Mean): {dispersion_index:.3f}")

# Visualize the distribution
plt.figure(figsize=(12, 6))
sns.histplot(df_robust['num_rentals'], bins=max(df_robust['num_rentals']), kde=False, color='purple')
plt.title('Distribution from Robust Two-Stage Generation')
plt.xlabel('Number of Rentals')
plt.ylabel('Frequency')
plt.show()

--- Verifying the Dispersion ---
Mean of num_rentals: 13.412
Variance of num_rentals: 93.157
Dispersion Index (Var/Mean): 6.946

# We'll use the 'df_robust' DataFrame from the previous step.

print("--- Fitting a Poisson Model to Highly Overdispersed Data ---")

# Define and fit the model
poisson_model_on_od_data = smf.glm(formula="num_rentals ~ temperature + is_weekend",
                                   data=df_robust,
                                   family=sm.families.Poisson()).fit()

# Print the summary
print(poisson_model_on_od_data.summary())

--- Fitting a Poisson Model to Highly Overdispersed Data ---
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:            num_rentals   No. Observations:                  500
Model:                            GLM   Df Residuals:                      497
Model Family:                 Poisson   Df Model:                            2
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1398.4
Date:                Sun, 16 Nov 2025   Deviance:                       687.12
Time:                        15:06:32   Pearson chi2:                     670.
No. Iterations:                     4   Pseudo R-squ. (CS):             0.9885
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.9867      0.039     25.027      0.000       0.909       1.064
temperature     0.0707      0.002     42.449      0.000       0.067       0.074
is_weekend      0.5932      0.025     24.038      0.000       0.545       0.642
===============================================================================

print("\n--- Fitting a Negative Binomial Model to a Highly Overdispersed Data ---")

# The code is almost identical, we just change the family
nb_model = smf.glm(formula="num_rentals ~ temperature + is_weekend",
                   data=df_robust,
                   family=sm.families.NegativeBinomial()).fit()

# Print the summary
print(nb_model.summary())

--- Fitting a Negative Binomial Model to a Highly Overdispersed Data ---
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:            num_rentals   No. Observations:                  500
Model:                            GLM   Df Residuals:                      497
Model Family:        NegativeBinomial   Df Model:                            2
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1742.2
Date:                Sun, 16 Nov 2025   Deviance:                       64.354
Time:                        15:12:51   Pearson chi2:                     55.1
No. Iterations:                     4   Pseudo R-squ. (CS):             0.2564
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.9661      0.139      6.939      0.000       0.693       1.239
temperature     0.0714      0.007     10.395      0.000       0.058       0.085
is_weekend      0.6063      0.102      5.961      0.000       0.407       0.806
===============================================================================

/usr/local/lib/python3.12/dist-packages/statsmodels/genmod/families/family.py:1367: ValueWarning: Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.
  warnings.warn("Negative binomial dispersion parameter alpha not "

## real world data example ##

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

#from google.colab import drive
#drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load the dataset from the extracted CSV file
df = pd.read_csv('/content/drive/MyDrive/OnlineNewsPopularity/OnlineNewsPopularity.csv')

print("--- Data Loaded Successfully ---")
print(f"Shape of the loaded data: {df.shape}")

--- Data Loaded Successfully ---
Shape of the loaded data: (39644, 61)

!# --- Clean Column Names ---
# The original column names have leading spaces (e.g., ' timedelta' instead of 'timedelta')
# Let's remove any leading/trailing whitespace from all column names.
df.columns = df.columns.str.strip()

print("\n--- Cleaned Column Names ---")
print(df.columns)

# --- Identify Target and Features ---
# The target variable is 'shares'.
# The first two columns, 'url' and 'timedelta', are not predictive features.
# 'url' is a unique identifier, and 'timedelta' is related to the target.
# So, our features are all columns from the 3rd column to the second-to-last.

TARGET = 'shares'
FEATURES = df.columns[2:-1] # All columns except url, timedelta, and shares

print(f"\nTarget Variable: {TARGET}")
print(f"Number of Features: {len(FEATURES)}")

--- Cleaned Column Names ---
Index(['url', 'timedelta', 'n_tokens_title', 'n_tokens_content',
       'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
       'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
       'average_token_length', 'num_keywords', 'data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
       'global_rate_negative_words', 'rate_positive_words',
       'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
       'max_positive_polarity', 'avg_negative_polarity',
       'min_negative_polarity', 'max_negative_polarity', 'title_subjectivity',
       'title_sentiment_polarity', 'abs_title_subjectivity',
       'abs_title_sentiment_polarity', 'shares'],
      dtype='object')

Target Variable: shares
Number of Features: 58

# --- Analyze the 'shares' Target Variable ---

print("\n--- EDA on the Target Variable: 'shares' ---")

# Calculate summary statistics
mean_shares = df[TARGET].mean()
variance_shares = df[TARGET].var()

print(f"Mean of shares: {mean_shares:,.2f}")
print(f"Variance of shares: {variance_shares:,.2f}")

# The most important calculation: The Dispersion Index
dispersion_index = variance_shares / mean_shares
print(f"\nDispersion Index (Variance / Mean): {dispersion_index:,.2f}")

# --- Visualize the Distribution ---
print("\n--- Generating Distribution Plot ---")
plt.figure(figsize=(12, 6))
sns.histplot(df[TARGET], bins=100, color='skyblue')
plt.title('Distribution of Article Shares')
plt.xlabel('Number of Shares')
plt.ylabel('Frequency (Number of Articles)')
plt.show()

# The distribution is extremely skewed. Let's look closer at the lower end.
plt.figure(figsize=(12, 6))
sns.histplot(df[df[TARGET] < 5000][TARGET], bins=100, color='coral')
plt.title('Distribution of Article Shares (for articles with < 5,000 shares)')
plt.xlabel('Number of Shares')
plt.ylabel('Frequency (Number of Articles)')
plt.show()

--- EDA on the Target Variable: 'shares' ---
Mean of shares: 3,395.38
Variance of shares: 135,185,983.71

Dispersion Index (Variance / Mean): 39,814.68

--- Generating Distribution Plot ---

# --- Select a subset of features for our model ---
# Using all features would make the summary table very long.
# Let's pick a few that are easy to interpret.
feature_subset = [
    'n_tokens_title',
    'num_hrefs',
    'num_imgs',
    'average_token_length',
    'weekday_is_saturday' # Note: statsmodels will handle this binary feature correctly.
]

# Create the R-style formula for statsmodels
# 'shares' is our target, and the features are on the right side of the ~
formula = f"shares ~ {' + '.join(feature_subset)}"
print(f"--- Using formula: {formula} ---")


# --- Fit the Poisson Model ---
print("\n--- Fitting Poisson Model (The Wrong Tool for this Data) ---")

poisson_model = smf.glm(formula=formula,
                        data=df,
                        family=sm.families.Poisson()).fit()

# Print the detailed summary
print(poisson_model.summary())

--- Using formula: shares ~ n_tokens_title + num_hrefs + num_imgs + average_token_length + weekday_is_saturday ---

--- Fitting Poisson Model (The Wrong Tool for this Data) ---
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                 shares   No. Observations:                39644
Model:                            GLM   Df Residuals:                    39638
Model Family:                 Poisson   Df Model:                            5
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.2802e+08
Date:                Sun, 16 Nov 2025   Deviance:                   2.5568e+08
Time:                        16:02:13   Pearson chi2:                 1.49e+09
No. Iterations:                     7   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept                8.2953      0.001   1.37e+04      0.000       8.294       8.296
n_tokens_title           0.0148   4.08e-05    361.936      0.000       0.015       0.015
num_hrefs                0.0096   6.06e-06   1589.107      0.000       0.010       0.010
num_imgs                 0.0086   8.95e-06    959.093      0.000       0.009       0.009
average_token_length    -0.1076   8.93e-05  -1204.796      0.000      -0.108      -0.107
weekday_is_saturday      0.1625      0.000    493.906      0.000       0.162       0.163
========================================================================================

# --- Fit the Negative Binomial Model ---
print("\n--- Fitting Negative Binomial Model (The Right Tool for this Data) ---")

nb_model = smf.glm(formula=formula,
                   data=df,
                   family=sm.families.NegativeBinomial()).fit() # The ONLY change is here

# Print the detailed summary
print(nb_model.summary())

--- Fitting Negative Binomial Model (The Right Tool for this Data) ---

/usr/local/lib/python3.12/dist-packages/statsmodels/genmod/families/family.py:1367: ValueWarning: Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.
  warnings.warn("Negative binomial dispersion parameter alpha not "

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                 shares   No. Observations:                39644
Model:                            GLM   Df Residuals:                    39638
Model Family:        NegativeBinomial   Df Model:                            5
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.6111e+05
Date:                Sun, 16 Nov 2025   Deviance:                       50236.
Time:                        16:02:28   Pearson chi2:                 4.22e+05
No. Iterations:                    11   Pseudo R-squ. (CS):            0.04201
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept                8.3056      0.038    215.985      0.000       8.230       8.381
n_tokens_title           0.0175      0.002      7.349      0.000       0.013       0.022
num_hrefs                0.0120      0.000     24.652      0.000       0.011       0.013
num_imgs                 0.0121      0.001     18.856      0.000       0.011       0.013
average_token_length    -0.1266      0.006    -20.678      0.000      -0.139      -0.115
weekday_is_saturday      0.1797      0.021      8.602      0.000       0.139       0.221
========================================================================================

## ANOTHER REAL WORLD EXAMPLE

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

# --- Step 1: Downloading and Merging Data from a Reliable Source ---

print("--- Downloading data for the last 10 seasons from football-data.co.uk... ---")

# Create an empty list to store data for the last 10 seasons
all_seasons_data = []

# Download data from the 2014-15 season to the 2023-24 season
for year in range(14, 24):
    season = f"{year}{year+1}"
    # Dynamically create the URL for each season
    url = f"https://www.football-data.co.uk/mmz4281/{season}/E0.csv"
    
    try:
        # Read each season's CSV file and add it to the list
        season_df = pd.read_csv(url, on_bad_lines='skip', encoding='latin1')
        all_seasons_data.append(season_df)
        print(f"Season 20{season[:2]}-20{season[2:]} downloaded successfully.")
    except Exception as e:
        print(f"Season 20{season[:2]}-20{season[2:]} could not be downloaded. Error: {e}")

# Merge all downloaded season data into a single DataFrame
df_raw = pd.concat(all_seasons_data, ignore_index=True)

print("\n--- All data merged successfully! ---")

# Select and name the columns we need
# The column names in this source are already as we want: HomeTeam, AwayTeam, FTHG, FTAG
df = df_raw[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']].copy()

# Clean missing data
df.dropna(inplace=True)

# Convert goal counts to integer format
df['FTHG'] = df['FTHG'].astype(int)
df['FTAG'] = df['FTAG'].astype(int)

print("\n--- First 5 Rows of Cleaned Data ---")
print(df.head())


# --- Step 2: EDA (Exploratory Data Analysis) - This part is the same as before ---

print("\n--- Exploratory Data Analysis on Goal Counts ---")

# Statistics for Home Team Goals (FTHG)
mean_home_goals = df['FTHG'].mean()
var_home_goals = df['FTHG'].var()
dispersion_home = var_home_goals / mean_home_goals

# Statistics for Away Team Goals (FTAG)
mean_away_goals = df['FTAG'].mean()
var_away_goals = df['FTAG'].var()
dispersion_away = var_away_goals / mean_away_goals

print(f"Home Goals Mean: {mean_home_goals:.3f}")
print(f"Home Goals Variance: {var_home_goals:.3f}")
print(f"HOME GOALS DISPERSION INDEX (Variance/Mean): {dispersion_home:.3f}\n")

print(f"Away Goals Mean: {mean_away_goals:.3f}")
print(f"Away Goals Variance: {var_away_goals:.3f}")
print(f"AWAY GOALS DISPERSION INDEX (Variance/Mean): {dispersion_away:.3f}\n")

# --- Visualize the Distributions ---
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.countplot(x='FTHG', data=df, ax=axes[0], color='skyblue', order = df['FTHG'].value_counts().index)
axes[0].set_title('Distribution of Goals Scored by Home Team')
axes[0].set_xlabel('Goals per Match')
axes[0].set_ylabel('Number of Matches')

sns.countplot(x='FTAG', data=df, ax=axes[1], color='coral', order = df['FTAG'].value_counts().index)
axes[1].set_title('Distribution of Goals Scored by Away Team')
axes[1].set_xlabel('Goals per Match')
axes[1].set_ylabel('Number of Matches')

plt.tight_layout()
plt.show()

--- Downloading data for the last 10 seasons from football-data.co.uk... ---
Season 2014-2015 downloaded successfully.
Season 2014-2015 downloaded successfully.
Season 2015-2016 downloaded successfully.
Season 2015-2016 downloaded successfully.
Season 2016-2017 downloaded successfully.
Season 2016-2017 downloaded successfully.
Season 2017-2018 downloaded successfully.
Season 2017-2018 downloaded successfully.
Season 2018-2019 downloaded successfully.
Season 2018-2019 downloaded successfully.
Season 2019-2020 downloaded successfully.
Season 2019-2020 downloaded successfully.
Season 2020-2021 downloaded successfully.
Season 2020-2021 downloaded successfully.
Season 2021-2022 downloaded successfully.
Season 2021-2022 downloaded successfully.
Season 2022-2023 downloaded successfully.
Season 2022-2023 downloaded successfully.
Season 2023-2024 downloaded successfully.

--- All data merged successfully! ---

--- First 5 Rows of Cleaned Data ---
     HomeTeam        AwayTeam  FTHG  FTAG
0     Arsenal  Crystal Palace     2     1
1   Leicester         Everton     2     2
2  Man United         Swansea     1     2
3         QPR            Hull     0     1
4       Stoke     Aston Villa     0     1

--- Exploratory Data Analysis on Goal Counts ---
Home Goals Mean: 1.548
Home Goals Variance: 1.743
HOME GOALS DISPERSION INDEX (Variance/Mean): 1.126

Away Goals Mean: 1.245
Away Goals Variance: 1.446
AWAY GOALS DISPERSION INDEX (Variance/Mean): 1.161

Season 2023-2024 downloaded successfully.

--- All data merged successfully! ---

--- First 5 Rows of Cleaned Data ---
     HomeTeam        AwayTeam  FTHG  FTAG
0     Arsenal  Crystal Palace     2     1
1   Leicester         Everton     2     2
2  Man United         Swansea     1     2
3         QPR            Hull     0     1
4       Stoke     Aston Villa     0     1

--- Exploratory Data Analysis on Goal Counts ---
Home Goals Mean: 1.548
Home Goals Variance: 1.743
HOME GOALS DISPERSION INDEX (Variance/Mean): 1.126

Away Goals Mean: 1.245
Away Goals Variance: 1.446
AWAY GOALS DISPERSION INDEX (Variance/Mean): 1.161

# --- Model Building ---

# Step 1: Create the formula
# The formula will predict FTAG based on the AwayTeam.
# C(AwayTeam) tells statsmodels to treat 'AwayTeam' as a categorical variable, creating coefficients for each team.

formula = 'FTAG ~ C(AwayTeam)'

print(f"\n--- Model formula: {formula} ---")

# Step 2: Fit the Poisson Regression model
print("\n--- Fitting Poisson Regression Model ---")
poisson_model = smf.glm(formula=formula, data=df, family=sm.families.Poisson()).fit()

# Step 3: Print the model summary
print(poisson_model.summary())

--- Model formula: FTAG ~ C(AwayTeam) ---

--- Fitting Poisson Regression Model ---
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                   FTAG   No. Observations:                 3800
Model:                            GLM   Df Residuals:                     3766
Model Family:                 Poisson   Df Model:                           33
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -5408.0
Date:                Sun, 16 Nov 2025   Deviance:                       4546.0
Time:                        16:37:41   Pearson chi2:                 4.01e+03
No. Iterations:                     5   Pseudo R-squ. (CS):            0.09927
Covariance Type:            nonrobust                                         
===================================================================================================
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                           0.4831      0.057      8.478      0.000       0.371       0.595
C(AwayTeam)[T.Aston Villa]         -0.4318      0.102     -4.236      0.000      -0.632      -0.232
C(AwayTeam)[T.Bournemouth]         -0.3695      0.100     -3.703      0.000      -0.565      -0.174
C(AwayTeam)[T.Brentford]           -0.1954      0.128     -1.526      0.127      -0.446       0.056
C(AwayTeam)[T.Brighton]            -0.3898      0.100     -3.880      0.000      -0.587      -0.193
C(AwayTeam)[T.Burnley]             -0.5371      0.101     -5.321      0.000      -0.735      -0.339
C(AwayTeam)[T.Cardiff]             -0.8626      0.283     -3.046      0.002      -1.418      -0.308
C(AwayTeam)[T.Chelsea]             -0.0065      0.081     -0.081      0.936      -0.165       0.152
C(AwayTeam)[T.Crystal Palace]      -0.3410      0.088     -3.858      0.000      -0.514      -0.168
C(AwayTeam)[T.Everton]             -0.4318      0.091     -4.755      0.000      -0.610      -0.254
C(AwayTeam)[T.Fulham]              -0.4571      0.127     -3.606      0.000      -0.706      -0.209
C(AwayTeam)[T.Huddersfield]        -0.9426      0.212     -4.448      0.000      -1.358      -0.527
C(AwayTeam)[T.Hull]                -0.9852      0.216     -4.558      0.000      -1.409      -0.562
C(AwayTeam)[T.Leeds]               -0.1567      0.126     -1.242      0.214      -0.404       0.091
C(AwayTeam)[T.Leicester]           -0.1113      0.085     -1.305      0.192      -0.279       0.056
C(AwayTeam)[T.Liverpool]            0.1018      0.079      1.295      0.195      -0.052       0.256
C(AwayTeam)[T.Luton]               -0.2495      0.212     -1.177      0.239      -0.665       0.166
C(AwayTeam)[T.Man City]             0.2463      0.076      3.238      0.001       0.097       0.395
C(AwayTeam)[T.Man United]          -0.1429      0.084     -1.708      0.088      -0.307       0.021
C(AwayTeam)[T.Middlesbrough]       -1.1249      0.321     -3.501      0.000      -1.755      -0.495
C(AwayTeam)[T.Newcastle]           -0.3990      0.093     -4.297      0.000      -0.581      -0.217
C(AwayTeam)[T.Norwich]             -1.0921      0.188     -5.796      0.000      -1.461      -0.723
C(AwayTeam)[T.Nott'm Forest]       -0.6242      0.183     -3.408      0.001      -0.983      -0.265
C(AwayTeam)[T.QPR]                 -0.4831      0.236     -2.044      0.041      -0.946      -0.020
C(AwayTeam)[T.Sheffield United]    -0.8626      0.170     -5.075      0.000      -1.196      -0.529
C(AwayTeam)[T.Southampton]         -0.4207      0.093     -4.500      0.000      -0.604      -0.237
C(AwayTeam)[T.Stoke]               -0.6091      0.135     -4.519      0.000      -0.873      -0.345
C(AwayTeam)[T.Sunderland]          -0.5558      0.149     -3.738      0.000      -0.847      -0.264
C(AwayTeam)[T.Swansea]             -0.5653      0.132     -4.269      0.000      -0.825      -0.306
C(AwayTeam)[T.Tottenham]            0.0537      0.080      0.676      0.499      -0.102       0.210
C(AwayTeam)[T.Watford]             -0.5279      0.111     -4.737      0.000      -0.746      -0.309
C(AwayTeam)[T.West Brom]           -0.7329      0.129     -5.661      0.000      -0.987      -0.479
C(AwayTeam)[T.West Ham]            -0.2370      0.086     -2.763      0.006      -0.405      -0.069
C(AwayTeam)[T.Wolves]              -0.5008      0.110     -4.538      0.000      -0.717      -0.285
===================================================================================================
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                   FTAG   No. Observations:                 3800
Model:                            GLM   Df Residuals:                     3766
Model Family:                 Poisson   Df Model:                           33
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -5408.0
Date:                Sun, 16 Nov 2025   Deviance:                       4546.0
Time:                        16:37:41   Pearson chi2:                 4.01e+03
No. Iterations:                     5   Pseudo R-squ. (CS):            0.09927
Covariance Type:            nonrobust                                         
===================================================================================================
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                           0.4831      0.057      8.478      0.000       0.371       0.595
C(AwayTeam)[T.Aston Villa]         -0.4318      0.102     -4.236      0.000      -0.632      -0.232
C(AwayTeam)[T.Bournemouth]         -0.3695      0.100     -3.703      0.000      -0.565      -0.174
C(AwayTeam)[T.Brentford]           -0.1954      0.128     -1.526      0.127      -0.446       0.056
C(AwayTeam)[T.Brighton]            -0.3898      0.100     -3.880      0.000      -0.587      -0.193
C(AwayTeam)[T.Burnley]             -0.5371      0.101     -5.321      0.000      -0.735      -0.339
C(AwayTeam)[T.Cardiff]             -0.8626      0.283     -3.046      0.002      -1.418      -0.308
C(AwayTeam)[T.Chelsea]             -0.0065      0.081     -0.081      0.936      -0.165       0.152
C(AwayTeam)[T.Crystal Palace]      -0.3410      0.088     -3.858      0.000      -0.514      -0.168
C(AwayTeam)[T.Everton]             -0.4318      0.091     -4.755      0.000      -0.610      -0.254
C(AwayTeam)[T.Fulham]              -0.4571      0.127     -3.606      0.000      -0.706      -0.209
C(AwayTeam)[T.Huddersfield]        -0.9426      0.212     -4.448      0.000      -1.358      -0.527
C(AwayTeam)[T.Hull]                -0.9852      0.216     -4.558      0.000      -1.409      -0.562
C(AwayTeam)[T.Leeds]               -0.1567      0.126     -1.242      0.214      -0.404       0.091
C(AwayTeam)[T.Leicester]           -0.1113      0.085     -1.305      0.192      -0.279       0.056
C(AwayTeam)[T.Liverpool]            0.1018      0.079      1.295      0.195      -0.052       0.256
C(AwayTeam)[T.Luton]               -0.2495      0.212     -1.177      0.239      -0.665       0.166
C(AwayTeam)[T.Man City]             0.2463      0.076      3.238      0.001       0.097       0.395
C(AwayTeam)[T.Man United]          -0.1429      0.084     -1.708      0.088      -0.307       0.021
C(AwayTeam)[T.Middlesbrough]       -1.1249      0.321     -3.501      0.000      -1.755      -0.495
C(AwayTeam)[T.Newcastle]           -0.3990      0.093     -4.297      0.000      -0.581      -0.217
C(AwayTeam)[T.Norwich]             -1.0921      0.188     -5.796      0.000      -1.461      -0.723
C(AwayTeam)[T.Nott'm Forest]       -0.6242      0.183     -3.408      0.001      -0.983      -0.265
C(AwayTeam)[T.QPR]                 -0.4831      0.236     -2.044      0.041      -0.946      -0.020
C(AwayTeam)[T.Sheffield United]    -0.8626      0.170     -5.075      0.000      -1.196      -0.529
C(AwayTeam)[T.Southampton]         -0.4207      0.093     -4.500      0.000      -0.604      -0.237
C(AwayTeam)[T.Stoke]               -0.6091      0.135     -4.519      0.000      -0.873      -0.345
C(AwayTeam)[T.Sunderland]          -0.5558      0.149     -3.738      0.000      -0.847      -0.264
C(AwayTeam)[T.Swansea]             -0.5653      0.132     -4.269      0.000      -0.825      -0.306
C(AwayTeam)[T.Tottenham]            0.0537      0.080      0.676      0.499      -0.102       0.210
C(AwayTeam)[T.Watford]             -0.5279      0.111     -4.737      0.000      -0.746      -0.309
C(AwayTeam)[T.West Brom]           -0.7329      0.129     -5.661      0.000      -0.987      -0.479
C(AwayTeam)[T.West Ham]            -0.2370      0.086     -2.763      0.006      -0.405      -0.069
C(AwayTeam)[T.Wolves]              -0.5008      0.110     -4.538      0.000      -0.717      -0.285
===================================================================================================

# --- Compare to Negative Binomial Model ---
print("\n--- Fitting Negative Binomial Model ---")
nb_model = smf.glm(formula=formula, data=df, family=sm.families.NegativeBinomial()).fit()

# Print the model summary
print(nb_model.summary())

# Perform a likelihood ratio test to compare the two models
# This test tells us if the Negative Binomial model provides a significantly better fit than the Poisson model.
print("\n--- Likelihood Ratio Test ---")
# Perform a likelihood ratio test to compare the two models
# This test tells us if the Negative Binomial model provides a significantly better fit than the Poisson model.
print("\n--- Likelihood Ratio Test ---")
# Calculate the test statistic manually
lr_stat = 2 * (nb_model.llf - poisson_model.llf)
p_value = stats.chi2.sf(lr_stat, df=1)
print(f"Likelihood Ratio Statistic: {lr_stat:.4f}")
print(f"P-value: {p_value:.4f}")
if p_value < 0.05:
    print("Conclusion: The Negative Binomial model fits significantly better than the Poisson model.")
else:
    print("Conclusion: No significant difference in fit between the models.")

--- Fitting Negative Binomial Model ---
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                   FTAG   No. Observations:                 3800
Model:                            GLM   Df Residuals:                     3766
Model Family:        NegativeBinomial   Df Model:                           33
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -5774.1
Date:                Sun, 16 Nov 2025   Deviance:                       2479.4
Time:                        16:40:18   Pearson chi2:                 1.83e+03
No. Iterations:                     5   Pseudo R-squ. (CS):            0.04570
Covariance Type:            nonrobust                                         
===================================================================================================
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                           0.4831      0.092      5.237      0.000       0.302       0.664
C(AwayTeam)[T.Aston Villa]         -0.4318      0.152     -2.837      0.005      -0.730      -0.133
C(AwayTeam)[T.Bournemouth]         -0.3695      0.151     -2.450      0.014      -0.665      -0.074
C(AwayTeam)[T.Brentford]           -0.1954      0.198     -0.987      0.324      -0.584       0.193
C(AwayTeam)[T.Brighton]            -0.3898      0.151     -2.577      0.010      -0.686      -0.093
C(AwayTeam)[T.Burnley]             -0.5371      0.148     -3.619      0.000      -0.828      -0.246
C(AwayTeam)[T.Cardiff]             -0.8626      0.372     -2.321      0.020      -1.591      -0.134
C(AwayTeam)[T.Chelsea]             -0.0065      0.131     -0.050      0.960      -0.262       0.249
C(AwayTeam)[T.Crystal Palace]      -0.3410      0.135     -2.518      0.012      -0.606      -0.076
C(AwayTeam)[T.Everton]             -0.4318      0.137     -3.151      0.002      -0.700      -0.163
C(AwayTeam)[T.Fulham]              -0.4571      0.186     -2.461      0.014      -0.821      -0.093
C(AwayTeam)[T.Huddersfield]        -0.9426      0.277     -3.408      0.001      -1.485      -0.401
C(AwayTeam)[T.Hull]                -0.9852      0.280     -3.521      0.000      -1.534      -0.437
C(AwayTeam)[T.Leeds]               -0.1567      0.197     -0.796      0.426      -0.542       0.229
C(AwayTeam)[T.Leicester]           -0.1113      0.136     -0.821      0.412      -0.377       0.154
C(AwayTeam)[T.Liverpool]            0.1018      0.129      0.787      0.431      -0.152       0.355
C(AwayTeam)[T.Luton]               -0.2495      0.321     -0.778      0.437      -0.878       0.379
C(AwayTeam)[T.Man City]             0.2463      0.128      1.928      0.054      -0.004       0.497
C(AwayTeam)[T.Man United]          -0.1429      0.132     -1.079      0.280      -0.402       0.117
C(AwayTeam)[T.Middlesbrough]       -1.1249      0.401     -2.802      0.005      -1.912      -0.338
C(AwayTeam)[T.Newcastle]           -0.3990      0.140     -2.840      0.005      -0.674      -0.124
C(AwayTeam)[T.Norwich]             -1.0921      0.241     -4.523      0.000      -1.565      -0.619
C(AwayTeam)[T.Nott'm Forest]       -0.6242      0.255     -2.446      0.014      -1.124      -0.124
C(AwayTeam)[T.QPR]                 -0.4831      0.337     -1.432      0.152      -1.144       0.178
C(AwayTeam)[T.Sheffield United]    -0.8626      0.227     -3.794      0.000      -1.308      -0.417
C(AwayTeam)[T.Southampton]         -0.4207      0.141     -2.986      0.003      -0.697      -0.145
C(AwayTeam)[T.Stoke]               -0.6091      0.191     -3.184      0.001      -0.984      -0.234
C(AwayTeam)[T.Sunderland]          -0.5558      0.212     -2.623      0.009      -0.971      -0.140
C(AwayTeam)[T.Swansea]             -0.5653      0.190     -2.981      0.003      -0.937      -0.194
C(AwayTeam)[T.Tottenham]            0.0537      0.130      0.414      0.679      -0.201       0.308
C(AwayTeam)[T.Watford]             -0.5279      0.163     -3.246      0.001      -0.847      -0.209
C(AwayTeam)[T.West Brom]           -0.7329      0.180     -4.062      0.000      -1.086      -0.379
C(AwayTeam)[T.West Ham]            -0.2370      0.134     -1.772      0.076      -0.499       0.025
C(AwayTeam)[T.Wolves]              -0.5008      0.162     -3.093      0.002      -0.818      -0.183
===================================================================================================

--- Likelihood Ratio Test ---

--- Likelihood Ratio Test ---
Likelihood Ratio Statistic: -732.0961
P-value: 1.0000
Conclusion: No significant difference in fit between the models.

/usr/local/lib/python3.12/dist-packages/statsmodels/genmod/families/family.py:1367: ValueWarning: Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.
  warnings.warn("Negative binomial dispersion parameter alpha not "

import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
# --- Step 1: Prepare Data for XGBoost ---

# We'll create features for both the HomeTeam and the AwayTeam
# pd.get_dummies will automatically convert our categorical columns into 0s and 1s
X = pd.get_dummies(df[['HomeTeam', 'AwayTeam']])
y = df['FTAG'] # Our target remains the same: Full Time Away Goals

print("--- Data prepared for XGBoost ---")
print("Original features: ['HomeTeam', 'AwayTeam']")
print(f"Shape of new features (X) after One-Hot Encoding: {X.shape}")
print("First 5 rows of X:")
print(X.head())



# --- Step 2: Build and Train the XGBoost Regressor ---

# Instantiate the model. We'll use the 'count:poisson' objective.
# n_estimators is the number of trees to build.
xgb_model = xgb.XGBRegressor(objective='count:poisson', 
                             n_estimators=100, 
                             seed=42)

print("\n--- Training XGBoost model... ---")
xgb_model.fit(X, y)
print("--- Model training complete. ---")


# --- Step 3: Evaluate the Model ---
predictions = xgb_model.predict(X)

# Calculate evaluation metrics
rmse = np.sqrt(mean_squared_error(y, predictions))
mae = mean_absolute_error(y, predictions)

print(f"\n--- Model Evaluation ---")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# --- Step 4: Interpret the Model (Feature Importance) ---
# This is the primary way to understand what XGBoost learned.
fig, ax = plt.subplots(figsize=(12, 10))
xgb.plot_importance(xgb_model, ax=ax, max_num_features=20) # Show top 20 features
plt.title("XGBoost Feature Importance")
plt.show()


# --- Get predictions from the Negative Binomial model ---
# We need to provide the original dataframe to the predict function
nb_predictions = nb_model.predict(df)

# --- Calculate the same metrics for the NB model ---
nb_rmse = np.sqrt(mean_squared_error(df['FTAG'], nb_predictions))
nb_mae = mean_absolute_error(df['FTAG'], nb_predictions)

print("--- Negative Binomial Model Evaluation ---")
print(f"Root Mean Squared Error (RMSE): {nb_rmse:.4f}")
print(f"Mean Absolute Error (MAE): {nb_mae:.4f}")

print("\n--- XGBoost Model Evaluation ---")
print(f"Root Mean Squared Error (RMSE): 1.0318")
print(f"Mean Absolute Error (MAE): 0.8099")

--- Data prepared for XGBoost ---
Original features: ['HomeTeam', 'AwayTeam']
Shape of new features (X) after One-Hot Encoding: (3800, 68)
First 5 rows of X:
   HomeTeam_Arsenal  HomeTeam_Aston Villa  HomeTeam_Bournemouth  \
0              True                 False                 False   
1             False                 False                 False   
2             False                 False                 False   
3             False                 False                 False   
4             False                 False                 False   

   HomeTeam_Brentford  HomeTeam_Brighton  HomeTeam_Burnley  HomeTeam_Cardiff  \
0               False              False             False             False   
1               False              False             False             False   
2               False              False             False             False   
3               False              False             False             False   
4               False              False             False             False   

   HomeTeam_Chelsea  HomeTeam_Crystal Palace  HomeTeam_Everton  ...  \
0             False                    False             False  ...   
1             False                    False             False  ...   
2             False                    False             False  ...   
3             False                    False             False  ...   
4             False                    False             False  ...   

   AwayTeam_Sheffield United  AwayTeam_Southampton  AwayTeam_Stoke  \
0                      False                 False           False   
1                      False                 False           False   
2                      False                 False           False   
3                      False                 False           False   
4                      False                 False           False   

   AwayTeam_Sunderland  AwayTeam_Swansea  AwayTeam_Tottenham  \
0                False             False               False   
1                False             False               False   
2                False              True               False   
3                False             False               False   
4                False             False               False   

   AwayTeam_Watford  AwayTeam_West Brom  AwayTeam_West Ham  AwayTeam_Wolves  
0             False               False              False            False  
1             False               False              False            False  
2             False               False              False            False  
3             False               False              False            False  
4             False               False              False            False  

[5 rows x 68 columns]

--- Training XGBoost model... ---
--- Model training complete. ---

--- Model Evaluation ---
Root Mean Squared Error (RMSE): 1.0318
Mean Absolute Error (MAE): 0.8099

--- Negative Binomial Model Evaluation ---
Root Mean Squared Error (RMSE): 1.1463
Mean Absolute Error (MAE): 0.8965

--- XGBoost Model Evaluation ---
Root Mean Squared Error (RMSE): 1.0318
Mean Absolute Error (MAE): 0.8099

# Another Analysis

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# --- Step 1: Creating the Synthetic Dataset ---
print("--- Step 1: Creating Synthetic Dataset... ---")
np.random.seed(42)
n_samples = 1000

# Predictor 1: Poisson Distribution
x1_poisson = np.random.poisson(lam=5, size=n_samples)

# Predictor 2: Normal Distribution
x2_normal = np.random.normal(loc=10, scale=2, size=n_samples)

# Predictor 3: Indeterminate/Skewed Distribution (Log-Normal)
x3_skewed = np.random.lognormal(mean=2, sigma=0.5, size=n_samples)

# Let's define the true coefficients (the hidden values the model will try to learn)
beta_0 = 0.5  # Intercept
beta_1 = 0.05 # Effect of the Poisson predictor
beta_2 = 0.1  # Effect of the Normal predictor
beta_3 = 0.01 # Effect of the Skewed predictor

# Calculate the expected value (lambda) of the target variable
# log(lambda) = beta_0 + beta_1*x1 + beta_2*x2 + beta_3*x3
log_lambda = beta_0 + (beta_1 * x1_poisson) + (beta_2 * x2_normal) + (beta_3 * x3_skewed)
lambda_values = np.exp(log_lambda)

# TARGET VARIABLE: Create the target variable from a Poisson distribution
y_poisson = np.random.poisson(lam=lambda_values, size=n_samples)

# Let's collect the data in a DataFrame
df = pd.DataFrame({
    'y_poisson': y_poisson,
    'x1_poisson': x1_poisson,
    'x2_normal': x2_normal,
    'x3_skewed': x3_skewed
})

print("Dataset created. First 5 rows:")
print(df.head())
print("\nDistribution of the target variable (should be Poisson):")
print(f"Mean: {df['y_poisson'].mean():.2f}, Variance: {df['y_poisson'].var():.2f}")


# --- Step 2: Training the Poisson Regression Model ---
print("\n--- Step 2: Training Poisson Regression Model... ---")
formula = "y_poisson ~ x1_poisson + x2_normal + x3_skewed"
poisson_model = smf.glm(formula=formula, data=df, family=sm.families.Poisson()).fit()

print("\nModel Summary:")
print(poisson_model.summary())


# --- Step 3: Residual Analysis ---
print("\n--- Step 3: Performing Residual Analysis... ---")

# Get the fitted values (expected lambda) from the model
fitted_values = poisson_model.fittedvalues

# Calculate Deviance Residuals (the most appropriate type of residual for GLMs)
deviance_residuals = poisson_model.resid_deviance

# --- Visualizations ---
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Distribution of Residuals
sns.histplot(deviance_residuals, kde=True, ax=axes[0], color='skyblue')
axes[0].set_title('Distribution of Deviance Residuals')
axes[0].set_xlabel('Deviance Residual')
axes[0].set_ylabel('Frequency')

# 2. Residuals vs. Fitted Values Plot
sns.scatterplot(x=fitted_values, y=deviance_residuals, ax=axes[1], alpha=0.5, color='coral')
axes[1].axhline(y=0, color='red', linestyle='--') # Add a zero line
axes[1].set_title('Residuals vs. Fitted Values')
axes[1].set_xlabel('Fitted Value')
axes[1].set_ylabel('Deviance Residual')

plt.tight_layout()
plt.show()

# 3. Q-Q Plot: Check how well the residuals fit a normal distribution
fig = sm.qqplot(deviance_residuals, stats.norm, fit=True, line='45')
plt.title('Q-Q Plot of Deviance Residuals')
plt.show()

--- Step 1: Creating Synthetic Dataset... ---
Dataset created. First 5 rows:
   y_poisson  x1_poisson  x2_normal  x3_skewed
0          6           5  10.075253   2.692034
1          8           4   9.452998   9.722258
2          8           4  10.397352  23.948026
3          8           5  11.035040   5.606789
4          8           5   5.933999   8.907722

Distribution of the target variable (should be Poisson):
Mean: 6.29, Variance: 8.55

--- Step 2: Training Poisson Regression Model... ---

Model Summary:
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:              y_poisson   No. Observations:                 1000
Model:                            GLM   Df Residuals:                      996
Model Family:                 Poisson   Df Model:                            3
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2338.8
Date:                Mon, 17 Nov 2025   Deviance:                       1104.3
Time:                        15:18:09   Pearson chi2:                 1.05e+03
No. Iterations:                     4   Pseudo R-squ. (CS):             0.2690
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.5872      0.074      7.907      0.000       0.442       0.733
x1_poisson     0.0480      0.006      8.594      0.000       0.037       0.059
x2_normal      0.0893      0.006     13.950      0.000       0.077       0.102
x3_skewed      0.0127      0.003      4.977      0.000       0.008       0.018
==============================================================================

--- Step 3: Performing Residual Analysis... ---

####

###

import matplotlib.pyplot as plt
import seaborn as sns

# Let's use the variables from the previous analysis
# deviance_residuals = poisson_model.resid_deviance
# x1_poisson = df['x1_poisson']
# x2_normal = df['x2_normal']
# x3_skewed = df['x3_skewed']

print("--- Residuals vs. Each Predictor Plot ---")

# Let's create a figure and axes for 3 plots
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

# --- Plot 1: Residuals vs. x1_poisson ---
sns.scatterplot(x=x1_poisson, y=deviance_residuals, ax=axes[0], alpha=0.5, color='blue')
axes[0].axhline(y=0, color='red', linestyle='--')
axes[0].set_title('Residuals vs. x1_poisson')
axes[0].set_xlabel('x1_poisson Values')
axes[0].set_ylabel('Deviance Residual')

# --- Plot 2: Residuals vs. x2_normal ---
sns.scatterplot(x=x2_normal, y=deviance_residuals, ax=axes[1], alpha=0.5, color='green')
axes[1].axhline(y=0, color='red', linestyle='--')
axes[1].set_title('Residuals vs. x2_normal')
axes[1].set_xlabel('x2_normal Values')
axes[1].set_ylabel('Deviance Residual')

# --- Plot 3: Residuals vs. x3_skewed ---
sns.scatterplot(x=x3_skewed, y=deviance_residuals, ax=axes[2], alpha=0.5, color='purple')
axes[2].axhline(y=0, color='red', linestyle='--')
axes[2].set_title('Residuals vs. x3_skewed')
axes[2].set_xlabel('x3_skewed Values')
axes[2].set_ylabel('Deviance Residual')

plt.tight_layout()
plt.show()

--- Residuals vs. Each Predictor Plot ---

###

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# --- Step 1: Creating the Synthetic Dataset (with the New Variable) ---
print("--- Step 1: Creating Synthetic Dataset with New Variable... ---")
np.random.seed(42)
n_samples = 1000

# Previous predictors
x1_poisson = np.random.poisson(lam=5, size=n_samples)
x2_normal = np.random.normal(loc=10, scale=2, size=n_samples)
x3_skewed = np.random.lognormal(mean=2, sigma=0.5, size=n_samples)

# NEW PREDICTOR: Continuous, right-skewed, Poisson-like (Gamma Distribution)
# The 'a' (shape) parameter determines the shape of the distribution. Lower values make it more skewed.
shape_param = 2.0 
x4_gamma = np.random.gamma(shape=shape_param, scale=1.5, size=n_samples)

# Let's define the new true coefficients
beta_0 = 0.5
beta_1 = 0.05
beta_2 = 0.1
beta_3 = 0.01
beta_4 = 0.08 # Effect of the new variable

# Calculate the new lambda
log_lambda = beta_0 + (beta_1 * x1_poisson) + (beta_2 * x2_normal) + (beta_3 * x3_skewed) + (beta_4 * x4_gamma)
lambda_values = np.exp(log_lambda)

# Recreate the target variable
y_poisson = np.random.poisson(lam=lambda_values, size=n_samples)

# New DataFrame
df_new = pd.DataFrame({
    'y_poisson': y_poisson,
    'x1_poisson': x1_poisson,
    'x2_normal': x2_normal,
    'x3_skewed': x3_skewed,
    'x4_gamma': x4_gamma
})

# Let's visualize the distribution of the new variable
print("Distribution of the New Variable (x4_gamma):")
sns.histplot(df_new['x4_gamma'], kde=True, color='orange')
plt.title('Distribution of x4_gamma (Gamma Distribution)')
plt.show()


# --- Step 2: Training the New Poisson Regression Model ---
print("\n--- Step 2: Training the Improved Poisson Regression Model... ---")
formula_new = "y_poisson ~ x1_poisson + x2_normal + x3_skewed + x4_gamma"
poisson_model_new = smf.glm(formula=formula_new, data=df_new, family=sm.families.Poisson()).fit()

print("\nImproved Model Summary:")
print(poisson_model_new.summary())


# --- Step 3: Residual Analysis (for the New Variable) ---
print("\n--- Step 3: Performing Residual Analysis for the New Variable... ---")
deviance_residuals_new = poisson_model_new.resid_deviance

plt.figure(figsize=(8, 6))
sns.scatterplot(x=df_new['x4_gamma'], y=deviance_residuals_new, alpha=0.5, color='orange')
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Residuals vs. x4_gamma')
plt.xlabel('x4_gamma Values')
plt.ylabel('Deviance Residual')
plt.show()

--- Step 1: Creating Synthetic Dataset with New Variable... ---
Distribution of the New Variable (x4_gamma):

--- Step 2: Training the Improved Poisson Regression Model... ---

Improved Model Summary:
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:              y_poisson   No. Observations:                 1000
Model:                            GLM   Df Residuals:                      995
Model Family:                 Poisson   Df Model:                            4
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2494.9
Date:                Mon, 17 Nov 2025   Deviance:                       1151.8
Time:                        15:23:03   Pearson chi2:                 1.10e+03
No. Iterations:                     4   Pseudo R-squ. (CS):             0.5442
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4941      0.067      7.377      0.000       0.363       0.625
x1_poisson     0.0448      0.005      9.208      0.000       0.035       0.054
x2_normal      0.1026      0.006     18.326      0.000       0.092       0.114
x3_skewed      0.0096      0.002      4.257      0.000       0.005       0.014
x4_gamma       0.0825      0.004     18.605      0.000       0.074       0.091
==============================================================================

--- Step 3: Performing Residual Analysis for the New Variable... ---

##

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Prepare the data
X = df[['x1_poisson', 'x2_normal', 'x3_skewed']]
y = df['y_poisson']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters for count:poisson objective
params = {
    'objective': 'count:poisson',
    'eval_metric': 'rmse',
    'learning_rate': 0.1,
    'n_estimators':100,
    'seed': 42
}

# Train the model
num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

# Make predictions
y_pred = bst.predict(dtest)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("XGBoost with count:poisson objective")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

# Compare with Poisson regression predictions
poisson_pred = poisson_model.predict(X_test)
poisson_mse = mean_squared_error(y_test, poisson_pred)
poisson_rmse = np.sqrt(poisson_mse)
poisson_mae = mean_absolute_error(y_test, poisson_pred)

print("\nComparison with Poisson Regression:")
print(f"Poisson Regression RMSE: {poisson_rmse:.4f}")
print(f"Poisson Regression MAE: {poisson_mae:.4f}")

# Feature importance
xgb.plot_importance(bst)
plt.title('XGBoost Feature Importance')
plt.show()

XGBoost with count:poisson objective
RMSE: 2.7860
MAE: 2.2027

Comparison with Poisson Regression:
Poisson Regression RMSE: 2.6064
Poisson Regression MAE: 2.0824

/Users/hincaltopcuoglu/Desktop/Math_and_Statistics/stats/lib/python3.12/site-packages/xgboost/training.py:199: UserWarning: [15:24:22] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:790: 
Parameters: { "n_estimators" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Metric	Poisson Model (Wrong Tool)	Negative Binomial Model (Right Tool)	The "So What?"
Model Family	Assumes `Variance = Mean`	Allows `Variance > Mean`	The NB model's assumption matches our data's reality (Dispersion = 6.94).
Coefficient (`coef`) for `temperature`	0.0707	0.0714	Nearly Identical. Both models correctly identify the positive relationship.
Standard Error (`std err`) for `temperature`	0.002	0.007	The NB model's standard error is 3.5 times larger, honestly reflecting the high variability in the data.
Z-score (`z`) for `temperature`	42.449 (Extremely Overconfident)	10.395 (Strong, but Realistic)	The NB model provides a more truthful measure of the effect's significance. It's still highly significant, but not impossibly so.
Deviance (Goodness of Fit)	687.12	64.354	A lower deviance indicates a much better fit. The NB model fits the data 10 times better than the Poisson model. This is our statistical proof.
Log-Likelihood	-1398.4	-1742.2	Note: These are not directly comparable because the models are different, but the Deviance score is the key metric for fit.

Metric	Poisson GLM (Winner)	XGBoost
RMSE	2.6064	2.7406
MAE	2.0824	2.1875

Import Required Libraries¶

Understanding Poisson Distribution¶

¶

Side-by-Side Model Comparison¶

Final Conclusion: The Research Answer¶

1. Model Success (Model Summary)¶

2. Residual Analysis: A "Healthy" Model's CT Scan¶

Overall Interpretation¶

Why You Are Right: The Intuitive Expectation¶

The Answer: "Good Fit" and "High Explanatory Power" Are Not the Same Thing¶

An Analogy: Weather Forecasting¶

Final Interpretation¶

Expected Results and Interpretation¶

1. Residuals vs. x1_poisson (Blue Graph)¶

2. Residuals vs. x2_normal (Green Graph)¶

3. Residuals vs. x3_skewed (Purple Graph)¶

Final and Comprehensive Conclusion¶

Expected Results and Interpretation¶

1. Distribution of the New Variable (`x4_gamma`)¶

2. Improved Model Summary¶

3. Residuals vs. x4_gamma Plot¶

Final and Most Comprehensive Conclusion¶

Interpretation of the Results¶

Final Conclusion and the Closing of Our Research¶

Using XGBoost for Poisson Count Prediction¶

Detailed Analysis¶

1. Prediction Performance: Why Did the "Simpler" Model Win?¶

2. Feature Importance: Why Did XGBoost Get It Wrong?¶

Final and Comprehensive Conclusion¶

Import Required Libraries¶

Understanding Poisson Distribution¶

¶

Side-by-Side Model Comparison¶

Final Conclusion: The Research Answer¶

1. Model Success (Model Summary)¶

2. Residual Analysis: A "Healthy" Model's CT Scan¶

Overall Interpretation¶

Why You Are Right: The Intuitive Expectation¶

The Answer: "Good Fit" and "High Explanatory Power" Are Not the Same Thing¶

An Analogy: Weather Forecasting¶

Final Interpretation¶

Expected Results and Interpretation¶

1. Residuals vs. x1_poisson (Blue Graph)¶

2. Residuals vs. x2_normal (Green Graph)¶

3. Residuals vs. x3_skewed (Purple Graph)¶

Final and Comprehensive Conclusion¶

Expected Results and Interpretation¶

1. Distribution of the New Variable (x4_gamma)¶

2. Improved Model Summary¶

3. Residuals vs. x4_gamma Plot¶

Final and Most Comprehensive Conclusion¶

Interpretation of the Results¶

Final Conclusion and the Closing of Our Research¶

Using XGBoost for Poisson Count Prediction¶

Detailed Analysis¶

1. Prediction Performance: Why Did the "Simpler" Model Win?¶

2. Feature Importance: Why Did XGBoost Get It Wrong?¶

Final and Comprehensive Conclusion¶

1. Distribution of the New Variable (`x4_gamma`)¶