import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.spatial import Voronoi, voronoi_plot_2d
import time

# --- 2. GENERATE/LOAD DATA (Actual 80,000 student data) ---
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
df = pd.read_csv(file_path)

# Select the input (X) and output (Y) variables for 2D density estimation:
x = df['study_hours_per_day'].values 
y = df['exam_score'].values         

# Ensure data consistency
if len(x) != len(y):
    min_len = min(len(x), len(y))
    x = x[:min_len]
    y = y[:min_len]

npts = len(x) # N is the number of data points
print(f"Data successfully loaded. N={npts} data points.")

Data successfully loaded. N=80000 data points.

# --- 1. SETUP PARAMETERS ---
nclusters = 3      # K or M: Hyperparameter defining the number of components
nsteps = 25        # Number of E-M iterations (Should be sufficient for convergence)
nplot = 100        # Grid size for plotting density
np.random.seed(0)
log_likelihoods = []

# --- 3. INITIALIZE MODEL PARAMETERS ---
# Choose starting points for cluster means (mux, muy) randomly from data
indices = np.random.uniform(low=0, high=npts, size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]

# Initialize overall variance (varx, vary) and cluster weights (pc)
# We assume diagonal covariance (variance) for simplicity in this 2D example
varx = np.ones(nclusters) * (np.max(x) - np.min(x))**2 
vary = np.ones(nclusters) * (np.max(y) - np.min(y))**2
pc = np.ones(nclusters) / nclusters # p(c_m)

# Plot initial state (optional visual check)
fig, ax = plt.subplots()
plt.plot(x, y, '.')
plt.errorbar(mux, muy, xerr=np.sqrt(varx), yerr=np.sqrt(vary), fmt='r.', markersize=20)
plt.autoscale()
plt.title('GMM Initialization: Before Iteration')
plt.show()

# --- 4. EXPECTATION-MAXIMIZATION (E-M) ITERATION ---
for i in range(nsteps):
    # --- E-STEP (Expectation): Calculate Responsibilities p(c|x,y) ---
    # Construct matrices for vectorized operations (N rows x M columns)
    xm = np.outer(x, np.ones(nclusters))
    ym = np.outer(y, np.ones(nclusters))
    muxm = np.outer(np.ones(npts), mux)
    muym = np.outer(np.ones(npts), muy)
    varxm = np.outer(np.ones(npts), varx)
    varym = np.outer(np.ones(npts), vary)
    
    # Calculate p(v|c) - Gaussian probability of data point (v=x,y) given cluster (c)
    # This assumes independent variance components, approximating multivariate Gaussian N(x|mu, Sigma)
    pvgc = (1 / np.sqrt(2 * np.pi * varxm)) * np.exp(- (xm - muxm) ** 2 / (2 * varxm)) * \
           (1 / np.sqrt(2 * np.pi * varym)) * np.exp(- (ym - muym) ** 2 / (2 * varym))
    
    # Calculate p(v,c) = p(v|c) * p(c) (Joint probability, where p(c) is the mixing weight)
    pvc = pvgc * np.outer(np.ones(npts), pc) 
    
    # Calculate pcgv = p(c|v) - Conditional probability (Responsibility)
    pcgv = pvc / np.outer(np.sum(pvc, 1), np.ones(nclusters))
    
    # --- M-STEP (Maximization): Update Model Parameters ---
    
    # Calculate Nk = sum of responsibilities for each cluster
    Nk = np.sum(pcgv, 0)
    
    # Update expansion weights pi_k = Nk / N [23, 24]
    pc = Nk / npts 
    
    # Update means mu_k = sum(gamma * x) / Nk [23, 24]
    mux = np.sum(xm * pcgv, 0) / Nk
    muy = np.sum(ym * pcgv, 0) / Nk
    
    # Update variances Sigma_k = sum(gamma * (x - mu_k)^2) / Nk [23, 24]
    # Small constant (0.1) added to variances to prevent singularity/division by zero
    varx = 0.1 + np.sum((xm - muxm) ** 2 * pcgv, 0) / Nk
    vary = 0.1 + np.sum((ym - muym) ** 2 * pcgv, 0) / Nk

# Optional plot after the final iteration
fig, ax = plt.subplots()
plt.plot(x, y, '.')
plt.errorbar(mux, muy, xerr=np.sqrt(varx), yerr=np.sqrt(vary), fmt='r.', markersize=20)
plt.autoscale()
plt.title('GMM Fit: After E-M Iteration')
plt.show()

# --- 5. VISUALIZATION OF THE ESTIMATED DISTRIBUTION ---

xplot = np.linspace(np.min(x), np.max(x), nplot)
yplot = np.linspace(np.min(y), np.max(y), nplot)
(X, Y) = np.meshgrid(xplot, yplot)
p = np.zeros((nplot, nplot))

# Calculate the final density p(x,y) by summing the weighted Gaussian components
for c in range(nclusters):
    # Gaussian density for X dimension
    px = np.exp(- (X - mux[c]) ** 2 / (2 * varx[c])) / np.sqrt(2 * np.pi * varx[c])
    # Gaussian density for Y dimension
    py = np.exp(- (Y - muy[c]) ** 2 / (2 * vary[c])) / np.sqrt(2 * np.pi * vary[c])
    
    # p(x,y) += p(x|c) * p(y|c) * p(c)
    p += px * py * pc[c]

# Plot the 3D surface of the estimated probability distribution
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X, Y, p, cmap='viridis')
ax.set_title('Estimated Probability Distribution (GMM)')
ax.set_xlabel('Study Hours (X)')
ax.set_ylabel('Exam Score (Y)')
ax.set_zlabel('Density p(x,y)')
plt.show()

# --- 4. EXPECTATION-MAXIMIZATION (E-M) ITERATION ---
for i in range(nsteps):
    # --- E-STEP (Expectation): Calculate Responsibilities p(c|x,y) ---
    # Construct matrices for vectorized operations (N rows x M columns)
    xm = np.outer(x, np.ones(nclusters))
    ym = np.outer(y, np.ones(nclusters))
    muxm = np.outer(np.ones(npts), mux)
    muym = np.outer(np.ones(npts), muy)
    varxm = np.outer(np.ones(npts), varx)
    varym = np.outer(np.ones(npts), vary)
    
    # Calculate p(v|c) - Gaussian probability of data point (v=x,y) given cluster (c)
    # This assumes independent variance components, approximating multivariate Gaussian N(x|mu, Sigma)
    pvgc = (1 / np.sqrt(2 * np.pi * varxm)) * np.exp(- (xm - muxm) ** 2 / (2 * varxm)) * \
           (1 / np.sqrt(2 * np.pi * varym)) * np.exp(- (ym - muym) ** 2 / (2 * varym))
    
    # Calculate p(v,c) = p(v|c) * p(c) (Joint probability, where p(c) is the mixing weight)
    pvc = pvgc * np.outer(np.ones(npts), pc) 
    
    # Calculate pcgv = p(c|v) - Conditional probability (Responsibility)
    pcgv = pvc / np.outer(np.sum(pvc, 1), np.ones(nclusters))
    
    # --- M-STEP (Maximization): Update Model Parameters ---
    
    # Calculate Nk = sum of responsibilities for each cluster
    Nk = np.sum(pcgv, 0)
    
    # Use a small floor (e.g., 1e-9) to prevent division by zero for empty clusters.
    Nk_stable = Nk + 1e-9 
    
    # 1. Update expansion weights pi_k = Nk / N
    pc = Nk / npts 
    
    # 2. Update means mu_k = sum(gamma * x) / Nk
    mux = np.sum(xm * pcgv, 0) / Nk_stable
    muy = np.sum(ym * pcgv, 0) / Nk_stable
    
    # 3. Update variances Sigma_k = sum(gamma * (x - mu_k)^2) / Nk
    # Note: 0.1 added for variance stabilization
    varx = 0.1 + np.sum((xm - muxm) ** 2 * pcgv, 0) / Nk_stable
    vary = 0.1 + np.sum((ym - muym) ** 2 * pcgv, 0) / Nk_stable
    
    # 4. Log-Likelihood Tracking
    marginal_likelihood = np.sum(pvc, axis=1)
    epsilon = 1e-100 # Stabilization constant to prevent log(0)
    current_log_likelihood = np.sum(np.log(marginal_likelihood + epsilon))
    log_likelihoods.append(current_log_likelihood)
    
    # Optional: Print current log-likelihood to monitor convergence
    print(f"Step {i+1}/{nsteps}: Log-Likelihood = {current_log_likelihood:.2f}")

Step 1/25: Log-Likelihood = -439080.41
Step 2/25: Log-Likelihood = -437649.27
Step 3/25: Log-Likelihood = -436240.28
Step 4/25: Log-Likelihood = -434593.65
Step 5/25: Log-Likelihood = -432622.33
Step 6/25: Log-Likelihood = -431170.33
Step 7/25: Log-Likelihood = -430580.48
Step 8/25: Log-Likelihood = -430318.85
Step 9/25: Log-Likelihood = -430161.70
Step 10/25: Log-Likelihood = -430049.17
Step 11/25: Log-Likelihood = -429960.60
Step 12/25: Log-Likelihood = -429887.30
Step 13/25: Log-Likelihood = -429825.07
Step 14/25: Log-Likelihood = -429771.63
Step 15/25: Log-Likelihood = -429725.53
Step 16/25: Log-Likelihood = -429685.72
Step 17/25: Log-Likelihood = -429651.40
Step 18/25: Log-Likelihood = -429621.84
Step 19/25: Log-Likelihood = -429596.44
Step 20/25: Log-Likelihood = -429574.65
Step 21/25: Log-Likelihood = -429555.97
Step 22/25: Log-Likelihood = -429539.98
Step 23/25: Log-Likelihood = -429526.30
Step 24/25: Log-Likelihood = -429514.58
Step 25/25: Log-Likelihood = -429504.55

# --- 4b. PLOT CONVERGENCE ---
if log_likelihoods:
    # Filter out non-finite values (NaN/Inf) for plotting stability
    ll_array = np.array(log_likelihoods)
    finite_indices = np.isfinite(ll_array)
    ll_filtered = ll_array[finite_indices]
    steps_filtered = np.arange(1, nsteps + 1)[finite_indices]
    
    if len(ll_filtered) > 0:
        plt.figure(figsize=(8, 6))
        plt.plot(steps_filtered, ll_filtered, marker='o', linestyle='-')
        plt.title('Log-Likelihood Convergence of GMM (E-M)')
        plt.xlabel('Iteration Step')
        plt.ylabel('Log-Likelihood L(θ)')
        plt.grid(True)
        plt.show()
    else:
        print("Error: All log-likelihood values resulted in NaN/Inf. Cannot plot convergence.")

# --- 6. MODEL SELECTION: FIND OPTIMAL K USING BIC ---

k_range = range(1, 11) # Test models with 1 to 10 components
bic_scores = []
n_data_points = npts

# Degrees of Freedom (P) for diagonal covariance GMM in D=2 dimensions: P = 5*K - 1
def calculate_degrees_of_freedom(K):
    # P = (K-1 mixture weights) + (K * 2 means) + (K * 2 variances)
    return 5 * K - 1 

print("Running E-M models for K=1 to 10 to calculate BIC scores...")

# Re-run the core E-M logic (without plotting) for multiple K values
for K in k_range:
    nclusters_k = K
    current_L = -np.inf # Initialize log likelihood

    # Initialize parameters randomly for current K
    indices = np.random.uniform(low=0, high=n_data_points, size=nclusters_k).astype(int)
    mux = x[indices]
    muy = y[indices]
    varx = np.ones(nclusters_k) * (np.max(x) - np.min(x))**2 
    vary = np.ones(nclusters_k) * (np.max(y) - np.min(y))**2
    pc = np.ones(nclusters_k) / nclusters_k
    
    # E-M loop (simplified for multiple runs)
    for i in range(nsteps):
        # E-STEP Setup
        xm = np.outer(x, np.ones(nclusters_k))
        ym = np.outer(y, np.ones(nclusters_k))
        muxm = np.outer(np.ones(n_data_points), mux)
        muym = np.outer(np.ones(n_data_points), muy)
        varxm = np.outer(np.ones(n_data_points), varx)
        varym = np.outer(np.ones(n_data_points), vary)
        
        # E-STEP Calculation
        pvgc = (1 / np.sqrt(2 * np.pi * varxm)) * np.exp(- (xm - muxm) ** 2 / (2 * varxm)) * \
               (1 / np.sqrt(2 * np.pi * varym)) * np.exp(- (ym - muym) ** 2 / (2 * varym))
        pvc = pvgc * np.outer(np.ones(n_data_points), pc) 
        
        # Log-Likelihood Check and M-STEP Setup
        marginal_likelihood = np.sum(pvc, axis=1)
        epsilon = 1e-100 
        current_L_new = np.sum(np.log(marginal_likelihood + epsilon))

        # Check for convergence based on marginal improvement
        if np.abs(current_L_new - current_L) < 1e-6 and i > 5:
             current_L = current_L_new
             break 
        current_L = current_L_new
        
        # M-STEP Parameter Update
        pcgv = pvc / np.outer(np.sum(pvc, 1), np.ones(nclusters_k))
        Nk = np.sum(pcgv, 0)
        Nk_stable = Nk + 1e-9 
        pc = Nk / n_data_points 
        mux = np.sum(xm * pcgv, 0) / Nk_stable
        muy = np.sum(ym * pcgv, 0) / Nk_stable
        varx = 0.1 + np.sum((xm - muxm) ** 2 * pcgv, 0) / Nk_stable
        vary = 0.1 + np.sum((ym - muym) ** 2 * pcgv, 0) / Nk_stable
    
    # Calculate BIC score: BIC = -2 * L + P * log(N) [3]
    P = calculate_degrees_of_freedom(K)
    BIC = (-2 * current_L) + (P * np.log(n_data_points))
    bic_scores.append(BIC)
    print(f"K={K}: Log-L={current_L:.2f}, BIC={BIC:.2f}")

# Plot BIC results
optimal_K_index = np.argmin(bic_scores)
optimal_K = k_range[optimal_K_index]

print(f"\nOptimal number of clusters (K) based on BIC: {optimal_K}")

Running E-M models for K=1 to 10 to calculate BIC scores...
K=1: Log-L=-478679.92, BIC=957404.99
K=2: Log-L=-451980.44, BIC=904062.48
K=3: Log-L=-444282.58, BIC=888723.22
K=4: Log-L=-435469.53, BIC=871153.57
K=5: Log-L=-427840.65, BIC=855952.25
K=6: Log-L=-432832.65, BIC=865992.71
K=7: Log-L=-427001.24, BIC=854386.34
K=8: Log-L=-426208.61, BIC=852857.51
K=9: Log-L=-426539.33, BIC=853575.41
K=10: Log-L=-426991.49, BIC=854536.17

Optimal number of clusters (K) based on BIC: 8

plt.figure(figsize=(10, 6))
plt.plot(k_range, bic_scores, marker='o', linestyle='-', label='BIC Score')
plt.axvline(optimal_K, color='r', linestyle='--', label=f'Optimal K={optimal_K}')
plt.title('GMM Model Selection using Bayesian Information Criterion (BIC)')
plt.xlabel('Number of Components (K)')
plt.ylabel('BIC Score (Minimize)')
plt.legend()
plt.grid(True)
plt.show()

# Set the optimal K for the final visualization step (In [9])
nclusters = optimal_K

Desity Estimation¶

GENERATE/LOAD DATA (Actual 80,000 student data)¶

Expectation-Maximization (E-M) Iteration¶

Visualization of Estimated Density¶

Model Selection (BIC Evaluation)¶

1. Meaning of BIC = 8¶

2. How to Use $K=8$ in Your Density Estimation Assignment¶

3. Using $K=8$ in Other Data Science Work (Clustering and Predictive Modeling)¶

A. Clustering (Discovering Subgroups)¶

B. Predictive Modeling (Supervised Learning)¶