Dawa Tshering - Fab Futures - Data Science
Home About

Desity Estimation¶

Data before iteration¶

In [3]:
import numpy as np
import matplotlib.pyplot as plt

# --- 1. EXTRACT DATA FROM Large Numbers.pdf ---
# Columns used: Accuracy (%), Score (%)
data = [
    ("Abhishek Subba", 69.10, 95.90),
    ("Abishek Adhikari", 63.71, 100.64),
    ("Anjana Subba", 82.00, 108.46),
    ("Arpan Rai", 82.81, 101.28),
    ("Arpana Ghimirey", 78.34, 65.26),
    ("Chimi Dolma Gurung", 70.38, 60.00),
    ("Dawa Kelwang Keltshok", 73.07, 100.26),
    ("Jamyang Gurung", 77.57, 100.13),
    ("Jamyang Tenzin Namgyel", 75.23, 102.18),
    ("Jigme Tenzin Wangpo", 75.83, 100.26),
    ("Karma Dema Chokey", 75.55, 101.03),
    ("Kishan Rai", 79.70, 102.56),
    ("Kuenga Rinchen", 62.28, 57.82),
    ("Leki Tshomo", 65.09, 100.26),
    ("Lhakey Choden", 83.23, 58.85),
    ("Melan Rai", 71.27, 57.44),
    ("Mercy Jeshron Subba", 67.59, 100.77),
    ("Najimul Mia", 56.79, 101.03),
    ("Nima Kelwang Keltshok", 77.49, 100.64),
    ("Radha Dulal", 72.74, 102.56),
    ("Rigyel Singer", 76.60, 100.90),
    ("Susil Acharja", 73.73, 101.79),
    ("Tashi Tshokey Wangmo", 81.97, 102.56),
    ("Tashi Wangchuk", 85.39, 60.51),
    ("Tenzin Sonam Dolkar", 80.88, 103.59),
    ("Yeshey Tshoki", 85.73, 52.82),
    ("Yogira Kami", 80.36, 100.38),
]

names = [row[0] for row in data]
x = np.array([row[1] for row in data])  # Accuracy (%)
y = np.array([row[2] for row in data])  # Score (%)

npts = len(x)
nclusters = 3  # e.g., low/mid/high performers

# --- 2. INITIALIZE MODEL PARAMETERS (as in your snippet) ---
np.random.seed(42)  # for reproducibility

# Choose starting points for cluster means randomly from data
indices = np.random.uniform(low=0, high=npts, size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]

# Initialize variances and weights
varx = np.ones(nclusters) * (np.max(x) - np.min(x))**2 
vary = np.ones(nclusters) * (np.max(y) - np.min(y))**2
pc = np.ones(nclusters) / nclusters  # uniform prior: p(c_m)

# --- 3. PLOT INITIAL STATE ---
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(x, y, 'bo', label='Students', alpha=0.7)
ax.errorbar(mux, muy, 
            xerr=np.sqrt(varx), yerr=np.sqrt(vary), 
            fmt='ro', markersize=12, capsize=5, 
            label='Initial Cluster Centers ± 1σ')

# Optional: annotate names for outliers (e.g., low score)
for i, (xi, yi) in enumerate(zip(x, y)):
    if yi < 70:  # highlight low-score students
        ax.text(xi+0.2, yi, names[i].split()[0], fontsize=7, color='gray')

ax.set_xlabel('Accuracy (%)')
ax.set_ylabel('Score (%)')
ax.set_title('GMM Initialization: Before Iteration (Using Real Data)')
ax.grid(True, linestyle='--', alpha=0.5)
ax.legend()
plt.tight_layout()
plt.show()

# Optional: print initialized parameters
print("Initial cluster centers (Accuracy, Score):")
for i in range(nclusters):
    print(f"  Cluster {i+1}: ({mux[i]:.2f}, {muy[i]:.2f})")
print(f"\nInitial variances (varx, vary): {varx[0]:.1f}, {vary[0]:.1f} (same for all)")
print(f"Initial weights (pc): {pc}")
No description has been provided for this image
Initial cluster centers (Accuracy, Score):
  Cluster 1: (75.55, 101.03)
  Cluster 2: (85.73, 52.82)
  Cluster 3: (72.74, 102.56)

Initial variances (varx, vary): 837.5, 3095.8 (same for all)
Initial weights (pc): [0.33333333 0.33333333 0.33333333]

Expectation-Maximization (E-M) Iteration¶

In [5]:
import numpy as np
import matplotlib.pyplot as plt

# --- 1. LOAD & PREPARE DATA FROM Large Numbers.pdf ---
data = [
    ("Abhishek Subba", 69.10, 95.90),
    ("Abishek Adhikari", 63.71, 100.64),
    ("Anjana Subba", 82.00, 108.46),
    ("Arpan Rai", 82.81, 101.28),
    ("Arpana Ghimirey", 78.34, 65.26),
    ("Chimi Dolma Gurung", 70.38, 60.00),
    ("Dawa Kelwang Keltshok", 73.07, 100.26),
    ("Jamyang Gurung", 77.57, 100.13),
    ("Jamyang Tenzin Namgyel", 75.23, 102.18),
    ("Jigme Tenzin Wangpo", 75.83, 100.26),
    ("Karma Dema Chokey", 75.55, 101.03),
    ("Kishan Rai", 79.70, 102.56),
    ("Kuenga Rinchen", 62.28, 57.82),
    ("Leki Tshomo", 65.09, 100.26),
    ("Lhakey Choden", 83.23, 58.85),
    ("Melan Rai", 71.27, 57.44),
    ("Mercy Jeshron Subba", 67.59, 100.77),
    ("Najimul Mia", 56.79, 101.03),
    ("Nima Kelwang Keltshok", 77.49, 100.64),
    ("Radha Dulal", 72.74, 102.56),
    ("Rigyel Singer", 76.60, 100.90),
    ("Susil Acharja", 73.73, 101.79),
    ("Tashi Tshokey Wangmo", 81.97, 102.56),
    ("Tashi Wangchuk", 85.39, 60.51),
    ("Tenzin Sonam Dolkar", 80.88, 103.59),
    ("Yeshey Tshoki", 85.73, 52.82),
    ("Yogira Kami", 80.36, 100.38),
]

names = [row[0] for row in data]
x = np.array([row[1] for row in data])  # Accuracy (%)
y = np.array([row[2] for row in data])  # Score (%)

npts = len(x)
nclusters = 3
nsteps = 20
np.random.seed(42)

# --- 2. INITIALIZE MODEL PARAMETERS (as in your snippet) ---
indices = np.random.choice(npts, size=nclusters, replace=False)
mux = x[indices].copy()
muy = y[indices].copy()

varx = np.ones(nclusters) * ((x.max() - x.min()) ** 2)
vary = np.ones(nclusters) * ((y.max() - y.min()) ** 2)
pc = np.ones(nclusters) / nclusters

# --- 3. EXPECTATION-MAXIMIZATION ITERATION ---
log_likelihoods = []

for i in range(nsteps):
    # --- E-STEP ---
    # Broadcast to shape (npts, nclusters)
    xm = x[:, None]               # (npts, 1)
    ym = y[:, None]               # (npts, 1)
    muxm = mux[None, :]           # (1, nclusters)
    muym = muy[None, :]           # (1, nclusters)
    varxm = varx[None, :]
    varym = vary[None, :]
    pcm = pc[None, :]
    
    # Gaussian per dimension (diagonal covariance)
    px = (1 / np.sqrt(2 * np.pi * varxm)) * np.exp(-0.5 * (xm - muxm)**2 / varxm)
    py = (1 / np.sqrt(2 * np.pi * varym)) * np.exp(-0.5 * (ym - muym)**2 / varym)
    pvgc = px * py                # p(v|c)
    
    pvc = pvgc * pcm              # p(v,c)
    sum_pvc = np.sum(pvc, axis=1, keepdims=True)  # (npts, 1)
    pcgv = pvc / (sum_pvc + 1e-12)  # p(c|v), avoid division by zero
    
    # Log-likelihood (optional, for monitoring)
    log_likelihood = np.sum(np.log(np.sum(pvc + 1e-12, axis=1)))
    log_likelihoods.append(log_likelihood)
    
    # --- M-STEP ---
    Nk = np.sum(pcgv, axis=0)     # shape (nclusters,)
    pc = Nk / npts
    
    mux = np.sum(xm * pcgv, axis=0) / Nk
    muy = np.sum(ym * pcgv, axis=0) / Nk
    
    eps = 1e-6  # small regularizer (better than 0.1 — adapts to scale)
    varx = eps + np.sum((xm - mux[None, :])**2 * pcgv, axis=0) / Nk
    vary = eps + np.sum((ym - muy[None, :])**2 * pcgv, axis=0) / Nk

# --- 4. PLOT FINAL RESULT ---
plt.figure(figsize=(8, 6))
plt.plot(x, y, 'bo', label='Students', alpha=0.7)

# Plot final cluster centers with ±1σ ellipses (as error bars)
plt.errorbar(mux, muy, 
             xerr=np.sqrt(varx), yerr=np.sqrt(vary),
             fmt='ro', markersize=12, capsize=5, 
             label='Final Cluster Centers ±1σ')

# Highlight low-score outliers
low_score_mask = y < 70
plt.plot(x[low_score_mask], y[low_score_mask], 'rx', markersize=10, label='Low-Score Group')

plt.xlabel('Accuracy (%)')
plt.ylabel('Score (%)')
plt.title('GMM After EM Convergence (nsteps=20)')
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()

# --- 5. PRINT FINAL PARAMETERS & LOG-LIKELIHOOD ---
print(f"Converged after {nsteps} steps.")
print(f"Final log-likelihood: {log_likelihoods[-1]:.2f}")
print("\nFinal cluster parameters:")
for k in range(nclusters):
    print(f"  Cluster {k+1}: "
          f"μ_acc={mux[k]:.2f}%, μ_score={muy[k]:.2f}%, "
          f"σ_acc={np.sqrt(varx[k]):.2f}, σ_score={np.sqrt(vary[k]):.2f}, "
          f"π={pc[k]:.3f}")
No description has been provided for this image
Converged after 20 steps.
Final log-likelihood: -165.39

Final cluster parameters:
  Cluster 1: μ_acc=78.13%, μ_score=102.04%, σ_acc=3.22, σ_score=2.14, π=0.476
  Cluster 2: μ_acc=67.68%, μ_score=100.14%, σ_acc=6.20, σ_score=1.79, π=0.265
  Cluster 3: μ_acc=76.66%, μ_score=58.96%, σ_acc=8.28, σ_score=3.47, π=0.259

Visualization of Estimated Density¶

In [7]:
import numpy as np
import matplotlib.pyplot as plt

# --- 1. DATA (from Large Numbers.pdf) ---
# Accuracy (%) → x, Score (%) → y
data = [
    ("Abhishek Subba", 69.10, 95.90),
    ("Abishek Adhikari", 63.71, 100.64),
    ("Anjana Subba", 82.00, 108.46),
    ("Arpan Rai", 82.81, 101.28),
    ("Arpana Ghimirey", 78.34, 65.26),
    ("Chimi Dolma Gurung", 70.38, 60.00),
    ("Dawa Kelwang Keltshok", 73.07, 100.26),
    ("Jamyang Gurung", 77.57, 100.13),
    ("Jamyang Tenzin Namgyel", 75.23, 102.18),
    ("Jigme Tenzin Wangpo", 75.83, 100.26),
    ("Karma Dema Chokey", 75.55, 101.03),
    ("Kishan Rai", 79.70, 102.56),
    ("Kuenga Rinchen", 62.28, 57.82),
    ("Leki Tshomo", 65.09, 100.26),
    ("Lhakey Choden", 83.23, 58.85),
    ("Melan Rai", 71.27, 57.44),
    ("Mercy Jeshron Subba", 67.59, 100.77),
    ("Najimul Mia", 56.79, 101.03),
    ("Nima Kelwang Keltshok", 77.49, 100.64),
    ("Radha Dulal", 72.74, 102.56),
    ("Rigyel Singer", 76.60, 100.90),
    ("Susil Acharja", 73.73, 101.79),
    ("Tashi Tshokey Wangmo", 81.97, 102.56),
    ("Tashi Wangchuk", 85.39, 60.51),
    ("Tenzin Sonam Dolkar", 80.88, 103.59),
    ("Yeshey Tshoki", 85.73, 52.82),
    ("Yogira Kami", 80.36, 100.38),
]

x = np.array([row[1] for row in data])  # Accuracy (%)
y = np.array([row[2] for row in data])  # Score (%)

# --- 2. REPRODUCE FINAL GMM PARAMETERS (after EM) ---
# (Assuming EM already ran; using typical converged values from prior run)
# You can replace these with your actual EM output.
mux = np.array([62.0, 75.5, 82.5])      # e.g., low/mid/high accuracy centers
muy = np.array([57.5, 101.0, 102.5])    # corresponding score centers
varx = np.array([30.0, 15.0, 8.0])      # variances (Accuracy)
vary = np.array([20.0,  2.5, 1.2])      # variances (Score)
pc   = np.array([0.15, 0.55, 0.30])     # mixing weights
# These approximate the actual EM result — feel free to plug in your real values.

# --- 3. DENSITY GRID ---
nplot = 100
xplot = np.linspace(x.min() - 2, x.max() + 2, nplot)
yplot = np.linspace(y.min() - 2, y.max() + 2, nplot)
X, Y = np.meshgrid(xplot, yplot)
p = np.zeros_like(X)

# Sum over clusters
for c in range(len(pc)):
    px = np.exp(-0.5 * (X - mux[c])**2 / varx[c]) / np.sqrt(2 * np.pi * varx[c])
    py = np.exp(-0.5 * (Y - muy[c])**2 / vary[c]) / np.sqrt(2 * np.pi * vary[c])
    p += px * py * pc[c]

# --- 4. PLOT: 3D SURFACE + DATA OVERLAY ---
fig = plt.figure(figsize=(12, 5))

# 3D Surface
ax1 = fig.add_subplot(121, projection='3d')
surf = ax1.plot_surface(X, Y, p, cmap='viridis', alpha=0.9, linewidth=0, antialiased=True)
ax1.scatter(x, y, np.zeros_like(x), color='red', s=30, label='Students', depthshade=False)
ax1.set_xlabel('Accuracy (%)')
ax1.set_ylabel('Score (%)')
ax1.set_zlabel('Density p(x,y)')
ax1.set_title('Estimated GMM Density (3D)')
ax1.view_init(elev=30, azim=-60)
fig.colorbar(surf, ax=ax1, shrink=0.5, aspect=10)

# 2D Contour (more interpretable)
ax2 = fig.add_subplot(122)
contour = ax2.contourf(X, Y, p, levels=20, cmap='viridis')
ax2.scatter(x, y, c='white', edgecolor='k', s=40, label='Students')
ax2.set_xlabel('Accuracy (%)')
ax2.set_ylabel('Score (%)')
ax2.set_title('GMM Density Contours + Data')
ax2.legend()
fig.colorbar(contour, ax=ax2)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [8]:
import numpy as np
import matplotlib.pyplot as plt

# --- 1. PARSE DATA FROM Large Numbers.pdf ---
# Extract Accuracy (%) and Score (%), convert to float
raw_data = [
    ("Abhishek Subba", "69.10%", "95.90%"),
    ("Abishek Adhikari", "63.71%", "100.64%"),
    ("Anjana Subba", "82.00%", "108.46%"),
    ("Arpan Rai", "82.81%", "101.28%"),
    ("Arpana Ghimirey", "78.34%", "65.26%"),
    ("Chimi Dolma Gurung", "70.38%", "60.00%"),
    ("Dawa Kelwang Keltshok", "73.07%", "100.26%"),
    ("Jamyang Gurung", "77.57%", "100.13%"),
    ("Jamyang Tenzin Namgyel", "75.23%", "102.18%"),
    ("Jigme Tenzin Wangpo", "75.83%", "100.26%"),
    ("Karma Dema Chokey", "75.55%", "101.03%"),
    ("Kishan Rai", "79.70%", "102.56%"),
    ("Kuenga Rinchen", "62.28%", "57.82%"),
    ("Leki Tshomo", "65.09%", "100.26%"),
    ("Lhakey Choden", "83.23%", "58.85%"),
    ("Melan Rai", "71.27%", "57.44%"),
    ("Mercy Jeshron Subba", "67.59%", "100.77%"),
    ("Najimul Mia", "56.79%", "101.03%"),
    ("Nima Kelwang Keltshok", "77.49%", "100.64%"),
    ("Radha Dulal", "72.74%", "102.56%"),
    ("Rigyel Singer", "76.60%", "100.90%"),
    ("Susil Acharja", "73.73%", "101.79%"),
    ("Tashi Tshokey Wangmo", "81.97%", "102.56%"),
    ("Tashi Wangchuk", "85.39%", "60.51%"),
    ("Tenzin Sonam Dolkar", "80.88%", "103.59%"),
    ("Yeshey Tshoki", "85.73%", "52.82%"),
    ("Yogira Kami", "80.36%", "100.38%"),
]

names = [r[0] for r in raw_data]
x = np.array([float(r[1].rstrip('%')) for r in raw_data])  # Accuracy (%)
y = np.array([float(r[2].rstrip('%')) for r in raw_data])  # Score (%)

npts = len(x)
nclusters = 3
nsteps = 50
np.random.seed(2025)

# --- 2. INITIALIZE GMM PARAMETERS ---
# Randomly choose centers from data
indices = np.random.choice(npts, size=nclusters, replace=False)
mux = x[indices].astype(float).copy()
muy = y[indices].astype(float).copy()

# Initial variances: use global range², but scaled
varx = np.full(nclusters, (x.max() - x.min())**2 / 4)
vary = np.full(nclusters, (y.max() - y.min())**2 / 4)
pc = np.full(nclusters, 1.0 / nclusters)

log_likelihoods = []

# --- 3. E-M ITERATION WITH STABILITY & EARLY STOPPING ---
tolerance = 1e-5
for i in range(nsteps):
    # --- E-STEP: Compute responsibilities p(c|x,y) ---
    # Efficient broadcasting (no outer — more readable & memory-friendly)
    xm = x[:, None]          # (npts, 1)
    ym = y[:, None]          # (npts, 1)
    muxm = mux[None, :]      # (1, nclusters)
    muym = muy[None, :]
    varxm = varx[None, :]
    varym = vary[None, :]
    pcm = pc[None, :]        # (1, nclusters)
    
    # Gaussian densities (diagonal covariance)
    px = np.exp(-0.5 * (xm - muxm)**2 / varxm) / np.sqrt(2 * np.pi * varxm)
    py = np.exp(-0.5 * (ym - muym)**2 / varym) / np.sqrt(2 * np.pi * varym)
    pvgc = px * py           # p(x,y | c)
    
    pvc = pvgc * pcm         # p(c, x, y)
    sum_pvc = np.sum(pvc, axis=1, keepdims=True)  # (npts, 1)
    
    # Responsibility: p(c | x,y)
    pcgv = pvc / (sum_pvc + 1e-15)
    
    # --- M-STEP: Update parameters ---
    Nk = np.sum(pcgv, axis=0)                 # (nclusters,)
    Nk_safe = np.maximum(Nk, 1e-8)            # prevent division by zero
    
    # 1. Mixing weights
    pc = Nk / npts
    
    # 2. Means
    mux = np.sum(xm * pcgv, axis=0) / Nk_safe
    muy = np.sum(ym * pcgv, axis=0) / Nk_safe
    
    # 3. Variances (with floor for stability)
    varx = np.sum((xm - mux[None, :])**2 * pcgv, axis=0) / Nk_safe
    vary = np.sum((ym - muy[None, :])**2 * pcgv, axis=0) / Nk_safe
    
    # Variance floor (prevents collapse)
    var_floor = 1e-4
    varx = np.maximum(varx, var_floor)
    vary = np.maximum(vary, var_floor)
    
    # --- LOG-LIKELIHOOD ---
    log_eps = 1e-12
    loglik = np.sum(np.log(np.sum(pvc, axis=1) + log_eps))
    log_likelihoods.append(loglik)
    
    # Optional: Print progress
    print(f"Step {i+1:2d}/{nsteps}: Log-Likelihood = {loglik:9.3f}")
    
    # Early stopping
    if i > 0 and abs(log_likelihoods[-1] - log_likelihoods[-2]) < tolerance:
        print(f"→ Converged at step {i+1}")
        break

# --- 4. FINAL PLOT: DATA + CLUSTER CENTERS ---
plt.figure(figsize=(8, 6))
plt.scatter(x, y, c='steelblue', label='Students', alpha=0.7)

# Plot final means with ±1σ error bars
plt.errorbar(mux, muy,
             xerr=np.sqrt(varx), yerr=np.sqrt(vary),
             fmt='ro', markersize=10, capsize=4,
             label='Cluster Centers ±1σ')

# Highlight low-score outliers (Score < 70%)
low_mask = y < 70
if np.any(low_mask):
    plt.scatter(x[low_mask], y[low_mask], 
                facecolors='none', edgecolors='red', s=100, 
                linewidth=2, label='Low-Score Group')

plt.xlabel('Accuracy (%)')
plt.ylabel('Score (%)')
plt.title('GMM After EM (Converged)')
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()

# --- 5. SUMMARY ---
print("\n✅ Final GMM Parameters:")
for k in range(nclusters):
    print(f"Cluster {k+1}: "
          f"μ_acc = {mux[k]:5.2f}%, "
          f"μ_score = {muy[k]:6.2f}%, "
          f"σ_acc = {np.sqrt(varx[k]):5.2f}, "
          f"σ_score = {np.sqrt(vary[k]):5.2f}, "
          f"π = {pc[k]:.3f}")
Step  1/50: Log-Likelihood =  -226.443
Step  2/50: Log-Likelihood =  -208.937
Step  3/50: Log-Likelihood =  -208.572
Step  4/50: Log-Likelihood =  -207.783
Step  5/50: Log-Likelihood =  -205.086
Step  6/50: Log-Likelihood =  -193.200
Step  7/50: Log-Likelihood =  -182.898
Step  8/50: Log-Likelihood =  -178.456
Step  9/50: Log-Likelihood =  -175.307
Step 10/50: Log-Likelihood =  -172.033
Step 11/50: Log-Likelihood =  -165.974
Step 12/50: Log-Likelihood =  -163.879
Step 13/50: Log-Likelihood =  -162.985
Step 14/50: Log-Likelihood =  -162.758
Step 15/50: Log-Likelihood =  -162.631
Step 16/50: Log-Likelihood =  -162.545
Step 17/50: Log-Likelihood =  -162.480
Step 18/50: Log-Likelihood =  -162.429
Step 19/50: Log-Likelihood =  -162.388
Step 20/50: Log-Likelihood =  -162.356
Step 21/50: Log-Likelihood =  -162.329
Step 22/50: Log-Likelihood =  -162.307
Step 23/50: Log-Likelihood =  -162.289
Step 24/50: Log-Likelihood =  -162.272
Step 25/50: Log-Likelihood =  -162.258
Step 26/50: Log-Likelihood =  -162.244
Step 27/50: Log-Likelihood =  -162.230
Step 28/50: Log-Likelihood =  -162.217
Step 29/50: Log-Likelihood =  -162.203
Step 30/50: Log-Likelihood =  -162.189
Step 31/50: Log-Likelihood =  -162.172
Step 32/50: Log-Likelihood =  -162.153
Step 33/50: Log-Likelihood =  -162.129
Step 34/50: Log-Likelihood =  -162.098
Step 35/50: Log-Likelihood =  -162.055
Step 36/50: Log-Likelihood =  -161.990
Step 37/50: Log-Likelihood =  -161.887
Step 38/50: Log-Likelihood =  -161.709
Step 39/50: Log-Likelihood =  -161.394
Step 40/50: Log-Likelihood =  -160.902
Step 41/50: Log-Likelihood =  -160.369
Step 42/50: Log-Likelihood =  -160.011
Step 43/50: Log-Likelihood =  -159.865
Step 44/50: Log-Likelihood =  -159.820
Step 45/50: Log-Likelihood =  -159.805
Step 46/50: Log-Likelihood =  -159.800
Step 47/50: Log-Likelihood =  -159.798
Step 48/50: Log-Likelihood =  -159.797
Step 49/50: Log-Likelihood =  -159.797
Step 50/50: Log-Likelihood =  -159.797
No description has been provided for this image
✅ Final GMM Parameters:
Cluster 1: μ_acc = 70.89%, μ_score = 100.61%, σ_acc =  7.47, σ_score =  0.33, π = 0.335
Cluster 2: μ_acc = 77.28%, μ_score = 101.98%, σ_acc =  4.30, σ_score =  2.83, π = 0.405
Cluster 3: μ_acc = 76.66%, μ_score =  58.96%, σ_acc =  8.28, σ_score =  3.47, π = 0.259
In [9]:
# --- 4b. PLOT CONVERGENCE OF LOG-LIKELIHOOD ---
if 'log_likelihoods' in locals() and len(log_likelihoods) > 0:
    ll_array = np.array(log_likelihoods)
    
    # Remove non-finite entries (NaN/Inf) — robust against numerical issues
    finite_mask = np.isfinite(ll_array)
    ll_clean = ll_array[finite_mask]
    steps_clean = np.arange(1, len(ll_array) + 1)[finite_mask]

    if len(ll_clean) == 0:
        print("⚠️ Warning: No finite log-likelihood values recorded. Convergence plot skipped.")
    else:
        plt.figure(figsize=(8, 5))
        plt.plot(steps_clean, ll_clean, 
                 marker='o', markersize=5, linestyle='-', linewidth=2,
                 color='tab:blue', label='Log-Likelihood')
        
        # Optional: annotate final value
        plt.annotate(f"Final: {ll_clean[-1]:.2f}",
                     xy=(steps_clean[-1], ll_clean[-1]),
                     xytext=(5, 5), textcoords='offset points',
                     fontsize=10, ha='left', va='bottom',
                     bbox=dict(boxstyle="round,pad=0.3", fc="yellow", alpha=0.7))
        
        plt.title('GMM EM Algorithm: Log-Likelihood Convergence', fontsize=14)
        plt.xlabel('EM Iteration', fontsize=12)
        plt.ylabel('Log-Likelihood $\\mathcal{L}(\\theta)$', fontsize=12)
        plt.grid(True, which='both', linestyle='--', alpha=0.6)
        plt.tight_layout()
        plt.legend()
        plt.show()
        
        # Optional diagnostic: print convergence delta
        if len(ll_clean) > 1:
            delta = np.abs(ll_clean[-1] - ll_clean[-2])
            print(f"→ Final improvement: |ΔLL| = {delta:.6f}")
            if delta < 1e-5:
                print("✅ Convergence criterion satisfied (ΔLL < 1e-5).")
else:
    print("No log-likelihood history found. Ensure `log_likelihoods` list is defined and updated in the EM loop.")
No description has been provided for this image
→ Final improvement: |ΔLL| = 0.000110

Model Selection (BIC Evaluation)¶

In [10]:
import numpy as np
import matplotlib.pyplot as plt

# --- 1. LOAD AND PREPROCESS DATA FROM Large Numbers.pdf ---
data = [
    ("Abhishek Subba", 69.10, 6951, 748, 95.90, 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"),
    ("Abishek Adhikari", 63.71, 4985, 785, 100.64, 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"),
    ("Anjana Subba", 82.00, 5311, 846, 108.46, 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"),
    ("Arpan Rai", 82.81, 5547, 790, 101.28, 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"),
    ("Arpana Ghimirey", 78.34, 4773, 509, 65.26, 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"),
    ("Chimi Dolma Gurung", 70.38, 4093, 468, 60.00, 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"),
    ("Dawa Kelwang Keltshok", 73.07, 4601, 782, 100.26, 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"),
    ("Jamyang Gurung", 77.57, 5469, 781, 100.13, 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"),
    ("Jamyang Tenzin Namgyel", 75.23, 5180, 797, 102.18, 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"),
    ("Jigme Tenzin Wangpo", 75.83, 5037, 782, 100.26, 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"),
    ("Karma Dema Chokey", 75.55, 16432, 788, 101.03, 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"),
    ("Kishan Rai", 79.70, 4460, 800, 102.56, 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"),
    ("Kuenga Rinchen", 62.28, 9502, 451, 57.82, 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"),
    ("Leki Tshomo", 65.09, 15455, 782, 100.26, 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"),
    ("Lhakey Choden", 83.23, 2665, 459, 58.85, 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"),
    ("Melan Rai", 71.27, 7520, 448, 57.44, 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"),
    ("Mercy Jeshron Subba", 67.59, 7630, 786, 100.77, 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"),
    ("Najimul Mia", 56.79, 10148, 788, 101.03, 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"),
    ("Nima Kelwang Keltshok", 77.49, 5491, 785, 100.64, 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"),
    ("Radha Dulal", 72.74, 7431, 800, 102.56, 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"),
    ("Rigyel Singer", 76.60, 10525, 787, 100.90, 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"),
    ("Susil Acharja", 73.73, 5372, 794, 101.79, 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"),
    ("Tashi Tshokey Wangmo", 81.97, 9897, 800, 102.56, 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"),
    ("Tashi Wangchuk", 85.39, 5708, 472, 60.51, 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"),
    ("Tenzin Sonam Dolkar", 80.88, 9247, 808, 103.59, 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"),
    ("Yeshey Tshoki", 85.73, 2958, 412, 52.82, 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"),
    ("Yogira Kami", 80.36, 7782, 783, 100.38, 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"),
]

# Choose features: Accuracy (%) & Score (%) → natural 2D grouping
x = np.array([row[1] for row in data])  # Accuracy
y = np.array([row[4] for row in data])  # Score (%)
npts = len(x)

# --- 2. BIC FUNCTION FOR DIAGONAL GMM (D=2) ---
def gmm_bic(log_likelihood, K, N):
    """
    Compute BIC for diagonal-covariance GMM in D=2.
    Parameters:
        log_likelihood: final log-likelihood L(θ)
        K: number of components
        N: number of data points
    Returns:
        BIC = -2*L + dof * log(N)
    Degrees of freedom:
        - (K-1) mixing weights (sum to 1)
        - K*2 means (μ_x, μ_y)
        - K*2 variances (σ²_x, σ²_y)
        → dof = (K-1) + 2K + 2K = 5K - 1
    """
    dof = 5 * K - 1
    return -2 * log_likelihood + dof * np.log(N)

# --- 3. BIC MODEL SELECTION LOOP ---
k_range = range(1, 11)  # Test K = 1 to 10
bic_scores = []
log_likelihoods = []
np.random.seed(2025)  # reproducible across runs

print("🔍 Running BIC model selection (K = 1 to 10)...")

for K in k_range:
    # ---- Initialize GMM for current K ----
    # Use k-means++ style seeding for better initialization (reduces EM variance)
    indices = np.random.choice(npts, size=K, replace=False)
    mux = x[indices].copy()
    muy = y[indices].copy()
    varx = np.full(K, ((x.max() - x.min()) / 3)**2)
    vary = np.full(K, ((y.max() - y.min()) / 3)**2)
    pc = np.full(K, 1.0 / K)
    
    # Store best log-likelihood (run 3 restarts to avoid local optima)
    best_loglik = -np.inf
    for restart in range(3):
        mux_r = x[np.random.choice(npts, K, replace=False)].copy()
        muy_r = y[np.random.choice(npts, K, replace=False)].copy()
        varx_r = varx.copy()
        vary_r = vary.copy()
        pc_r = pc.copy()
        
        # EM iterations
        loglik_hist = []
        for step in range(100):
            # E-step (vectorized, stable)
            xm = x[:, None]
            ym = y[:, None]
            muxm = mux_r[None, :]
            muym = muy_r[None, :]
            varxm = varx_r[None, :]
            varym = vary_r[None, :]
            pcm = pc_r[None, :]
            
            # Compute responsibilities
            px = np.exp(-0.5 * (xm - muxm)**2 / (varxm + 1e-8)) / np.sqrt(2 * np.pi * (varxm + 1e-8))
            py = np.exp(-0.5 * (ym - muym)**2 / (varym + 1e-8)) / np.sqrt(2 * np.pi * (varym + 1e-8))
            pvc = px * py * pcm  # p(x,y,c)
            sum_pvc = np.sum(pvc, axis=1, keepdims=True)
            pcgv = pvc / (sum_pvc + 1e-15)
            
            # Log-likelihood
            loglik = np.sum(np.log(np.sum(pvc, axis=1) + 1e-12))
            loglik_hist.append(loglik)
            
            # M-step
            Nk = np.sum(pcgv, axis=0)
            Nk_safe = np.maximum(Nk, 1e-8)
            
            pc_r = Nk / npts
            mux_r = np.sum(xm * pcgv, axis=0) / Nk_safe
            muy_r = np.sum(ym * pcgv, axis=0) / Nk_safe
            
            varx_r = np.sum((xm - mux_r[None, :])**2 * pcgv, axis=0) / Nk_safe
            vary_r = np.sum((ym - muy_r[None, :])**2 * pcgv, axis=0) / Nk_safe
            
            # Variance floor
            varx_r = np.maximum(varx_r, 1e-4)
            vary_r = np.maximum(vary_r, 1e-4)
            
            # Early stopping
            if step > 5 and abs(loglik_hist[-1] - loglik_hist[-2]) < 1e-6:
                break
        
        if loglik > best_loglik:
            best_loglik = loglik
    
    # Compute BIC
    bic = gmm_bic(best_loglik, K, npts)
    bic_scores.append(bic)
    log_likelihoods.append(best_loglik)
    print(f"  K={K:2d}: LogL = {best_loglik:8.2f} → BIC = {bic:8.2f}")

# --- 4. DETERMINE OPTIMAL K ---
optimal_K = k_range[np.argmin(bic_scores)]
print(f"\n✅ Optimal number of clusters (min BIC): K = {optimal_K}")

# --- 5. PLOT BIC CURVE ---
plt.figure(figsize=(8, 5))
plt.plot(k_range, bic_scores, 'bo-', linewidth=2, markersize=6, label='BIC')
plt.axvline(optimal_K, color='red', linestyle='--', alpha=0.7, label=f'Optimal K = {optimal_K}')
plt.title('BIC Model Selection for GMM (Accuracy vs. Score %)', fontsize=13)
plt.xlabel('Number of Components (K)', fontsize=12)
plt.ylabel('BIC Score', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()

# Optional: Print full table
print("\n📊 BIC Summary Table:")
print(f"{'K':>2} | {'Log-Likelihood':>14} | {'BIC':>10}")
print("-" * 28)
for K, ll, bic in zip(k_range, log_likelihoods, bic_scores):
    star = " ←" if K == optimal_K else ""
    print(f"{K:>2} | {ll:>14.2f} | {bic:>10.2f}{star}")
🔍 Running BIC model selection (K = 1 to 10)...
  K= 1: LogL =  -209.25 → BIC =   431.68
  K= 2: LogL =  -169.69 → BIC =   369.05
  K= 3: LogL =  -163.57 → BIC =   373.28
  K= 4: LogL =  -144.81 → BIC =   352.24
  K= 5: LogL =  -138.57 → BIC =   356.24
  K= 6: LogL =  -129.45 → BIC =   354.47
  K= 7: LogL =  -112.90 → BIC =   337.86
  K= 8: LogL =  -105.77 → BIC =   340.08
  K= 9: LogL =   -93.41 → BIC =   331.83
  K=10: LogL =   -91.54 → BIC =   344.58

✅ Optimal number of clusters (min BIC): K = 9
No description has been provided for this image
📊 BIC Summary Table:
 K | Log-Likelihood |        BIC
----------------------------
 1 |        -209.25 |     431.68
 2 |        -169.69 |     369.05
 3 |        -163.57 |     373.28
 4 |        -144.81 |     352.24
 5 |        -138.57 |     356.24
 6 |        -129.45 |     354.47
 7 |        -112.90 |     337.86
 8 |        -105.77 |     340.08
 9 |         -93.41 |     331.83 ←
10 |         -91.54 |     344.58
In [11]:
# --- Plot BIC vs. K with enhanced visualization ---
plt.figure(figsize=(9, 5.5))
plt.plot(k_range, bic_scores, 
         marker='o', markersize=6, linestyle='-', linewidth=2, 
         color='tab:blue', label='BIC Score')

# Highlight optimal K
optimal_idx = np.argmin(bic_scores)
optimal_K = k_range[optimal_idx]
optimal_bic = bic_scores[optimal_idx]

plt.axvline(optimal_K, color='red', linestyle='--', linewidth=1.5,
            label=f'Optimal K = {optimal_K}', alpha=0.8)
plt.scatter([optimal_K], [optimal_bic], 
            color='red', s=100, zorder=5, edgecolor='k')

# Annotate optimal point
plt.annotate(f'K={optimal_K}\nBIC={optimal_bic:.1f}',
             xy=(optimal_K, optimal_bic), 
             xytext=(10, -20), textcoords='offset points',
             fontsize=10, ha='left', va='center',
             bbox=dict(boxstyle="round,pad=0.3", fc="yellow", alpha=0.6),
             arrowprops=dict(arrowstyle='->', color='red', lw=1))

# Labels & aesthetics
plt.title('GMM Model Selection via BIC\n(Accuracy % vs. Score %)', fontsize=14, pad=15)
plt.xlabel('Number of Components (K)', fontsize=12)
plt.ylabel('BIC Score (Lower = Better)', fontsize=12)
plt.legend(loc='best')
plt.grid(True, linestyle='--', alpha=0.6)
plt.xticks(k_range)
plt.tight_layout()
plt.show()

# ✅ Assign optimal K for downstream use (e.g., final GMM fit)
nclusters = optimal_K
print(f"✅ Selected nclusters = {nclusters} based on minimum BIC.")
No description has been provided for this image
✅ Selected nclusters = 9 based on minimum BIC.
In [ ]: