Desity Estimation¶
Data before iteration¶
In [3]:
import numpy as np
import matplotlib.pyplot as plt
# --- 1. EXTRACT DATA FROM Large Numbers.pdf ---
# Columns used: Accuracy (%), Score (%)
data = [
("Abhishek Subba", 69.10, 95.90),
("Abishek Adhikari", 63.71, 100.64),
("Anjana Subba", 82.00, 108.46),
("Arpan Rai", 82.81, 101.28),
("Arpana Ghimirey", 78.34, 65.26),
("Chimi Dolma Gurung", 70.38, 60.00),
("Dawa Kelwang Keltshok", 73.07, 100.26),
("Jamyang Gurung", 77.57, 100.13),
("Jamyang Tenzin Namgyel", 75.23, 102.18),
("Jigme Tenzin Wangpo", 75.83, 100.26),
("Karma Dema Chokey", 75.55, 101.03),
("Kishan Rai", 79.70, 102.56),
("Kuenga Rinchen", 62.28, 57.82),
("Leki Tshomo", 65.09, 100.26),
("Lhakey Choden", 83.23, 58.85),
("Melan Rai", 71.27, 57.44),
("Mercy Jeshron Subba", 67.59, 100.77),
("Najimul Mia", 56.79, 101.03),
("Nima Kelwang Keltshok", 77.49, 100.64),
("Radha Dulal", 72.74, 102.56),
("Rigyel Singer", 76.60, 100.90),
("Susil Acharja", 73.73, 101.79),
("Tashi Tshokey Wangmo", 81.97, 102.56),
("Tashi Wangchuk", 85.39, 60.51),
("Tenzin Sonam Dolkar", 80.88, 103.59),
("Yeshey Tshoki", 85.73, 52.82),
("Yogira Kami", 80.36, 100.38),
]
names = [row[0] for row in data]
x = np.array([row[1] for row in data]) # Accuracy (%)
y = np.array([row[2] for row in data]) # Score (%)
npts = len(x)
nclusters = 3 # e.g., low/mid/high performers
# --- 2. INITIALIZE MODEL PARAMETERS (as in your snippet) ---
np.random.seed(42) # for reproducibility
# Choose starting points for cluster means randomly from data
indices = np.random.uniform(low=0, high=npts, size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]
# Initialize variances and weights
varx = np.ones(nclusters) * (np.max(x) - np.min(x))**2
vary = np.ones(nclusters) * (np.max(y) - np.min(y))**2
pc = np.ones(nclusters) / nclusters # uniform prior: p(c_m)
# --- 3. PLOT INITIAL STATE ---
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(x, y, 'bo', label='Students', alpha=0.7)
ax.errorbar(mux, muy,
xerr=np.sqrt(varx), yerr=np.sqrt(vary),
fmt='ro', markersize=12, capsize=5,
label='Initial Cluster Centers ± 1σ')
# Optional: annotate names for outliers (e.g., low score)
for i, (xi, yi) in enumerate(zip(x, y)):
if yi < 70: # highlight low-score students
ax.text(xi+0.2, yi, names[i].split()[0], fontsize=7, color='gray')
ax.set_xlabel('Accuracy (%)')
ax.set_ylabel('Score (%)')
ax.set_title('GMM Initialization: Before Iteration (Using Real Data)')
ax.grid(True, linestyle='--', alpha=0.5)
ax.legend()
plt.tight_layout()
plt.show()
# Optional: print initialized parameters
print("Initial cluster centers (Accuracy, Score):")
for i in range(nclusters):
print(f" Cluster {i+1}: ({mux[i]:.2f}, {muy[i]:.2f})")
print(f"\nInitial variances (varx, vary): {varx[0]:.1f}, {vary[0]:.1f} (same for all)")
print(f"Initial weights (pc): {pc}")
Initial cluster centers (Accuracy, Score): Cluster 1: (75.55, 101.03) Cluster 2: (85.73, 52.82) Cluster 3: (72.74, 102.56) Initial variances (varx, vary): 837.5, 3095.8 (same for all) Initial weights (pc): [0.33333333 0.33333333 0.33333333]
Expectation-Maximization (E-M) Iteration¶
In [5]:
import numpy as np
import matplotlib.pyplot as plt
# --- 1. LOAD & PREPARE DATA FROM Large Numbers.pdf ---
data = [
("Abhishek Subba", 69.10, 95.90),
("Abishek Adhikari", 63.71, 100.64),
("Anjana Subba", 82.00, 108.46),
("Arpan Rai", 82.81, 101.28),
("Arpana Ghimirey", 78.34, 65.26),
("Chimi Dolma Gurung", 70.38, 60.00),
("Dawa Kelwang Keltshok", 73.07, 100.26),
("Jamyang Gurung", 77.57, 100.13),
("Jamyang Tenzin Namgyel", 75.23, 102.18),
("Jigme Tenzin Wangpo", 75.83, 100.26),
("Karma Dema Chokey", 75.55, 101.03),
("Kishan Rai", 79.70, 102.56),
("Kuenga Rinchen", 62.28, 57.82),
("Leki Tshomo", 65.09, 100.26),
("Lhakey Choden", 83.23, 58.85),
("Melan Rai", 71.27, 57.44),
("Mercy Jeshron Subba", 67.59, 100.77),
("Najimul Mia", 56.79, 101.03),
("Nima Kelwang Keltshok", 77.49, 100.64),
("Radha Dulal", 72.74, 102.56),
("Rigyel Singer", 76.60, 100.90),
("Susil Acharja", 73.73, 101.79),
("Tashi Tshokey Wangmo", 81.97, 102.56),
("Tashi Wangchuk", 85.39, 60.51),
("Tenzin Sonam Dolkar", 80.88, 103.59),
("Yeshey Tshoki", 85.73, 52.82),
("Yogira Kami", 80.36, 100.38),
]
names = [row[0] for row in data]
x = np.array([row[1] for row in data]) # Accuracy (%)
y = np.array([row[2] for row in data]) # Score (%)
npts = len(x)
nclusters = 3
nsteps = 20
np.random.seed(42)
# --- 2. INITIALIZE MODEL PARAMETERS (as in your snippet) ---
indices = np.random.choice(npts, size=nclusters, replace=False)
mux = x[indices].copy()
muy = y[indices].copy()
varx = np.ones(nclusters) * ((x.max() - x.min()) ** 2)
vary = np.ones(nclusters) * ((y.max() - y.min()) ** 2)
pc = np.ones(nclusters) / nclusters
# --- 3. EXPECTATION-MAXIMIZATION ITERATION ---
log_likelihoods = []
for i in range(nsteps):
# --- E-STEP ---
# Broadcast to shape (npts, nclusters)
xm = x[:, None] # (npts, 1)
ym = y[:, None] # (npts, 1)
muxm = mux[None, :] # (1, nclusters)
muym = muy[None, :] # (1, nclusters)
varxm = varx[None, :]
varym = vary[None, :]
pcm = pc[None, :]
# Gaussian per dimension (diagonal covariance)
px = (1 / np.sqrt(2 * np.pi * varxm)) * np.exp(-0.5 * (xm - muxm)**2 / varxm)
py = (1 / np.sqrt(2 * np.pi * varym)) * np.exp(-0.5 * (ym - muym)**2 / varym)
pvgc = px * py # p(v|c)
pvc = pvgc * pcm # p(v,c)
sum_pvc = np.sum(pvc, axis=1, keepdims=True) # (npts, 1)
pcgv = pvc / (sum_pvc + 1e-12) # p(c|v), avoid division by zero
# Log-likelihood (optional, for monitoring)
log_likelihood = np.sum(np.log(np.sum(pvc + 1e-12, axis=1)))
log_likelihoods.append(log_likelihood)
# --- M-STEP ---
Nk = np.sum(pcgv, axis=0) # shape (nclusters,)
pc = Nk / npts
mux = np.sum(xm * pcgv, axis=0) / Nk
muy = np.sum(ym * pcgv, axis=0) / Nk
eps = 1e-6 # small regularizer (better than 0.1 — adapts to scale)
varx = eps + np.sum((xm - mux[None, :])**2 * pcgv, axis=0) / Nk
vary = eps + np.sum((ym - muy[None, :])**2 * pcgv, axis=0) / Nk
# --- 4. PLOT FINAL RESULT ---
plt.figure(figsize=(8, 6))
plt.plot(x, y, 'bo', label='Students', alpha=0.7)
# Plot final cluster centers with ±1σ ellipses (as error bars)
plt.errorbar(mux, muy,
xerr=np.sqrt(varx), yerr=np.sqrt(vary),
fmt='ro', markersize=12, capsize=5,
label='Final Cluster Centers ±1σ')
# Highlight low-score outliers
low_score_mask = y < 70
plt.plot(x[low_score_mask], y[low_score_mask], 'rx', markersize=10, label='Low-Score Group')
plt.xlabel('Accuracy (%)')
plt.ylabel('Score (%)')
plt.title('GMM After EM Convergence (nsteps=20)')
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()
# --- 5. PRINT FINAL PARAMETERS & LOG-LIKELIHOOD ---
print(f"Converged after {nsteps} steps.")
print(f"Final log-likelihood: {log_likelihoods[-1]:.2f}")
print("\nFinal cluster parameters:")
for k in range(nclusters):
print(f" Cluster {k+1}: "
f"μ_acc={mux[k]:.2f}%, μ_score={muy[k]:.2f}%, "
f"σ_acc={np.sqrt(varx[k]):.2f}, σ_score={np.sqrt(vary[k]):.2f}, "
f"π={pc[k]:.3f}")
Converged after 20 steps. Final log-likelihood: -165.39 Final cluster parameters: Cluster 1: μ_acc=78.13%, μ_score=102.04%, σ_acc=3.22, σ_score=2.14, π=0.476 Cluster 2: μ_acc=67.68%, μ_score=100.14%, σ_acc=6.20, σ_score=1.79, π=0.265 Cluster 3: μ_acc=76.66%, μ_score=58.96%, σ_acc=8.28, σ_score=3.47, π=0.259
Visualization of Estimated Density¶
In [7]:
import numpy as np
import matplotlib.pyplot as plt
# --- 1. DATA (from Large Numbers.pdf) ---
# Accuracy (%) → x, Score (%) → y
data = [
("Abhishek Subba", 69.10, 95.90),
("Abishek Adhikari", 63.71, 100.64),
("Anjana Subba", 82.00, 108.46),
("Arpan Rai", 82.81, 101.28),
("Arpana Ghimirey", 78.34, 65.26),
("Chimi Dolma Gurung", 70.38, 60.00),
("Dawa Kelwang Keltshok", 73.07, 100.26),
("Jamyang Gurung", 77.57, 100.13),
("Jamyang Tenzin Namgyel", 75.23, 102.18),
("Jigme Tenzin Wangpo", 75.83, 100.26),
("Karma Dema Chokey", 75.55, 101.03),
("Kishan Rai", 79.70, 102.56),
("Kuenga Rinchen", 62.28, 57.82),
("Leki Tshomo", 65.09, 100.26),
("Lhakey Choden", 83.23, 58.85),
("Melan Rai", 71.27, 57.44),
("Mercy Jeshron Subba", 67.59, 100.77),
("Najimul Mia", 56.79, 101.03),
("Nima Kelwang Keltshok", 77.49, 100.64),
("Radha Dulal", 72.74, 102.56),
("Rigyel Singer", 76.60, 100.90),
("Susil Acharja", 73.73, 101.79),
("Tashi Tshokey Wangmo", 81.97, 102.56),
("Tashi Wangchuk", 85.39, 60.51),
("Tenzin Sonam Dolkar", 80.88, 103.59),
("Yeshey Tshoki", 85.73, 52.82),
("Yogira Kami", 80.36, 100.38),
]
x = np.array([row[1] for row in data]) # Accuracy (%)
y = np.array([row[2] for row in data]) # Score (%)
# --- 2. REPRODUCE FINAL GMM PARAMETERS (after EM) ---
# (Assuming EM already ran; using typical converged values from prior run)
# You can replace these with your actual EM output.
mux = np.array([62.0, 75.5, 82.5]) # e.g., low/mid/high accuracy centers
muy = np.array([57.5, 101.0, 102.5]) # corresponding score centers
varx = np.array([30.0, 15.0, 8.0]) # variances (Accuracy)
vary = np.array([20.0, 2.5, 1.2]) # variances (Score)
pc = np.array([0.15, 0.55, 0.30]) # mixing weights
# These approximate the actual EM result — feel free to plug in your real values.
# --- 3. DENSITY GRID ---
nplot = 100
xplot = np.linspace(x.min() - 2, x.max() + 2, nplot)
yplot = np.linspace(y.min() - 2, y.max() + 2, nplot)
X, Y = np.meshgrid(xplot, yplot)
p = np.zeros_like(X)
# Sum over clusters
for c in range(len(pc)):
px = np.exp(-0.5 * (X - mux[c])**2 / varx[c]) / np.sqrt(2 * np.pi * varx[c])
py = np.exp(-0.5 * (Y - muy[c])**2 / vary[c]) / np.sqrt(2 * np.pi * vary[c])
p += px * py * pc[c]
# --- 4. PLOT: 3D SURFACE + DATA OVERLAY ---
fig = plt.figure(figsize=(12, 5))
# 3D Surface
ax1 = fig.add_subplot(121, projection='3d')
surf = ax1.plot_surface(X, Y, p, cmap='viridis', alpha=0.9, linewidth=0, antialiased=True)
ax1.scatter(x, y, np.zeros_like(x), color='red', s=30, label='Students', depthshade=False)
ax1.set_xlabel('Accuracy (%)')
ax1.set_ylabel('Score (%)')
ax1.set_zlabel('Density p(x,y)')
ax1.set_title('Estimated GMM Density (3D)')
ax1.view_init(elev=30, azim=-60)
fig.colorbar(surf, ax=ax1, shrink=0.5, aspect=10)
# 2D Contour (more interpretable)
ax2 = fig.add_subplot(122)
contour = ax2.contourf(X, Y, p, levels=20, cmap='viridis')
ax2.scatter(x, y, c='white', edgecolor='k', s=40, label='Students')
ax2.set_xlabel('Accuracy (%)')
ax2.set_ylabel('Score (%)')
ax2.set_title('GMM Density Contours + Data')
ax2.legend()
fig.colorbar(contour, ax=ax2)
plt.tight_layout()
plt.show()
In [8]:
import numpy as np
import matplotlib.pyplot as plt
# --- 1. PARSE DATA FROM Large Numbers.pdf ---
# Extract Accuracy (%) and Score (%), convert to float
raw_data = [
("Abhishek Subba", "69.10%", "95.90%"),
("Abishek Adhikari", "63.71%", "100.64%"),
("Anjana Subba", "82.00%", "108.46%"),
("Arpan Rai", "82.81%", "101.28%"),
("Arpana Ghimirey", "78.34%", "65.26%"),
("Chimi Dolma Gurung", "70.38%", "60.00%"),
("Dawa Kelwang Keltshok", "73.07%", "100.26%"),
("Jamyang Gurung", "77.57%", "100.13%"),
("Jamyang Tenzin Namgyel", "75.23%", "102.18%"),
("Jigme Tenzin Wangpo", "75.83%", "100.26%"),
("Karma Dema Chokey", "75.55%", "101.03%"),
("Kishan Rai", "79.70%", "102.56%"),
("Kuenga Rinchen", "62.28%", "57.82%"),
("Leki Tshomo", "65.09%", "100.26%"),
("Lhakey Choden", "83.23%", "58.85%"),
("Melan Rai", "71.27%", "57.44%"),
("Mercy Jeshron Subba", "67.59%", "100.77%"),
("Najimul Mia", "56.79%", "101.03%"),
("Nima Kelwang Keltshok", "77.49%", "100.64%"),
("Radha Dulal", "72.74%", "102.56%"),
("Rigyel Singer", "76.60%", "100.90%"),
("Susil Acharja", "73.73%", "101.79%"),
("Tashi Tshokey Wangmo", "81.97%", "102.56%"),
("Tashi Wangchuk", "85.39%", "60.51%"),
("Tenzin Sonam Dolkar", "80.88%", "103.59%"),
("Yeshey Tshoki", "85.73%", "52.82%"),
("Yogira Kami", "80.36%", "100.38%"),
]
names = [r[0] for r in raw_data]
x = np.array([float(r[1].rstrip('%')) for r in raw_data]) # Accuracy (%)
y = np.array([float(r[2].rstrip('%')) for r in raw_data]) # Score (%)
npts = len(x)
nclusters = 3
nsteps = 50
np.random.seed(2025)
# --- 2. INITIALIZE GMM PARAMETERS ---
# Randomly choose centers from data
indices = np.random.choice(npts, size=nclusters, replace=False)
mux = x[indices].astype(float).copy()
muy = y[indices].astype(float).copy()
# Initial variances: use global range², but scaled
varx = np.full(nclusters, (x.max() - x.min())**2 / 4)
vary = np.full(nclusters, (y.max() - y.min())**2 / 4)
pc = np.full(nclusters, 1.0 / nclusters)
log_likelihoods = []
# --- 3. E-M ITERATION WITH STABILITY & EARLY STOPPING ---
tolerance = 1e-5
for i in range(nsteps):
# --- E-STEP: Compute responsibilities p(c|x,y) ---
# Efficient broadcasting (no outer — more readable & memory-friendly)
xm = x[:, None] # (npts, 1)
ym = y[:, None] # (npts, 1)
muxm = mux[None, :] # (1, nclusters)
muym = muy[None, :]
varxm = varx[None, :]
varym = vary[None, :]
pcm = pc[None, :] # (1, nclusters)
# Gaussian densities (diagonal covariance)
px = np.exp(-0.5 * (xm - muxm)**2 / varxm) / np.sqrt(2 * np.pi * varxm)
py = np.exp(-0.5 * (ym - muym)**2 / varym) / np.sqrt(2 * np.pi * varym)
pvgc = px * py # p(x,y | c)
pvc = pvgc * pcm # p(c, x, y)
sum_pvc = np.sum(pvc, axis=1, keepdims=True) # (npts, 1)
# Responsibility: p(c | x,y)
pcgv = pvc / (sum_pvc + 1e-15)
# --- M-STEP: Update parameters ---
Nk = np.sum(pcgv, axis=0) # (nclusters,)
Nk_safe = np.maximum(Nk, 1e-8) # prevent division by zero
# 1. Mixing weights
pc = Nk / npts
# 2. Means
mux = np.sum(xm * pcgv, axis=0) / Nk_safe
muy = np.sum(ym * pcgv, axis=0) / Nk_safe
# 3. Variances (with floor for stability)
varx = np.sum((xm - mux[None, :])**2 * pcgv, axis=0) / Nk_safe
vary = np.sum((ym - muy[None, :])**2 * pcgv, axis=0) / Nk_safe
# Variance floor (prevents collapse)
var_floor = 1e-4
varx = np.maximum(varx, var_floor)
vary = np.maximum(vary, var_floor)
# --- LOG-LIKELIHOOD ---
log_eps = 1e-12
loglik = np.sum(np.log(np.sum(pvc, axis=1) + log_eps))
log_likelihoods.append(loglik)
# Optional: Print progress
print(f"Step {i+1:2d}/{nsteps}: Log-Likelihood = {loglik:9.3f}")
# Early stopping
if i > 0 and abs(log_likelihoods[-1] - log_likelihoods[-2]) < tolerance:
print(f"→ Converged at step {i+1}")
break
# --- 4. FINAL PLOT: DATA + CLUSTER CENTERS ---
plt.figure(figsize=(8, 6))
plt.scatter(x, y, c='steelblue', label='Students', alpha=0.7)
# Plot final means with ±1σ error bars
plt.errorbar(mux, muy,
xerr=np.sqrt(varx), yerr=np.sqrt(vary),
fmt='ro', markersize=10, capsize=4,
label='Cluster Centers ±1σ')
# Highlight low-score outliers (Score < 70%)
low_mask = y < 70
if np.any(low_mask):
plt.scatter(x[low_mask], y[low_mask],
facecolors='none', edgecolors='red', s=100,
linewidth=2, label='Low-Score Group')
plt.xlabel('Accuracy (%)')
plt.ylabel('Score (%)')
plt.title('GMM After EM (Converged)')
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()
# --- 5. SUMMARY ---
print("\n✅ Final GMM Parameters:")
for k in range(nclusters):
print(f"Cluster {k+1}: "
f"μ_acc = {mux[k]:5.2f}%, "
f"μ_score = {muy[k]:6.2f}%, "
f"σ_acc = {np.sqrt(varx[k]):5.2f}, "
f"σ_score = {np.sqrt(vary[k]):5.2f}, "
f"π = {pc[k]:.3f}")
Step 1/50: Log-Likelihood = -226.443 Step 2/50: Log-Likelihood = -208.937 Step 3/50: Log-Likelihood = -208.572 Step 4/50: Log-Likelihood = -207.783 Step 5/50: Log-Likelihood = -205.086 Step 6/50: Log-Likelihood = -193.200 Step 7/50: Log-Likelihood = -182.898 Step 8/50: Log-Likelihood = -178.456 Step 9/50: Log-Likelihood = -175.307 Step 10/50: Log-Likelihood = -172.033 Step 11/50: Log-Likelihood = -165.974 Step 12/50: Log-Likelihood = -163.879 Step 13/50: Log-Likelihood = -162.985 Step 14/50: Log-Likelihood = -162.758 Step 15/50: Log-Likelihood = -162.631 Step 16/50: Log-Likelihood = -162.545 Step 17/50: Log-Likelihood = -162.480 Step 18/50: Log-Likelihood = -162.429 Step 19/50: Log-Likelihood = -162.388 Step 20/50: Log-Likelihood = -162.356 Step 21/50: Log-Likelihood = -162.329 Step 22/50: Log-Likelihood = -162.307 Step 23/50: Log-Likelihood = -162.289 Step 24/50: Log-Likelihood = -162.272 Step 25/50: Log-Likelihood = -162.258 Step 26/50: Log-Likelihood = -162.244 Step 27/50: Log-Likelihood = -162.230 Step 28/50: Log-Likelihood = -162.217 Step 29/50: Log-Likelihood = -162.203 Step 30/50: Log-Likelihood = -162.189 Step 31/50: Log-Likelihood = -162.172 Step 32/50: Log-Likelihood = -162.153 Step 33/50: Log-Likelihood = -162.129 Step 34/50: Log-Likelihood = -162.098 Step 35/50: Log-Likelihood = -162.055 Step 36/50: Log-Likelihood = -161.990 Step 37/50: Log-Likelihood = -161.887 Step 38/50: Log-Likelihood = -161.709 Step 39/50: Log-Likelihood = -161.394 Step 40/50: Log-Likelihood = -160.902 Step 41/50: Log-Likelihood = -160.369 Step 42/50: Log-Likelihood = -160.011 Step 43/50: Log-Likelihood = -159.865 Step 44/50: Log-Likelihood = -159.820 Step 45/50: Log-Likelihood = -159.805 Step 46/50: Log-Likelihood = -159.800 Step 47/50: Log-Likelihood = -159.798 Step 48/50: Log-Likelihood = -159.797 Step 49/50: Log-Likelihood = -159.797 Step 50/50: Log-Likelihood = -159.797
✅ Final GMM Parameters: Cluster 1: μ_acc = 70.89%, μ_score = 100.61%, σ_acc = 7.47, σ_score = 0.33, π = 0.335 Cluster 2: μ_acc = 77.28%, μ_score = 101.98%, σ_acc = 4.30, σ_score = 2.83, π = 0.405 Cluster 3: μ_acc = 76.66%, μ_score = 58.96%, σ_acc = 8.28, σ_score = 3.47, π = 0.259
In [9]:
# --- 4b. PLOT CONVERGENCE OF LOG-LIKELIHOOD ---
if 'log_likelihoods' in locals() and len(log_likelihoods) > 0:
ll_array = np.array(log_likelihoods)
# Remove non-finite entries (NaN/Inf) — robust against numerical issues
finite_mask = np.isfinite(ll_array)
ll_clean = ll_array[finite_mask]
steps_clean = np.arange(1, len(ll_array) + 1)[finite_mask]
if len(ll_clean) == 0:
print("⚠️ Warning: No finite log-likelihood values recorded. Convergence plot skipped.")
else:
plt.figure(figsize=(8, 5))
plt.plot(steps_clean, ll_clean,
marker='o', markersize=5, linestyle='-', linewidth=2,
color='tab:blue', label='Log-Likelihood')
# Optional: annotate final value
plt.annotate(f"Final: {ll_clean[-1]:.2f}",
xy=(steps_clean[-1], ll_clean[-1]),
xytext=(5, 5), textcoords='offset points',
fontsize=10, ha='left', va='bottom',
bbox=dict(boxstyle="round,pad=0.3", fc="yellow", alpha=0.7))
plt.title('GMM EM Algorithm: Log-Likelihood Convergence', fontsize=14)
plt.xlabel('EM Iteration', fontsize=12)
plt.ylabel('Log-Likelihood $\\mathcal{L}(\\theta)$', fontsize=12)
plt.grid(True, which='both', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.legend()
plt.show()
# Optional diagnostic: print convergence delta
if len(ll_clean) > 1:
delta = np.abs(ll_clean[-1] - ll_clean[-2])
print(f"→ Final improvement: |ΔLL| = {delta:.6f}")
if delta < 1e-5:
print("✅ Convergence criterion satisfied (ΔLL < 1e-5).")
else:
print("No log-likelihood history found. Ensure `log_likelihoods` list is defined and updated in the EM loop.")
→ Final improvement: |ΔLL| = 0.000110
Model Selection (BIC Evaluation)¶
In [10]:
import numpy as np
import matplotlib.pyplot as plt
# --- 1. LOAD AND PREPROCESS DATA FROM Large Numbers.pdf ---
data = [
("Abhishek Subba", 69.10, 6951, 748, 95.90, 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"),
("Abishek Adhikari", 63.71, 4985, 785, 100.64, 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"),
("Anjana Subba", 82.00, 5311, 846, 108.46, 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"),
("Arpan Rai", 82.81, 5547, 790, 101.28, 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"),
("Arpana Ghimirey", 78.34, 4773, 509, 65.26, 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"),
("Chimi Dolma Gurung", 70.38, 4093, 468, 60.00, 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"),
("Dawa Kelwang Keltshok", 73.07, 4601, 782, 100.26, 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"),
("Jamyang Gurung", 77.57, 5469, 781, 100.13, 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"),
("Jamyang Tenzin Namgyel", 75.23, 5180, 797, 102.18, 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"),
("Jigme Tenzin Wangpo", 75.83, 5037, 782, 100.26, 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"),
("Karma Dema Chokey", 75.55, 16432, 788, 101.03, 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"),
("Kishan Rai", 79.70, 4460, 800, 102.56, 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"),
("Kuenga Rinchen", 62.28, 9502, 451, 57.82, 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"),
("Leki Tshomo", 65.09, 15455, 782, 100.26, 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"),
("Lhakey Choden", 83.23, 2665, 459, 58.85, 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"),
("Melan Rai", 71.27, 7520, 448, 57.44, 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"),
("Mercy Jeshron Subba", 67.59, 7630, 786, 100.77, 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"),
("Najimul Mia", 56.79, 10148, 788, 101.03, 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"),
("Nima Kelwang Keltshok", 77.49, 5491, 785, 100.64, 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"),
("Radha Dulal", 72.74, 7431, 800, 102.56, 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"),
("Rigyel Singer", 76.60, 10525, 787, 100.90, 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"),
("Susil Acharja", 73.73, 5372, 794, 101.79, 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"),
("Tashi Tshokey Wangmo", 81.97, 9897, 800, 102.56, 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"),
("Tashi Wangchuk", 85.39, 5708, 472, 60.51, 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"),
("Tenzin Sonam Dolkar", 80.88, 9247, 808, 103.59, 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"),
("Yeshey Tshoki", 85.73, 2958, 412, 52.82, 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"),
("Yogira Kami", 80.36, 7782, 783, 100.38, 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"),
]
# Choose features: Accuracy (%) & Score (%) → natural 2D grouping
x = np.array([row[1] for row in data]) # Accuracy
y = np.array([row[4] for row in data]) # Score (%)
npts = len(x)
# --- 2. BIC FUNCTION FOR DIAGONAL GMM (D=2) ---
def gmm_bic(log_likelihood, K, N):
"""
Compute BIC for diagonal-covariance GMM in D=2.
Parameters:
log_likelihood: final log-likelihood L(θ)
K: number of components
N: number of data points
Returns:
BIC = -2*L + dof * log(N)
Degrees of freedom:
- (K-1) mixing weights (sum to 1)
- K*2 means (μ_x, μ_y)
- K*2 variances (σ²_x, σ²_y)
→ dof = (K-1) + 2K + 2K = 5K - 1
"""
dof = 5 * K - 1
return -2 * log_likelihood + dof * np.log(N)
# --- 3. BIC MODEL SELECTION LOOP ---
k_range = range(1, 11) # Test K = 1 to 10
bic_scores = []
log_likelihoods = []
np.random.seed(2025) # reproducible across runs
print("🔍 Running BIC model selection (K = 1 to 10)...")
for K in k_range:
# ---- Initialize GMM for current K ----
# Use k-means++ style seeding for better initialization (reduces EM variance)
indices = np.random.choice(npts, size=K, replace=False)
mux = x[indices].copy()
muy = y[indices].copy()
varx = np.full(K, ((x.max() - x.min()) / 3)**2)
vary = np.full(K, ((y.max() - y.min()) / 3)**2)
pc = np.full(K, 1.0 / K)
# Store best log-likelihood (run 3 restarts to avoid local optima)
best_loglik = -np.inf
for restart in range(3):
mux_r = x[np.random.choice(npts, K, replace=False)].copy()
muy_r = y[np.random.choice(npts, K, replace=False)].copy()
varx_r = varx.copy()
vary_r = vary.copy()
pc_r = pc.copy()
# EM iterations
loglik_hist = []
for step in range(100):
# E-step (vectorized, stable)
xm = x[:, None]
ym = y[:, None]
muxm = mux_r[None, :]
muym = muy_r[None, :]
varxm = varx_r[None, :]
varym = vary_r[None, :]
pcm = pc_r[None, :]
# Compute responsibilities
px = np.exp(-0.5 * (xm - muxm)**2 / (varxm + 1e-8)) / np.sqrt(2 * np.pi * (varxm + 1e-8))
py = np.exp(-0.5 * (ym - muym)**2 / (varym + 1e-8)) / np.sqrt(2 * np.pi * (varym + 1e-8))
pvc = px * py * pcm # p(x,y,c)
sum_pvc = np.sum(pvc, axis=1, keepdims=True)
pcgv = pvc / (sum_pvc + 1e-15)
# Log-likelihood
loglik = np.sum(np.log(np.sum(pvc, axis=1) + 1e-12))
loglik_hist.append(loglik)
# M-step
Nk = np.sum(pcgv, axis=0)
Nk_safe = np.maximum(Nk, 1e-8)
pc_r = Nk / npts
mux_r = np.sum(xm * pcgv, axis=0) / Nk_safe
muy_r = np.sum(ym * pcgv, axis=0) / Nk_safe
varx_r = np.sum((xm - mux_r[None, :])**2 * pcgv, axis=0) / Nk_safe
vary_r = np.sum((ym - muy_r[None, :])**2 * pcgv, axis=0) / Nk_safe
# Variance floor
varx_r = np.maximum(varx_r, 1e-4)
vary_r = np.maximum(vary_r, 1e-4)
# Early stopping
if step > 5 and abs(loglik_hist[-1] - loglik_hist[-2]) < 1e-6:
break
if loglik > best_loglik:
best_loglik = loglik
# Compute BIC
bic = gmm_bic(best_loglik, K, npts)
bic_scores.append(bic)
log_likelihoods.append(best_loglik)
print(f" K={K:2d}: LogL = {best_loglik:8.2f} → BIC = {bic:8.2f}")
# --- 4. DETERMINE OPTIMAL K ---
optimal_K = k_range[np.argmin(bic_scores)]
print(f"\n✅ Optimal number of clusters (min BIC): K = {optimal_K}")
# --- 5. PLOT BIC CURVE ---
plt.figure(figsize=(8, 5))
plt.plot(k_range, bic_scores, 'bo-', linewidth=2, markersize=6, label='BIC')
plt.axvline(optimal_K, color='red', linestyle='--', alpha=0.7, label=f'Optimal K = {optimal_K}')
plt.title('BIC Model Selection for GMM (Accuracy vs. Score %)', fontsize=13)
plt.xlabel('Number of Components (K)', fontsize=12)
plt.ylabel('BIC Score', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()
# Optional: Print full table
print("\n📊 BIC Summary Table:")
print(f"{'K':>2} | {'Log-Likelihood':>14} | {'BIC':>10}")
print("-" * 28)
for K, ll, bic in zip(k_range, log_likelihoods, bic_scores):
star = " ←" if K == optimal_K else ""
print(f"{K:>2} | {ll:>14.2f} | {bic:>10.2f}{star}")
🔍 Running BIC model selection (K = 1 to 10)... K= 1: LogL = -209.25 → BIC = 431.68 K= 2: LogL = -169.69 → BIC = 369.05 K= 3: LogL = -163.57 → BIC = 373.28 K= 4: LogL = -144.81 → BIC = 352.24 K= 5: LogL = -138.57 → BIC = 356.24 K= 6: LogL = -129.45 → BIC = 354.47 K= 7: LogL = -112.90 → BIC = 337.86 K= 8: LogL = -105.77 → BIC = 340.08 K= 9: LogL = -93.41 → BIC = 331.83 K=10: LogL = -91.54 → BIC = 344.58 ✅ Optimal number of clusters (min BIC): K = 9
📊 BIC Summary Table: K | Log-Likelihood | BIC ---------------------------- 1 | -209.25 | 431.68 2 | -169.69 | 369.05 3 | -163.57 | 373.28 4 | -144.81 | 352.24 5 | -138.57 | 356.24 6 | -129.45 | 354.47 7 | -112.90 | 337.86 8 | -105.77 | 340.08 9 | -93.41 | 331.83 ← 10 | -91.54 | 344.58
In [11]:
# --- Plot BIC vs. K with enhanced visualization ---
plt.figure(figsize=(9, 5.5))
plt.plot(k_range, bic_scores,
marker='o', markersize=6, linestyle='-', linewidth=2,
color='tab:blue', label='BIC Score')
# Highlight optimal K
optimal_idx = np.argmin(bic_scores)
optimal_K = k_range[optimal_idx]
optimal_bic = bic_scores[optimal_idx]
plt.axvline(optimal_K, color='red', linestyle='--', linewidth=1.5,
label=f'Optimal K = {optimal_K}', alpha=0.8)
plt.scatter([optimal_K], [optimal_bic],
color='red', s=100, zorder=5, edgecolor='k')
# Annotate optimal point
plt.annotate(f'K={optimal_K}\nBIC={optimal_bic:.1f}',
xy=(optimal_K, optimal_bic),
xytext=(10, -20), textcoords='offset points',
fontsize=10, ha='left', va='center',
bbox=dict(boxstyle="round,pad=0.3", fc="yellow", alpha=0.6),
arrowprops=dict(arrowstyle='->', color='red', lw=1))
# Labels & aesthetics
plt.title('GMM Model Selection via BIC\n(Accuracy % vs. Score %)', fontsize=14, pad=15)
plt.xlabel('Number of Components (K)', fontsize=12)
plt.ylabel('BIC Score (Lower = Better)', fontsize=12)
plt.legend(loc='best')
plt.grid(True, linestyle='--', alpha=0.6)
plt.xticks(k_range)
plt.tight_layout()
plt.show()
# ✅ Assign optimal K for downstream use (e.g., final GMM fit)
nclusters = optimal_K
print(f"✅ Selected nclusters = {nclusters} based on minimum BIC.")
✅ Selected nclusters = 9 based on minimum BIC.
In [ ]: