Expectation-Maximization (E-M)¶
- apparently circular logic
- use model to update latent (hidden) variables
- use latent variables to updade model
- iteration maximizes likelihood
- can use momentum for local minima
Clustering¶
- group similar data points
k-means¶
- E-M clustering algorithm
- choose a set of anchor points randomly from data
- iteration
- assign data points to the closest anchor
- update anchor to the mean of closest points
- can assess number of clusters by plotting total distance of points to anchors versus their number and look for the "elbow"
Voronoi tesselation¶
- partition of space closest to anchor points
In [20]:
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
import numpy as np
import pandas as pd
#
# k-means parameters
#
nsteps = 1000
momentum = 0. # (kept for consistency, not used)
#
# import data from CSV
#
data = pd.read_csv('~/work/rinchen-khandu/datasets/student_depression_dataset.csv')
# ✅ FIX: Use Age and Depression columns
# Drop missing values to avoid errors
data = data[["Age", "Depression"]].dropna()
x = data["Age"].values
y = data["Depression"].values
#
# k-means function
#
def kmeans(x, y, momentum, nclusters):
# choose random initial cluster centers
indices = np.random.randint(0, len(x), nclusters)
mux = x[indices]
muy = y[indices]
for _ in range(nsteps):
# distance of each point to each cluster center
X = np.outer(x, np.ones(len(mux)))
Y = np.outer(y, np.ones(len(muy)))
Mux = np.outer(np.ones(len(x)), mux)
Muy = np.outer(np.ones(len(x)), muy)
distances = np.sqrt((X - Mux)**2 + (Y - Muy)**2)
mins = np.argmin(distances, axis=1)
# update cluster centers
for i in range(len(mux)):
index = np.where(mins == i)[0]
if len(index) > 0:
mux[i] = np.mean(x[index])
muy[i] = np.mean(y[index])
# compute total distance (for elbow method)
total_distance = 0
for i in range(len(mux)):
index = np.where(mins == i)[0]
total_distance += np.sum(
np.sqrt((x[index] - mux[i])**2 + (y[index] - muy[i])**2)
)
return mux, muy, total_distance
#
# plot k-means clusters
#
def plot_kmeans(x, y, mux, muy):
plt.figure()
plt.plot(x, y, '.', alpha=0.6)
plt.plot(mux, muy, 'r.', markersize=20)
plt.xlabel("Age")
plt.ylabel("Depression Score")
plt.title(f"{len(mux)} clusters (K-Means)")
plt.show()
#
# plot Voronoi diagram
#
def plot_Voronoi(x, y, mux, muy):
plt.figure()
plt.plot(x, y, '.', alpha=0.6)
vor = Voronoi(np.column_stack((mux, muy)))
voronoi_plot_2d(
vor,
show_points=True,
show_vertices=False,
point_size=20
)
plt.xlabel("Age")
plt.ylabel("Sleep Duration")
plt.title(f"{len(mux)} clusters (Voronoi)")
plt.show()
#
# run clustering for different cluster counts
#
distances = np.zeros(5)
mux, muy, distances[0] = kmeans(x, y, momentum, 1)
plot_kmeans(x, y, mux, muy)
mux, muy, distances[1] = kmeans(x, y, momentum, 2)
plot_kmeans(x, y, mux, muy)
mux, muy, distances[2] = kmeans(x, y, momentum, 3)
plot_Voronoi(x, y, mux, muy)
mux, muy, distances[3] = kmeans(x, y, momentum, 4)
plot_Voronoi(x, y, mux, muy)
mux, muy, distances[4] = kmeans(x, y, momentum, 5)
plot_Voronoi(x, y, mux, muy)
#
# elbow method plot
#
plt.figure()
plt.plot(np.arange(1, 6), distances, 'o-')
plt.xlabel("Number of clusters")
plt.ylabel("Total distance to clusters")
plt.title("Elbow Method (Age vs Depression)")
plt.show()
--------------------------------------------------------------------------- QhullError Traceback (most recent call last) Cell In[20], line 109 106 plot_kmeans(x, y, mux, muy) 108 mux, muy, distances[2] = kmeans(x, y, momentum, 3) --> 109 plot_Voronoi(x, y, mux, muy) 111 mux, muy, distances[3] = kmeans(x, y, momentum, 4) 112 plot_Voronoi(x, y, mux, muy) Cell In[20], line 83, in plot_Voronoi(x, y, mux, muy) 80 plt.figure() 81 plt.plot(x, y, '.', alpha=0.6) ---> 83 vor = Voronoi(np.column_stack((mux, muy))) 84 voronoi_plot_2d( 85 vor, 86 show_points=True, 87 show_vertices=False, 88 point_size=20 89 ) 91 plt.xlabel("Age") File scipy/spatial/_qhull.pyx:2680, in scipy.spatial._qhull.Voronoi.__init__() File scipy/spatial/_qhull.pyx:356, in scipy.spatial._qhull._Qhull.__init__() QhullError: QH6154 Qhull precision error: Initial simplex is flat (facet 1 is coplanar with the interior point) While executing: | qhull v Qz Qbb Qc Options selected for Qhull 2020.2.r 2020/08/31: run-id 1577805114 voronoi Qz-infinity-point Qbbound-last Qcoplanar-keep _pre-merge _zero-centrum Qinterior-keep Pgood _max-width 11 Error-roundoff 4.3e-14 _one-merge 3e-13 Visible-distance 8.6e-14 U-max-coplanar 8.6e-14 Width-outside 1.7e-13 _wide-facet 5.1e-13 _maxoutside 3.4e-13 The input to qhull appears to be less than 3 dimensional, or a computation has overflowed. Qhull could not construct a clearly convex simplex from points: - p2(v4): 25 0 10 - p3(v3): 25 0 31 - p0(v2): 31 0 26 - p1(v1): 20 0 0 The center point is coplanar with a facet, or a vertex is coplanar with a neighboring facet. The maximum round off error for computing distances is 4.3e-14. The center point, facets and distances to the center point are as follows: center point 25.2 0 16.91 facet p3 p0 p1 distance= 0 facet p2 p0 p1 distance= 0 facet p2 p3 p1 distance= 0 facet p2 p3 p0 distance= 0 These points either have a maximum or minimum x-coordinate, or they maximize the determinant for k coordinates. Trial points are first selected from points that maximize a coordinate. The min and max coordinates for each dimension are: 0: 19.92 30.87 difference= 10.95 1: 0 0 difference= 0 2: 0 30.87 difference= 30.87 If the input should be full dimensional, you have several options that may determine an initial simplex: - use 'QJ' to joggle the input and make it full dimensional - use 'QbB' to scale the points to the unit cube - use 'QR0' to randomly rotate the input for different maximum points - use 'Qs' to search all points for the initial simplex - use 'En' to specify a maximum roundoff error less than 4.3e-14. - trace execution with 'T3' to see the determinant for each point. If the input is lower dimensional: - use 'QJ' to joggle the input and make it full dimensional - use 'Qbk:0Bk:0' to delete coordinate k from the input. You should pick the coordinate with the least range. The hull will have the correct topology. - determine the flat containing the points, rotate the points into a coordinate plane, and delete the other coordinates. - add one or more points to make the input full dimensional.
modeling functions¶
- cluster function in this example is locally quadratic
In [25]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#
# Load your own data
#
data = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")
data = data[["Age", "Depression"]].dropna()
x = data["Age"].values
y = data["Depression"].values
# sort data (important for plotting smooth curves)
idx = np.argsort(x)
x = x[idx]
y = y[idx]
npts = len(x)
#
# function cluster-weighted modeling parameters
#
nclusters = 5
minvar = 0.01
niterations = 100
np.random.seed(10)
xplot = np.copy(x)
nplot = npts
#
# initialize parameters
#
mux = np.random.choice(x, nclusters)
beta = 0.1 * np.random.rand(3, nclusters)
varx = np.ones(nclusters)
vary = np.ones(nclusters)
pc = np.ones(nclusters) / nclusters
#
# ---------- BEFORE ITERATION ----------
#
pxgc = np.exp(-(np.outer(x, np.ones(nclusters))
- np.outer(np.ones(npts), mux))**2
/ (2 * np.outer(np.ones(npts), varx))) \
/ np.sqrt(2 * np.pi * np.outer(np.ones(npts), varx))
pygxc = np.exp(-(np.outer(y, np.ones(nclusters))
- (np.outer(np.ones(npts), beta[0, :])
+ np.outer(x, beta[1, :])
+ np.outer(x * x, beta[2, :])))**2
/ (2 * np.outer(np.ones(npts), vary))) \
/ np.sqrt(2 * np.pi * np.outer(np.ones(npts), vary))
pxc = pxgc * np.outer(np.ones(npts), pc)
pxyc = pygxc * pxc
pcgxy = pxyc / np.outer(np.sum(pxyc, 1), np.ones(nclusters))
plt.figure()
plt.plot(x, y, '.', alpha=0.6)
yplot = np.sum((np.outer(np.ones(nplot), beta[0, :])
+ np.outer(x, beta[1, :])
+ np.outer(x * x, beta[2, :])) * pxc, 1) / np.sum(pxc, 1)
plt.plot(xplot, yplot)
yvarplot = np.sum((vary +
(np.outer(np.ones(nplot), beta[0, :])
+ np.outer(x, beta[1, :])
+ np.outer(x * x, beta[2, :]))**2) * pxc, 1) \
/ np.sum(pxc, 1) - yplot**2
ystdplot = np.sqrt(yvarplot)
plt.plot(xplot, yplot + ystdplot, 'k--')
plt.plot(xplot, yplot - ystdplot, 'k--')
plt.title("Function Clusters BEFORE EM Iteration")
plt.xlabel("Age")
plt.ylabel("Depression Score")
plt.show()
#
# ---------- EM ITERATION ----------
#
for step in range(niterations):
pxgc = np.exp(-(np.outer(x, np.ones(nclusters))
- np.outer(np.ones(npts), mux))**2
/ (2 * np.outer(np.ones(npts), varx))) \
/ np.sqrt(2 * np.pi * np.outer(np.ones(npts), varx))
pygxc = np.exp(-(np.outer(y, np.ones(nclusters))
- (np.outer(np.ones(npts), beta[0, :])
+ np.outer(x, beta[1, :])
+ np.outer(x * x, beta[2, :])))**2
/ (2 * np.outer(np.ones(npts), vary))) \
/ np.sqrt(2 * np.pi * np.outer(np.ones(npts), vary))
pxc = pxgc * np.outer(np.ones(npts), pc)
pxyc = pygxc * pxc
pcgxy = pxyc / np.outer(np.sum(pxyc, 1), np.ones(nclusters))
pc = np.sum(pcgxy, 0) / npts
mux = np.sum(np.outer(x, np.ones(nclusters)) * pcgxy, 0) / (npts * pc)
varx = minvar + np.sum((np.outer(x, np.ones(nclusters))
- np.outer(np.ones(npts), mux))**2
* pcgxy, 0) / (npts * pc)
a = np.array([
np.sum(np.outer(y, np.ones(nclusters)) * pcgxy, 0) / (npts * pc),
np.sum(np.outer(y * x, np.ones(nclusters)) * pcgxy, 0) / (npts * pc),
np.sum(np.outer(y * x * x, np.ones(nclusters)) * pcgxy, 0) / (npts * pc)
])
B = np.zeros((3, 3, nclusters))
for c in range(nclusters):
B[:, :, c] = [
[1,
np.sum(x * pcgxy[:, c]) / (npts * pc[c]),
np.sum(x * x * pcgxy[:, c]) / (npts * pc[c])],
[np.sum(x * pcgxy[:, c]) / (npts * pc[c]),
np.sum(x * x * pcgxy[:, c]) / (npts * pc[c]),
np.sum(x * x * x * pcgxy[:, c]) / (npts * pc[c])],
[np.sum(x * x * pcgxy[:, c]) / (npts * pc[c]),
np.sum(x * x * x * pcgxy[:, c]) / (npts * pc[c]),
np.sum(x * x * x * x * pcgxy[:, c]) / (npts * pc[c])]
]
beta[:, c] = np.linalg.inv(B[:, :, c]) @ a[:, c]
vary = minvar + np.sum((np.outer(y, np.ones(nclusters))
- (np.outer(np.ones(npts), beta[0, :])
+ np.outer(x, beta[1, :])
+ np.outer(x * x, beta[2, :])))**2
* pcgxy, 0) / (npts * pc)
#
# ---------- AFTER ITERATION ----------
#
plt.figure()
plt.plot(x, y, '.', alpha=0.6)
yplot = np.sum((np.outer(np.ones(nplot), beta[0, :])
+ np.outer(x, beta[1, :])
+ np.outer(x * x, beta[2, :])) * pxc, 1) / np.sum(pxc, 1)
plt.plot(xplot, yplot)
ystdplot = np.sqrt(yvarplot)
plt.plot(xplot, yplot + ystdplot, 'k--')
plt.plot(xplot, yplot - ystdplot, 'k--')
plt.title("Function Clusters AFTER EM Iteration")
plt.xlabel("Age")
plt.ylabel("Depression Score")
plt.show()
/tmp/ipykernel_496/3830952134.py:58: RuntimeWarning: invalid value encountered in divide pcgxy = pxyc / np.outer(np.sum(pxyc, 1), np.ones(nclusters))
/tmp/ipykernel_496/3830952134.py:102: RuntimeWarning: invalid value encountered in divide pcgxy = pxyc / np.outer(np.sum(pxyc, 1), np.ones(nclusters))
modeling states¶
- cluster function is a table of state probabilities
In [27]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#
# state cluster-weighted modeling parameters
#
dim = 2
nstates = 2
nclusters = 5
momentum = 0.9
rate = 0.1
min_var = 0.1
niterations = 100
np.random.seed(10)
#
# ---------- LOAD YOUR OWN DATA ----------
#
data = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")
data = data[["Age", "Depression"]].dropna()
# define states using median split
threshold = data["Depression"].median()
states = (data["Depression"] > threshold).astype(int).values
# points matrix (2 × N)
points = np.vstack((data["Age"].values,
data["Depression"].values))
npts = points.shape[1]
#
# ---------- INITIALIZATION ----------
#
indices = np.random.randint(0, npts, nclusters)
means = points[:, indices]
dmean = np.zeros((dim, nclusters))
variances = np.outer(np.var(points, axis=1), np.ones(nclusters))
pc = np.ones(nclusters) / nclusters # p(c)
pxgc = np.zeros((nclusters, npts)) # p(x|c)
psgxc = np.ones((nclusters, nstates)) / nstates # p(s|x,c)
#
# ---------- PLOT DATA ----------
#
plt.figure()
plt.plot(points[0, states == 0], points[1, states == 0],
ls='', marker='o', label='Low Depression')
plt.plot(points[0, states == 1], points[1, states == 1],
ls='', marker='x', label='High Depression')
plt.xlabel("Age")
plt.ylabel("Depression Score")
plt.legend()
#
# ---------- PLOT VARIANCE CLUSTERS ----------
#
def plot_var():
for cluster in range(nclusters):
x, y = means[:, cluster]
dx = np.sqrt(variances[0, cluster])
dy = np.sqrt(variances[1, cluster])
label = np.argmax(psgxc[cluster, :])
plt.plot([x - dx, x + dx], [y, y], 'k')
plt.plot([x, x], [y - dy, y + dy], 'k')
plt.plot([x], [y], 'k', marker=f'${label}$', markersize=15)
plot_var()
plt.title("Variance Clusters BEFORE EM")
plt.show()
#
# ---------- EM ITERATION ----------
#
for _ in range(niterations):
# E-step
for c in range(nclusters):
mean = np.outer(means[:, c], np.ones(npts))
var = np.outer(variances[:, c], np.ones(npts))
pxgc[c, :] = np.prod(
np.exp(-(points - mean) ** 2 / (2 * var)) /
np.sqrt(2 * np.pi * var), axis=0
)
pxc = pxgc * pc[:, None]
pcgx = pxc / np.sum(pxc, axis=0)
psxc = psgxc[:, states] * pxc
pcgsx = psxc / np.sum(psxc, axis=0)
pc = np.sum(pcgsx, axis=1) / npts
# M-step
for c in range(nclusters):
newmean = (
momentum * dmean[:, c] +
np.sum(points * pcgsx[c, :], axis=1) /
np.sum(pcgsx[c, :])
)
dmean[:, c] = newmean - means[:, c]
means[:, c] += rate * dmean[:, c]
m = means[:, c][:, None]
variances[:, c] = (
np.sum((points - m) ** 2 * pcgsx[c, :], axis=1) /
np.sum(pcgsx[c, :]) + min_var
)
for s in range(nstates):
idx = states == s
psgxc[:, s] = np.sum(pcgsx[:, idx], axis=1) / np.sum(pcgsx, axis=1)
#
# ---------- FINAL PLOTS ----------
#
plt.figure()
plt.plot(points[0, states == 0], points[1, states == 0],
ls='', marker='o', label='Low Depression')
plt.plot(points[0, states == 1], points[1, states == 1],
ls='', marker='x', label='High Depression')
plot_var()
plt.xlabel("Age")
plt.ylabel("Depression Score")
plt.legend()
plt.title("Variance Clusters AFTER EM")
plt.show()
In [ ]: