[Your-Name-Here] - Fab Futures - Data Science
Home About
In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection

# -------------------------------
# LOAD AND CLEAN CSV
# -------------------------------
df = pd.read_csv("Baselinedata_Grade VIII E..csv")
df = df.dropna(how="all")

rename_map = {
    "Unnamed: 1": "name",
    "Unnamed: 4": "exploration",
    "Unnamed: 5": "communication",
    "Unnamed: 6": "creativity"
}
df = df.rename(columns=rename_map)
df = df[["name", "exploration", "communication", "creativity"]]
df = df.dropna(subset=["name"])

# -------------------------------
# CONVERT RATINGS TO NUMBERS
# -------------------------------
rating_map = {
    "Very Good": 3,
    "Good": 2,
    "Need improvement": 1
}

df["exploration_score"] = df["exploration"].map(rating_map)
df["communication_score"] = df["communication"].map(rating_map)
df["creativity_score"] = df["creativity"].map(rating_map)

# Remove rows with any NaN in scores
df = df.dropna(subset=["exploration_score", "communication_score", "creativity_score"])

# -------------------------------
# PREPARE DATA
# -------------------------------
X = df[["exploration_score", "communication_score", "creativity_score"]].values
k = 3
np.random.seed(0)

# Pick valid initial centroids (no NaNs)
indices = np.random.choice(len(X), k, replace=False)
centroids = X[indices].astype(float)

# -------------------------------
# K-MEANS WITH NaN PROTECTION
# -------------------------------
def assign_clusters(X, centroids):
    d = np.sqrt(((X[:, None, :] - centroids[None, :, :]) ** 2).sum(axis=2))
    return np.argmin(d, axis=1)

def recompute_centroids(X, labels, k):
    new_centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        pts = X[labels == i]
        if len(pts) == 0:
            # Avoid NaN – pick a random point as centroid
            new_centroids[i] = X[np.random.randint(0, len(X))]
        else:
            new_centroids[i] = np.nanmean(pts, axis=0)
    return new_centroids

for _ in range(15):
    labels = assign_clusters(X, centroids)
    centroids = recompute_centroids(X, labels, k)

# -------------------------------
# VORONOI + CLUSTER PLOT (Cloud Style)
# -------------------------------
fig, ax = plt.subplots(figsize=(8,6))

# Add small jitter to avoid overlapping points
jitter = 0.08
X_jittered = X[:, :2] + np.random.uniform(-jitter, jitter, X[:, :2].shape)

# Scatter plot with jitter
scatter = ax.scatter(
    X_jittered[:,0], X_jittered[:,1],
    c=labels,
    cmap="viridis",
    s=100,
    edgecolor="black",
    alpha=0.7
)

# Ensure centroids have no NaNs
vor_centroids = centroids[:, :2]
if np.isnan(vor_centroids).any():
    for i in range(len(vor_centroids)):
        if np.isnan(vor_centroids[i]).any():
            vor_centroids[i] = X[np.random.randint(0, len(X)), :2]

# Create Voronoi diagram
vor = Voronoi(vor_centroids)

# Plot colored Voronoi regions
patches = []
colors = plt.cm.viridis(np.linspace(0, 1, k))

for region_index, color in zip(vor.point_region, colors):
    region = vor.regions[region_index]
    if not -1 in region and len(region) > 0:
        polygon = [vor.vertices[i] for i in region]
        patches.append(Polygon(polygon, True, facecolor=color, alpha=0.2))

p = PatchCollection(patches, match_original=True)
ax.add_collection(p)

# Plot Voronoi edges
voronoi_plot_2d(vor, ax=ax, show_vertices=False, line_colors="black", line_width=1, point_size=0)

plt.xlabel("Exploration Score")
plt.ylabel("Communication Score")
plt.title("Student Clusters (K-means with 3 Skills)")
plt.grid(True)
plt.show()

# -------------------------------
# OUTPUT STUDENT CLUSTERS
# -------------------------------
df["cluster"] = labels
print(df[["name", "exploration", "communication", "creativity", "cluster"]])
No description has been provided for this image
                      name       exploration     communication  \
2            Ashish Wakley              Good              Good   
3           Dorji Tshewang              Good              Good   
4            Jigme Namgyel  Need improvement  Need improvement   
5            Kelden Drukda  Need improvement  Need improvement   
6             Kinga Tobgay              Good              Good   
7            Kinley Dendup              Good              Good   
8          Kinley Tshering              Good  Need improvement   
9     Lekden Thujee Drakpa  Need improvement  Need improvement   
10             Pema Namgay         Very Good         Very Good   
11            Rinzin Dorji              Good  Need improvement   
12             Samten Nima              Good  Need improvement   
13         Sherub Phuntsho              Good              Good   
14         Sherub Wangchuk              Good              Good   
15            Sonam Rabten              Good              Good   
16            Sonam Tobden              Good         Very Good   
17            Tashi Yoezer              Good              Good   
18   Thukten Rigtsel Dorji              Good         Very Good   
19           Yonten Yoezer              Good  Need improvement   
20              Dechen Pem              Good         Very Good   
21           Dechen Zangmo              Good  Need improvement   
22            Dorji Choden         Very Good         Very Good   
23             Jigme Metho  Need improvement  Need improvement   
24            Kamala Sunar  Need improvement  Need improvement   
25          Kinzang Lhazin  Need improvement              Good   
26           Namgay Choden              Good         Very Good   
27               Pema Dema              Good  Need improvement   
28             Pema Wangmo              Good              Good   
29           Pema Yangchen              Good              Good   
30           Sangay Choden              Good              Good   
32           Tenzin Chokey              Good  Need improvement   
33  Thukten Ngawang Choden         Very Good         Very Good   
34       Tshering Choden S              Good              Good   
35       Tshering Choden Z              Good              Good   
36        Tshering Yangzom              Good              Good   
37              Ugyen Dema              Good              Good   
38            Ugyen lhaden              Good  Need improvement   
39           Ugyen Yangzom              Good         Very Good   
40              Yeshi lham  Need improvement  Need improvement   

          creativity  cluster  
2          Very Good        1  
3               Good        1  
4   Need improvement        0  
5          Very Good        1  
6   Need improvement        0  
7   Need improvement        0  
8   Need improvement        0  
9   Need improvement        0  
10         Very Good        2  
11  Need improvement        0  
12  Need improvement        0  
13  Need improvement        0  
14  Need improvement        0  
15  Need improvement        0  
16  Need improvement        2  
17              Good        1  
18              Good        2  
19  Need improvement        0  
20  Need improvement        2  
21              Good        1  
22              Good        2  
23              Good        1  
24  Need improvement        0  
25              Good        1  
26  Need improvement        2  
27  Need improvement        0  
28  Need improvement        0  
29  Need improvement        0  
30  Need improvement        0  
32              Good        1  
33         Very Good        2  
34              Good        1  
35              Good        1  
36              Good        1  
37              Good        1  
38  Need improvement        0  
39              Good        2  
40  Need improvement        0  
In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# -------------------------------
# LOAD AND CLEAN CSV
# -------------------------------
# Skip the first row (file title) and use the second row as header
df = pd.read_csv("Baselinedata_Grade VIII E..csv", header=1)

# Strip spaces and rename columns
df.columns = df.columns.str.strip()
df = df.rename(columns={
    "Name": "Name",
    "Exploration": "Exploration",
    "Communication": "Communication",
    "Creativity": "Creativity"
})

# Keep only relevant columns
df = df[["Name", "Exploration", "Communication", "Creativity"]].dropna(subset=["Name"])

# Map ratings to numbers
rating_map = {"Very Good": 3, "Good": 2, "Need improvement": 1}
df["Exploration_score"] = df["Exploration"].map(rating_map)
df["Communication_score"] = df["Communication"].map(rating_map)
df["Creativity_score"] = df["Creativity"].map(rating_map)

# Drop rows with missing scores
df = df.dropna(subset=["Exploration_score", "Communication_score", "Creativity_score"])

# -------------------------------
# PREPARE DATA
# -------------------------------
# Use all three scores for 3D clustering
points = df[["Exploration_score", "Communication_score", "Creativity_score"]].T.values  # 3 x npoints
npts = points.shape[1]

# Cluster parameters
nclusters = 5
np.random.seed(10)

# -------------------------------
# INITIALIZE CLUSTER PARAMETERS
# -------------------------------
indices = np.random.randint(0, npts, nclusters)
means = points[:, indices]
dmean = np.zeros_like(means)
variances = np.outer(np.var(points, axis=1), np.ones(nclusters))
pc = np.ones(nclusters) / nclusters  # cluster probabilities
pxgc = np.zeros((nclusters, npts))

# -------------------------------
# FUNCTION TO PLOT CLUSTERS IN 3D
# -------------------------------
def plot_3d_clusters(points, means, title="3D Clusters"):
    fig = plt.figure(figsize=(10,7))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(points[0, :], points[1, :], points[2, :], c='skyblue', edgecolor='black', s=100)
    for cluster in range(means.shape[1]):
        ax.scatter(means[0, cluster], means[1, cluster], means[2, cluster], 
                   marker=f'${cluster}$', s=200, c='red')
    ax.set_xlabel("Exploration Score")
    ax.set_ylabel("Communication Score")
    ax.set_zlabel("Creativity Score")
    ax.set_title(title)
    plt.show()

# -------------------------------
# PLOT INITIAL DATA
# -------------------------------
plot_3d_clusters(points, means, title="Clusters Before EM Iteration")

# -------------------------------
# EM ITERATION
# -------------------------------
niterations = 50
momentum = 0.9
rate = 0.1
min_var = 0.1

for i in range(niterations):
    # E-step: probability of each point given cluster
    for cluster in range(nclusters):
        mean = np.outer(means[:, cluster], np.ones(npts))
        variance = np.outer(variances[:, cluster], np.ones(npts))
        pxgc[cluster, :] = np.prod(np.exp(-(points - mean) ** 2 / (2 * variance)) / np.sqrt(2 * np.pi * variance), 0)
    
    pxc = pxgc * np.outer(pc, np.ones(npts))
    pcgx = pxc / np.outer(np.ones(nclusters), np.sum(pxc, axis=0))

    # M-step: update cluster weights and parameters
    pc = np.sum(pcgx, axis=1) / npts
    for cluster in range(nclusters):
        newmean = momentum * dmean[:, cluster] + np.sum(points * np.outer(np.ones(points.shape[0]), pcgx[cluster, :]), axis=1) / np.sum(pcgx[cluster, :])
        dmean[:, cluster] = newmean - means[:, cluster]
        means[:, cluster] += rate * dmean[:, cluster]
        m = np.outer(means[:, cluster], np.ones(npts))
        variances[:, cluster] = np.sum((points - m) ** 2 * np.outer(np.ones(points.shape[0]), pcgx[cluster, :]), axis=1) / np.sum(pcgx[cluster, :]) + min_var

# -------------------------------
# PLOT FINAL CLUSTERS
# -------------------------------
plot_3d_clusters(points, means, title="Clusters After EM Iteration")

# -------------------------------
# ASSIGN CLUSTERS TO STUDENTS
# -------------------------------
student_clusters = np.argmax(pcgx, axis=0)
df["Cluster"] = student_clusters
print(df[["Name", "Exploration", "Communication", "Creativity", "Cluster"]])
No description has been provided for this image
No description has been provided for this image
                      Name       Exploration     Communication  \
1            Ashish Wakley              Good              Good   
2           Dorji Tshewang              Good              Good   
3            Jigme Namgyel  Need improvement  Need improvement   
4            Kelden Drukda  Need improvement  Need improvement   
5             Kinga Tobgay              Good              Good   
6            Kinley Dendup              Good              Good   
7          Kinley Tshering              Good  Need improvement   
8     Lekden Thujee Drakpa  Need improvement  Need improvement   
9              Pema Namgay         Very Good         Very Good   
10            Rinzin Dorji              Good  Need improvement   
11             Samten Nima              Good  Need improvement   
12         Sherub Phuntsho              Good              Good   
13         Sherub Wangchuk              Good              Good   
14            Sonam Rabten              Good              Good   
15            Sonam Tobden              Good         Very Good   
16            Tashi Yoezer              Good              Good   
17   Thukten Rigtsel Dorji              Good         Very Good   
18           Yonten Yoezer              Good  Need improvement   
19              Dechen Pem              Good         Very Good   
20           Dechen Zangmo              Good  Need improvement   
21            Dorji Choden         Very Good         Very Good   
22             Jigme Metho  Need improvement  Need improvement   
23            Kamala Sunar  Need improvement  Need improvement   
24          Kinzang Lhazin  Need improvement              Good   
25           Namgay Choden              Good         Very Good   
26               Pema Dema              Good  Need improvement   
27             Pema Wangmo              Good              Good   
28           Pema Yangchen              Good              Good   
29           Sangay Choden              Good              Good   
31           Tenzin Chokey              Good  Need improvement   
32  Thukten Ngawang Choden         Very Good         Very Good   
33       Tshering Choden S              Good              Good   
34       Tshering Choden Z              Good              Good   
35        Tshering Yangzom              Good              Good   
36              Ugyen Dema              Good              Good   
37            Ugyen lhaden              Good  Need improvement   
38           Ugyen Yangzom              Good         Very Good   
39              Yeshi lham  Need improvement  Need improvement   

          Creativity  Cluster  
1          Very Good        4  
2               Good        4  
3   Need improvement        0  
4          Very Good        2  
5   Need improvement        4  
6   Need improvement        4  
7   Need improvement        0  
8   Need improvement        0  
9          Very Good        1  
10  Need improvement        0  
11  Need improvement        0  
12  Need improvement        4  
13  Need improvement        4  
14  Need improvement        4  
15  Need improvement        4  
16              Good        4  
17              Good        4  
18  Need improvement        0  
19  Need improvement        4  
20              Good        4  
21              Good        1  
22              Good        2  
23  Need improvement        0  
24              Good        2  
25  Need improvement        4  
26  Need improvement        0  
27  Need improvement        4  
28  Need improvement        4  
29  Need improvement        4  
31              Good        4  
32         Very Good        1  
33              Good        4  
34              Good        4  
35              Good        4  
36              Good        4  
37  Need improvement        0  
38              Good        4  
39  Need improvement        0  
In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# -------------------------------
# LOAD CSV
# -------------------------------
# Skip the first row (file title) and use the second row as header
df = pd.read_csv("Baselinedata_Grade VIII E..csv", header=1)

# Strip spaces and rename columns
df.columns = df.columns.str.strip()
df = df.rename(columns={
    "Name": "Name",
    "Exploration": "Exploration",
    "Communication": "Communication",
    "Creativity": "Creativity"
})

# Keep only relevant columns
df = df[["Name", "Exploration", "Communication", "Creativity"]].dropna(subset=["Name"])

# Map ratings to numbers
rating_map = {"Very Good": 3, "Good": 2, "Need improvement": 1}
df["Exploration_score"] = df["Exploration"].map(rating_map)
df["Communication_score"] = df["Communication"].map(rating_map)

# Drop rows with missing scores
df = df.dropna(subset=["Exploration_score", "Communication_score"])

# -------------------------------
# PREPARE DATA (2D for clustering)
# -------------------------------
points = df[["Exploration_score", "Communication_score"]].T.values  # 2 x npoints
npts = points.shape[1]

# -------------------------------
# CLUSTER PARAMETERS
# -------------------------------
dim = 2
nstates = 2
nclusters = 5
momentum = 0.9
rate = 0.1
min_var = 0.1
niterations = 100
np.random.seed(10)

# -------------------------------
# INITIALIZE CLUSTER ARRAYS
# -------------------------------
indices = np.random.randint(0, npts, nclusters)
means = points[:, indices]
dmean = np.zeros((dim, nclusters))
covariances = np.zeros((nclusters, dim, dim))
covariances[:,0,0] = np.var(points[0,:])
covariances[:,1,1] = np.var(points[1,:])
pc = np.ones(nclusters)/nclusters  # cluster probabilities
pxgc = np.zeros((nclusters, npts))  # p(x|c)
psgxc = np.ones((nclusters, nstates))/nstates  # p(s|x,c)

# -------------------------------
# PLOT FUNCTION
# -------------------------------
def plot_covar():
    for cluster in range(means.shape[1]):
        x = means[0, cluster]
        y = means[1, cluster]
        w, v = np.linalg.eig(covariances[cluster])
        w = np.sqrt(w)
        plt.plot([x-w[0]*v[0,0], x+w[0]*v[0,0]], [y-w[0]*v[1,0], y+w[0]*v[1,0]], 'k')
        plt.plot([x-w[1]*v[0,1], x+w[1]*v[0,1]], [y-w[1]*v[1,1], y+w[1]*v[1,1]], 'k')
        plt.plot([x],[y], 'k', marker=f'${cluster}$', markersize=15)

# -------------------------------
# INITIAL PLOT
# -------------------------------
plt.figure(figsize=(8,6))
plt.scatter(points[0,:], points[1,:], c='skyblue', edgecolor='black', s=100)
plot_covar()
plt.xlabel("Exploration Score")
plt.ylabel("Communication Score")
plt.title("Covariance Clusters Before EM Iteration")
plt.grid(True)
plt.show()

# -------------------------------
# EM ITERATION
# -------------------------------
for i in range(niterations):
    for cluster in range(nclusters):
        mean = np.outer(means[:,cluster], np.ones(npts))
        cinv = np.linalg.pinv(covariances[cluster])
        pxgc[cluster,:] = (np.sqrt(np.linalg.det(cinv))/(2*np.pi)**(dim/2)) * \
                          np.exp(-np.sum((points-mean)*(cinv@(points-mean)),0)/2)
    pxc = pxgc * np.outer(pc, np.ones(npts))
    pcgx = pxc / np.outer(np.ones(nclusters), np.sum(pxc,0))
    psxc = psgxc[:, np.zeros(npts,dtype=int)] * pxc  # simplified states
    pcgsx = psxc / np.outer(np.ones(nclusters), np.sum(psxc,0))
    pc = np.sum(pcgsx,1)/npts
    
    for cluster in range(nclusters):
        newmean = momentum*dmean[:,cluster] + np.sum(points*np.outer(np.ones(dim), pcgsx[cluster,:]),1)/np.sum(pcgsx[cluster,:])
        dmean[:,cluster] = newmean - means[:,cluster]
        means[:,cluster] += rate*dmean[:,cluster]
        m = np.outer(means[:,cluster], np.ones(npts))
        for d in range(dim):
            covariances[cluster][:,d] = np.sum(np.outer(np.ones(dim), points[d,:]-m[d,:])*(points-m) * \
                                               np.outer(np.ones(dim), pcgsx[cluster,:]),1)/np.sum(np.outer(np.ones(dim), pcgsx[cluster,:]),1) + min_var

# -------------------------------
# PLOT FINAL CLUSTERS
# -------------------------------
plt.figure(figsize=(8,6))
plt.scatter(points[0,:], points[1,:], c='skyblue', edgecolor='black', s=100)
plot_covar()
plt.xlabel("Exploration Score")
plt.ylabel("Communication Score")
plt.title("Covariance Clusters After EM Iteration")
plt.grid(True)
plt.show()

# -------------------------------
# ASSIGN CLUSTER TO STUDENTS
# -------------------------------
student_clusters = np.argmax(pxgc, axis=0)
df["Cluster"] = student_clusters
print(df[["Name", "Exploration", "Communication", "Cluster"]])
No description has been provided for this image
No description has been provided for this image
                      Name       Exploration     Communication  Cluster
1            Ashish Wakley              Good              Good        2
2           Dorji Tshewang              Good              Good        2
3            Jigme Namgyel  Need improvement  Need improvement        2
4            Kelden Drukda  Need improvement  Need improvement        2
5             Kinga Tobgay              Good              Good        2
6            Kinley Dendup              Good              Good        2
7          Kinley Tshering              Good  Need improvement        0
8     Lekden Thujee Drakpa  Need improvement  Need improvement        2
9              Pema Namgay         Very Good         Very Good        2
10            Rinzin Dorji              Good  Need improvement        0
11             Samten Nima              Good  Need improvement        0
12         Sherub Phuntsho              Good              Good        2
13         Sherub Wangchuk              Good              Good        2
14            Sonam Rabten              Good              Good        2
15            Sonam Tobden              Good         Very Good        1
16            Tashi Yoezer              Good              Good        2
17   Thukten Rigtsel Dorji              Good         Very Good        1
18           Yonten Yoezer              Good  Need improvement        0
19              Dechen Pem              Good         Very Good        1
20           Dechen Zangmo              Good  Need improvement        0
21            Dorji Choden         Very Good         Very Good        2
22             Jigme Metho  Need improvement  Need improvement        2
23            Kamala Sunar  Need improvement  Need improvement        2
24          Kinzang Lhazin  Need improvement              Good        1
25           Namgay Choden              Good         Very Good        1
26               Pema Dema              Good  Need improvement        0
27             Pema Wangmo              Good              Good        2
28           Pema Yangchen              Good              Good        2
29           Sangay Choden              Good              Good        2
31           Tenzin Chokey              Good  Need improvement        0
32  Thukten Ngawang Choden         Very Good         Very Good        2
33       Tshering Choden S              Good              Good        2
34       Tshering Choden Z              Good              Good        2
35        Tshering Yangzom              Good              Good        2
36              Ugyen Dema              Good              Good        2
37            Ugyen lhaden              Good  Need improvement        0
38           Ugyen Yangzom              Good         Very Good        1
39              Yeshi lham  Need improvement  Need improvement        2
In [32]:
import matplotlib.pyplot as plt
import numpy as np

# -------------------------------
# function cluster-weighted modeling parameters
# -------------------------------
nclusters = 5
xmin = -5
xmax = 5
npts = 100
minvar = 0.01
niterations = 100
np.random.seed(10)

# -------------------------------
# generate synthetic data
# -------------------------------
x = np.linspace(xmin, xmax, npts)
y = np.tanh(x) + 0.25 * np.random.rand(npts)
xplot = np.copy(x)

# initialize cluster parameters
mux = xmin + (xmax - xmin) * np.random.rand(nclusters)
beta = 0.1 * np.random.rand(3, nclusters)
varx = np.ones(nclusters)
vary = np.ones(nclusters)
pc = np.ones(nclusters) / nclusters

# -------------------------------
# plot before iteration
# -------------------------------
pxgc = np.exp(-((np.outer(x, np.ones(nclusters)) - np.outer(np.ones(npts), mux))**2) /
              (2 * np.outer(np.ones(npts), varx))) / np.sqrt(2 * np.pi * np.outer(np.ones(npts), varx))

pygxc = np.exp(-((np.outer(y, np.ones(nclusters)) -
                  (np.outer(np.ones(npts), beta[0, :]) +
                   np.outer(x, beta[1, :]) +
                   np.outer(x*x, beta[2, :])))**2) /
               (2 * np.outer(np.ones(npts), vary))) / np.sqrt(2 * np.pi * np.outer(np.ones(npts), vary))

pxc = pxgc * np.outer(np.ones(npts), pc)
pxyc = pygxc * pxc
pcgxy = pxyc / np.outer(np.sum(pxyc, axis=1), np.ones(nclusters))

plt.figure(figsize=(8,5))
plt.plot(x, y, '.', label='Data')
yplot = np.sum((np.outer(np.ones(npts), beta[0, :]) +
                np.outer(x, beta[1, :]) +
                np.outer(x*x, beta[2, :])) * pxc, 1) / np.sum(pxc, 1)
plt.plot(xplot, yplot, label='Cluster estimate')
plt.title('Function clusters before iteration')
plt.legend()
plt.show()

# -------------------------------
# EM iteration
# -------------------------------
for step in range(niterations):
    # E-step
    pxgc = np.exp(-((np.outer(x, np.ones(nclusters)) - np.outer(np.ones(npts), mux))**2) /
                  (2 * np.outer(np.ones(npts), varx))) / np.sqrt(2 * np.pi * np.outer(np.ones(npts), varx))

    pygxc = np.exp(-((np.outer(y, np.ones(nclusters)) -
                      (np.outer(np.ones(npts), beta[0, :]) +
                       np.outer(x, beta[1, :]) +
                       np.outer(x*x, beta[2, :])))**2) /
                   (2 * np.outer(np.ones(npts), vary))) / np.sqrt(2 * np.pi * np.outer(np.ones(npts), vary))

    pxc = pxgc * np.outer(np.ones(npts), pc)
    pxyc = pygxc * pxc
    pcgxy = pxyc / np.outer(np.sum(pxyc, axis=1), np.ones(nclusters))

    # M-step
    pc = np.sum(pcgxy, 0) / npts
    mux = np.sum(np.outer(x, np.ones(nclusters)) * pcgxy, 0) / (npts * pc)
    varx = minvar + np.sum((np.outer(x, np.ones(nclusters)) - np.outer(np.ones(npts), mux))**2 * pcgxy, 0) / (npts * pc)

    # update beta using least squares
    a = np.zeros((3, nclusters))
    for c in range(nclusters):
        a[:, c] = np.array([
            np.sum(y * pcgxy[:, c]),
            np.sum(y * x * pcgxy[:, c]),
            np.sum(y * x * x * pcgxy[:, c])
        ]) / (npts * pc[c])

    B = np.zeros((3, 3, nclusters))
    for c in range(nclusters):
        B[:, :, c] = np.array([
            [np.sum(pcgxy[:, c]), np.sum(x * pcgxy[:, c]), np.sum(x*x * pcgxy[:, c])],
            [np.sum(x * pcgxy[:, c]), np.sum(x*x * pcgxy[:, c]), np.sum(x*x*x * pcgxy[:, c])],
            [np.sum(x*x * pcgxy[:, c]), np.sum(x*x*x * pcgxy[:, c]), np.sum(x*x*x*x * pcgxy[:, c])]
        ])
        # Solve least squares instead of matrix inversion
        beta[:, c], _, _, _ = np.linalg.lstsq(B[:, :, c], a[:, c], rcond=None)

    vary = minvar + np.sum((np.outer(y, np.ones(nclusters)) -
                            (np.outer(np.ones(npts), beta[0, :]) +
                             np.outer(x, beta[1, :]) +
                             np.outer(x*x, beta[2, :])))**2 * pcgxy, 0) / (npts * pc)

# -------------------------------
# plot after EM iteration
# -------------------------------
plt.figure(figsize=(8,5))
plt.plot(x, y, '.', label='Data')
yplot = np.sum((np.outer(np.ones(npts), beta[0, :]) +
                np.outer(x, beta[1, :]) +
                np.outer(x*x, beta[2, :])) * pxc, 1) / np.sum(pxc, 1)
plt.plot(xplot, yplot, label='Cluster estimate')
plt.title('Function clusters after iteration')
plt.legend()
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]: