import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv") 
rating_map = df['parental level of education'].value_counts()
df["education_count"] = df['parental level of education'].map(rating_map)

math_score = df['math score'].dropna().values 
npts = len(math_score)
mean = np.mean(math_score)
stddev = np.std(math_score)

plt.figure(figsize=(10,6))

plt.hist(math_score, bins=max(npts//50, 5), density=True, alpha=0.6, color='gray', edgecolor='black')

plt.plot(math_score, 0*math_score, '|', ms=15, color='black')

# Gaussian fit
xi = np.linspace(mean - 3*stddev, mean + 3*stddev, 100)
yi = np.exp(-(xi - mean)**2 / (2*stddev**2)) / np.sqrt(2*np.pi*stddev**2)
plt.plot(xi, yi, 'b', lw=2, label='Gaussian Fit')

plt.xlabel("Math Score")
plt.ylabel("Density")
plt.title("Density Estimation: Gaussian Curve")
plt.legend()

plt.savefig("fig2.1.png", dpi=300, bbox_inches='tight', transparent=True)

plt.show()

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
# -----------------------------------------------------------
# 1. LOAD YOUR DATA
# -----------------------------------------------------------
data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv") 
xcol = "reading score"  
ycol = "math score"  

# Extract x and y as NumPy arrays
x = data[xcol].dropna().values
y = data[ycol].dropna().values

# Make sure they have the same length
N = min(len(x), len(y))
x = x[:N]
y = y[:N]

# -----------------------------------------------------------
# 2. K-MEANS PARAMETERS
# -----------------------------------------------------------
nsteps = 1000
momentum = 0.   # not used, kept for compatibility

# -----------------------------------------------------------
# 3. K-MEANS FUNCTION
# -----------------------------------------------------------
def kmeans(x, y, momentum, nclusters):
    indices = np.random.uniform(low=0, high=len(x), size=nclusters).astype(int)
    mux = x[indices]
    muy = y[indices]

    for _ in range(nsteps):
        X = np.outer(x, np.ones(len(mux)))
        Y = np.outer(y, np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)), mux)
        Muy = np.outer(np.ones(len(x)), muy)

        distances = np.sqrt((X - Mux)**2 + (Y - Muy)**2)
        mins = np.argmin(distances, axis=1)

        for i in range(len(mux)):
            index = np.where(mins == i)
            if len(index[0]) > 0:
                mux[i] = np.mean(x[index])
                muy[i] = np.mean(y[index])

    distances = 0
    for i in range(len(mux)):
        index = np.where(mins == i)
        distances += np.sum(np.sqrt((x[index] - mux[i])**2 + (y[index] - muy[i])**2))

    return mux, muy, distances

# -----------------------------------------------------------
# 4. PLOTTING FUNCTIONS
# -----------------------------------------------------------
def plot_kmeans(x, y, mux, muy):
    fig, ax = plt.subplots()
    ax.plot(x, y, ".", alpha=0.5)
    ax.plot(mux, muy, "r.", markersize=20)
    ax.set_xlabel(xcol)
    ax.set_ylabel(ycol)
    ax.set_title(f"{len(mux)} clusters (centroids)")
    plt.show()

def plot_Voronoi(x, y, mux, muy):
    fig, ax = plt.subplots()
    ax.plot(x, y, ".", alpha=0.5)
    vor = Voronoi(np.stack((mux, muy), axis=1))
    voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
    ax.set_xlabel(xcol)
    ax.set_ylabel(ycol)
    ax.set_title(f"{len(mux)} clusters (Voronoi)")
    plt.show()

# -----------------------------------------------------------
# 5. RUN K-MEANS AND PLOT
# -----------------------------------------------------------
distances = []

for k in range(1, 6):
    mux, muy, d = kmeans(x, y, momentum, k)
    distances.append(d)
    
    if k <= 2:
        plot_kmeans(x, y, mux, muy)
    else:
        plot_Voronoi(x, y, mux, muy)

# -----------------------------------------------------------
# 6. ELBOW PLOT
# -----------------------------------------------------------
plt.figure()
plt.plot(range(1, 6), distances, "o-")
plt.xlabel("Number of clusters")
plt.ylabel("Total distance to clusters")
plt.title("Elbow Plot")
plt.xticks(range(1, 6))
plt.show()

Math Score Distribution and Gaussian Fit¶

Explanation¶

Density Estimation¶

K-Means Clustering (3 clusters Voronoi)¶

Explanation¶