import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
import numpy as np
import pandas as pd


# k-means parameters

nsteps = 1000
momentum = 0.
np.random.seed(10)

df = pd.read_csv("datasets/Housing.csv")
df.head()

# Select two numerical features
x = df["area"].values
y = df["price"].values


# k-means function (UNCHANGED)

def kmeans(x, y, momentum, nclusters):

    indices = np.random.uniform(low=0, high=len(x), size=nclusters).astype(int)
    mux = x[indices]
    muy = y[indices]

    for step in range(nsteps):

        X = np.outer(x, np.ones(len(mux)))
        Y = np.outer(y, np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)), mux)
        Muy = np.outer(np.ones(len(x)), muy)

        distances = np.sqrt((X - Mux)**2 + (Y - Muy)**2)
        mins = np.argmin(distances, axis=1)

        for i in range(len(mux)):
            index = np.where(mins == i)
            mux[i] = np.mean(x[index])
            muy[i] = np.mean(y[index])

    total_distance = 0
    for i in range(len(mux)):
        index = np.where(mins == i)
        total_distance += np.sum(
            np.sqrt((x[index] - mux[i])**2 + (y[index] - muy[i])**2)
        )

    return mux, muy, total_distance


def plot_kmeans(x, y, mux, muy):
    plt.figure()
    plt.plot(x, y, '.', alpha=0.4)
    plt.plot(mux, muy, 'r.', markersize=20)
    plt.xlabel("Area")
    plt.ylabel("Price")
    plt.title(f"{len(mux)} clusters")
    plt.show()


def plot_voronoi(x, y, mux, muy):
    plt.figure()
    plt.plot(x, y, '.', alpha=0.4)
    vor = Voronoi(np.stack((mux, muy), axis=1))
    voronoi_plot_2d(vor, show_vertices=False, show_points=True)
    plt.xlabel("Area")
    plt.ylabel("Price")
    plt.title(f"{len(mux)} clusters")
    plt.show()


#
# RUN K-MEANS FOR DIFFERENT CLUSTERS
#
distances = np.zeros(5)

mux, muy, distances[0] = kmeans(x, y, momentum, 1)
plot_kmeans(x, y, mux, muy)

mux, muy, distances[1] = kmeans(x, y, momentum, 2)
plot_kmeans(x, y, mux, muy)

mux, muy, distances[2] = kmeans(x, y, momentum, 3)
plot_voronoi(x, y, mux, muy)

mux, muy, distances[3] = kmeans(x, y, momentum, 4)
plot_voronoi(x, y, mux, muy)

mux, muy, distances[4] = kmeans(x, y, momentum, 5)
plot_voronoi(x, y, mux, muy)

#
# ELBOW PLOT
#
plt.figure()
plt.plot(np.arange(1, 6), distances, 'o-')
plt.xlabel("Number of clusters")
plt.ylabel("Total distance to clusters")
plt.title("Elbow Method for Housing Data")
plt.show()

import time

#
# Gaussian Mixture Model parameters
#
nclusters = 3
nsteps = 25
nplot = 100
np.random.seed(0)

#
# load YOUR dataset (two numerical variables)
#
df = pd.read_csv("datasets/Housing.csv")

x = df["area"].values
y = df["price"].values

# normalize (important for GMM stability)
x = (x - np.mean(x)) / np.std(x)
y = (y - np.mean(y)) / np.std(y)

npts = len(x)

#
# choose starting points and initialize
#
indices = np.random.uniform(low=0, high=npts, size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]

varx = np.ones(nclusters)
vary = np.ones(nclusters)
pc = np.ones(nclusters) / nclusters

#
# plot before iteration
#
fig, ax = plt.subplots()
plt.plot(x, y, '.', alpha=0.4)
plt.errorbar(mux, muy,
             xerr=np.sqrt(varx),
             yerr=np.sqrt(vary),
             fmt='r.', markersize=15)
plt.autoscale()
plt.title('before iteration')
plt.show()

#
# do E-M iterations
#
for i in range(nsteps):

    xm = np.outer(x, np.ones(nclusters))
    ym = np.outer(y, np.ones(nclusters))
    muxm = np.outer(np.ones(npts), mux)
    muym = np.outer(np.ones(npts), muy)
    varxm = np.outer(np.ones(npts), varx)
    varym = np.outer(np.ones(npts), vary)
    pcm = np.outer(np.ones(npts), pc)

    pvgc = (1/np.sqrt(2*np.pi*varxm)) * \
           np.exp(-(xm-muxm)**2/(2*varxm)) * \
           (1/np.sqrt(2*np.pi*varym)) * \
           np.exp(-(ym-muym)**2/(2*varym))

    pvc = pvgc * pcm
    pcgv = pvc / np.outer(np.sum(pvc, 1), np.ones(nclusters))

    pc = np.sum(pcgv, 0) / npts
    mux = np.sum(xm * pcgv, 0) / (npts * pc)
    muy = np.sum(ym * pcgv, 0) / (npts * pc)
    varx = 0.1 + np.sum((xm - muxm)**2 * pcgv, 0) / (npts * pc)
    vary = 0.1 + np.sum((ym - muym)**2 * pcgv, 0) / (npts * pc)

#
# plot after iteration
#
fig, ax = plt.subplots()
plt.plot(x, y, '.', alpha=0.4)
plt.errorbar(mux, muy,
             xerr=np.sqrt(varx),
             yerr=np.sqrt(vary),
             fmt='r.', markersize=15)
plt.autoscale()
plt.title('after iteration')
plt.show()

#
# plot distribution
#
xplot = np.linspace(np.min(x), np.max(x), nplot)
yplot = np.linspace(np.min(y), np.max(y), nplot)
(X, Y) = np.meshgrid(xplot, yplot)

p = np.zeros((nplot, nplot))
for c in range(nclusters):
    p += np.exp(-(X-mux[c])**2/(2*varx[c]))/np.sqrt(2*np.pi*varx[c]) * \
         np.exp(-(Y-muy[c])**2/(2*vary[c]))/np.sqrt(2*np.pi*vary[c]) * pc[c]

fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
ax.plot_surface(X, Y, p)
plt.title('probability distribution')
plt.show()

Interpretation:


Before Iteration:
    

Initial cluster means and variances are placed randomly.

Ellipses are large, showing high uncertainty.

The model does not yet reflect the data distribution.
    

After Iteration:
    

Cluster centers move toward dense regions of the data.

Variances shrink and better fit local concentrations.

This indicates successful learning by the EM algorithm.
    

3D Probability Distribution:
    

Shows the estimated joint probability density of area and price.

Peaks represent regions where housing observations are most likely.

The smooth surface reflects overlapping Gaussian components.

nclusters = 5
minvar = .01
niterations = 100
np.random.seed(10)
x = df["area"].values
y = df["price"].values

# normalize (important for stability)
x = (x - np.mean(x)) / np.std(x)
y = (y - np.mean(y)) / np.std(y)

npts = len(x)
nplot = npts
xplot = np.copy(x)

#
# initialize parameters (same as original code)
#
mux = np.min(x) + (np.max(x)-np.min(x)) * np.random.rand(nclusters)
beta = 0.1 * np.random.rand(3, nclusters)
varx = np.ones(nclusters)
vary = np.ones(nclusters)
pc = np.ones(nclusters) / nclusters

#
# plot before iteration
#
pxgc = np.exp(-(np.outer(x,np.ones(nclusters)) 
  -np.outer(np.ones(npts),mux))**2
  /(2*np.outer(np.ones(npts),varx))) \
  /np.sqrt(2*np.pi*np.outer(np.ones(npts),varx))

pygxc = np.exp(-(np.outer(y,np.ones(nclusters)) 
  -(np.outer(np.ones(npts),beta[0,:])
  +np.outer(x,beta[1,:])
  +np.outer(x*x,beta[2,:])))**2
  /(2*np.outer(np.ones(npts),vary))) \
  /np.sqrt(2*np.pi*np.outer(np.ones(npts),vary))

pxc = pxgc*np.outer(np.ones(npts),pc)
pxyc = pygxc*pxc
pcgxy = pxyc/np.outer(np.sum(pxyc,1),np.ones(nclusters))

plt.plot(x,y,'.',alpha=0.4)
yplot = np.sum((np.outer(np.ones(nplot),beta[0,:])
  +np.outer(x,beta[1,:])
  +np.outer(x*x,beta[2,:]))*pxc,1)/np.sum(pxc,1)

plt.plot(xplot,yplot)
yvarplot = np.sum((vary+(np.outer(np.ones(nplot),beta[0,:])
  +np.outer(x,beta[1,:])
  +np.outer(x*x,beta[2,:]))**2)*pxc,1)/np.sum(pxc,1)-yplot**2

ystdplot = np.sqrt(yvarplot)
plt.plot(xplot,yplot+ystdplot,'k--')
plt.plot(xplot,yplot-ystdplot,'k--')

plt.title('function clusters before iteration')
plt.show()

#
# E–M iteration
#
for step in range(niterations):

   pxgc = np.exp(-(np.outer(x,np.ones(nclusters)) 
      -np.outer(np.ones(npts),mux))**2
      /(2*np.outer(np.ones(npts),varx))) \
      /np.sqrt(2*np.pi*np.outer(np.ones(npts),varx))

   pygxc = np.exp(-(np.outer(y,np.ones(nclusters)) 
      -(np.outer(np.ones(npts),beta[0,:])
      +np.outer(x,beta[1,:])
      +np.outer(x*x,beta[2,:])))**2
      /(2*np.outer(np.ones(npts),vary))) \
      /np.sqrt(2*np.pi*np.outer(np.ones(npts),vary))

   pxc = pxgc*np.outer(np.ones(npts),pc)
   pxyc = pygxc*pxc
   pcgxy = pxyc/np.outer(np.sum(pxyc,1),np.ones(nclusters))

   pc = np.sum(pcgxy,0)/npts
   mux = np.sum(np.outer(x,np.ones(nclusters))*pcgxy,0)/(npts*pc)
   varx = minvar + np.sum((np.outer(x,np.ones(nclusters))
      -np.outer(np.ones(npts),mux))**2*pcgxy,0)/(npts*pc)

   a = np.array([
      np.sum(np.outer(y,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(y*x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(y*x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc)
   ])

   B = np.array([[
      np.ones(nclusters),
      np.sum(np.outer(x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc)],
      [
      np.sum(np.outer(x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(x*x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc)],
      [
      np.sum(np.outer(x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(x*x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(x*x*x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc)]
   ])

   for c in range(nclusters):
      beta[:,c] = np.linalg.inv(B[:,:,c]) @ a[:,c]

   vary = minvar + np.sum((np.outer(y,np.ones(nclusters))
      -(np.outer(np.ones(npts),beta[0,:])
      +np.outer(x,beta[1,:])
      +np.outer(x*x,beta[2,:])))**2
      *pcgxy,0)/(npts*pc)

#
# plot after iteration
#
plt.plot(x,y,'.',alpha=0.4)
yplot = np.sum((np.outer(np.ones(nplot),beta[0,:])
  +np.outer(x,beta[1,:])
  +np.outer(x*x,beta[2,:]))*pxc,1)/np.sum(pxc,1)

plt.plot(xplot,yplot)
yvarplot = np.sum((vary+(np.outer(np.ones(nplot),beta[0,:])
  +np.outer(x,beta[1,:])
  +np.outer(x*x,beta[2,:]))**2)*pxc,1)/np.sum(pxc,1)-yplot**2

ystdplot = np.sqrt(yvarplot)
plt.plot(xplot,yplot+ystdplot,'k--')
plt.plot(xplot,yplot-ystdplot,'k--')

plt.title('function clusters after iteration')
plt.show()

Interpretation:

Before Iteration:
    

The quadratic curves are poorly aligned with the data.

High uncertainty bands show weak initial modeling.

Cluster responsibilities are nearly uniform.
    

After Iteration:
    

Multiple quadratic functions capture different trends in the data.

The combined curve fits nonlinear relationships better than a single regression.

Confidence bands become narrower, indicating improved certainty.

Each cluster models a different price–area behavior pattern.

Voronoi tesselation¶

Kernel Density Estimation¶

modeling sates¶