[karma Tshomo] - Fab Futures - Data Science
Home About

Voronoi tesselation¶

In [1]:
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
import numpy as np
import pandas as pd


# k-means parameters

nsteps = 1000
momentum = 0.
np.random.seed(10)

df = pd.read_csv("datasets/Housing.csv")
df.head()

# Select two numerical features
x = df["area"].values
y = df["price"].values


# k-means function (UNCHANGED)

def kmeans(x, y, momentum, nclusters):

    indices = np.random.uniform(low=0, high=len(x), size=nclusters).astype(int)
    mux = x[indices]
    muy = y[indices]

    for step in range(nsteps):

        X = np.outer(x, np.ones(len(mux)))
        Y = np.outer(y, np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)), mux)
        Muy = np.outer(np.ones(len(x)), muy)

        distances = np.sqrt((X - Mux)**2 + (Y - Muy)**2)
        mins = np.argmin(distances, axis=1)

        for i in range(len(mux)):
            index = np.where(mins == i)
            mux[i] = np.mean(x[index])
            muy[i] = np.mean(y[index])

    total_distance = 0
    for i in range(len(mux)):
        index = np.where(mins == i)
        total_distance += np.sum(
            np.sqrt((x[index] - mux[i])**2 + (y[index] - muy[i])**2)
        )

    return mux, muy, total_distance


def plot_kmeans(x, y, mux, muy):
    plt.figure()
    plt.plot(x, y, '.', alpha=0.4)
    plt.plot(mux, muy, 'r.', markersize=20)
    plt.xlabel("Area")
    plt.ylabel("Price")
    plt.title(f"{len(mux)} clusters")
    plt.show()


def plot_voronoi(x, y, mux, muy):
    plt.figure()
    plt.plot(x, y, '.', alpha=0.4)
    vor = Voronoi(np.stack((mux, muy), axis=1))
    voronoi_plot_2d(vor, show_vertices=False, show_points=True)
    plt.xlabel("Area")
    plt.ylabel("Price")
    plt.title(f"{len(mux)} clusters")
    plt.show()


#
# RUN K-MEANS FOR DIFFERENT CLUSTERS
#
distances = np.zeros(5)

mux, muy, distances[0] = kmeans(x, y, momentum, 1)
plot_kmeans(x, y, mux, muy)

mux, muy, distances[1] = kmeans(x, y, momentum, 2)
plot_kmeans(x, y, mux, muy)

mux, muy, distances[2] = kmeans(x, y, momentum, 3)
plot_voronoi(x, y, mux, muy)

mux, muy, distances[3] = kmeans(x, y, momentum, 4)
plot_voronoi(x, y, mux, muy)

mux, muy, distances[4] = kmeans(x, y, momentum, 5)
plot_voronoi(x, y, mux, muy)

#
# ELBOW PLOT
#
plt.figure()
plt.plot(np.arange(1, 6), distances, 'o-')
plt.xlabel("Number of clusters")
plt.ylabel("Total distance to clusters")
plt.title("Elbow Method for Housing Data")
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Interpreattion:

K = 1 cluster:

All housing data points are assigned to a single cluster.

One centroid represents the entire dataset.

This shows very high overall distance, meaning the data structure is poorly captured.

K = 2 clusters:

The data is split into two broad groups based on area and price.

Points closer to each centroid belong to that cluster.

Some separation appears, but clusters are still too coarse.

K = 3 clusters (Voronoi plot):

The dataset is partitioned into three regions using Voronoi boundaries.

Each region contains points closest to one centroid.

This matches visible groupings in the housing data better.

K = 4 clusters:

More refined segmentation of the data.

Voronoi regions become smaller, capturing local structure.

Some clusters start to overlap due to similar prices or areas.

K = 5 clusters:

Very fine partitioning of the dataset.

Some clusters contain only a small number of points.

Indicates possible over-clustering.

Elbow Plot:

The total distance to clusters decreases as the number of clusters increases.

The rate of decrease slows after 3 clusters.

This suggests 3 clusters is a reasonable choice for the housing data.

Kernel Density Estimation¶

In [3]:
import time

#
# Gaussian Mixture Model parameters
#
nclusters = 3
nsteps = 25
nplot = 100
np.random.seed(0)

#
# load YOUR dataset (two numerical variables)
#
df = pd.read_csv("datasets/Housing.csv")

x = df["area"].values
y = df["price"].values

# normalize (important for GMM stability)
x = (x - np.mean(x)) / np.std(x)
y = (y - np.mean(y)) / np.std(y)

npts = len(x)

#
# choose starting points and initialize
#
indices = np.random.uniform(low=0, high=npts, size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]

varx = np.ones(nclusters)
vary = np.ones(nclusters)
pc = np.ones(nclusters) / nclusters

#
# plot before iteration
#
fig, ax = plt.subplots()
plt.plot(x, y, '.', alpha=0.4)
plt.errorbar(mux, muy,
             xerr=np.sqrt(varx),
             yerr=np.sqrt(vary),
             fmt='r.', markersize=15)
plt.autoscale()
plt.title('before iteration')
plt.show()

#
# do E-M iterations
#
for i in range(nsteps):

    xm = np.outer(x, np.ones(nclusters))
    ym = np.outer(y, np.ones(nclusters))
    muxm = np.outer(np.ones(npts), mux)
    muym = np.outer(np.ones(npts), muy)
    varxm = np.outer(np.ones(npts), varx)
    varym = np.outer(np.ones(npts), vary)
    pcm = np.outer(np.ones(npts), pc)

    pvgc = (1/np.sqrt(2*np.pi*varxm)) * \
           np.exp(-(xm-muxm)**2/(2*varxm)) * \
           (1/np.sqrt(2*np.pi*varym)) * \
           np.exp(-(ym-muym)**2/(2*varym))

    pvc = pvgc * pcm
    pcgv = pvc / np.outer(np.sum(pvc, 1), np.ones(nclusters))

    pc = np.sum(pcgv, 0) / npts
    mux = np.sum(xm * pcgv, 0) / (npts * pc)
    muy = np.sum(ym * pcgv, 0) / (npts * pc)
    varx = 0.1 + np.sum((xm - muxm)**2 * pcgv, 0) / (npts * pc)
    vary = 0.1 + np.sum((ym - muym)**2 * pcgv, 0) / (npts * pc)

#
# plot after iteration
#
fig, ax = plt.subplots()
plt.plot(x, y, '.', alpha=0.4)
plt.errorbar(mux, muy,
             xerr=np.sqrt(varx),
             yerr=np.sqrt(vary),
             fmt='r.', markersize=15)
plt.autoscale()
plt.title('after iteration')
plt.show()

#
# plot distribution
#
xplot = np.linspace(np.min(x), np.max(x), nplot)
yplot = np.linspace(np.min(y), np.max(y), nplot)
(X, Y) = np.meshgrid(xplot, yplot)

p = np.zeros((nplot, nplot))
for c in range(nclusters):
    p += np.exp(-(X-mux[c])**2/(2*varx[c]))/np.sqrt(2*np.pi*varx[c]) * \
         np.exp(-(Y-muy[c])**2/(2*vary[c]))/np.sqrt(2*np.pi*vary[c]) * pc[c]

fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
ax.plot_surface(X, Y, p)
plt.title('probability distribution')
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
Interpretation:


Before Iteration:
    

Initial cluster means and variances are placed randomly.

Ellipses are large, showing high uncertainty.

The model does not yet reflect the data distribution.
    

After Iteration:
    

Cluster centers move toward dense regions of the data.

Variances shrink and better fit local concentrations.

This indicates successful learning by the EM algorithm.
    

3D Probability Distribution:
    

Shows the estimated joint probability density of area and price.

Peaks represent regions where housing observations are most likely.

The smooth surface reflects overlapping Gaussian components.
    

modeling sates¶

In [6]:
nclusters = 5
minvar = .01
niterations = 100
np.random.seed(10)
x = df["area"].values
y = df["price"].values

# normalize (important for stability)
x = (x - np.mean(x)) / np.std(x)
y = (y - np.mean(y)) / np.std(y)

npts = len(x)
nplot = npts
xplot = np.copy(x)

#
# initialize parameters (same as original code)
#
mux = np.min(x) + (np.max(x)-np.min(x)) * np.random.rand(nclusters)
beta = 0.1 * np.random.rand(3, nclusters)
varx = np.ones(nclusters)
vary = np.ones(nclusters)
pc = np.ones(nclusters) / nclusters

#
# plot before iteration
#
pxgc = np.exp(-(np.outer(x,np.ones(nclusters)) 
  -np.outer(np.ones(npts),mux))**2
  /(2*np.outer(np.ones(npts),varx))) \
  /np.sqrt(2*np.pi*np.outer(np.ones(npts),varx))

pygxc = np.exp(-(np.outer(y,np.ones(nclusters)) 
  -(np.outer(np.ones(npts),beta[0,:])
  +np.outer(x,beta[1,:])
  +np.outer(x*x,beta[2,:])))**2
  /(2*np.outer(np.ones(npts),vary))) \
  /np.sqrt(2*np.pi*np.outer(np.ones(npts),vary))

pxc = pxgc*np.outer(np.ones(npts),pc)
pxyc = pygxc*pxc
pcgxy = pxyc/np.outer(np.sum(pxyc,1),np.ones(nclusters))

plt.plot(x,y,'.',alpha=0.4)
yplot = np.sum((np.outer(np.ones(nplot),beta[0,:])
  +np.outer(x,beta[1,:])
  +np.outer(x*x,beta[2,:]))*pxc,1)/np.sum(pxc,1)

plt.plot(xplot,yplot)
yvarplot = np.sum((vary+(np.outer(np.ones(nplot),beta[0,:])
  +np.outer(x,beta[1,:])
  +np.outer(x*x,beta[2,:]))**2)*pxc,1)/np.sum(pxc,1)-yplot**2

ystdplot = np.sqrt(yvarplot)
plt.plot(xplot,yplot+ystdplot,'k--')
plt.plot(xplot,yplot-ystdplot,'k--')

plt.title('function clusters before iteration')
plt.show()

#
# E–M iteration
#
for step in range(niterations):

   pxgc = np.exp(-(np.outer(x,np.ones(nclusters)) 
      -np.outer(np.ones(npts),mux))**2
      /(2*np.outer(np.ones(npts),varx))) \
      /np.sqrt(2*np.pi*np.outer(np.ones(npts),varx))

   pygxc = np.exp(-(np.outer(y,np.ones(nclusters)) 
      -(np.outer(np.ones(npts),beta[0,:])
      +np.outer(x,beta[1,:])
      +np.outer(x*x,beta[2,:])))**2
      /(2*np.outer(np.ones(npts),vary))) \
      /np.sqrt(2*np.pi*np.outer(np.ones(npts),vary))

   pxc = pxgc*np.outer(np.ones(npts),pc)
   pxyc = pygxc*pxc
   pcgxy = pxyc/np.outer(np.sum(pxyc,1),np.ones(nclusters))

   pc = np.sum(pcgxy,0)/npts
   mux = np.sum(np.outer(x,np.ones(nclusters))*pcgxy,0)/(npts*pc)
   varx = minvar + np.sum((np.outer(x,np.ones(nclusters))
      -np.outer(np.ones(npts),mux))**2*pcgxy,0)/(npts*pc)

   a = np.array([
      np.sum(np.outer(y,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(y*x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(y*x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc)
   ])

   B = np.array([[
      np.ones(nclusters),
      np.sum(np.outer(x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc)],
      [
      np.sum(np.outer(x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(x*x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc)],
      [
      np.sum(np.outer(x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(x*x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc),
      np.sum(np.outer(x*x*x*x,np.ones(nclusters))*pcgxy,0)/(npts*pc)]
   ])

   for c in range(nclusters):
      beta[:,c] = np.linalg.inv(B[:,:,c]) @ a[:,c]

   vary = minvar + np.sum((np.outer(y,np.ones(nclusters))
      -(np.outer(np.ones(npts),beta[0,:])
      +np.outer(x,beta[1,:])
      +np.outer(x*x,beta[2,:])))**2
      *pcgxy,0)/(npts*pc)

#
# plot after iteration
#
plt.plot(x,y,'.',alpha=0.4)
yplot = np.sum((np.outer(np.ones(nplot),beta[0,:])
  +np.outer(x,beta[1,:])
  +np.outer(x*x,beta[2,:]))*pxc,1)/np.sum(pxc,1)

plt.plot(xplot,yplot)
yvarplot = np.sum((vary+(np.outer(np.ones(nplot),beta[0,:])
  +np.outer(x,beta[1,:])
  +np.outer(x*x,beta[2,:]))**2)*pxc,1)/np.sum(pxc,1)-yplot**2

ystdplot = np.sqrt(yvarplot)
plt.plot(xplot,yplot+ystdplot,'k--')
plt.plot(xplot,yplot-ystdplot,'k--')

plt.title('function clusters after iteration')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]:
Interpretation:

Before Iteration:
    

The quadratic curves are poorly aligned with the data.

High uncertainty bands show weak initial modeling.

Cluster responsibilities are nearly uniform.
    

After Iteration:
    

Multiple quadratic functions capture different trends in the data.

The combined curve fits nonlinear relationships better than a single regression.

Confidence bands become narrower, indicating improved certainty.

Each cluster models a different price–area behavior pattern.