import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from scipy.spatial import Voronoi, voronoi_plot_2d
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
import warnings
warnings.filterwarnings('ignore')

# Load Iris dataset
iris = load_iris()
X = iris.data[:, :2]  # Use first 2 features (sepal length, sepal width)
y = iris.target  # True labels (for comparison)
feature_names = iris.feature_names[:2]
target_names = iris.target_names

print("Iris Dataset:")
print(f"  Total samples: {len(X)}")
print(f"  Features: {feature_names[0]}, {feature_names[1]}")
print(f"  Classes: {len(target_names)} ({', '.join(target_names)})")
print(f"\n2D Data shape: {X.shape}")
print(f"  {feature_names[0]} range: {X[:, 0].min():.2f} - {X[:, 0].max():.2f}")
print(f"  {feature_names[1]} range: {X[:, 1].min():.2f} - {X[:, 1].max():.2f}")
print(f"\nClass distribution:")
for i, name in enumerate(target_names):
    count = np.sum(y == i)
    print(f"  {name}: {count} samples")

Iris Dataset:
  Total samples: 150
  Features: sepal length (cm), sepal width (cm)
  Classes: 3 (setosa, versicolor, virginica)

2D Data shape: (150, 2)
  sepal length (cm) range: 4.30 - 7.90
  sepal width (cm) range: 2.00 - 4.40

Class distribution:
  setosa: 50 samples
  versicolor: 50 samples
  virginica: 50 samples

# 2D Kernel Density Estimation
kde = gaussian_kde(X.T)

# Create grid for visualization
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                     np.linspace(y_min, y_max, 100))
grid_points = np.c_[xx.ravel(), yy.ravel()]
density = kde(grid_points.T).reshape(xx.shape)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Scatter plot with true labels
scatter = axes[0].scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', alpha=0.7, s=30, edgecolors='black', linewidths=0.5)
axes[0].set_xlabel(feature_names[0])
axes[0].set_ylabel(feature_names[1])
axes[0].set_title('Iris Data Points (True Labels)')
axes[0].legend(handles=scatter.legend_elements()[0], labels=target_names.tolist(), title='Species')
axes[0].grid(True, alpha=0.3)

# KDE contour plot
contour = axes[1].contourf(xx, yy, density, levels=20, cmap='viridis', alpha=0.6)
axes[1].scatter(X[:, 0], X[:, 1], alpha=0.7, s=30, c='white', edgecolors='black', linewidths=0.5)
plt.colorbar(contour, ax=axes[1], label='Probability Density')
axes[1].set_xlabel(feature_names[0])
axes[1].set_ylabel(feature_names[1])
axes[1].set_title('Kernel Density Estimation (KDE)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("KDE provides a smooth continuous estimate of the probability density.")
print("Note: KDE shows overall density without using class labels.")

KDE provides a smooth continuous estimate of the probability density.
Note: KDE shows overall density without using class labels.

# k-means clustering with iteration visualization
# Use k=3 to match the 3 iris species
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10, max_iter=1)

# Before iteration: show initial random centroids
kmeans.fit(X)
initial_centroids = kmeans.cluster_centers_.copy()
initial_labels = kmeans.labels_

# After full iteration: show converged centroids
kmeans_full = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans_full.fit(X)
final_centroids = kmeans_full.cluster_centers_
final_labels = kmeans_full.labels_

print(f"k-means clustering (k=3):")
print(f"  Initial centroids:\n{initial_centroids}")
print(f"\n  Final centroids:\n{final_centroids}")
print(f"\n  Inertia: {kmeans_full.inertia_:.2f} (lower is better)")
print(f"  Cluster sizes: {np.bincount(final_labels)}")
print(f"  Iterations to converge: {kmeans_full.n_iter_}")

# Compare with true labels
from sklearn.metrics import adjusted_rand_score
ari_score = adjusted_rand_score(y, final_labels)
print(f"\n  Adjusted Rand Index (vs true labels): {ari_score:.3f}")
print(f"    (1.0 = perfect match, 0.0 = random)")

k-means clustering (k=3):
  Initial centroids:
[[5.01454545 3.35454545]
 [6.86428571 3.1047619 ]
 [5.89433962 2.71132075]]

  Final centroids:
[[6.81276596 3.07446809]
 [5.77358491 2.69245283]
 [5.006      3.428     ]]

  Inertia: 37.05 (lower is better)
  Cluster sizes: [47 53 50]
  Iterations to converge: 8

  Adjusted Rand Index (vs true labels): 0.601
    (1.0 = perfect match, 0.0 = random)

# Visualize k-means before and after iteration
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Before iteration
scatter1 = axes[0].scatter(X[:, 0], X[:, 1], c=initial_labels, cmap='viridis', alpha=0.7, s=30, edgecolors='black', linewidths=0.5)
axes[0].scatter(initial_centroids[:, 0], initial_centroids[:, 1], 
                c='red', marker='x', s=200, linewidths=3, label='Initial centroids')
vor_initial = Voronoi(initial_centroids)
voronoi_plot_2d(vor_initial, ax=axes[0], show_points=False, show_vertices=False)
axes[0].set_xlabel(feature_names[0])
axes[0].set_ylabel(feature_names[1])
axes[0].set_title('Before k-means iteration (1 iteration)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# After iteration
scatter2 = axes[1].scatter(X[:, 0], X[:, 1], c=final_labels, cmap='viridis', alpha=0.7, s=30, edgecolors='black', linewidths=0.5)
axes[1].scatter(final_centroids[:, 0], final_centroids[:, 1], 
                c='red', marker='x', s=200, linewidths=3, label='Final centroids')
vor_final = Voronoi(final_centroids)
voronoi_plot_2d(vor_final, ax=axes[1], show_points=False, show_vertices=False)
axes[1].set_xlabel(feature_names[0])
axes[1].set_ylabel(feature_names[1])
axes[1].set_title(f'After k-means iteration (converged in {kmeans_full.n_iter_} iterations)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nKey observations:")
print("  - Voronoi tessellation shows hard boundaries between clusters")
print("  - Each point belongs to exactly one cluster (hard assignment)")
print("  - Centroids move to minimize within-cluster variance (inertia)")
print("  - E-step: assign points to nearest centroid")
print("  - M-step: update centroids to mean of assigned points")

Key observations:
  - Voronoi tessellation shows hard boundaries between clusters
  - Each point belongs to exactly one cluster (hard assignment)
  - Centroids move to minimize within-cluster variance (inertia)
  - E-step: assign points to nearest centroid
  - M-step: update centroids to mean of assigned points

colors = ['red', 'green', 'blue', 'orange', 'purple']
# Gaussian Mixture Model
n_components = 3
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(X)
gmm_labels = gmm.predict(X)
gmm_probs = gmm.predict_proba(X)

# Create grid for GMM density visualization
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                     np.linspace(y_min, y_max, 100))
grid_points = np.vstack([xx.ravel(), yy.ravel()]).T
gmm_density = np.exp(gmm.score_samples(grid_points)).reshape(xx.shape)

# Plot GMM results
plt.figure(figsize=(14, 5))

# Plot 1: GMM clusters with probabilities
plt.subplot(1, 2, 1)
for i in range(n_components):
    mask = gmm_labels == i
    plt.scatter(X[mask, 0], X[mask, 1], alpha=0.5, s=20,
                c=colors[i % len(colors)], label=f'Component {i+1}')

# Plot means
means = gmm.means_
plt.scatter(means[:, 0], means[:, 1], c='black', marker='x',
            s=200, linewidths=3, label='GMM Means')

# Plot covariance ellipses
for i in range(n_components):
    mean = means[i]
    cov = gmm.covariances_[i]
    eigenvals, eigenvecs = np.linalg.eigh(cov)
    angle = np.degrees(np.arctan2(eigenvecs[1, 0], eigenvecs[0, 0]))
    width, height = 2 * np.sqrt(eigenvals) * 2  # 2 standard deviations
    from matplotlib.patches import Ellipse
    ellipse = Ellipse(mean, width, height, angle=angle, 
                     fill=False, linestyle='--', linewidth=2, color='black')
    plt.gca().add_patch(ellipse)

plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.title(f'Gaussian Mixture Model (k={n_components})')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: GMM density
plt.subplot(1, 2, 2)
plt.contourf(xx, yy, gmm_density, levels=20, cmap='viridis')
plt.colorbar(label='Density')
plt.scatter(X[:, 0], X[:, 1], alpha=0.2, s=5, c='white', edgecolors='black')
plt.scatter(means[:, 0], means[:, 1], c='red', marker='x',
            s=200, linewidths=3, label='GMM Means')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.title('GMM Probability Density')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"GMM Component Weights: {gmm.weights_}")
print(f"\nGMM Means:")
for i, mean in enumerate(means):
    print(f"  Component {i+1}: ({mean[0]:.2f}, {mean[1]:.2f}), Weight: {gmm.weights_[i]:.3f}")

GMM Component Weights: [0.29474143 0.38081285 0.32444572]

GMM Means:
  Component 1: (6.68, 3.03), Weight: 0.295
  Component 2: (5.90, 2.74), Weight: 0.381
  Component 3: (5.02, 3.45), Weight: 0.324

labels = final_labels
centers = final_centroids
n_clusters = 3
colors = ['red', 'green', 'blue', 'orange', 'purple']
n_clusters = 3
colors = ['red', 'green', 'blue', 'orange', 'purple']
# Compare k-means and GMM
plt.figure(figsize=(14, 5))

# Plot 1: k-means (hard boundaries)
plt.subplot(1, 2, 1)
for i in range(n_clusters):
    mask = labels == i
    plt.scatter(X[mask, 0], X[mask, 1], alpha=0.5, s=20,
                c=colors[i % len(colors)], label=f'Cluster {i+1}')
plt.scatter(centers[:, 0], centers[:, 1], c='black', marker='x',
            s=200, linewidths=3)
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.title('k-means: Hard Clustering')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: GMM (soft boundaries)
plt.subplot(1, 2, 2)
# Color points by maximum probability
max_probs = np.max(gmm_probs, axis=1)
scatter = plt.scatter(X[:, 0], X[:, 1], c=max_probs, cmap='viridis',
                     alpha=0.6, s=20, edgecolors='black', linewidths=0.5)
plt.colorbar(scatter, label='Max Probability')
plt.scatter(means[:, 0], means[:, 1], c='red', marker='x',
            s=200, linewidths=3, label='GMM Means')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.title('GMM: Soft Clustering (Probability-based)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Key Differences:")
print("  k-means: Hard boundaries, each point belongs to one cluster")
print("  GMM: Soft boundaries, each point has probabilities for all clusters")

Key Differences:
  k-means: Hard boundaries, each point belongs to one cluster
  GMM: Soft boundaries, each point has probabilities for all clusters

Data Science - Week 3: Density Estimation Assignment¶

1. Load and Explore Data¶

2. Expectation-Maximization (E-M) Algorithm¶

E-M Steps:¶

3. Kernel Density Estimation (KDE)¶

4. k-means Clustering¶

Algorithm:¶

Voronoi Tessellation:¶

5. Gaussian Mixture Model (GMM)¶

Mathematical Formulation:¶

E-M Updates:¶

6. Comparison: k-means vs GMM¶

7. Summary¶

Key Findings:¶