import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d

# Load and clean data
df = pd.read_csv('datasets/viii_2023.csv')
df.columns = [col.strip() for col in df.columns]

# Clean: remove summary rows and empty rows
df_clean = df[df.iloc[:, 0].notna() & ~df.iloc[:, 0].astype(str).str.contains('S.M=', na=False)].copy()
df_clean = df_clean.rename(columns={df_clean.columns[0]: 'Name'})

# Select two subjects for 2D visualization (e.g., Maths and Science)
subject1 = 'Maths'
subject2 = 'Science'

# Extract data points for these two subjects
points = df_clean[[subject1, subject2]].dropna().values

# Create Voronoi tessellation
vor = Voronoi(points)

# Plot
plt.figure(figsize=(10, 8))
voronoi_plot_2d(vor, show_vertices=False, line_colors='blue', line_width=2, point_size=10)
plt.scatter(points[:, 0], points[:, 1], c='red', s=50, label='Students')

# Calculate and show region areas (inverse density)
if len(points) > 0:
    areas = []
    for region_idx in vor.point_region:
        region = vor.regions[region_idx]
        if -1 not in region and len(region) > 0:
            area = 0
            for i in range(len(region)):
                x1, y1 = vor.vertices[region[i-1]]
                x2, y2 = vor.vertices[region[i]]
                area += (x1*y2 - x2*y1)
            areas.append(abs(area)/2)
    
    if areas:
        plt.title(f'Voronoi Tessellation of {subject1} vs {subject2}\nAvg region area: {np.mean(areas):.1f}')
        print(f"Average Voronoi region area: {np.mean(areas):.1f}")
        print(f"Region area variance: {np.var(areas):.1f}")
        print(f"Density ~ {1/np.mean(areas):.4f} points per unit area")

plt.xlabel(subject1)
plt.ylabel(subject2)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

Average Voronoi region area: 91.6
Region area variance: 42046.2
Density ~ 0.0109 points per unit area

<Figure size 1000x800 with 0 Axes>

week06: Density estimation¶

Lesson on:¶

Explanation:¶