< Home
Week 6: Density Estimation¶
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi,voronoi_plot_2d
import numpy as np
import time
#
# k-means parameters
#
npts = 1000 #changed 1000 to 100
nsteps = 1 #chaged 1000 to 10
momentum = 0.
xs = [0,2,10,5] #added one more center
ys = [0,10,3,5] #added one more center
np.random.seed(10)
#
# generate random data
#
x = np.array([])
y = np.array([])
for i in range(len(xs)):
x = np.append(x,np.random.normal(loc=xs[i],scale=1,size=npts))
y = np.append(y,np.random.normal(loc=ys[i],scale=1,size=npts))
def kmeans(x,y,momentum,nclusters):
#
# choose starting points
#
indices = np.random.uniform(low=0,high=len(x),size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]
#
# do k-means iteration
#
for i in range(nsteps):
#
# find closest points
#
X = np.outer(x,np.ones(len(mux)))
Y = np.outer(y,np.ones(len(muy)))
Mux = np.outer(np.ones(len(x)),mux)
Muy = np.outer(np.ones(len(x)),muy)
distances = np.sqrt((X-Mux)**2+(Y-Muy)**2)
mins = np.argmin(distances,axis=1)
#
# update means
#
for i in range(len(mux)):
index = np.where(mins == i)
mux[i] = np.sum(x[index])/len(index[0])
muy[i] = np.sum(y[index])/len(index[0])
#
# find distances
#
distances = 0
for i in range(len(mux)):
index = np.where(mins == i)
distances += np.sum(np.sqrt((x[index]-mux[i])**2+(y[index]-muy[i])**2))
return mux,muy,distances
def plot_kmeans(x,y,mux,muy):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
fig,ax = plt.subplots()
plt.plot(x,y,'.')
plt.plot(mux,muy,'r.',markersize=20)
plt.xlim(xmin,xmax)
plt.ylim(xmin,xmax)
plt.title(f"{len(mux)} clusters")
plt.show()
def plot_Voronoi(x,y,mux,muy):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
fig,ax = plt.subplots()
plt.plot(x,y,'.')
vor = Voronoi(np.stack((mux,muy),axis=1))
voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
plt.xlim(xmin,xmax)
plt.ylim(xmin,xmax)
plt.title(f"{len(mux)} clusters")
plt.show()
distances = np.zeros(5)
mux,muy,distances[0] = kmeans(x,y,momentum,1)
plot_kmeans(x,y,mux,muy)
mux,muy,distances[1] = kmeans(x,y,momentum,2)
plot_kmeans(x,y,mux,muy)
mux,muy,distances[2] = kmeans(x,y,momentum,3)
plot_Voronoi(x,y,mux,muy)
mux,muy,distances[3] = kmeans(x,y,momentum,4)
plot_Voronoi(x,y,mux,muy)
mux,muy,distances[4] = kmeans(x,y,momentum,5)
plot_Voronoi(x,y,mux,muy)
fig,ax = plt.subplots()
plt.plot(np.arange(1,6),distances,'o')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.show()
I tried to understand this code. The code is doing following steps.
- Determine first center of gravity position appropriately.
- clustering
- calculate the distance from each points to center of gravity.
- Cluster each point to the nearest center of gravity.
- Calculate the new center of gravity for each cluster.
Through this process, we can find the exact center of gravity for each cluster.
Additionaly, I tried to change the "nsteps" number of iteration 1000 to 1.
Then I got a result bellow
It can not reach to the result. Those centers are showing wrong centers.
2.Try to use k-mean method for my picked up data.¶
I picked up the olympic data. That contains the personal information who took the medal in olympic. I will try to make clustering group through the height and weight.
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
import numpy as np
import time
# --- 1. データ読み込みと前処理 ---
def load_olympic_data(filename='datasets/olympic_athlete_events.csv'):
df = pd.read_csv(filename)
# 重複データの削除(同じ選手が複数のメダルを取っている場合などを考慮しIDで一意にする)
df_unique = df.drop_duplicates(subset='ID')
# 身長(Height)と体重(Weight)の欠損値(NaN)を除去
df_clean = df_unique.dropna(subset=['Height', 'Weight'])
# 計算負荷軽減のため、データをランダムにサンプリング(例: 2000件)
# 全データを使うと、この手書きk-meansの実装では計算時間がかかりすぎる可能性があります
if len(df_clean) > 2000:
df_sample = df_clean.sample(n=2000, random_state=42)
else:
df_sample = df_clean
return df_sample['Height'].values, df_sample['Weight'].values
# データをセット (x:身長, y:体重)
x, y = load_olympic_data()
# --- 2. ユーザー提供のk-meansロジック ---
## k-means parameters
nsteps = 100
momentum = 0.
def kmeans(x, y, momentum, nclusters):
#
# choose starting points (randomly from data)
#
indices = np.random.choice(len(x), nclusters, replace=False)
mux = x[indices]
muy = y[indices]
#
# do k-means iteration
#
for i in range(nsteps):
#
# find closest points
X = np.outer(x, np.ones(len(mux)))
Y = np.outer(y, np.ones(len(muy)))
Mux = np.outer(np.ones(len(x)), mux)
Muy = np.outer(np.ones(len(x)), muy)
distances_matrix = np.sqrt((X - Mux)**2 + (Y - Muy)**2)
mins = np.argmin(distances_matrix, axis=1)
#
# update means
#
for k in range(len(mux)):
index = np.where(mins == k)
if len(index[0]) > 0:
mux[k] = np.sum(x[index]) / len(index[0])
muy[k] = np.sum(y[index]) / len(index[0])
else:
rand_idx = np.random.randint(len(x))
mux[k] = x[rand_idx]
muy[k] = y[rand_idx]
#
# find total distances
#
total_distance = 0
for k in range(len(mux)):
index = np.where(mins == k)
if len(index[0]) > 0:
total_distance += np.sum(np.sqrt((x[index] - mux[k])**2 + (y[index] - muy[k])**2))
return mux, muy, total_distance
def plot_kmeans(x, y, mux, muy):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot(x, y, '.', alpha=0.3, label='Athletes')
plt.plot(mux, muy, 'r.', markersize=20, label='Centroids')
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.title(f"K-means Clustering: {len(mux)} clusters")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
def plot_Voronoi(x, y, mux, muy):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot(x, y, '.', alpha=0.3)
# make Voronoi line
if len(mux) >= 3:
try:
vor = Voronoi(np.stack((mux, muy), axis=1))
voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
except Exception as e:
print(f"Voronoi描画エラー: {e}")
plt.plot(mux, muy, 'r.', markersize=20) # エラー時は点だけ描画
else:
plt.plot(mux, muy, 'r.', markersize=20)
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.xlim(xmin - 5, xmax + 5) # 余白を追加
plt.ylim(ymin - 5, ymax + 5)
plt.title(f"Voronoi Diagram: {len(mux)} clusters")
plt.show()
# --- 3. execution ---
# Cluster 1
print("Running k=1...")
mux, muy, distances[0] = kmeans(x, y, momentum, 1)
plot_kmeans(x, y, mux, muy)
# Cluster 2
print("Running k=2...")
mux, muy, distances[1] = kmeans(x, y, momentum, 2)
plot_kmeans(x, y, mux, muy)
# Cluster 3
print("Running k=3...")
mux, muy, distances[2] = kmeans(x, y, momentum, 3)
plot_Voronoi(x, y, mux, muy)
# Cluster 4
print("Running k=4...")
mux, muy, distances[3] = kmeans(x, y, momentum, 4)
plot_Voronoi(x, y, mux, muy)
# Cluster 5
print("Running k=5...")
mux, muy, distances[4] = kmeans(x, y, momentum, 5)
plot_Voronoi(x, y, mux, muy)
# エルボー法のプロット
fig, ax = plt.subplots()
plt.plot(np.arange(1, 6), distances, 'o-')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.title('Elbow Method')
plt.grid(True)
plt.show()
Running k=1...
Running k=2...
Running k=3...
Running k=4...
Running k=5...
Analisys and conclusion¶
I made chart that showing the relationship between height and weght for olympic medalist. But I couldn't make more than 2 clusters for this data. Because the datas between height and weght has strong correlation.
Additionaly...¶
So I tried to ask Gemini to make this assignment more meaningful. "Please suggest an idea how can I make k-mean experiment more clear." Then I got the code bellow.
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
import numpy as np
# --- 1. Load Data (Modified to include labels) ---
def load_specialists_with_labels(filename='datasets/olympic_athlete_events.csv'):
try:
df = pd.read_csv(filename)
except FileNotFoundError:
print("File not found. Running in demo mode.")
return None, None, None
df_unique = df.drop_duplicates(subset='ID')
df_men = df_unique[df_unique['Sex'] == 'M']
# Extract 3 specific sports
gym = df_men[df_men['Sport'] == 'Gymnastics']
bball = df_men[df_men['Sport'] == 'Basketball']
shotput = df_men[df_men['Event'].str.contains('Shot Put', na=False)]
# Sampling
n_samples = 300
def get_samples(d):
d = d.dropna(subset=['Height', 'Weight'])
if len(d) > n_samples:
return d.sample(n=n_samples, random_state=42)
return d
gym = get_samples(gym)
bball = get_samples(bball)
shotput = get_samples(shotput)
# Combine data
df_final = pd.concat([gym, bball, shotput])
# ★ Change: Return labels (sport names) as well
return df_final['Height'].values, df_final['Weight'].values, df_final['Sport'].values
# Load data
x, y, labels = load_specialists_with_labels()
# Generate demo data (if file is missing)
if x is None:
n = 200
x1, y1 = np.random.normal(165, 5, n), np.random.normal(60, 5, n)
x2, y2 = np.random.normal(200, 6, n), np.random.normal(95, 8, n)
x3, y3 = np.random.normal(185, 6, n), np.random.normal(120, 10, n)
x = np.concatenate([x1, x2, x3])
y = np.concatenate([y1, y2, y3])
# Create dummy labels
labels = np.array(['Gymnastics']*n + ['Basketball']*n + ['Shot Put']*n)
# --- 2. k-means Logic (No changes) ---
nsteps = 100
def kmeans(x, y, nclusters):
indices = np.random.choice(len(x), nclusters, replace=False)
mux = x[indices]
muy = y[indices]
for i in range(nsteps):
X = np.outer(x, np.ones(len(mux)))
Y = np.outer(y, np.ones(len(muy)))
Mux = np.outer(np.ones(len(x)), mux)
Muy = np.outer(np.ones(len(x)), muy)
distances_matrix = np.sqrt((X - Mux)**2 + (Y - Muy)**2)
mins = np.argmin(distances_matrix, axis=1)
for k in range(len(mux)):
index = np.where(mins == k)
if len(index[0]) > 0:
mux[k] = np.sum(x[index]) / len(index[0])
muy[k] = np.sum(y[index]) / len(index[0])
else:
rand_idx = np.random.randint(len(x))
mux[k] = x[rand_idx]
muy[k] = y[rand_idx]
return mux, muy
# --- 3. Plotting (Added color-coding) ---
def plot_colored_clusters(x, y, labels, mux, muy):
xmin, xmax = np.min(x), np.max(x)
ymin, ymax = np.min(y), np.max(y)
fig, ax = plt.subplots(figsize=(12, 9))
# ★ Change: Plot with different colors for each label
# Define 3 colors
unique_labels = np.unique(labels)
colors = ['blue', 'orange', 'green']
for i, label in enumerate(unique_labels):
# Mask to extract data for a specific sport
mask = (labels == label)
# Display adjustment if Shot Put is listed under 'Athletics'
display_label = "Shot Put" if "Athletics" in label else label
ax.scatter(x[mask], y[mask], label=display_label, alpha=0.6, s=30, c=colors[i % len(colors)])
# Voronoi (Boundaries)
if len(mux) >= 3:
try:
vor = Voronoi(np.stack((mux, muy), axis=1))
voronoi_plot_2d(vor, ax=ax, show_points=False, show_vertices=False,
line_colors='black', line_width=2, line_alpha=0.8)
except Exception:
pass
# Centroids
plt.plot(mux, muy, 'rX', markersize=25, markeredgecolor='black', label='Centroids', zorder=10)
plt.xlabel('Height (cm)', fontsize=12)
plt.ylabel('Weight (kg)', fontsize=12)
plt.title("Clustering Result vs Actual Sports", fontsize=15)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()
# --- Execution ---
print("Running clustering and validation...")
mux, muy = kmeans(x, y, 3)
plot_colored_clusters(x, y, labels, mux, muy)
Running clustering and validation...
Gemini suggested that I use the data focusing on three sports, Shot Put, Basket ball, Gymnastics. have These three athletes have distinctive physiques.
Analysis:¶
I could get better result. We were able to cluster differences in body type across the three sports.