import matplotlib.pyplot as plt
from scipy.spatial import Voronoi,voronoi_plot_2d
import numpy as np
import time
#
# k-means parameters
#
npts = 1000 #changed 1000 to 100
nsteps = 1 #chaged 1000 to 10
momentum = 0.
xs = [0,2,10,5] #added one more center
ys = [0,10,3,5] #added one more center
np.random.seed(10)
#
# generate random data
#
x = np.array([])
y = np.array([])
for i in range(len(xs)):
    x = np.append(x,np.random.normal(loc=xs[i],scale=1,size=npts))
    y = np.append(y,np.random.normal(loc=ys[i],scale=1,size=npts))

def kmeans(x,y,momentum,nclusters):
    #
    # choose starting points
    #
    indices = np.random.uniform(low=0,high=len(x),size=nclusters).astype(int)
    mux = x[indices]
    muy = y[indices]
    #
    # do k-means iteration
    #
    for i in range(nsteps):
        #
        # find closest points
        #
        X = np.outer(x,np.ones(len(mux)))
        Y = np.outer(y,np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)),mux)
        Muy = np.outer(np.ones(len(x)),muy)
        distances = np.sqrt((X-Mux)**2+(Y-Muy)**2)
        mins = np.argmin(distances,axis=1)
        #
        # update means
        #
        for i in range(len(mux)):
            index = np.where(mins == i)
            mux[i] = np.sum(x[index])/len(index[0])
            muy[i] = np.sum(y[index])/len(index[0])
    #
    # find distances
    #
    distances = 0
    for i in range(len(mux)):
        index = np.where(mins == i)
        distances += np.sum(np.sqrt((x[index]-mux[i])**2+(y[index]-muy[i])**2))
    return mux,muy,distances

def plot_kmeans(x,y,mux,muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig,ax = plt.subplots()
    plt.plot(x,y,'.')
    plt.plot(mux,muy,'r.',markersize=20)
    plt.xlim(xmin,xmax)
    plt.ylim(xmin,xmax)
    plt.title(f"{len(mux)} clusters")
    plt.show()

def plot_Voronoi(x,y,mux,muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig,ax = plt.subplots()
    plt.plot(x,y,'.')
    vor = Voronoi(np.stack((mux,muy),axis=1))
    voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
    plt.xlim(xmin,xmax)
    plt.ylim(xmin,xmax)
    plt.title(f"{len(mux)} clusters")
    plt.show()

distances = np.zeros(5)

mux,muy,distances[0] = kmeans(x,y,momentum,1)
plot_kmeans(x,y,mux,muy)

mux,muy,distances[1] = kmeans(x,y,momentum,2)
plot_kmeans(x,y,mux,muy)

mux,muy,distances[2] = kmeans(x,y,momentum,3)
plot_Voronoi(x,y,mux,muy)

mux,muy,distances[3] = kmeans(x,y,momentum,4)
plot_Voronoi(x,y,mux,muy)

mux,muy,distances[4] = kmeans(x,y,momentum,5)
plot_Voronoi(x,y,mux,muy)

fig,ax = plt.subplots()
plt.plot(np.arange(1,6),distances,'o')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
import numpy as np
import time

# --- 1. データ読み込みと前処理 ---
def load_olympic_data(filename='datasets/olympic_athlete_events.csv'):
    df = pd.read_csv(filename)


    # 重複データの削除（同じ選手が複数のメダルを取っている場合などを考慮しIDで一意にする）
    df_unique = df.drop_duplicates(subset='ID')
    
    # 身長(Height)と体重(Weight)の欠損値(NaN)を除去
    df_clean = df_unique.dropna(subset=['Height', 'Weight'])
    
    # 計算負荷軽減のため、データをランダムにサンプリング（例: 2000件）
    # 全データを使うと、この手書きk-meansの実装では計算時間がかかりすぎる可能性があります
    if len(df_clean) > 2000:
        df_sample = df_clean.sample(n=2000, random_state=42)
    else:
        df_sample = df_clean

    return df_sample['Height'].values, df_sample['Weight'].values

# データをセット (x:身長, y:体重)
x, y = load_olympic_data()

# --- 2. ユーザー提供のk-meansロジック ---

## k-means parameters
nsteps = 100 
momentum = 0. 

def kmeans(x, y, momentum, nclusters):
    #
    # choose starting points (randomly from data)
    #
    indices = np.random.choice(len(x), nclusters, replace=False)
    mux = x[indices]
    muy = y[indices]
    
    #
    # do k-means iteration
    #
    for i in range(nsteps):
        #
        # find closest points
        X = np.outer(x, np.ones(len(mux)))
        Y = np.outer(y, np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)), mux)
        Muy = np.outer(np.ones(len(x)), muy)
        
        distances_matrix = np.sqrt((X - Mux)**2 + (Y - Muy)**2)
        mins = np.argmin(distances_matrix, axis=1)
        
        #
        # update means
        #
        for k in range(len(mux)):
            index = np.where(mins == k)
            if len(index[0]) > 0: 
                mux[k] = np.sum(x[index]) / len(index[0])
                muy[k] = np.sum(y[index]) / len(index[0])
            else:
                rand_idx = np.random.randint(len(x))
                mux[k] = x[rand_idx]
                muy[k] = y[rand_idx]

    #
    # find total distances
    #
    total_distance = 0
    for k in range(len(mux)):
        index = np.where(mins == k)
        if len(index[0]) > 0:
            total_distance += np.sum(np.sqrt((x[index] - mux[k])**2 + (y[index] - muy[k])**2))
            
    return mux, muy, total_distance

def plot_kmeans(x, y, mux, muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig, ax = plt.subplots(figsize=(8, 6))
    
    plt.plot(x, y, '.', alpha=0.3, label='Athletes')
    plt.plot(mux, muy, 'r.', markersize=20, label='Centroids')
    
    plt.xlabel('Height (cm)')
    plt.ylabel('Weight (kg)')
    plt.title(f"K-means Clustering: {len(mux)} clusters")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

def plot_Voronoi(x, y, mux, muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig, ax = plt.subplots(figsize=(8, 6))
    
    plt.plot(x, y, '.', alpha=0.3)
    
    # make Voronoi line
    if len(mux) >= 3:
        try:
            vor = Voronoi(np.stack((mux, muy), axis=1))
            voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
        except Exception as e:
            print(f"Voronoi描画エラー: {e}")
            plt.plot(mux, muy, 'r.', markersize=20) # エラー時は点だけ描画
    else:
        plt.plot(mux, muy, 'r.', markersize=20)

    plt.xlabel('Height (cm)')
    plt.ylabel('Weight (kg)')
    plt.xlim(xmin - 5, xmax + 5) # 余白を追加
    plt.ylim(ymin - 5, ymax + 5)
    plt.title(f"Voronoi Diagram: {len(mux)} clusters")
    plt.show()

# --- 3. execution ---


# Cluster 1
print("Running k=1...")
mux, muy, distances[0] = kmeans(x, y, momentum, 1)
plot_kmeans(x, y, mux, muy)

# Cluster 2
print("Running k=2...")
mux, muy, distances[1] = kmeans(x, y, momentum, 2)
plot_kmeans(x, y, mux, muy)

# Cluster 3
print("Running k=3...")
mux, muy, distances[2] = kmeans(x, y, momentum, 3)
plot_Voronoi(x, y, mux, muy)

# Cluster 4
print("Running k=4...")
mux, muy, distances[3] = kmeans(x, y, momentum, 4)
plot_Voronoi(x, y, mux, muy)

# Cluster 5
print("Running k=5...")
mux, muy, distances[4] = kmeans(x, y, momentum, 5)
plot_Voronoi(x, y, mux, muy)

# エルボー法のプロット
fig, ax = plt.subplots()
plt.plot(np.arange(1, 6), distances, 'o-')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.title('Elbow Method')
plt.grid(True)
plt.show()

Running k=1...

Running k=2...

Running k=3...

Running k=4...

Running k=5...

import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
import numpy as np

# --- 1. Load Data (Modified to include labels) ---
def load_specialists_with_labels(filename='datasets/olympic_athlete_events.csv'):
    try:
        df = pd.read_csv(filename)
    except FileNotFoundError:
        print("File not found. Running in demo mode.")
        return None, None, None

    df_unique = df.drop_duplicates(subset='ID')
    df_men = df_unique[df_unique['Sex'] == 'M']
    
    # Extract 3 specific sports
    gym = df_men[df_men['Sport'] == 'Gymnastics']
    bball = df_men[df_men['Sport'] == 'Basketball']
    shotput = df_men[df_men['Event'].str.contains('Shot Put', na=False)]
    
    # Sampling
    n_samples = 300
    def get_samples(d):
        d = d.dropna(subset=['Height', 'Weight'])
        if len(d) > n_samples:
            return d.sample(n=n_samples, random_state=42)
        return d

    gym = get_samples(gym)
    bball = get_samples(bball)
    shotput = get_samples(shotput)
    
    # Combine data
    df_final = pd.concat([gym, bball, shotput])
    
    # ★ Change: Return labels (sport names) as well
    return df_final['Height'].values, df_final['Weight'].values, df_final['Sport'].values

# Load data
x, y, labels = load_specialists_with_labels()

# Generate demo data (if file is missing)
if x is None:
    n = 200
    x1, y1 = np.random.normal(165, 5, n), np.random.normal(60, 5, n)
    x2, y2 = np.random.normal(200, 6, n), np.random.normal(95, 8, n)
    x3, y3 = np.random.normal(185, 6, n), np.random.normal(120, 10, n)
    x = np.concatenate([x1, x2, x3])
    y = np.concatenate([y1, y2, y3])
    # Create dummy labels
    labels = np.array(['Gymnastics']*n + ['Basketball']*n + ['Shot Put']*n)

# --- 2. k-means Logic (No changes) ---
nsteps = 100
def kmeans(x, y, nclusters):
    indices = np.random.choice(len(x), nclusters, replace=False)
    mux = x[indices]
    muy = y[indices]
    for i in range(nsteps):
        X = np.outer(x, np.ones(len(mux)))
        Y = np.outer(y, np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)), mux)
        Muy = np.outer(np.ones(len(x)), muy)
        distances_matrix = np.sqrt((X - Mux)**2 + (Y - Muy)**2)
        mins = np.argmin(distances_matrix, axis=1)
        for k in range(len(mux)):
            index = np.where(mins == k)
            if len(index[0]) > 0:
                mux[k] = np.sum(x[index]) / len(index[0])
                muy[k] = np.sum(y[index]) / len(index[0])
            else:
                rand_idx = np.random.randint(len(x))
                mux[k] = x[rand_idx]
                muy[k] = y[rand_idx]
    return mux, muy

# --- 3. Plotting (Added color-coding) ---
def plot_colored_clusters(x, y, labels, mux, muy):
    xmin, xmax = np.min(x), np.max(x)
    ymin, ymax = np.min(y), np.max(y)
    
    fig, ax = plt.subplots(figsize=(12, 9))
    
    # ★ Change: Plot with different colors for each label
    # Define 3 colors
    unique_labels = np.unique(labels)
    colors = ['blue', 'orange', 'green'] 
    
    for i, label in enumerate(unique_labels):
        # Mask to extract data for a specific sport
        mask = (labels == label)
        # Display adjustment if Shot Put is listed under 'Athletics'
        display_label = "Shot Put" if "Athletics" in label else label
        
        ax.scatter(x[mask], y[mask], label=display_label, alpha=0.6, s=30, c=colors[i % len(colors)])
    
    # Voronoi (Boundaries)
    if len(mux) >= 3:
        try:
            vor = Voronoi(np.stack((mux, muy), axis=1))
            voronoi_plot_2d(vor, ax=ax, show_points=False, show_vertices=False, 
                           line_colors='black', line_width=2, line_alpha=0.8)
        except Exception:
            pass

    # Centroids
    plt.plot(mux, muy, 'rX', markersize=25, markeredgecolor='black', label='Centroids', zorder=10)

    plt.xlabel('Height (cm)', fontsize=12)
    plt.ylabel('Weight (kg)', fontsize=12)
    plt.title("Clustering Result vs Actual Sports", fontsize=15)
    plt.legend(fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.show()

# --- Execution ---
print("Running clustering and validation...")
mux, muy = kmeans(x, y, 3)
plot_colored_clusters(x, y, labels, mux, muy)

Running clustering and validation...

Week 6: Density Estimation¶

About this week¶

Assignment¶

1.Play with K-mean example¶

2.Try to use k-mean method for my picked up data.¶

Analisys and conclusion¶

Additionaly...¶

Analysis:¶