[Your-Name-Here] - Fab Futures - Data Science
Home About

< Home

Week3-2: Density Estimation¶

I would like to see what kinds of developping countries are good for setting up FabLab using HDI.

Clustering by k-means¶

Clustering for HDI x Lab No.¶

Since I had no idea, I just asked to chat GPT first to try Neil's code with my data.

In [18]:
path_hdr25 = "./datasets/hdr25.csv"
#!pip install pycountry  #install library ”pycountry”
import pandas as pd #import Panda Library
import numpy as np  #import Numpy
import pycountry
import requests
import json
import matplotlib.pyplot as plt
#HDI
df_hdr25 = pd.read_csv(path_hdr25, encoding='utf-8', encoding_errors='ignore') #df = data flame = read dataset path "path_hdr25" defined above with panda 
df_hdr25.fillna(0, inplace=True) #replace N/A to 0 
#Lab list
url = 'https://api.fablabs.io/0/labs.json'
r = requests.get(url)
data = r.json()
df_lablist = pd.DataFrame(data)
#CountryCodeMarge
def alpha2_to_alpha3(code2):
    country = pycountry.countries.get(alpha_2=code2.upper())
    return country.alpha_3 if country else None
df_lablist['ccd3'] = df_lablist['country_code'].apply(alpha2_to_alpha3)
#CountLabNo
df_labcount = (df_lablist.groupby('ccd3').agg(lab_count=('id', 'count')).reset_index())
#CountLabKind
df_kindcount = (df_lablist.groupby(['ccd3', 'kind_name']).agg(kind_count=('id', 'count')).reset_index())
#Marge HDI with Lab(Number)
df_labcount = df_lablist.groupby('country_code').agg(lab_count=('id', 'count')).reset_index()
df_labcount['ccd3'] = df_labcount['country_code'].apply(alpha2_to_alpha3)
df_labno_hdi = pd.merge(df_labcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Encoding
from sklearn.preprocessing import LabelEncoder
#Encoding for HDI x Lab Kind
le = LabelEncoder()
encoded_country = le.fit_transform(df_labno_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
#Encoding for HDI x Lab Number list
df_labno_hdi['country_encoded'] = encoded_country
encoded = le.fit_transform(df_kind_hdi['kind_name'].values)
decoded = le.inverse_transform(encoded)
df_kind_hdi['kind_name_encoded'] = encoded
encoded_country = le.fit_transform(df_kind_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_kind_hdi['country_encoded'] = encoded_country
In [19]:
#
# k-means parameters
#
npts = 1000
nsteps = 1000
momentum = 0.
    
# 欠損を除外
df_lab_hdi = df_lab_hdi.replace(0, np.nan)

# 必要な列が存在するか確認(念のため)
assert 'hdi_2023' in df_lab_hdi.columns
assert 'lab_count' in df_lab_hdi.columns

# データ抽出
x = df_lab_hdi['hdi_2023'].values
y = df_lab_hdi['lab_count'].values

# 欠損除去
mask = (~np.isnan(x)) & (~np.isnan(y))
x = x[mask]
y = y[mask]

# log変換(FabLab数は偏りが大きいので必須)
y = np.log1p(y)  # log(1 + lab_count)

from sklearn.preprocessing import StandardScaler
xy = np.stack((x, y), axis=1)
xy = StandardScaler().fit_transform(xy)

x = xy[:,0]
y = xy[:,1]

def kmeans(x,y,momentum,nclusters):
    #
    # choose starting points
    #
    indices = np.random.uniform(low=0,high=len(x),size=nclusters).astype(int)
    mux = x[indices]
    muy = y[indices]
    #
    # do k-means iteration
    #
    for i in range(nsteps):
        #
        # find closest points
        #
        X = np.outer(x,np.ones(len(mux)))
        Y = np.outer(y,np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)),mux)
        Muy = np.outer(np.ones(len(x)),muy)
        distances = np.sqrt((X-Mux)**2+(Y-Muy)**2)
        mins = np.argmin(distances,axis=1)
        #
        # update means
        #
        for i in range(len(mux)):
            index = np.where(mins == i)
            if len(index[0]) == 0:
                # 空クラスタ → ランダムに再配置
                j = np.random.randint(0, len(x))
                mux[i] = x[j]
                muy[i] = y[j]
            else:
                mux[i] = np.mean(x[index])
                muy[i] = np.mean(y[index])
    #
    # find distances
    #
    distances = 0
    for i in range(len(mux)):
        index = np.where(mins == i)
        distances += np.sum(np.sqrt((x[index]-mux[i])**2+(y[index]-muy[i])**2))
    return mux,muy,distances

def plot_kmeans(x,y,mux,muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig,ax = plt.subplots()
    plt.plot(x,y,'.')
    plt.plot(mux,muy,'r.',markersize=20)
    plt.xlim(xmin,xmax)
    plt.ylim(ymin, ymax)
#    plt.xlim(-3, 3)
#    plt.ylim(-3, 3)
    plt.title(f"{len(mux)} clusters")
    plt.show()

def plot_Voronoi(x,y,mux,muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig,ax = plt.subplots()
    plt.plot(x,y,'.')
    vor = Voronoi(np.stack((mux,muy),axis=1))
    voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
    plt.xlim(xmin,xmax)
    plt.ylim(ymin, ymax)
#    plt.xlim(-3, 3)
#    plt.ylim(-3, 3)
    plt.title(f"{len(mux)} clusters")
    plt.show()

distances = np.zeros(5)

mux,muy,distances[0] = kmeans(x,y,momentum,1)
plot_kmeans(x,y,mux,muy)

mux,muy,distances[1] = kmeans(x,y,momentum,2)
plot_kmeans(x,y,mux,muy)

mux,muy,distances[2] = kmeans(x,y,momentum,3)
plot_Voronoi(x,y,mux,muy)

mux,muy,distances[3] = kmeans(x,y,momentum,4)
plot_Voronoi(x,y,mux,muy)

mux,muy,distances[4] = kmeans(x,y,momentum,5)
plot_Voronoi(x,y,mux,muy)

fig,ax = plt.subplots()
plt.plot(np.arange(1,6),distances,'o')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 9
      6 momentum = 0.
      8 # 欠損を除外
----> 9 df_lab_hdi = df_lab_hdi.replace(0, np.nan)
     11 # 必要な列が存在するか確認(念のため)
     12 assert 'hdi_2023' in df_lab_hdi.columns

NameError: name 'df_lab_hdi' is not defined

Clustering for HDI x Lab Kind¶

I also tried same with Lab Kind data.

In [ ]:
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#
# k-means parameters
#
npts = 1000
nsteps = 1000
momentum = 0.
    
# 欠損を除外
df_lab_hdi = df_lab_hdi.replace(0, np.nan)

# 必要な列が存在するか確認(念のため)
assert 'hdi_2023' in df_kind_hdi.columns
assert 'kind_count' in df_kind_hdi.columns

# データ抽出
x = df_kind_hdi['hdi_2023'].values
y = df_kind_hdi['kind_count'].values

# 欠損除去
mask = (~np.isnan(x)) & (~np.isnan(y))
x = x[mask]
y = y[mask]

# log変換(FabLab数は偏りが大きいので必須)
y = np.log1p(y)  # log(1 + lab_count)

from sklearn.preprocessing import StandardScaler
xy = np.stack((x, y), axis=1)
xy = StandardScaler().fit_transform(xy)

x = xy[:,0]
y = xy[:,1]

def kmeans(x,y,momentum,nclusters):
    #
    # choose starting points
    #
    indices = np.random.uniform(low=0,high=len(x),size=nclusters).astype(int)
    mux = x[indices]
    muy = y[indices]
    #
    # do k-means iteration
    #
    for i in range(nsteps):
        #
        # find closest points
        #
        X = np.outer(x,np.ones(len(mux)))
        Y = np.outer(y,np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)),mux)
        Muy = np.outer(np.ones(len(x)),muy)
        distances = np.sqrt((X-Mux)**2+(Y-Muy)**2)
        mins = np.argmin(distances,axis=1)
        #
        # update means
        #
        for i in range(len(mux)):
            index = np.where(mins == i)
            if len(index[0]) == 0:
                # 空クラスタ → ランダムに再配置
                j = np.random.randint(0, len(x))
                mux[i] = x[j]
                muy[i] = y[j]
            else:
                mux[i] = np.mean(x[index])
                muy[i] = np.mean(y[index])
    #
    # find distances
    #
    distances = 0
    for i in range(len(mux)):
        index = np.where(mins == i)
        distances += np.sum(np.sqrt((x[index]-mux[i])**2+(y[index]-muy[i])**2))
    return mux,muy,distances

def plot_kmeans(x,y,mux,muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig,ax = plt.subplots()
    plt.plot(x,y,'.')
    plt.plot(mux,muy,'r.',markersize=20)
    plt.xlim(xmin,xmax)
    plt.ylim(ymin, ymax)
#    plt.xlim(-3, 3)
#    plt.ylim(-3, 3)
    plt.title(f"{len(mux)} clusters")
    plt.show()

def plot_Voronoi(x,y,mux,muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig,ax = plt.subplots()
    plt.plot(x,y,'.')
    vor = Voronoi(np.stack((mux,muy),axis=1))
    voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
    plt.xlim(xmin,xmax)
    plt.ylim(ymin, ymax)
#    plt.xlim(-3, 3)
#    plt.ylim(-3, 3)
    plt.title(f"{len(mux)} clusters")
    plt.show()

distances = np.zeros(5)

mux,muy,distances[0] = kmeans(x,y,momentum,1)
plot_kmeans(x,y,mux,muy)

mux,muy,distances[1] = kmeans(x,y,momentum,2)
plot_kmeans(x,y,mux,muy)

mux,muy,distances[2] = kmeans(x,y,momentum,3)
plot_Voronoi(x,y,mux,muy)

mux,muy,distances[3] = kmeans(x,y,momentum,4)
plot_Voronoi(x,y,mux,muy)

mux,muy,distances[4] = kmeans(x,y,momentum,5)
plot_Voronoi(x,y,mux,muy)

fig,ax = plt.subplots()
plt.plot(np.arange(1,6),distances,'o')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.show()

From Probability class, it seems these educational indicators appear to be correlated with Lab Nos, so I will also try clustering on them.

  • Expected Years of Schooling(eys)
  • Mean Years of Schooling(mys)
  • Population with at least some secondary education(se f/m)

I asked chat GPT to how to imprement this.

In [22]:
features_edu = [
    'eys_2023',
    'mys_2023',
    'se_f_2023',
    'se_m_2023'
]

df_edu = (
    df_labno_hdi[features_edu + ['lab_count']]
    .replace(0, np.nan)
    .dropna(subset=features_edu)
)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(df_edu[features_edu])

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=0, n_init=20)
df_edu['edu_cluster'] = kmeans.fit_predict(X)

cluster_profile = (
    df_edu
    .groupby('edu_cluster')[features_edu + ['lab_count']]
    .mean()
)

cluster_profile

import matplotlib.pyplot as plt

#eys × mys
plt.figure(figsize=(5,5))
for c in sorted(df_edu['edu_cluster'].unique()):
    d = df_edu[df_edu['edu_cluster'] == c]
    plt.scatter(d['eys_2023'], d['mys_2023'], label=f'Cluster {c}', alpha=0.7)

plt.xlabel('Expected Years of Schooling (eys)')
plt.ylabel('Mean Years of Schooling (mys)')
plt.title('K-means clustering (eys vs mys)')
plt.legend()
plt.show()

#se_f × se_m
plt.figure(figsize=(5,5))
for c in sorted(df_edu['edu_cluster'].unique()):
    d = df_edu[df_edu['edu_cluster'] == c]
    plt.scatter(d['se_f_2023'], d['se_m_2023'], label=f'Cluster {c}', alpha=0.7)

plt.xlabel('Secondary education (female)')
plt.ylabel('Secondary education (male)')
plt.title('K-means clustering (se_f vs se_m)')
plt.legend()
plt.show()

#eys × se_f
plt.figure(figsize=(5,5))
for c in sorted(df_edu['edu_cluster'].unique()):
    d = df_edu[df_edu['edu_cluster'] == c]
    plt.scatter(d['eys_2023'], d['se_f_2023'], label=f'Cluster {c}', alpha=0.7)

plt.xlabel('Expected Years of Schooling')
plt.ylabel('Secondary education (female)')
plt.title('K-means clustering (eys vs se_f)')
plt.legend()
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

I checked what countries are in each cluster.
I also asked ChatGPT about this.
Cluster 0 tends to have countries with higher level of education, while Cluster 1 tends to have countries with lower level of education.

In [26]:
#各クラスタに属する国の 平均値:edu_clusterごとに国をまとめて、各指標の平均値を計算
cluster_profile = (
    df_edu
    .groupby('edu_cluster')
    .mean()
)

cluster_profile
Out[26]:
eys_2023 mys_2023 se_f_2023 se_m_2023 lab_count
edu_cluster
0 16.395650 12.333613 91.137699 93.359582 28.508475
1 9.963002 4.362760 17.410367 28.770754 3.172414
2 13.708520 9.054664 60.184097 63.767110 19.636364

With ChatGPT's suggestion, I tried to check the no of lab for each cluster

In [23]:
import seaborn as sns

sns.boxplot(
    x='edu_cluster',
    y='lab_count',
    data=df_edu
)
plt.xlabel('Education cluster')
plt.ylabel('Number of FabLabs')
plt.title('FabLab count by education cluster')
plt.show()
No description has been provided for this image