< Home
Week3-2: Density Estimation¶
I would like to see what kinds of developping countries are good for setting up FabLab using HDI.
In [18]:
path_hdr25 = "./datasets/hdr25.csv"
#!pip install pycountry #install library ”pycountry”
import pandas as pd #import Panda Library
import numpy as np #import Numpy
import pycountry
import requests
import json
import matplotlib.pyplot as plt
#HDI
df_hdr25 = pd.read_csv(path_hdr25, encoding='utf-8', encoding_errors='ignore') #df = data flame = read dataset path "path_hdr25" defined above with panda
df_hdr25.fillna(0, inplace=True) #replace N/A to 0
#Lab list
url = 'https://api.fablabs.io/0/labs.json'
r = requests.get(url)
data = r.json()
df_lablist = pd.DataFrame(data)
#CountryCodeMarge
def alpha2_to_alpha3(code2):
country = pycountry.countries.get(alpha_2=code2.upper())
return country.alpha_3 if country else None
df_lablist['ccd3'] = df_lablist['country_code'].apply(alpha2_to_alpha3)
#CountLabNo
df_labcount = (df_lablist.groupby('ccd3').agg(lab_count=('id', 'count')).reset_index())
#CountLabKind
df_kindcount = (df_lablist.groupby(['ccd3', 'kind_name']).agg(kind_count=('id', 'count')).reset_index())
#Marge HDI with Lab(Number)
df_labcount = df_lablist.groupby('country_code').agg(lab_count=('id', 'count')).reset_index()
df_labcount['ccd3'] = df_labcount['country_code'].apply(alpha2_to_alpha3)
df_labno_hdi = pd.merge(df_labcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Encoding
from sklearn.preprocessing import LabelEncoder
#Encoding for HDI x Lab Kind
le = LabelEncoder()
encoded_country = le.fit_transform(df_labno_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
#Encoding for HDI x Lab Number list
df_labno_hdi['country_encoded'] = encoded_country
encoded = le.fit_transform(df_kind_hdi['kind_name'].values)
decoded = le.inverse_transform(encoded)
df_kind_hdi['kind_name_encoded'] = encoded
encoded_country = le.fit_transform(df_kind_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_kind_hdi['country_encoded'] = encoded_country
In [19]:
#
# k-means parameters
#
npts = 1000
nsteps = 1000
momentum = 0.
# 欠損を除外
df_lab_hdi = df_lab_hdi.replace(0, np.nan)
# 必要な列が存在するか確認(念のため)
assert 'hdi_2023' in df_lab_hdi.columns
assert 'lab_count' in df_lab_hdi.columns
# データ抽出
x = df_lab_hdi['hdi_2023'].values
y = df_lab_hdi['lab_count'].values
# 欠損除去
mask = (~np.isnan(x)) & (~np.isnan(y))
x = x[mask]
y = y[mask]
# log変換(FabLab数は偏りが大きいので必須)
y = np.log1p(y) # log(1 + lab_count)
from sklearn.preprocessing import StandardScaler
xy = np.stack((x, y), axis=1)
xy = StandardScaler().fit_transform(xy)
x = xy[:,0]
y = xy[:,1]
def kmeans(x,y,momentum,nclusters):
#
# choose starting points
#
indices = np.random.uniform(low=0,high=len(x),size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]
#
# do k-means iteration
#
for i in range(nsteps):
#
# find closest points
#
X = np.outer(x,np.ones(len(mux)))
Y = np.outer(y,np.ones(len(muy)))
Mux = np.outer(np.ones(len(x)),mux)
Muy = np.outer(np.ones(len(x)),muy)
distances = np.sqrt((X-Mux)**2+(Y-Muy)**2)
mins = np.argmin(distances,axis=1)
#
# update means
#
for i in range(len(mux)):
index = np.where(mins == i)
if len(index[0]) == 0:
# 空クラスタ → ランダムに再配置
j = np.random.randint(0, len(x))
mux[i] = x[j]
muy[i] = y[j]
else:
mux[i] = np.mean(x[index])
muy[i] = np.mean(y[index])
#
# find distances
#
distances = 0
for i in range(len(mux)):
index = np.where(mins == i)
distances += np.sum(np.sqrt((x[index]-mux[i])**2+(y[index]-muy[i])**2))
return mux,muy,distances
def plot_kmeans(x,y,mux,muy):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
fig,ax = plt.subplots()
plt.plot(x,y,'.')
plt.plot(mux,muy,'r.',markersize=20)
plt.xlim(xmin,xmax)
plt.ylim(ymin, ymax)
# plt.xlim(-3, 3)
# plt.ylim(-3, 3)
plt.title(f"{len(mux)} clusters")
plt.show()
def plot_Voronoi(x,y,mux,muy):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
fig,ax = plt.subplots()
plt.plot(x,y,'.')
vor = Voronoi(np.stack((mux,muy),axis=1))
voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
plt.xlim(xmin,xmax)
plt.ylim(ymin, ymax)
# plt.xlim(-3, 3)
# plt.ylim(-3, 3)
plt.title(f"{len(mux)} clusters")
plt.show()
distances = np.zeros(5)
mux,muy,distances[0] = kmeans(x,y,momentum,1)
plot_kmeans(x,y,mux,muy)
mux,muy,distances[1] = kmeans(x,y,momentum,2)
plot_kmeans(x,y,mux,muy)
mux,muy,distances[2] = kmeans(x,y,momentum,3)
plot_Voronoi(x,y,mux,muy)
mux,muy,distances[3] = kmeans(x,y,momentum,4)
plot_Voronoi(x,y,mux,muy)
mux,muy,distances[4] = kmeans(x,y,momentum,5)
plot_Voronoi(x,y,mux,muy)
fig,ax = plt.subplots()
plt.plot(np.arange(1,6),distances,'o')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[19], line 9 6 momentum = 0. 8 # 欠損を除外 ----> 9 df_lab_hdi = df_lab_hdi.replace(0, np.nan) 11 # 必要な列が存在するか確認(念のため) 12 assert 'hdi_2023' in df_lab_hdi.columns NameError: name 'df_lab_hdi' is not defined
Clustering for HDI x Lab Kind¶
I also tried same with Lab Kind data.
In [ ]:
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#
# k-means parameters
#
npts = 1000
nsteps = 1000
momentum = 0.
# 欠損を除外
df_lab_hdi = df_lab_hdi.replace(0, np.nan)
# 必要な列が存在するか確認(念のため)
assert 'hdi_2023' in df_kind_hdi.columns
assert 'kind_count' in df_kind_hdi.columns
# データ抽出
x = df_kind_hdi['hdi_2023'].values
y = df_kind_hdi['kind_count'].values
# 欠損除去
mask = (~np.isnan(x)) & (~np.isnan(y))
x = x[mask]
y = y[mask]
# log変換(FabLab数は偏りが大きいので必須)
y = np.log1p(y) # log(1 + lab_count)
from sklearn.preprocessing import StandardScaler
xy = np.stack((x, y), axis=1)
xy = StandardScaler().fit_transform(xy)
x = xy[:,0]
y = xy[:,1]
def kmeans(x,y,momentum,nclusters):
#
# choose starting points
#
indices = np.random.uniform(low=0,high=len(x),size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]
#
# do k-means iteration
#
for i in range(nsteps):
#
# find closest points
#
X = np.outer(x,np.ones(len(mux)))
Y = np.outer(y,np.ones(len(muy)))
Mux = np.outer(np.ones(len(x)),mux)
Muy = np.outer(np.ones(len(x)),muy)
distances = np.sqrt((X-Mux)**2+(Y-Muy)**2)
mins = np.argmin(distances,axis=1)
#
# update means
#
for i in range(len(mux)):
index = np.where(mins == i)
if len(index[0]) == 0:
# 空クラスタ → ランダムに再配置
j = np.random.randint(0, len(x))
mux[i] = x[j]
muy[i] = y[j]
else:
mux[i] = np.mean(x[index])
muy[i] = np.mean(y[index])
#
# find distances
#
distances = 0
for i in range(len(mux)):
index = np.where(mins == i)
distances += np.sum(np.sqrt((x[index]-mux[i])**2+(y[index]-muy[i])**2))
return mux,muy,distances
def plot_kmeans(x,y,mux,muy):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
fig,ax = plt.subplots()
plt.plot(x,y,'.')
plt.plot(mux,muy,'r.',markersize=20)
plt.xlim(xmin,xmax)
plt.ylim(ymin, ymax)
# plt.xlim(-3, 3)
# plt.ylim(-3, 3)
plt.title(f"{len(mux)} clusters")
plt.show()
def plot_Voronoi(x,y,mux,muy):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
fig,ax = plt.subplots()
plt.plot(x,y,'.')
vor = Voronoi(np.stack((mux,muy),axis=1))
voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
plt.xlim(xmin,xmax)
plt.ylim(ymin, ymax)
# plt.xlim(-3, 3)
# plt.ylim(-3, 3)
plt.title(f"{len(mux)} clusters")
plt.show()
distances = np.zeros(5)
mux,muy,distances[0] = kmeans(x,y,momentum,1)
plot_kmeans(x,y,mux,muy)
mux,muy,distances[1] = kmeans(x,y,momentum,2)
plot_kmeans(x,y,mux,muy)
mux,muy,distances[2] = kmeans(x,y,momentum,3)
plot_Voronoi(x,y,mux,muy)
mux,muy,distances[3] = kmeans(x,y,momentum,4)
plot_Voronoi(x,y,mux,muy)
mux,muy,distances[4] = kmeans(x,y,momentum,5)
plot_Voronoi(x,y,mux,muy)
fig,ax = plt.subplots()
plt.plot(np.arange(1,6),distances,'o')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.show()
From Probability class, it seems these educational indicators appear to be correlated with Lab Nos, so I will also try clustering on them.
- Expected Years of Schooling(eys)
- Mean Years of Schooling(mys)
- Population with at least some secondary education(se f/m)
I asked chat GPT to how to imprement this.
In [22]:
features_edu = [
'eys_2023',
'mys_2023',
'se_f_2023',
'se_m_2023'
]
df_edu = (
df_labno_hdi[features_edu + ['lab_count']]
.replace(0, np.nan)
.dropna(subset=features_edu)
)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(df_edu[features_edu])
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0, n_init=20)
df_edu['edu_cluster'] = kmeans.fit_predict(X)
cluster_profile = (
df_edu
.groupby('edu_cluster')[features_edu + ['lab_count']]
.mean()
)
cluster_profile
import matplotlib.pyplot as plt
#eys × mys
plt.figure(figsize=(5,5))
for c in sorted(df_edu['edu_cluster'].unique()):
d = df_edu[df_edu['edu_cluster'] == c]
plt.scatter(d['eys_2023'], d['mys_2023'], label=f'Cluster {c}', alpha=0.7)
plt.xlabel('Expected Years of Schooling (eys)')
plt.ylabel('Mean Years of Schooling (mys)')
plt.title('K-means clustering (eys vs mys)')
plt.legend()
plt.show()
#se_f × se_m
plt.figure(figsize=(5,5))
for c in sorted(df_edu['edu_cluster'].unique()):
d = df_edu[df_edu['edu_cluster'] == c]
plt.scatter(d['se_f_2023'], d['se_m_2023'], label=f'Cluster {c}', alpha=0.7)
plt.xlabel('Secondary education (female)')
plt.ylabel('Secondary education (male)')
plt.title('K-means clustering (se_f vs se_m)')
plt.legend()
plt.show()
#eys × se_f
plt.figure(figsize=(5,5))
for c in sorted(df_edu['edu_cluster'].unique()):
d = df_edu[df_edu['edu_cluster'] == c]
plt.scatter(d['eys_2023'], d['se_f_2023'], label=f'Cluster {c}', alpha=0.7)
plt.xlabel('Expected Years of Schooling')
plt.ylabel('Secondary education (female)')
plt.title('K-means clustering (eys vs se_f)')
plt.legend()
plt.show()
I checked what countries are in each cluster.
I also asked ChatGPT about this.
Cluster 0 tends to have countries with higher level of education, while Cluster 1 tends to have countries with lower level of education.
In [26]:
#各クラスタに属する国の 平均値:edu_clusterごとに国をまとめて、各指標の平均値を計算
cluster_profile = (
df_edu
.groupby('edu_cluster')
.mean()
)
cluster_profile
Out[26]:
| eys_2023 | mys_2023 | se_f_2023 | se_m_2023 | lab_count | |
|---|---|---|---|---|---|
| edu_cluster | |||||
| 0 | 16.395650 | 12.333613 | 91.137699 | 93.359582 | 28.508475 |
| 1 | 9.963002 | 4.362760 | 17.410367 | 28.770754 | 3.172414 |
| 2 | 13.708520 | 9.054664 | 60.184097 | 63.767110 | 19.636364 |
With ChatGPT's suggestion, I tried to check the no of lab for each cluster
In [23]:
import seaborn as sns
sns.boxplot(
x='edu_cluster',
y='lab_count',
data=df_edu
)
plt.xlabel('Education cluster')
plt.ylabel('Number of FabLabs')
plt.title('FabLab count by education cluster')
plt.show()