< Home
Week4-1: Transforms¶
I would like to see what kinds of developping countries are good for setting up FabLab using HDI. In addition to the lecture, based on Tsuchiya-san's explanation, I decided to use PCA to identify important indicators of HDI.
PCA with HDI indicators in 2023¶
In [3]:
path_hdr25 = "./datasets/hdr25.csv"
#!pip install pycountry #install library ”pycountry”
import pandas as pd #import Panda Library
import numpy as np #import Numpy
import pycountry
import requests
import json
import matplotlib.pyplot as plt
#HDI
df_hdr25 = pd.read_csv(path_hdr25, encoding='utf-8', encoding_errors='ignore') #df = data flame = read dataset path "path_hdr25" defined above with panda
df_hdr25.fillna(0, inplace=True) #replace N/A to 0
#Lab list
url = 'https://api.fablabs.io/0/labs.json'
r = requests.get(url)
data = r.json()
df_lablist = pd.DataFrame(data)
#CountryCodeMarge
def alpha2_to_alpha3(code2):
country = pycountry.countries.get(alpha_2=code2.upper())
return country.alpha_3 if country else None
df_lablist['ccd3'] = df_lablist['country_code'].apply(alpha2_to_alpha3)
#CountLabNo
df_labcount = (df_lablist.groupby('ccd3').agg(lab_count=('id', 'count')).reset_index())
#CountLabKind
df_kindcount = (df_lablist.groupby(['ccd3', 'kind_name']).agg(kind_count=('id', 'count')).reset_index())
#Marge HDI with Lab(Number)
df_labcount = df_lablist.groupby('country_code').agg(lab_count=('id', 'count')).reset_index()
df_labcount['ccd3'] = df_labcount['country_code'].apply(alpha2_to_alpha3)
df_labno_hdi = pd.merge(df_labcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Encoding
from sklearn.preprocessing import LabelEncoder
#Encoding for HDI x Lab Kind
le = LabelEncoder()
encoded_country = le.fit_transform(df_labno_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
#Encoding for HDI x Lab Number list
df_labno_hdi['country_encoded'] = encoded_country
encoded = le.fit_transform(df_kind_hdi['kind_name'].values)
decoded = le.inverse_transform(encoded)
df_kind_hdi['kind_name_encoded'] = encoded
encoded_country = le.fit_transform(df_kind_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_kind_hdi['country_encoded'] = encoded_country
Chose related indicaters.¶
In [16]:
cols = df_labno_hdi.columns
cols2023 = []
for item in cols:
if '2023' in item:
cols2023.append(item)
#cols2023
#[c for c in cols2023 if 'rank' in c.lower()]
cols2023 = [c for c in cols2023 if 'rank' not in c.lower()]
cols2023 = [c for c in cols2023 if 'hdi' not in c.lower()]
In [17]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
cols_pca = cols2023
df_pca = df_labno_hdi[cols_pca].dropna()
#標準化
scaler = StandardScaler()
Xscale = scaler.fit_transform(df_pca)
pca = PCA()
Xpca = pca.fit_transform(Xscale)
plt.plot(pca.explained_variance_ratio_, 'o-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA explained variance (HDI + lab_count)')
plt.show()
loadings = pd.DataFrame(
pca.components_.T,
index=cols_pca,
columns=[f'PC{i+1}' for i in range(len(cols_pca))]
)
loadings
Out[17]:
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | ... | PC22 | PC23 | PC24 | PC25 | PC26 | PC27 | PC28 | PC29 | PC30 | PC31 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| le_2023 | 0.211800 | -0.054955 | 0.006493 | -0.178181 | 0.088710 | 0.125428 | -0.209398 | -0.040073 | -0.050286 | -0.068270 | ... | 0.072414 | -0.054780 | -0.002740 | 0.027417 | 0.011697 | -0.029523 | -0.003097 | -0.021003 | -0.267493 | 0.767548 |
| eys_2023 | 0.199153 | 0.141232 | -0.052108 | -0.127472 | 0.175753 | -0.058357 | -0.092404 | 0.358083 | 0.130728 | 0.086997 | ... | 0.061719 | 0.015184 | 0.001479 | -0.010426 | 0.027925 | 0.710214 | 0.375476 | -0.022960 | 0.097672 | 0.058061 |
| mys_2023 | 0.214473 | 0.006789 | -0.102758 | 0.142446 | -0.003577 | -0.155231 | 0.080404 | 0.056790 | -0.191360 | 0.007087 | ... | 0.031260 | 0.065197 | -0.017072 | 0.019097 | 0.021783 | 0.124301 | -0.411460 | 0.298799 | 0.584668 | 0.215839 |
| gnipc_2023 | 0.191551 | 0.085989 | 0.311239 | -0.156832 | -0.089751 | -0.144176 | 0.042655 | -0.105132 | 0.006519 | -0.151343 | ... | -0.070319 | -0.088483 | -0.130204 | 0.116873 | -0.006921 | -0.346723 | 0.577168 | 0.174528 | 0.356150 | 0.117540 |
| gdi_group_2023 | -0.185174 | -0.121411 | 0.104656 | -0.259199 | 0.012978 | -0.116093 | 0.153079 | 0.263856 | -0.167125 | 0.020977 | ... | 0.002893 | 0.032880 | 0.030471 | 0.005649 | 0.040057 | 0.004752 | -0.007849 | 0.007023 | -0.001145 | 0.000579 |
| gdi_2023 | 0.172972 | 0.101036 | -0.092019 | 0.351664 | 0.045146 | 0.049023 | -0.232165 | -0.299690 | 0.150610 | 0.077034 | ... | -0.090648 | -0.228916 | -0.023554 | -0.253552 | 0.036217 | -0.015029 | -0.010701 | -0.004781 | 0.000700 | 0.003182 |
| le_f_2023 | 0.214184 | -0.057746 | -0.046537 | -0.134580 | 0.082264 | 0.131882 | -0.207145 | -0.081447 | -0.052691 | -0.082047 | ... | 0.381496 | -0.220258 | 0.075606 | 0.459603 | 0.123931 | 0.010116 | 0.002004 | 0.029977 | 0.111404 | -0.399892 |
| eys_f_2023 | 0.204225 | 0.117513 | -0.049402 | -0.094016 | 0.161991 | -0.034451 | -0.121027 | 0.284088 | 0.123326 | 0.117720 | ... | 0.335340 | 0.028528 | 0.150843 | 0.187961 | -0.023761 | -0.373693 | -0.201076 | 0.010092 | -0.066587 | -0.027507 |
| mys_f_2023 | 0.216016 | -0.016660 | -0.085728 | 0.162607 | -0.021453 | -0.114133 | 0.055653 | 0.014378 | -0.173312 | 0.029969 | ... | -0.119614 | 0.431618 | 0.067314 | 0.284097 | -0.429098 | -0.041008 | 0.210853 | -0.130655 | -0.277633 | -0.112213 |
| gni_pc_f_2023 | 0.188344 | 0.166816 | 0.244324 | -0.144821 | -0.087688 | -0.192127 | 0.043147 | -0.139370 | 0.099030 | -0.251464 | ... | 0.328864 | 0.289853 | 0.448030 | -0.359057 | 0.054396 | 0.075061 | -0.158537 | -0.067321 | -0.104746 | -0.037604 |
| le_m_2023 | 0.204494 | -0.054475 | 0.065139 | -0.221224 | 0.095273 | 0.119134 | -0.205319 | 0.002865 | -0.054447 | -0.053656 | ... | -0.235828 | 0.091328 | -0.073948 | -0.421900 | -0.066270 | 0.006675 | -0.006167 | 0.004656 | 0.150428 | -0.377984 |
| eys_m_2023 | 0.187834 | 0.161909 | -0.034279 | -0.162293 | 0.185281 | -0.091832 | -0.055358 | 0.439334 | 0.130307 | 0.055079 | ... | -0.327149 | -0.025556 | -0.116786 | -0.142105 | 0.002612 | -0.346965 | -0.170464 | 0.010135 | -0.035148 | -0.029404 |
| mys_m_2023 | 0.209152 | 0.024037 | -0.105907 | 0.122087 | 0.011932 | -0.202141 | 0.102435 | 0.100470 | -0.215926 | -0.013003 | ... | 0.048938 | -0.345747 | -0.018386 | -0.211191 | 0.372756 | -0.082446 | 0.208608 | -0.175804 | -0.298061 | -0.099397 |
| gni_pc_m_2023 | 0.192575 | 0.055091 | 0.308025 | -0.173328 | -0.085467 | -0.134957 | 0.045311 | -0.084388 | -0.023452 | -0.134691 | ... | -0.249247 | -0.247305 | -0.339802 | 0.279940 | -0.047213 | 0.278223 | -0.413045 | -0.107376 | -0.249491 | -0.080412 |
| coef_ineq_2023 | -0.205367 | 0.193879 | 0.040937 | -0.096187 | 0.155514 | -0.165244 | -0.140803 | -0.053936 | -0.161983 | 0.044801 | ... | 0.307971 | 0.007371 | -0.334281 | -0.117718 | -0.219805 | -0.060613 | -0.028691 | -0.631231 | 0.262905 | 0.064554 |
| loss_2023 | -0.203884 | 0.196734 | 0.034615 | -0.095790 | 0.163762 | -0.168585 | -0.152879 | -0.063007 | -0.176856 | 0.056773 | ... | 0.269453 | 0.041571 | -0.310835 | -0.156941 | -0.139635 | 0.032893 | 0.039637 | 0.640933 | -0.313486 | -0.087280 |
| ineq_le_2023 | -0.213720 | 0.099523 | 0.110893 | 0.055542 | -0.078768 | -0.136643 | 0.158494 | 0.063754 | 0.041715 | 0.001256 | ... | 0.101555 | -0.295551 | 0.161374 | 0.034672 | 0.202189 | 0.002004 | 0.008778 | -0.001407 | -0.007282 | 0.007519 |
| ineq_edu_2023 | -0.202323 | 0.023040 | 0.145253 | -0.214685 | 0.077493 | -0.008584 | -0.104349 | 0.030095 | -0.001373 | 0.065298 | ... | -0.304065 | 0.088321 | 0.323305 | 0.176316 | 0.177154 | 0.008218 | -0.004761 | -0.008746 | 0.022703 | 0.011082 |
| ineq_inc_2023 | -0.151130 | 0.284860 | -0.083913 | -0.011183 | 0.286608 | -0.213959 | -0.334815 | -0.223849 | -0.357488 | 0.095566 | ... | -0.269799 | 0.025173 | 0.300998 | 0.133351 | 0.173642 | 0.019662 | -0.003998 | -0.025387 | 0.033826 | 0.013032 |
| gii_2023 | -0.214163 | -0.058405 | 0.042103 | 0.070494 | 0.030231 | -0.032609 | -0.091483 | 0.104190 | -0.063198 | 0.156086 | ... | 0.078698 | -0.064246 | -0.085838 | 0.069039 | -0.018941 | -0.004886 | -0.014632 | 0.001773 | 0.000314 | -0.003756 |
| mmr_2023 | -0.183510 | 0.135600 | 0.186668 | 0.078934 | -0.079896 | -0.257394 | 0.203237 | 0.219475 | 0.118925 | -0.077244 | ... | -0.002338 | -0.030935 | 0.018976 | -0.009940 | 0.000143 | -0.002311 | -0.008775 | 0.007249 | 0.003992 | 0.000823 |
| abr_2023 | -0.196339 | 0.169508 | 0.070765 | 0.133710 | -0.087780 | -0.022122 | -0.049254 | 0.027478 | -0.089443 | 0.036309 | ... | -0.035396 | 0.058323 | 0.013913 | 0.001315 | -0.013755 | -0.005584 | -0.000556 | 0.004609 | -0.000464 | 0.000634 |
| se_f_2023 | 0.207002 | -0.011452 | -0.091240 | 0.171110 | 0.008456 | -0.173565 | 0.152437 | 0.011241 | -0.208683 | 0.116087 | ... | 0.030055 | 0.405034 | -0.288274 | 0.071368 | 0.528742 | -0.029021 | -0.002613 | -0.045493 | -0.025512 | -0.009393 |
| se_m_2023 | 0.196818 | 0.002286 | -0.106410 | 0.138410 | 0.061998 | -0.266237 | 0.195780 | 0.068459 | -0.259803 | 0.107390 | ... | -0.040579 | -0.372078 | 0.287526 | -0.052573 | -0.440338 | 0.019849 | 0.000717 | 0.040736 | 0.023116 | 0.005792 |
| pr_f_2023 | 0.051320 | 0.477575 | -0.062820 | -0.145855 | -0.121967 | 0.342388 | 0.188762 | -0.028203 | 0.119573 | 0.554718 | ... | 0.000298 | -0.038520 | 0.013267 | 0.002095 | 0.013645 | -0.002242 | 0.000325 | -0.004334 | -0.002242 | -0.003434 |
| pr_m_2023 | -0.050419 | -0.434407 | 0.154754 | 0.205723 | 0.169486 | -0.310497 | -0.326334 | 0.095291 | 0.380100 | 0.287414 | ... | 0.016607 | 0.002973 | 0.014322 | -0.000762 | 0.019518 | 0.001530 | 0.005550 | -0.003564 | 0.000059 | -0.002595 |
| lfpr_f_2023 | 0.052178 | 0.427480 | 0.215121 | 0.440462 | 0.072315 | -0.023562 | -0.051331 | 0.040155 | 0.303265 | -0.145436 | ... | -0.017974 | 0.092558 | -0.055638 | 0.179293 | -0.035640 | 0.018610 | -0.006549 | 0.012702 | -0.001853 | -0.005622 |
| lfpr_m_2023 | -0.016457 | -0.004394 | 0.411251 | 0.329300 | 0.238040 | 0.488852 | -0.075762 | 0.329465 | -0.361805 | -0.203493 | ... | 0.027442 | -0.030683 | 0.054720 | -0.064918 | 0.021354 | -0.000239 | -0.000577 | -0.005976 | -0.004564 | 0.001873 |
| co2_prod_2023 | 0.126752 | -0.205013 | 0.396698 | 0.061701 | -0.075351 | 0.091620 | 0.118572 | -0.044025 | -0.181995 | 0.526929 | ... | 0.112939 | 0.044967 | 0.051364 | -0.072707 | -0.018676 | 0.014877 | -0.013233 | -0.007028 | -0.004834 | -0.000051 |
| mf_2023 | 0.153236 | 0.023010 | 0.424826 | -0.066648 | -0.028695 | -0.128031 | -0.086131 | -0.230458 | -0.023354 | 0.207786 | ... | -0.044999 | -0.012890 | 0.002123 | 0.014181 | 0.012104 | 0.003351 | -0.010162 | -0.000140 | -0.004417 | 0.001626 |
| pop_total_2023 | -0.018477 | -0.077157 | 0.038714 | -0.046263 | 0.763339 | 0.053504 | 0.514550 | -0.278595 | 0.169128 | -0.022413 | ... | -0.006534 | 0.011732 | -0.016792 | 0.017295 | 0.005762 | -0.003451 | 0.001148 | 0.000565 | -0.000762 | 0.000693 |
31 rows × 31 columns
In [18]:
# PC1を寄与の大きい順に
loadings['PC1'].abs().sort_values(ascending=False)
Out[18]:
mys_f_2023 0.216016 mys_2023 0.214473 le_f_2023 0.214184 gii_2023 0.214163 ineq_le_2023 0.213720 le_2023 0.211800 mys_m_2023 0.209152 se_f_2023 0.207002 coef_ineq_2023 0.205367 le_m_2023 0.204494 eys_f_2023 0.204225 loss_2023 0.203884 ineq_edu_2023 0.202323 eys_2023 0.199153 se_m_2023 0.196818 abr_2023 0.196339 gni_pc_m_2023 0.192575 gnipc_2023 0.191551 gni_pc_f_2023 0.188344 eys_m_2023 0.187834 gdi_group_2023 0.185174 mmr_2023 0.183510 gdi_2023 0.172972 mf_2023 0.153236 ineq_inc_2023 0.151130 co2_prod_2023 0.126752 lfpr_f_2023 0.052178 pr_f_2023 0.051320 pr_m_2023 0.050419 pop_total_2023 0.018477 lfpr_m_2023 0.016457 Name: PC1, dtype: float64
In [19]:
loadings['PC2'].abs().sort_values(ascending=False)
Out[19]:
pr_f_2023 0.477575 pr_m_2023 0.434407 lfpr_f_2023 0.427480 ineq_inc_2023 0.284860 co2_prod_2023 0.205013 loss_2023 0.196734 coef_ineq_2023 0.193879 abr_2023 0.169508 gni_pc_f_2023 0.166816 eys_m_2023 0.161909 eys_2023 0.141232 mmr_2023 0.135600 gdi_group_2023 0.121411 eys_f_2023 0.117513 gdi_2023 0.101036 ineq_le_2023 0.099523 gnipc_2023 0.085989 pop_total_2023 0.077157 gii_2023 0.058405 le_f_2023 0.057746 gni_pc_m_2023 0.055091 le_2023 0.054955 le_m_2023 0.054475 mys_m_2023 0.024037 ineq_edu_2023 0.023040 mf_2023 0.023010 mys_f_2023 0.016660 se_f_2023 0.011452 mys_2023 0.006789 lfpr_m_2023 0.004394 se_m_2023 0.002286 Name: PC2, dtype: float64
Below is test code with ChatGPT
In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
np.set_printoptions(precision=1)
#
# PCA に使う列を選択
#
# tidy データ(縦長)に変換
value_vars = [c for c in df_hdr25.columns if c[-4:].isdigit()]
df_long = df_hdr25.melt(
id_vars=["country"],
value_vars=value_vars,
var_name="variable",
value_name="value"
)
df_long["year"] = df_long["variable"].str[-4:].astype(int)
df_long["indicator"] = df_long["variable"].str[:-5]
# ピボットで国×年×指標の行列化
df_wide = df_long.pivot_table(
index=["country","year"],
columns="indicator",
values="value"
).dropna()
# PCA
X = df_wide.values
X_scaled = StandardScaler().fit_transform(X)
Xpca = PCA(n_components=10).fit_transform(X_scaled)
#
# standardize (zero mean, unit variance) to eliminate dependence on data scaling
#
print(f"data mean: {np.mean(X):.2f}, variance: {np.var(X):.2f}")
X = X-np.mean(X,axis=0)
std = np.std(X,axis=0)
Xscale = X/np.where(std > 0,std,1)
print(f"standardized data mean: {np.mean(Xscale):.2f}, variance: {np.var(Xscale):.2f}")
#
# do 5 component PCA
#
pca = sklearn.decomposition.PCA(n_components=5)
pca.fit(Xscale)
Xpca = pca.transform(Xscale)
plt.plot(pca.explained_variance_,'o')
plt.plot()
plt.xlabel('PCA component')
plt.ylabel('explained variance')
plt.title('HDI Lab PCA')
plt.show()
#
# plot vs first two PCA components
#
plt.scatter(Xpca[:,0], Xpca[:,1], s=3)
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.title("HDI Lab PCA (PC1 vs PC2)")
plt.show()
data mean: 1783.14, variance: 88982502.68 standardized data mean: 0.00, variance: 1.00