< Home
Presentation¶
I would like to see what kinds of developping countries are good for setting up FabLab using HDI.
First, read the data.
path_hdr25 = "./datasets/hdr25.csv"
#!pip install pycountry #install library ”pycountry”
import pandas as pd #import Panda Library
import numpy as np #import Numpy
import pycountry
import requests
import json
import matplotlib.pyplot as plt
#HDI
df_hdr25 = pd.read_csv(path_hdr25, encoding='utf-8', encoding_errors='ignore') #df = data flame = read dataset path "path_hdr25" defined above with panda
df_hdr25.fillna(0, inplace=True) #replace N/A to 0
#Lab list
url = 'https://api.fablabs.io/0/labs.json'
r = requests.get(url)
data = r.json()
df_lablist = pd.DataFrame(data)
#CountryCodeMarge
def alpha2_to_alpha3(code2):
country = pycountry.countries.get(alpha_2=code2.upper())
return country.alpha_3 if country else None
df_lablist['ccd3'] = df_lablist['country_code'].apply(alpha2_to_alpha3)
#CountLabNo
df_labcount = (df_lablist.groupby('ccd3').agg(lab_count=('id', 'count')).reset_index())
#CountLabKind
df_kindcount = (df_lablist.groupby(['ccd3', 'kind_name']).agg(kind_count=('id', 'count')).reset_index())
#Marge HDI with Lab(Number)
df_labcount = df_lablist.groupby('country_code').agg(lab_count=('id', 'count')).reset_index()
df_labcount['ccd3'] = df_labcount['country_code'].apply(alpha2_to_alpha3)
df_labno_hdi = pd.merge(df_labcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Encoding
from sklearn.preprocessing import LabelEncoder
#Encoding for HDI x Lab Kind
le = LabelEncoder()
encoded_country = le.fit_transform(df_labno_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
#Encoding for HDI x Lab Number list
df_labno_hdi['country_encoded'] = encoded_country
encoded = le.fit_transform(df_kind_hdi['kind_name'].values)
decoded = le.inverse_transform(encoded)
df_kind_hdi['kind_name_encoded'] = encoded
encoded_country = le.fit_transform(df_kind_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_kind_hdi['country_encoded'] = encoded_country
Collecting pycountry Using cached pycountry-24.6.1-py3-none-any.whl.metadata (12 kB) Using cached pycountry-24.6.1-py3-none-any.whl (6.3 MB) Installing collected packages: pycountry Successfully installed pycountry-24.6.1
Select Data for analysis¶
Below is the list of the indicators in HDI.
| Full name | Short name | Time series | | ------------- | ------------- | ----- | | ISO3 | iso3 | - | | HDR Country Name| country | - | | Human Development Groups | hdicode | - | | UNDP Developeing Regions | region | - | | HDI | | HDI Rank | hdi_rank | 2023| | Human Development Index (value) | hdi | 1990-2023| | Life Expectancy at Birth (years) | le| 1990-2023| | Expected Years of Schooling (years) | eys| 1990-2023| | Mean Years of Schooling (years) | mys | 1990-2023| | Gross National Income Per Capita (2021 PPP) | gnipc | 1990-2023| | GDI| | GDI Group| gdi_group| 2023| | Gender Development Index (value) | gdi | 1990-2023| | HDI female (value)| hdi_f | 1990-2023| | Life Expectancy at Birth, female (years) | le_f | 1990-2023| | Expected Years of Schooling, female (years) | eys_f| 1990-2023| | Mean Years of Schooling, female (years)| mys_f | 1990-2023| | Gross National Income Per Capita, female(2021 PPP$)| gni_pc_f| 1990-2023| | HDI male (value)| hdi_m| 1990-2023| | Life Expectancy at Birth, male (years) | le_m| 1990-2023| | Expected Years of Schooling, male (years) | eys_m | 1990-2023| | Mean Years of Schooling, male (years) | mys_m | 1990-2023| | Gross National Income Per Capita, male (2021 PPP)|gni_pc_m| 1990-2023| | IHDI| | Inequality-adjusted Human Development Index (value)| ihdi| 2010-2023| | Coefficient of human inequality | coef_ineq | 2010-2023| | Overall loss (%) | loss | 2010-2023| | Inequality in life expectancy | ineq_le| 2010-2023| | Inequality in eduation| ineq_edu | 2010-2023| | Inequality in income | ineq_inc| 2010-2021, 2022-2023=2022| | GII| | GII Rank | gii_rank| 2023| | Gender Inequality Index (value)| gii | 1990-2023| | Maternal Mortality Ratio (deaths per 100,000 live births) | mmr | 1990-2019, 2020-2023=2020| | Adolescent Birth Rate (births per 1,000 women ages 15-19) | abr| 1990-2023| | Population with at least some secondary education, female (% ages 25 and older) | se_f| 1990-2023| | Population with at least some secondary education, male (% ages 25 and older) | se_m| 1990-2023| | Share of seats in parliament, female (% held by women) | pr_f | 1990-2023| | Share of seats in parliament, male (% held by men) | pr_m| 1990-2023| | Labour force participation rate, female (% ages 15 and older) | lfpr_f | 1990-2023| | Labour force participation rate, male (% ages 15 and older) | lfpr_m | 1990-2023| | PHDI| | Difference from HDI rank rankdiff_| hdi_phdi| 2023| | Planetary pressures–adjusted Human Development Index (value) | phdi | 1990-2023| | Difference from HDI value (%) | diff_hdi_phdi | 1990-2023| | Carbon dioxide emissions per capita (production) (tonnes) | co2_prod | 1990-2023v | Material footprint per capita (tonnes) | mf | 1990-2023| | Additional Indicator | | Population, total (millions) | pop_total | 1990-2023|
cols = df_labno_hdi.columns
cols2023 = []
for item in cols:
if '2023' in item:
cols2023.append(item)
#cols2023
#[c for c in cols2023 if 'rank' in c.lower()]
cols2023 = [c for c in cols2023 if 'rank' not in c.lower()]
cols2023 = [c for c in cols2023 if 'hdi' not in c.lower()]
cols2023
['le_2023', 'eys_2023', 'mys_2023', 'gnipc_2023', 'gdi_group_2023', 'gdi_2023', 'le_f_2023', 'eys_f_2023', 'mys_f_2023', 'gni_pc_f_2023', 'le_m_2023', 'eys_m_2023', 'mys_m_2023', 'gni_pc_m_2023', 'coef_ineq_2023', 'loss_2023', 'ineq_le_2023', 'ineq_edu_2023', 'ineq_inc_2023', 'gii_2023', 'mmr_2023', 'abr_2023', 'se_f_2023', 'se_m_2023', 'pr_f_2023', 'pr_m_2023', 'lfpr_f_2023', 'lfpr_m_2023', 'co2_prod_2023', 'mf_2023', 'pop_total_2023']
Spearman¶
First, I would like to know the relationship between the number of FabLabs and HDI
From 1st heatmap, it seems
- At reast "Share of seats in parliament(PR)" and "Labour force participation rate(LFPR)" are not so much important for FabLab Nos
From 2nd heatmap, it seems,
- All of basic HDI indicators appear to be equally correlated with the number of FabLabs
- Indicators related to inequality appear to be equally negatively correlated with the number of FabLabs
- Gender differences don't seem to have much of an impact
from scipy.stats import spearmanr
import numpy as np
import seaborn as sns
cols = ['lab_count'] + cols2023
df_fil = df_labno_hdi[cols].dropna()
cormatrix = pd.DataFrame(index=cols,columns=cols,dtype=float)
p_val_matrix = pd.DataFrame(index=cols,columns=cols,dtype=float)
for colx in cols:
for coly in cols:
if colx == coly:
cormatrix.loc[colx,coly] = 1.0
p_val_matrix.loc[colx,coly] = 1.0
elif(pd.isna(cormatrix.loc[colx,coly])):
corr_coef,p_value = spearmanr(df_fil[colx],df_fil[coly])
cormatrix.loc[colx,coly] = corr_coef
cormatrix.loc[coly,colx] = corr_coef
p_val_matrix.loc[colx,coly] = p_value
p_val_matrix.loc[coly,colx] = p_value
plt.figure(figsize=(10,8))
sns.heatmap(p_val_matrix,annot=True,annot_kws={"size": 6},cmap="binary_r",vmin=0.0,vmax=0.05)
plt.title("p-value of HDI indicators significantly correlated with the number of labs")
plt.tight_layout()
plt.figure(figsize=(10,8))
sns.heatmap(cormatrix,annot=True,annot_kws={"size": 6}, cmap="coolwarm",vmin=-1.0,vmax=1.0)
plt.title("co-efficience of HDI indicators significantly correlated with the number of labs")
plt.tight_layout()
PCA¶
I would like to know which indicator is especially important in HDI.
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
cols_pca = cols2023
df_pca = df_labno_hdi[cols_pca].dropna()
#標準化
scaler = StandardScaler()
Xscale = scaler.fit_transform(df_pca)
pca = PCA()
Xpca = pca.fit_transform(Xscale)
plt.plot(pca.explained_variance_ratio_, 'o-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA explained variance (HDI + lab_count)')
plt.show()
loadings = pd.DataFrame(
pca.components_.T,
index=cols_pca,
columns=[f'PC{i+1}' for i in range(len(cols_pca))]
)
loadings
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | ... | PC22 | PC23 | PC24 | PC25 | PC26 | PC27 | PC28 | PC29 | PC30 | PC31 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| le_2023 | 0.211800 | -0.054955 | 0.006493 | -0.178181 | 0.088710 | 0.125428 | -0.209398 | -0.040073 | -0.050286 | -0.068270 | ... | 0.072414 | -0.054780 | -0.002740 | 0.027417 | 0.011697 | -0.029523 | -0.003097 | -0.021003 | -0.267493 | 0.767548 |
| eys_2023 | 0.199153 | 0.141232 | -0.052108 | -0.127472 | 0.175753 | -0.058357 | -0.092404 | 0.358083 | 0.130728 | 0.086997 | ... | 0.061719 | 0.015184 | 0.001479 | -0.010426 | 0.027925 | 0.710214 | 0.375476 | -0.022960 | 0.097672 | 0.058061 |
| mys_2023 | 0.214473 | 0.006789 | -0.102758 | 0.142446 | -0.003577 | -0.155231 | 0.080404 | 0.056790 | -0.191360 | 0.007087 | ... | 0.031260 | 0.065197 | -0.017072 | 0.019097 | 0.021783 | 0.124301 | -0.411460 | 0.298799 | 0.584668 | 0.215839 |
| gnipc_2023 | 0.191551 | 0.085989 | 0.311239 | -0.156832 | -0.089751 | -0.144176 | 0.042655 | -0.105132 | 0.006519 | -0.151343 | ... | -0.070319 | -0.088483 | -0.130204 | 0.116873 | -0.006921 | -0.346723 | 0.577168 | 0.174528 | 0.356150 | 0.117540 |
| gdi_group_2023 | -0.185174 | -0.121411 | 0.104656 | -0.259199 | 0.012978 | -0.116093 | 0.153079 | 0.263856 | -0.167125 | 0.020977 | ... | 0.002893 | 0.032880 | 0.030471 | 0.005649 | 0.040057 | 0.004752 | -0.007849 | 0.007023 | -0.001145 | 0.000579 |
| gdi_2023 | 0.172972 | 0.101036 | -0.092019 | 0.351664 | 0.045146 | 0.049023 | -0.232165 | -0.299690 | 0.150610 | 0.077034 | ... | -0.090648 | -0.228916 | -0.023554 | -0.253552 | 0.036217 | -0.015029 | -0.010701 | -0.004781 | 0.000700 | 0.003182 |
| le_f_2023 | 0.214184 | -0.057746 | -0.046537 | -0.134580 | 0.082264 | 0.131882 | -0.207145 | -0.081447 | -0.052691 | -0.082047 | ... | 0.381496 | -0.220258 | 0.075606 | 0.459603 | 0.123931 | 0.010116 | 0.002004 | 0.029977 | 0.111404 | -0.399892 |
| eys_f_2023 | 0.204225 | 0.117513 | -0.049402 | -0.094016 | 0.161991 | -0.034451 | -0.121027 | 0.284088 | 0.123326 | 0.117720 | ... | 0.335340 | 0.028528 | 0.150843 | 0.187961 | -0.023761 | -0.373693 | -0.201076 | 0.010092 | -0.066587 | -0.027507 |
| mys_f_2023 | 0.216016 | -0.016660 | -0.085728 | 0.162607 | -0.021453 | -0.114133 | 0.055653 | 0.014378 | -0.173312 | 0.029969 | ... | -0.119614 | 0.431618 | 0.067314 | 0.284097 | -0.429098 | -0.041008 | 0.210853 | -0.130655 | -0.277633 | -0.112213 |
| gni_pc_f_2023 | 0.188344 | 0.166816 | 0.244324 | -0.144821 | -0.087688 | -0.192127 | 0.043147 | -0.139370 | 0.099030 | -0.251464 | ... | 0.328864 | 0.289853 | 0.448030 | -0.359057 | 0.054396 | 0.075061 | -0.158537 | -0.067321 | -0.104746 | -0.037604 |
| le_m_2023 | 0.204494 | -0.054475 | 0.065139 | -0.221224 | 0.095273 | 0.119134 | -0.205319 | 0.002865 | -0.054447 | -0.053656 | ... | -0.235828 | 0.091328 | -0.073948 | -0.421900 | -0.066270 | 0.006675 | -0.006167 | 0.004656 | 0.150428 | -0.377984 |
| eys_m_2023 | 0.187834 | 0.161909 | -0.034279 | -0.162293 | 0.185281 | -0.091832 | -0.055358 | 0.439334 | 0.130307 | 0.055079 | ... | -0.327149 | -0.025556 | -0.116786 | -0.142105 | 0.002612 | -0.346965 | -0.170464 | 0.010135 | -0.035148 | -0.029404 |
| mys_m_2023 | 0.209152 | 0.024037 | -0.105907 | 0.122087 | 0.011932 | -0.202141 | 0.102435 | 0.100470 | -0.215926 | -0.013003 | ... | 0.048938 | -0.345747 | -0.018386 | -0.211191 | 0.372756 | -0.082446 | 0.208608 | -0.175804 | -0.298061 | -0.099397 |
| gni_pc_m_2023 | 0.192575 | 0.055091 | 0.308025 | -0.173328 | -0.085467 | -0.134957 | 0.045311 | -0.084388 | -0.023452 | -0.134691 | ... | -0.249247 | -0.247305 | -0.339802 | 0.279940 | -0.047213 | 0.278223 | -0.413045 | -0.107376 | -0.249491 | -0.080412 |
| coef_ineq_2023 | -0.205367 | 0.193879 | 0.040937 | -0.096187 | 0.155514 | -0.165244 | -0.140803 | -0.053936 | -0.161983 | 0.044801 | ... | 0.307971 | 0.007371 | -0.334281 | -0.117718 | -0.219805 | -0.060613 | -0.028691 | -0.631231 | 0.262905 | 0.064554 |
| loss_2023 | -0.203884 | 0.196734 | 0.034615 | -0.095790 | 0.163762 | -0.168585 | -0.152879 | -0.063007 | -0.176856 | 0.056773 | ... | 0.269453 | 0.041571 | -0.310835 | -0.156941 | -0.139635 | 0.032893 | 0.039637 | 0.640933 | -0.313486 | -0.087280 |
| ineq_le_2023 | -0.213720 | 0.099523 | 0.110893 | 0.055542 | -0.078768 | -0.136643 | 0.158494 | 0.063754 | 0.041715 | 0.001256 | ... | 0.101555 | -0.295551 | 0.161374 | 0.034672 | 0.202189 | 0.002004 | 0.008778 | -0.001407 | -0.007282 | 0.007519 |
| ineq_edu_2023 | -0.202323 | 0.023040 | 0.145253 | -0.214685 | 0.077493 | -0.008584 | -0.104349 | 0.030095 | -0.001373 | 0.065298 | ... | -0.304065 | 0.088321 | 0.323305 | 0.176316 | 0.177154 | 0.008218 | -0.004761 | -0.008746 | 0.022703 | 0.011082 |
| ineq_inc_2023 | -0.151130 | 0.284860 | -0.083913 | -0.011183 | 0.286608 | -0.213959 | -0.334815 | -0.223849 | -0.357488 | 0.095566 | ... | -0.269799 | 0.025173 | 0.300998 | 0.133351 | 0.173642 | 0.019662 | -0.003998 | -0.025387 | 0.033826 | 0.013032 |
| gii_2023 | -0.214163 | -0.058405 | 0.042103 | 0.070494 | 0.030231 | -0.032609 | -0.091483 | 0.104190 | -0.063198 | 0.156086 | ... | 0.078698 | -0.064246 | -0.085838 | 0.069039 | -0.018941 | -0.004886 | -0.014632 | 0.001773 | 0.000314 | -0.003756 |
| mmr_2023 | -0.183510 | 0.135600 | 0.186668 | 0.078934 | -0.079896 | -0.257394 | 0.203237 | 0.219475 | 0.118925 | -0.077244 | ... | -0.002338 | -0.030935 | 0.018976 | -0.009940 | 0.000143 | -0.002311 | -0.008775 | 0.007249 | 0.003992 | 0.000823 |
| abr_2023 | -0.196339 | 0.169508 | 0.070765 | 0.133710 | -0.087780 | -0.022122 | -0.049254 | 0.027478 | -0.089443 | 0.036309 | ... | -0.035396 | 0.058323 | 0.013913 | 0.001315 | -0.013755 | -0.005584 | -0.000556 | 0.004609 | -0.000464 | 0.000634 |
| se_f_2023 | 0.207002 | -0.011452 | -0.091240 | 0.171110 | 0.008456 | -0.173565 | 0.152437 | 0.011241 | -0.208683 | 0.116087 | ... | 0.030055 | 0.405034 | -0.288274 | 0.071368 | 0.528742 | -0.029021 | -0.002613 | -0.045493 | -0.025512 | -0.009393 |
| se_m_2023 | 0.196818 | 0.002286 | -0.106410 | 0.138410 | 0.061998 | -0.266237 | 0.195780 | 0.068459 | -0.259803 | 0.107390 | ... | -0.040579 | -0.372078 | 0.287526 | -0.052573 | -0.440338 | 0.019849 | 0.000717 | 0.040736 | 0.023116 | 0.005792 |
| pr_f_2023 | 0.051320 | 0.477575 | -0.062820 | -0.145855 | -0.121967 | 0.342388 | 0.188762 | -0.028203 | 0.119573 | 0.554718 | ... | 0.000298 | -0.038520 | 0.013267 | 0.002095 | 0.013645 | -0.002242 | 0.000325 | -0.004334 | -0.002242 | -0.003434 |
| pr_m_2023 | -0.050419 | -0.434407 | 0.154754 | 0.205723 | 0.169486 | -0.310497 | -0.326334 | 0.095291 | 0.380100 | 0.287414 | ... | 0.016607 | 0.002973 | 0.014322 | -0.000762 | 0.019518 | 0.001530 | 0.005550 | -0.003564 | 0.000059 | -0.002595 |
| lfpr_f_2023 | 0.052178 | 0.427480 | 0.215121 | 0.440462 | 0.072315 | -0.023562 | -0.051331 | 0.040155 | 0.303265 | -0.145436 | ... | -0.017974 | 0.092558 | -0.055638 | 0.179293 | -0.035640 | 0.018610 | -0.006549 | 0.012702 | -0.001853 | -0.005622 |
| lfpr_m_2023 | -0.016457 | -0.004394 | 0.411251 | 0.329300 | 0.238040 | 0.488852 | -0.075762 | 0.329465 | -0.361805 | -0.203493 | ... | 0.027442 | -0.030683 | 0.054720 | -0.064918 | 0.021354 | -0.000239 | -0.000577 | -0.005976 | -0.004564 | 0.001873 |
| co2_prod_2023 | 0.126752 | -0.205013 | 0.396698 | 0.061701 | -0.075351 | 0.091620 | 0.118572 | -0.044025 | -0.181995 | 0.526929 | ... | 0.112939 | 0.044967 | 0.051364 | -0.072707 | -0.018676 | 0.014877 | -0.013233 | -0.007028 | -0.004834 | -0.000051 |
| mf_2023 | 0.153236 | 0.023010 | 0.424826 | -0.066648 | -0.028695 | -0.128031 | -0.086131 | -0.230458 | -0.023354 | 0.207786 | ... | -0.044999 | -0.012890 | 0.002123 | 0.014181 | 0.012104 | 0.003351 | -0.010162 | -0.000140 | -0.004417 | 0.001626 |
| pop_total_2023 | -0.018477 | -0.077157 | 0.038714 | -0.046263 | 0.763339 | 0.053504 | 0.514550 | -0.278595 | 0.169128 | -0.022413 | ... | -0.006534 | 0.011732 | -0.016792 | 0.017295 | 0.005762 | -0.003451 | 0.001148 | 0.000565 | -0.000762 | 0.000693 |
31 rows × 31 columns
What are PC1 and PC2?¶
(I also asked ChatGPT how to read this and how to output.)
It seems
Indicators with a large PC1:
- Mean Years of Schooling(mys_f / mys / mys_m)
- Life Expectancy at Birth(le / le_f / le_m)
- Expected Years of Schooling(eys / eys_f / eys_m)
- Gross National Income Per Capita (gni_pc)
- Population with at least some secondary education(se_f / se_m) Negative:
- Gender Inequality Index(gii)
- various ineq (inequality)
- HDI loss(loss)
PC1 = Overall Human Development and Social Progress
Indicators with a large PC2:
- Share of seats in parliament(pr_f / pr_m)
- Labour force participation rate(lfpr_f / lfpr_m)
- Inequality in income(ineq_inc)
- Carbon dioxide emissions per capita(co2_prod)
PC2 = Social Structure, Gender, and Institutional Characteristics Axis
summary = pd.DataFrame({
'PC1_loading': loadings['PC1'],
'PC1_abs': loadings['PC1'].abs(),
'PC2_loading': loadings['PC2'],
'PC2_abs': loadings['PC2'].abs()
})
summary_sorted = summary.sort_values(
by=['PC1_abs', 'PC2_abs'],
ascending=False
)
summary_sorted
| PC1_loading | PC1_abs | PC2_loading | PC2_abs | |
|---|---|---|---|---|
| mys_f_2023 | 0.216016 | 0.216016 | -0.016660 | 0.016660 |
| mys_2023 | 0.214473 | 0.214473 | 0.006789 | 0.006789 |
| le_f_2023 | 0.214184 | 0.214184 | -0.057746 | 0.057746 |
| gii_2023 | -0.214163 | 0.214163 | -0.058405 | 0.058405 |
| ineq_le_2023 | -0.213720 | 0.213720 | 0.099523 | 0.099523 |
| le_2023 | 0.211800 | 0.211800 | -0.054955 | 0.054955 |
| mys_m_2023 | 0.209152 | 0.209152 | 0.024037 | 0.024037 |
| se_f_2023 | 0.207002 | 0.207002 | -0.011452 | 0.011452 |
| coef_ineq_2023 | -0.205367 | 0.205367 | 0.193879 | 0.193879 |
| le_m_2023 | 0.204494 | 0.204494 | -0.054475 | 0.054475 |
| eys_f_2023 | 0.204225 | 0.204225 | 0.117513 | 0.117513 |
| loss_2023 | -0.203884 | 0.203884 | 0.196734 | 0.196734 |
| ineq_edu_2023 | -0.202323 | 0.202323 | 0.023040 | 0.023040 |
| eys_2023 | 0.199153 | 0.199153 | 0.141232 | 0.141232 |
| se_m_2023 | 0.196818 | 0.196818 | 0.002286 | 0.002286 |
| abr_2023 | -0.196339 | 0.196339 | 0.169508 | 0.169508 |
| gni_pc_m_2023 | 0.192575 | 0.192575 | 0.055091 | 0.055091 |
| gnipc_2023 | 0.191551 | 0.191551 | 0.085989 | 0.085989 |
| gni_pc_f_2023 | 0.188344 | 0.188344 | 0.166816 | 0.166816 |
| eys_m_2023 | 0.187834 | 0.187834 | 0.161909 | 0.161909 |
| gdi_group_2023 | -0.185174 | 0.185174 | -0.121411 | 0.121411 |
| mmr_2023 | -0.183510 | 0.183510 | 0.135600 | 0.135600 |
| gdi_2023 | 0.172972 | 0.172972 | 0.101036 | 0.101036 |
| mf_2023 | 0.153236 | 0.153236 | 0.023010 | 0.023010 |
| ineq_inc_2023 | -0.151130 | 0.151130 | 0.284860 | 0.284860 |
| co2_prod_2023 | 0.126752 | 0.126752 | -0.205013 | 0.205013 |
| lfpr_f_2023 | 0.052178 | 0.052178 | 0.427480 | 0.427480 |
| pr_f_2023 | 0.051320 | 0.051320 | 0.477575 | 0.477575 |
| pr_m_2023 | -0.050419 | 0.050419 | -0.434407 | 0.434407 |
| pop_total_2023 | -0.018477 | 0.018477 | -0.077157 | 0.077157 |
| lfpr_m_2023 | -0.016457 | 0.016457 | -0.004394 | 0.004394 |
K-mean Clustering¶
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
# PCAに使う元データ
X = df_labno_hdi[cols2023].dropna()
# 標準化
X_std = StandardScaler().fit_transform(X)
# PCA(PC1, PC2)
pca = PCA(n_components=2)
PC = pca.fit_transform(X_std)
# PC1, PC2 を明示的に DataFrame 化
df_pca = pd.DataFrame(
PC,
columns=['PC1', 'PC2'],
index=X.index
)
df_pca.head()
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
df_pca['cluster'] = kmeans.fit_predict(df_pca[['PC1', 'PC2']])
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
for c in sorted(df_pca['cluster'].unique()):
subset = df_pca[df_pca['cluster'] == c]
plt.scatter(
subset['PC1'],
subset['PC2'],
label=f'Cluster {c}',
alpha=0.7
)
plt.axhline(0, color='gray', linewidth=0.5)
plt.axvline(0, color='gray', linewidth=0.5)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('K-means clustering on PC1 × PC2')
plt.legend()
plt.show()
cluster_summary = (
df_pca
.groupby('cluster')[['PC1', 'PC2']]
.mean()
)
cluster_summary
| PC1 | PC2 | |
|---|---|---|
| cluster | ||
| 0 | -1.054291 | -0.738622 |
| 1 | 3.779757 | 0.186801 |
| 2 | -6.750372 | 0.720682 |
hdi_cols = [
'le_2023','gni_pc_f_2023','gni_pc_m_2023','gii_2023','mys_2023', 'eys_2023', 'se_f_2023', 'se_m_2023'
]
cluster_hdi = (
df_labno_hdi
.join(df_pca['cluster'])
.groupby('cluster')[hdi_cols]
.mean()
)
cluster_hdi
| le_2023 | gni_pc_f_2023 | gni_pc_m_2023 | gii_2023 | mys_2023 | eys_2023 | se_f_2023 | se_m_2023 | |
|---|---|---|---|---|---|---|---|---|
| cluster | ||||||||
| 0.0 | 73.585140 | 9354.713982 | 18257.990229 | 0.354953 | 8.946904 | 13.194642 | 58.170347 | 61.481492 |
| 1.0 | 80.067339 | 38069.804750 | 59809.293607 | 0.109177 | 12.125226 | 16.528256 | 88.319294 | 89.972756 |
| 2.0 | 63.848536 | 2852.471504 | 4994.721266 | 0.560929 | 4.544651 | 10.023278 | 19.750143 | 33.082322 |
Cluster 1 seems to have a high level of education, while Cluster 2 seems to have a low level of education.
# PCデータと LabNo を結合
df_plot = df_pca.join(df_labno_hdi['lab_count'])
plt.figure(figsize=(8, 6))
scatter = plt.scatter(
df_plot['PC1'],
df_plot['PC2'],
c=df_plot['cluster'],
s=df_plot['lab_count'] * 3, # サイズ調整
alpha=0.7
)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clusters on PC1×PC2 (size = Lab count)')
plt.colorbar(scatter, label='Cluster')
plt.show()
This graph shows that the country with higher educational standards have more FabLabs.
Additional to see the relationship between lab kind.¶
# cluster 情報を country_encoded をキーに付与
df_kind_plot = df_kind_hdi.join(
df_pca['cluster'],
on='country_encoded'
)
#df_kind_plot.head()
df_kind_cluster = (
df_kind_plot
.groupby(['cluster', 'kind_name'])
.agg(kind_count=('kind_count', 'sum'))
.reset_index()
)
df_kind_cluster.head()
| cluster | kind_name | kind_count | |
|---|---|---|---|
| 0 | 0.0 | fab_lab | 517 |
| 1 | 0.0 | mini_fab_lab | 94 |
| 2 | 0.0 | mobile | 8 |
| 3 | 1.0 | fab_lab | 1149 |
| 4 | 1.0 | mini_fab_lab | 169 |
...Below charts are suggested by Chat GPT.
import seaborn as sns
import matplotlib.pyplot as plt
# cluster × kind
heat = df_kind_cluster.pivot_table(
index='cluster',
columns='kind_name',
values='kind_count',
aggfunc='sum',
fill_value=0
)
plt.figure(figsize=(10, 5))
sns.heatmap(
heat,
cmap='Blues',
annot=True,
fmt='d'
)
plt.title('Lab kind distribution by cluster')
plt.xlabel('Lab kind')
plt.ylabel('Cluster')
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
# cluster × kind のピボット
radar_df = df_kind_cluster.pivot_table(
index='cluster',
columns='kind_name',
values='kind_count',
aggfunc='sum',
fill_value=0
)
radar_ratio = radar_df.div(radar_df.sum(axis=1), axis=0)
#radar_ratio
import matplotlib.pyplot as plt
labels = radar_ratio.columns.tolist()
num_vars = len(labels)
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1] # 閉じる
plt.figure(figsize=(7, 7))
ax = plt.subplot(111, polar=True)
for cluster in radar_ratio.index:
values = radar_ratio.loc[cluster].tolist()
values += values[:1] # 閉じる
ax.plot(angles, values, label=f'Cluster {cluster}')
ax.fill(angles, values, alpha=0.15)
ax.set_thetagrids(np.degrees(angles[:-1]), labels)
ax.set_title('Cluster characteristics by Lab kind', y=1.08)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.tight_layout()
plt.show()
It seems FabLab is big numbers and there still seems to be a bit of a numerical bias to conduct this analysis.