path_hdr25 = "./datasets/hdr25.csv"
#!pip install pycountry  #install library ”pycountry”
import pandas as pd #import Panda Library
import numpy as np  #import Numpy
import pycountry
import requests
import json
import matplotlib.pyplot as plt
#HDI
df_hdr25 = pd.read_csv(path_hdr25, encoding='utf-8', encoding_errors='ignore') #df = data flame = read dataset path "path_hdr25" defined above with panda 
df_hdr25.fillna(0, inplace=True) #replace N/A to 0 
#Lab list
url = 'https://api.fablabs.io/0/labs.json'
r = requests.get(url)
data = r.json()
df_lablist = pd.DataFrame(data)
#CountryCodeMarge
def alpha2_to_alpha3(code2):
    country = pycountry.countries.get(alpha_2=code2.upper())
    return country.alpha_3 if country else None
df_lablist['ccd3'] = df_lablist['country_code'].apply(alpha2_to_alpha3)
#CountLabNo
df_labcount = (df_lablist.groupby('ccd3').agg(lab_count=('id', 'count')).reset_index())
#CountLabKind
df_kindcount = (df_lablist.groupby(['ccd3', 'kind_name']).agg(kind_count=('id', 'count')).reset_index())
#Marge HDI with Lab(Number)
df_labcount = df_lablist.groupby('country_code').agg(lab_count=('id', 'count')).reset_index()
df_labcount['ccd3'] = df_labcount['country_code'].apply(alpha2_to_alpha3)
df_labno_hdi = pd.merge(df_labcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Encoding
from sklearn.preprocessing import LabelEncoder
#Encoding for HDI x Lab Kind
le = LabelEncoder()
encoded_country = le.fit_transform(df_labno_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
#Encoding for HDI x Lab Number list
df_labno_hdi['country_encoded'] = encoded_country
encoded = le.fit_transform(df_kind_hdi['kind_name'].values)
decoded = le.inverse_transform(encoded)
df_kind_hdi['kind_name_encoded'] = encoded
encoded_country = le.fit_transform(df_kind_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_kind_hdi['country_encoded'] = encoded_country

cols = df_labno_hdi.columns
cols2023 = []
for item in cols:
    if '2023' in item:
        cols2023.append(item)
#cols2023
#[c for c in cols2023 if 'rank' in c.lower()]
cols2023 = [c for c in cols2023 if 'rank' not in c.lower()]
cols2023 = [c for c in cols2023 if 'hdi' not in c.lower()]

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

cols_pca = cols2023

df_pca = df_labno_hdi[cols_pca].dropna()

#標準化
scaler = StandardScaler()
Xscale = scaler.fit_transform(df_pca)

pca = PCA()
Xpca = pca.fit_transform(Xscale)

plt.plot(pca.explained_variance_ratio_, 'o-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA explained variance (HDI + lab_count)')
plt.show()

loadings = pd.DataFrame(
    pca.components_.T,
    index=cols_pca,
    columns=[f'PC{i+1}' for i in range(len(cols_pca))]
)

loadings

# PC1を寄与の大きい順に
loadings['PC1'].abs().sort_values(ascending=False)

mys_f_2023        0.216016
mys_2023          0.214473
le_f_2023         0.214184
gii_2023          0.214163
ineq_le_2023      0.213720
le_2023           0.211800
mys_m_2023        0.209152
se_f_2023         0.207002
coef_ineq_2023    0.205367
le_m_2023         0.204494
eys_f_2023        0.204225
loss_2023         0.203884
ineq_edu_2023     0.202323
eys_2023          0.199153
se_m_2023         0.196818
abr_2023          0.196339
gni_pc_m_2023     0.192575
gnipc_2023        0.191551
gni_pc_f_2023     0.188344
eys_m_2023        0.187834
gdi_group_2023    0.185174
mmr_2023          0.183510
gdi_2023          0.172972
mf_2023           0.153236
ineq_inc_2023     0.151130
co2_prod_2023     0.126752
lfpr_f_2023       0.052178
pr_f_2023         0.051320
pr_m_2023         0.050419
pop_total_2023    0.018477
lfpr_m_2023       0.016457
Name: PC1, dtype: float64

loadings['PC2'].abs().sort_values(ascending=False)

pr_f_2023         0.477575
pr_m_2023         0.434407
lfpr_f_2023       0.427480
ineq_inc_2023     0.284860
co2_prod_2023     0.205013
loss_2023         0.196734
coef_ineq_2023    0.193879
abr_2023          0.169508
gni_pc_f_2023     0.166816
eys_m_2023        0.161909
eys_2023          0.141232
mmr_2023          0.135600
gdi_group_2023    0.121411
eys_f_2023        0.117513
gdi_2023          0.101036
ineq_le_2023      0.099523
gnipc_2023        0.085989
pop_total_2023    0.077157
gii_2023          0.058405
le_f_2023         0.057746
gni_pc_m_2023     0.055091
le_2023           0.054955
le_m_2023         0.054475
mys_m_2023        0.024037
ineq_edu_2023     0.023040
mf_2023           0.023010
mys_f_2023        0.016660
se_f_2023         0.011452
mys_2023          0.006789
lfpr_m_2023       0.004394
se_m_2023         0.002286
Name: PC2, dtype: float64

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
np.set_printoptions(precision=1)
#
#  PCA に使う列を選択
#
#　tidy データ（縦長）に変換
value_vars = [c for c in df_hdr25.columns if c[-4:].isdigit()]

df_long = df_hdr25.melt(
    id_vars=["country"],
    value_vars=value_vars,
    var_name="variable",
    value_name="value"
)

df_long["year"] = df_long["variable"].str[-4:].astype(int)
df_long["indicator"] = df_long["variable"].str[:-5]

# ピボットで国×年×指標の行列化
df_wide = df_long.pivot_table(
    index=["country","year"],
    columns="indicator",
    values="value"
).dropna()


# PCA
X = df_wide.values
X_scaled = StandardScaler().fit_transform(X)
Xpca = PCA(n_components=10).fit_transform(X_scaled)

#
# standardize (zero mean, unit variance) to eliminate dependence on data scaling
#
print(f"data mean: {np.mean(X):.2f}, variance: {np.var(X):.2f}")
X = X-np.mean(X,axis=0)
std = np.std(X,axis=0)
Xscale = X/np.where(std > 0,std,1)
print(f"standardized data mean: {np.mean(Xscale):.2f}, variance: {np.var(Xscale):.2f}")
#
# do 5 component PCA
#
pca = sklearn.decomposition.PCA(n_components=5)
pca.fit(Xscale)
Xpca = pca.transform(Xscale)
plt.plot(pca.explained_variance_,'o')
plt.plot()
plt.xlabel('PCA component')
plt.ylabel('explained variance')
plt.title('HDI Lab PCA')
plt.show()
#
# plot vs first two PCA components
#
plt.scatter(Xpca[:,0], Xpca[:,1], s=3)
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.title("HDI Lab PCA (PC1 vs PC2)")
plt.show()

data mean: 1783.14, variance: 88982502.68
standardized data mean: 0.00, variance: 1.00

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	...	PC22	PC23	PC24	PC25	PC26	PC27	PC28	PC29	PC30	PC31
le_2023	0.211800	-0.054955	0.006493	-0.178181	0.088710	0.125428	-0.209398	-0.040073	-0.050286	-0.068270	...	0.072414	-0.054780	-0.002740	0.027417	0.011697	-0.029523	-0.003097	-0.021003	-0.267493	0.767548
eys_2023	0.199153	0.141232	-0.052108	-0.127472	0.175753	-0.058357	-0.092404	0.358083	0.130728	0.086997	...	0.061719	0.015184	0.001479	-0.010426	0.027925	0.710214	0.375476	-0.022960	0.097672	0.058061
mys_2023	0.214473	0.006789	-0.102758	0.142446	-0.003577	-0.155231	0.080404	0.056790	-0.191360	0.007087	...	0.031260	0.065197	-0.017072	0.019097	0.021783	0.124301	-0.411460	0.298799	0.584668	0.215839
gnipc_2023	0.191551	0.085989	0.311239	-0.156832	-0.089751	-0.144176	0.042655	-0.105132	0.006519	-0.151343	...	-0.070319	-0.088483	-0.130204	0.116873	-0.006921	-0.346723	0.577168	0.174528	0.356150	0.117540
gdi_group_2023	-0.185174	-0.121411	0.104656	-0.259199	0.012978	-0.116093	0.153079	0.263856	-0.167125	0.020977	...	0.002893	0.032880	0.030471	0.005649	0.040057	0.004752	-0.007849	0.007023	-0.001145	0.000579
gdi_2023	0.172972	0.101036	-0.092019	0.351664	0.045146	0.049023	-0.232165	-0.299690	0.150610	0.077034	...	-0.090648	-0.228916	-0.023554	-0.253552	0.036217	-0.015029	-0.010701	-0.004781	0.000700	0.003182
le_f_2023	0.214184	-0.057746	-0.046537	-0.134580	0.082264	0.131882	-0.207145	-0.081447	-0.052691	-0.082047	...	0.381496	-0.220258	0.075606	0.459603	0.123931	0.010116	0.002004	0.029977	0.111404	-0.399892
eys_f_2023	0.204225	0.117513	-0.049402	-0.094016	0.161991	-0.034451	-0.121027	0.284088	0.123326	0.117720	...	0.335340	0.028528	0.150843	0.187961	-0.023761	-0.373693	-0.201076	0.010092	-0.066587	-0.027507
mys_f_2023	0.216016	-0.016660	-0.085728	0.162607	-0.021453	-0.114133	0.055653	0.014378	-0.173312	0.029969	...	-0.119614	0.431618	0.067314	0.284097	-0.429098	-0.041008	0.210853	-0.130655	-0.277633	-0.112213
gni_pc_f_2023	0.188344	0.166816	0.244324	-0.144821	-0.087688	-0.192127	0.043147	-0.139370	0.099030	-0.251464	...	0.328864	0.289853	0.448030	-0.359057	0.054396	0.075061	-0.158537	-0.067321	-0.104746	-0.037604
le_m_2023	0.204494	-0.054475	0.065139	-0.221224	0.095273	0.119134	-0.205319	0.002865	-0.054447	-0.053656	...	-0.235828	0.091328	-0.073948	-0.421900	-0.066270	0.006675	-0.006167	0.004656	0.150428	-0.377984
eys_m_2023	0.187834	0.161909	-0.034279	-0.162293	0.185281	-0.091832	-0.055358	0.439334	0.130307	0.055079	...	-0.327149	-0.025556	-0.116786	-0.142105	0.002612	-0.346965	-0.170464	0.010135	-0.035148	-0.029404
mys_m_2023	0.209152	0.024037	-0.105907	0.122087	0.011932	-0.202141	0.102435	0.100470	-0.215926	-0.013003	...	0.048938	-0.345747	-0.018386	-0.211191	0.372756	-0.082446	0.208608	-0.175804	-0.298061	-0.099397
gni_pc_m_2023	0.192575	0.055091	0.308025	-0.173328	-0.085467	-0.134957	0.045311	-0.084388	-0.023452	-0.134691	...	-0.249247	-0.247305	-0.339802	0.279940	-0.047213	0.278223	-0.413045	-0.107376	-0.249491	-0.080412
coef_ineq_2023	-0.205367	0.193879	0.040937	-0.096187	0.155514	-0.165244	-0.140803	-0.053936	-0.161983	0.044801	...	0.307971	0.007371	-0.334281	-0.117718	-0.219805	-0.060613	-0.028691	-0.631231	0.262905	0.064554
loss_2023	-0.203884	0.196734	0.034615	-0.095790	0.163762	-0.168585	-0.152879	-0.063007	-0.176856	0.056773	...	0.269453	0.041571	-0.310835	-0.156941	-0.139635	0.032893	0.039637	0.640933	-0.313486	-0.087280
ineq_le_2023	-0.213720	0.099523	0.110893	0.055542	-0.078768	-0.136643	0.158494	0.063754	0.041715	0.001256	...	0.101555	-0.295551	0.161374	0.034672	0.202189	0.002004	0.008778	-0.001407	-0.007282	0.007519
ineq_edu_2023	-0.202323	0.023040	0.145253	-0.214685	0.077493	-0.008584	-0.104349	0.030095	-0.001373	0.065298	...	-0.304065	0.088321	0.323305	0.176316	0.177154	0.008218	-0.004761	-0.008746	0.022703	0.011082
ineq_inc_2023	-0.151130	0.284860	-0.083913	-0.011183	0.286608	-0.213959	-0.334815	-0.223849	-0.357488	0.095566	...	-0.269799	0.025173	0.300998	0.133351	0.173642	0.019662	-0.003998	-0.025387	0.033826	0.013032
gii_2023	-0.214163	-0.058405	0.042103	0.070494	0.030231	-0.032609	-0.091483	0.104190	-0.063198	0.156086	...	0.078698	-0.064246	-0.085838	0.069039	-0.018941	-0.004886	-0.014632	0.001773	0.000314	-0.003756
mmr_2023	-0.183510	0.135600	0.186668	0.078934	-0.079896	-0.257394	0.203237	0.219475	0.118925	-0.077244	...	-0.002338	-0.030935	0.018976	-0.009940	0.000143	-0.002311	-0.008775	0.007249	0.003992	0.000823
abr_2023	-0.196339	0.169508	0.070765	0.133710	-0.087780	-0.022122	-0.049254	0.027478	-0.089443	0.036309	...	-0.035396	0.058323	0.013913	0.001315	-0.013755	-0.005584	-0.000556	0.004609	-0.000464	0.000634
se_f_2023	0.207002	-0.011452	-0.091240	0.171110	0.008456	-0.173565	0.152437	0.011241	-0.208683	0.116087	...	0.030055	0.405034	-0.288274	0.071368	0.528742	-0.029021	-0.002613	-0.045493	-0.025512	-0.009393
se_m_2023	0.196818	0.002286	-0.106410	0.138410	0.061998	-0.266237	0.195780	0.068459	-0.259803	0.107390	...	-0.040579	-0.372078	0.287526	-0.052573	-0.440338	0.019849	0.000717	0.040736	0.023116	0.005792
pr_f_2023	0.051320	0.477575	-0.062820	-0.145855	-0.121967	0.342388	0.188762	-0.028203	0.119573	0.554718	...	0.000298	-0.038520	0.013267	0.002095	0.013645	-0.002242	0.000325	-0.004334	-0.002242	-0.003434
pr_m_2023	-0.050419	-0.434407	0.154754	0.205723	0.169486	-0.310497	-0.326334	0.095291	0.380100	0.287414	...	0.016607	0.002973	0.014322	-0.000762	0.019518	0.001530	0.005550	-0.003564	0.000059	-0.002595
lfpr_f_2023	0.052178	0.427480	0.215121	0.440462	0.072315	-0.023562	-0.051331	0.040155	0.303265	-0.145436	...	-0.017974	0.092558	-0.055638	0.179293	-0.035640	0.018610	-0.006549	0.012702	-0.001853	-0.005622
lfpr_m_2023	-0.016457	-0.004394	0.411251	0.329300	0.238040	0.488852	-0.075762	0.329465	-0.361805	-0.203493	...	0.027442	-0.030683	0.054720	-0.064918	0.021354	-0.000239	-0.000577	-0.005976	-0.004564	0.001873
co2_prod_2023	0.126752	-0.205013	0.396698	0.061701	-0.075351	0.091620	0.118572	-0.044025	-0.181995	0.526929	...	0.112939	0.044967	0.051364	-0.072707	-0.018676	0.014877	-0.013233	-0.007028	-0.004834	-0.000051
mf_2023	0.153236	0.023010	0.424826	-0.066648	-0.028695	-0.128031	-0.086131	-0.230458	-0.023354	0.207786	...	-0.044999	-0.012890	0.002123	0.014181	0.012104	0.003351	-0.010162	-0.000140	-0.004417	0.001626
pop_total_2023	-0.018477	-0.077157	0.038714	-0.046263	0.763339	0.053504	0.514550	-0.278595	0.169128	-0.022413	...	-0.006534	0.011732	-0.016792	0.017295	0.005762	-0.003451	0.001148	0.000565	-0.000762	0.000693

Week4-1: Transforms¶

PCA with HDI indicators in 2023¶

Week4-1: Transforms¶

PCA with HDI indicators in 2023¶

Chose related indicaters.¶