< Home
Week3-1: Probability¶
I would like to see what kinds of developping countries are good for setting up FabLab using HDI.
Histogram¶
First, I tried histgram referring Neil's and Tshuchiya-san's code
First, read the data.
path_hdr25 = "./datasets/hdr25.csv"
!pip install pycountry #install library ”pycountry”
import pandas as pd #import Panda Library
import numpy as np #import Numpy
import pycountry
import requests
import json
import matplotlib.pyplot as plt
#HDI
df_hdr25 = pd.read_csv(path_hdr25, encoding='utf-8', encoding_errors='ignore') #df = data flame = read dataset path "path_hdr25" defined above with panda
df_hdr25.fillna(0, inplace=True) #replace N/A to 0
#Lab list
url = 'https://api.fablabs.io/0/labs.json'
r = requests.get(url)
data = r.json()
df_lablist = pd.DataFrame(data)
#CountryCodeMarge
def alpha2_to_alpha3(code2):
country = pycountry.countries.get(alpha_2=code2.upper())
return country.alpha_3 if country else None
df_lablist['ccd3'] = df_lablist['country_code'].apply(alpha2_to_alpha3)
#CountLabNo
df_labcount = (df_lablist.groupby('ccd3').agg(lab_count=('id', 'count')).reset_index())
#CountLabKind
df_kindcount = (df_lablist.groupby(['ccd3', 'kind_name']).agg(kind_count=('id', 'count')).reset_index())
#Marge HDI with Lab(Number)
df_labcount = df_lablist.groupby('country_code').agg(lab_count=('id', 'count')).reset_index()
df_labcount['ccd3'] = df_labcount['country_code'].apply(alpha2_to_alpha3)
df_labno_hdi = pd.merge(df_labcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Encoding
from sklearn.preprocessing import LabelEncoder
#Encoding for HDI x Lab Kind
le = LabelEncoder()
encoded_country = le.fit_transform(df_labno_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
#Encoding for HDI x Lab Number list
df_labno_hdi['country_encoded'] = encoded_country
encoded = le.fit_transform(df_kind_hdi['kind_name'].values)
decoded = le.inverse_transform(encoded)
df_kind_hdi['kind_name_encoded'] = encoded
encoded_country = le.fit_transform(df_kind_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_kind_hdi['country_encoded'] = encoded_country
Collecting pycountry Using cached pycountry-24.6.1-py3-none-any.whl.metadata (12 kB) Using cached pycountry-24.6.1-py3-none-any.whl (6.3 MB) Installing collected packages: pycountry Successfully installed pycountry-24.6.1
fil_col = ['le_2023','mys_2023','gnipc_2023','le_2022','mys_2022','gnipc_2022','le_2021','mys_2021','gnipc_2021','le_2020','mys_2020','gnipc_2020']
newcols = []
row = []
for col in fil_col:
if len(row) < 3:
row.append(col)
if col == fil_col[-1]:
newcols.append(row)
else:
if col == 'kind_name_encoded' or col == "country_encoded":
print(f"{col} has come here. else")
newcols.append(row)
row = []
row.append(col)
plt.figure(figsize=(10,10))
position = 1
for i in range(4):
for j in range(3):
x = df_labno_hdi[newcols[i][j]]
mean = np.mean(x,0)
stddev = np.std(x,0)
plt.subplot(4,3,position)
plt.hist(x,100,density=True)
plt.plot(x,0*x,'|',ms=20)
plt.title(newcols[i][j])
position = position + 1
# plot gaussian
xi = np.linspace(mean - 3*stddev,mean+3*stddev,100)
yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)
plt.plot(xi,yi,'r')
plt
<module 'matplotlib.pyplot' from '/opt/conda/lib/python3.13/site-packages/matplotlib/pyplot.py'>
import numpy as np
import matplotlib.pyplot as plt
fil_col = [
'le_2023','mys_2023','gnipc_2023',
'le_2022','mys_2022','gnipc_2022',
'le_2021','mys_2021','gnipc_2021',
'le_2020','mys_2020','gnipc_2020'
]
# --- newcols(3つずつまとめる) ---
newcols = []
row = []
for col in fil_col:
if len(row) < 3:
row.append(col)
if col == fil_col[-1]:
newcols.append(row)
else:
newcols.append(row)
row = [col]
# --- 描画 ---
plt.figure(figsize=(12, 12))
position = 1
for i in range(4):
for j in range(3):
colname = newcols[i][j]
x = df_labno_hdi[colname].dropna().values
mean = np.mean(x)
stddev = np.std(x)
# --- subplot ---
plt.subplot(4, 3, position)
# histogram
bins = max(10, len(x) // 50)
plt.hist(x, bins=bins, density=True)
# vertical sample marks
plt.plot(x, np.zeros_like(x), '|', ms=max(5, len(x) / 20))
# Gaussian curve
xi = np.linspace(mean - 3*stddev, mean + 3*stddev, 200)
yi = np.exp(-(xi - mean)**2 / (2 * stddev**2)) / np.sqrt(2 * np.pi * stddev**2)
plt.plot(xi, yi, 'r')
plt.title(colname)
position += 1
plt.tight_layout()
plt
<module 'matplotlib.pyplot' from '/opt/conda/lib/python3.13/site-packages/matplotlib/pyplot.py'>
Averaging¶
I also tried Averaging. Same as last time I just followed Neil's and Tsuchiya-san's code.
First result seems strange, but I'm still not quite sure what is the problem here...
fil_col = ['le_2023','mys_2023','gnipc_2023','le_2022','mys_2022','gnipc_2022','le_2021','mys_2021','gnipc_2021','le_2020','mys_2020','gnipc_2020']
newcols = []
row = []
print(len(fil_col))
for col in fil_col:
if len(row) < 3:
row.append(col)
if col == fil_col[-1]:
newcols.append(row)
else:
if col == 'kind_name_encoded' or col == "country_encoded":
print(f"{col} has come here. else")
newcols.append(row)
row = []
row.append(col)
plt.figure(figsize=(10,10))
position = 1
for i in range(4):
for j in range(3):
x = df_kind_hdi[newcols[i][j]]
mean = np.mean(x)
stddev = np.std(x)
trials = 100
points = np.arange(10, len(x), 100)
means = np.zeros((100,len(points)))
for k,N in enumerate(points):
for t in range(trials):
sample = np.random.choice(x,size=N,replace=False)
means[t,k] = np.mean(sample)
plt.subplot(4,3,position)
plt.plot(points,mean+stddev/np.sqrt(points), 'r', label='calculated')
plt.plot(points,mean-stddev/np.sqrt(points),'r')
mean = np.mean(means,0)
stddev = np.std(means,0)
plt.errorbar(points,mean,yerr=stddev,fmt='k-o',capsize=7,label="estimated")
for point in range(len(points)):
plt.plot(np.full(trials,points[point]),means[:,point],'o',markersize=2)
plt.xlabel("number of sample averaged")
plt.ylabel(f"mean estimated of {newcols[i][j]}")
position = position + 1
plt.tight_layout()
plt
12
<module 'matplotlib.pyplot' from '/opt/conda/lib/python3.13/site-packages/matplotlib/pyplot.py'>
import numpy as np
import matplotlib.pyplot as plt
# 対象となる列(例:newcols[0][0])
target_col = newcols[0][0]
x = df_kind_hdi[target_col].dropna().values
mean_true = np.mean(x)
width = np.std(x)
trials = 100
points = np.arange(10, len(x), 25)
means = np.zeros((trials, len(points)))
# メインループ(下のコード構造)
for p in range(len(points)):
N = points[p]
for t in range(trials):
sample = np.random.choice(x, size=N, replace=False)
means[t, p] = np.mean(sample)
# calculated lines
plt.plot(points, mean_true + width / np.sqrt(points), 'r')
plt.plot(points, mean_true - width / np.sqrt(points), 'r')
# estimated values
mean_est = np.mean(means, axis=0)
std_est = np.std(means, axis=0)
plt.errorbar(points, mean_est, yerr=std_est, fmt='k-o', capsize=7)
# scatter plot
for p in range(len(points)):
plt.plot(np.full(trials, points[p]), means[:, p], 'o', markersize=2)
plt.xlabel("number of samples averaged")
plt.ylabel(f"mean estimated of {target_col}")
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 12))
position = 1 # サブプロット位置
for i in range(4): # ← 4 行
for j in range(3): # ← 3 列
target_col = newcols[i][j] # 使用する列名
x = df_kind_hdi[target_col].dropna().values
if len(x) < 20:
print(f"⚠ データ数が少ないためスキップ: {target_col}")
continue
# 基本統計量
mean_true = np.mean(x)
width = np.std(x)
trials = 100
points = np.arange(10, len(x), 25) # データ数以内に制限
means = np.zeros((trials, len(points)))
# メインループ(下のコードの構造)
for p in range(len(points)):
N = points[p]
for t in range(trials):
sample = np.random.choice(x, size=N, replace=False)
means[t, p] = np.mean(sample)
# --- プロット ---
plt.subplot(4, 3, position)
# calculated lines
plt.plot(points, mean_true + width / np.sqrt(points), 'r')
plt.plot(points, mean_true - width / np.sqrt(points), 'r')
# estimated values
mean_est = np.mean(means, axis=0)
std_est = np.std(means, axis=0)
plt.errorbar(points, mean_est, yerr=std_est, fmt='k-o', capsize=7)
# scatter points
for p in range(len(points)):
plt.plot(np.full(trials, points[p]), means[:, p], 'o', markersize=2)
plt.xlabel("num samples")
plt.ylabel(target_col)
position += 1
plt.tight_layout()
plt.show()
Covariance¶
I also tried to see covariance. Same as above, I followed Neil's and Tsuchiya-san's code.
# HDI × Lab数(欠損除外)
covarsamples = df_labno_hdi[['hdi_2023', 'lab_count']].dropna().values
#
# find mean, covariance, eigenvalues, and eigenvectors
#
covarmean = np.mean(covarsamples,axis=0)
covar = np.cov(covarsamples,rowvar=False)
evalu,evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
#
# plot and print
#
plt.figure(figsize=(6,6))
# 散布図
plt.scatter(
covarsamples[:,0],
covarsamples[:,1],
alpha=0.5
)
# 平均点
plt.plot(covarmean[0], covarmean[1], 'ro')
# 共分散の主軸
plt.plot(covarplotx, covarploty, 'r')
plt.xlabel("HDI")
plt.ylabel("Number of FabLabs")
plt.title("HDI vs FabLab Count (with covariance)")
plt.show()
I thought this one is nice to see the relationship between LabNo and HDI, so I tried this with all HDI index in 2023.
Likely a positive correlation
- Human Development Index(HDI)
- Life Expectancy at Birth(LE)
- Expected Years of Schooling(EYS)
- Mean Years of Schooling(MYS)
- Gender Development Index(GDI)
- Inequality-adjusted Human Development Index(IHDI)
- Share of seats in parliament male/female(PR)
- Planetary pressures–adjusted Human Development Index(PHDI)
- Difference from HDI value (DIFF HDI)
- Carbon dioxide emissions per capita (production) (COEF)
- Material footprint per capita(MF)
Likely a negative correlation
- Coefficient of human inequality(COEF INEQ)
- Overall loss(LOSS)
- Inequality in life expectancy(INEQ LE)
- Inequality in eduation(INEQ EDU)
- Gender Inequality Index (GII)
- Labour force participation rate(ages 15 and older)(LFPR)
Additionaly, there was not much difference between the gender-specific indicators and the overall indicators.
Select only 2023¶
cols = df_labno_hdi.columns
cols2023 = []
for item in cols:
if '2023' in item:
cols2023.append(item)
#cols2023
#[c for c in cols2023 if 'rank' in c.lower()]
cols2023 = [c for c in cols2023 if 'rank' not in c.lower()]
cols2023 = [c for c in cols2023 if 'hdi' not in c.lower()]
# Lab数
y_col = 'lab_count'
df_fil = df_labno_hdi[['lab_count'] + cols2023].dropna()
check_conb = []
for col in cols2023:
tmp = df_labno_hdi[[col, y_col]].dropna()
if len(tmp) < 5:
continue # データ少なすぎ防止
corr = np.corrcoef(tmp[col], tmp[y_col])[0,1]
check_conb.append([col, y_col, corr])
nplots = len(check_conb)
ncols = 3
nrows = int(np.ceil(nplots / ncols))
plt.figure(figsize=(6 * ncols, 5 * nrows))
position = 1
for r in check_conb:
a = df_labno_hdi[r[0]].to_numpy()
b = df_labno_hdi[r[1]].to_numpy()
mask = ~np.isnan(a) & ~np.isnan(b)
samples = np.column_stack((a[mask], b[mask]))
# --- variance ---
varmean = np.mean(samples, axis=0)
varstd = np.sqrt(np.var(samples, axis=0))
varplotx = [
varmean[0]-varstd[0], varmean[0]+varstd[0],
None,
varmean[0], varmean[0]
]
varploty = [
varmean[1], varmean[1],
None,
varmean[1]-varstd[1], varmean[1]+varstd[1]
]
# --- covariance ---
covarmean = np.mean(samples, axis=0)
covar = np.cov(samples, rowvar=False)
evalu, evect = np.linalg.eig(covar)
dx0 = evect[0,0] * np.sqrt(evalu[0])
dx1 = evect[1,0] * np.sqrt(evalu[1])
dy0 = evect[0,1] * np.sqrt(evalu[0])
dy1 = evect[1,1] * np.sqrt(evalu[1])
covarplotx = [
covarmean[0]-dx0, covarmean[0]+dx0,
None,
covarmean[0]-dx1, covarmean[0]+dx1
]
covarploty = [
covarmean[1]+dy0, covarmean[1]-dy0,
None,
covarmean[1]+dy1, covarmean[1]-dy1
]
# --- plot ---
plt.subplot(nrows, ncols, position)
plt.scatter(samples[:,0], samples[:,1], alpha=0.4)
plt.plot(varplotx, varploty, 'g', lw=2)
plt.plot(covarplotx, covarploty, 'r', lw=2)
plt.plot(covarmean[0], covarmean[1], 'ro')
plt.xlabel(r[0])
plt.ylabel("Lab count")
plt.title(f"{r[0]} × Lab count (corr={r[2]:.2f})")
position += 1
#plt.tight_layout()
#plt.show()
With CHatGPT's suggestion, I also tried to see which index is most affected to lab No. The code is from ChatGPT.
hdi_cols = cols2023
n = len(hdi_cols)
ncols = 3
nrows = int(np.ceil(n / ncols))
plt.figure(figsize=(15, 4*nrows))
for i, col in enumerate(hdi_cols, start=1):
tmp = df_labno_hdi[[col, 'lab_count']].dropna()
x = tmp[col].values
y = tmp['lab_count'].values
plt.subplot(nrows, ncols, i)
plt.scatter(x, y, alpha=0.4)
plt.xlabel(col)
plt.ylabel("Lab count")
plt.title(col)
plt.tight_layout()
plt.show()
scores = []
for col in hdi_cols:
tmp = df_labno_hdi[[col, 'lab_count']].dropna()
corr = np.corrcoef(tmp[col], tmp['lab_count'])[0,1]
scores.append((col, corr))
scores = sorted(scores, key=lambda x: abs(x[1]), reverse=True)
scores
top_cols = [s[0] for s in scores[:3]]
plt.figure(figsize=(6, 5*len(top_cols)))
pos = 1
for col in top_cols:
tmp = df_labno_hdi[[col, 'lab_count']].dropna()
samples = tmp.values
mean = np.mean(samples, axis=0)
cov = np.cov(samples, rowvar=False)
eigval, eigvec = np.linalg.eig(cov)
dx = eigvec[0,0] * np.sqrt(eigval[0])
dy = eigvec[1,0] * np.sqrt(eigval[0])
plt.subplot(len(top_cols), 1, pos)
plt.scatter(samples[:,0], samples[:,1], alpha=0.4)
plt.plot(mean[0], mean[1], 'ro')
plt.plot(
[mean[0]-dx, mean[0]+dx],
[mean[1]-dy, mean[1]+dy],
'r', lw=2
)
plt.xlabel(col)
plt.ylabel("Lab count")
plt.title(f"{col} (corr={scores[top_cols.index(col)][1]:.2f})")
pos += 1
plt.tight_layout()
plt.show()
Correlation Analysis¶
To see the Correlation Analysis, I also followed Tsuchiya-san's code
- 1st graph
- Close to black: Small p-value (significant)
- Close to white: Large p-value (not significant)
- 2nd graph
- Red: Positive correlation (increases with increase)
- Blue: Negative correlation (decreases with increase)
- White: Almost no correlation
It seems there are not significant relationship...
At reast, "Share of seats in parliament(PR)" and "Labour force participation rate(LFPR)" is not so important for Lab Nos.
from scipy.stats import spearmanr
import numpy as np
import seaborn as sns
cols = ['lab_count'] + cols2023
df_fil = df_labno_hdi[cols].dropna()
cormatrix = pd.DataFrame(index=cols,columns=cols,dtype=float)
p_val_matrix = pd.DataFrame(index=cols,columns=cols,dtype=float)
for colx in cols:
for coly in cols:
if colx == coly:
cormatrix.loc[colx,coly] = 1.0
p_val_matrix.loc[colx,coly] = 1.0
elif(pd.isna(cormatrix.loc[colx,coly])):
corr_coef,p_value = spearmanr(df_fil[colx],df_fil[coly])
cormatrix.loc[colx,coly] = corr_coef
cormatrix.loc[coly,colx] = corr_coef
p_val_matrix.loc[colx,coly] = p_value
p_val_matrix.loc[coly,colx] = p_value
plt.figure(figsize=(10,8))
sns.heatmap(p_val_matrix,annot=True,annot_kws={"size": 6},cmap="binary_r",vmin=0.0,vmax=0.05)
plt.title("p-value of HDI indicators significantly correlated with the number of labs")
plt.tight_layout()
plt.figure(figsize=(10,8))
sns.heatmap(cormatrix,annot=True,annot_kws={"size": 6}, cmap="coolwarm",vmin=-1.0,vmax=1.0)
plt.title("co-efficience of HDI indicators significantly correlated with the number of labs")
plt.tight_layout()