[Your-Name-Here] - Fab Futures - Data Science
Home About

< Home

Week3-1: Probability¶

I would like to see what kinds of developping countries are good for setting up FabLab using HDI.

Histogram¶

First, I tried histgram referring Neil's and Tshuchiya-san's code
First, read the data.

In [13]:
path_hdr25 = "./datasets/hdr25.csv"
!pip install pycountry  #install library ”pycountry”
import pandas as pd #import Panda Library
import numpy as np  #import Numpy
import pycountry
import requests
import json
import matplotlib.pyplot as plt
#HDI
df_hdr25 = pd.read_csv(path_hdr25, encoding='utf-8', encoding_errors='ignore') #df = data flame = read dataset path "path_hdr25" defined above with panda 
df_hdr25.fillna(0, inplace=True) #replace N/A to 0 
#Lab list
url = 'https://api.fablabs.io/0/labs.json'
r = requests.get(url)
data = r.json()
df_lablist = pd.DataFrame(data)
#CountryCodeMarge
def alpha2_to_alpha3(code2):
    country = pycountry.countries.get(alpha_2=code2.upper())
    return country.alpha_3 if country else None
df_lablist['ccd3'] = df_lablist['country_code'].apply(alpha2_to_alpha3)
#CountLabNo
df_labcount = (df_lablist.groupby('ccd3').agg(lab_count=('id', 'count')).reset_index())
#CountLabKind
df_kindcount = (df_lablist.groupby(['ccd3', 'kind_name']).agg(kind_count=('id', 'count')).reset_index())
#Marge HDI with Lab(Number)
df_labcount = df_lablist.groupby('country_code').agg(lab_count=('id', 'count')).reset_index()
df_labcount['ccd3'] = df_labcount['country_code'].apply(alpha2_to_alpha3)
df_labno_hdi = pd.merge(df_labcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Encoding
from sklearn.preprocessing import LabelEncoder
#Encoding for HDI x Lab Kind
le = LabelEncoder()
encoded_country = le.fit_transform(df_labno_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
#Encoding for HDI x Lab Number list
df_labno_hdi['country_encoded'] = encoded_country
encoded = le.fit_transform(df_kind_hdi['kind_name'].values)
decoded = le.inverse_transform(encoded)
df_kind_hdi['kind_name_encoded'] = encoded
encoded_country = le.fit_transform(df_kind_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_kind_hdi['country_encoded'] = encoded_country
Collecting pycountry
  Using cached pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Using cached pycountry-24.6.1-py3-none-any.whl (6.3 MB)
Installing collected packages: pycountry
Successfully installed pycountry-24.6.1
In [14]:
fil_col = ['le_2023','mys_2023','gnipc_2023','le_2022','mys_2022','gnipc_2022','le_2021','mys_2021','gnipc_2021','le_2020','mys_2020','gnipc_2020']

newcols = []
row = []

for col in fil_col:

    if len(row) < 3:
        row.append(col)
        if col == fil_col[-1]:
            newcols.append(row)
            
    else:
        if col == 'kind_name_encoded' or col == "country_encoded":
            print(f"{col} has come here. else")
        newcols.append(row)
        row = []
        row.append(col)

plt.figure(figsize=(10,10))
position = 1
for i in range(4):
    for j in range(3):
        
        x = df_labno_hdi[newcols[i][j]]
        mean = np.mean(x,0)
        stddev = np.std(x,0)
        plt.subplot(4,3,position)
        plt.hist(x,100,density=True)
        plt.plot(x,0*x,'|',ms=20)
        plt.title(newcols[i][j])
        position = position + 1

        # plot gaussian
        xi = np.linspace(mean - 3*stddev,mean+3*stddev,100)
        yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)
        plt.plot(xi,yi,'r')

plt
Out[14]:
<module 'matplotlib.pyplot' from '/opt/conda/lib/python3.13/site-packages/matplotlib/pyplot.py'>
No description has been provided for this image
In [15]:
import numpy as np
import matplotlib.pyplot as plt

fil_col = [
    'le_2023','mys_2023','gnipc_2023',
    'le_2022','mys_2022','gnipc_2022',
    'le_2021','mys_2021','gnipc_2021',
    'le_2020','mys_2020','gnipc_2020'
]

# --- newcols(3つずつまとめる) ---
newcols = []
row = []

for col in fil_col:
    if len(row) < 3:
        row.append(col)
        if col == fil_col[-1]:
            newcols.append(row)
    else:
        newcols.append(row)
        row = [col]

# --- 描画 ---
plt.figure(figsize=(12, 12))
position = 1

for i in range(4):
    for j in range(3):

        colname = newcols[i][j]
        x = df_labno_hdi[colname].dropna().values

        mean = np.mean(x)
        stddev = np.std(x)

        # --- subplot ---
        plt.subplot(4, 3, position)

        # histogram
        bins = max(10, len(x) // 50)
        plt.hist(x, bins=bins, density=True)

        # vertical sample marks
        plt.plot(x, np.zeros_like(x), '|', ms=max(5, len(x) / 20))

        # Gaussian curve
        xi = np.linspace(mean - 3*stddev, mean + 3*stddev, 200)
        yi = np.exp(-(xi - mean)**2 / (2 * stddev**2)) / np.sqrt(2 * np.pi * stddev**2)
        plt.plot(xi, yi, 'r')

        plt.title(colname)

        position += 1

plt.tight_layout()
plt
Out[15]:
<module 'matplotlib.pyplot' from '/opt/conda/lib/python3.13/site-packages/matplotlib/pyplot.py'>
No description has been provided for this image

Averaging¶

I also tried Averaging. Same as last time I just followed Neil's and Tsuchiya-san's code.
First result seems strange, but I'm still not quite sure what is the problem here...

In [16]:
fil_col = ['le_2023','mys_2023','gnipc_2023','le_2022','mys_2022','gnipc_2022','le_2021','mys_2021','gnipc_2021','le_2020','mys_2020','gnipc_2020']

newcols = []
row = []
print(len(fil_col))
for col in fil_col:

    if len(row) < 3:
        row.append(col)
        if col == fil_col[-1]:
            newcols.append(row)
            
    else:
        if col == 'kind_name_encoded' or col == "country_encoded":
            print(f"{col} has come here. else")
        newcols.append(row)
        row = []
        row.append(col)

plt.figure(figsize=(10,10))
position = 1
for i in range(4):
    for j in range(3):
        
        x = df_kind_hdi[newcols[i][j]]
        mean = np.mean(x)
        stddev = np.std(x)
        trials = 100
        points = np.arange(10, len(x), 100)
        means = np.zeros((100,len(points)))

        for k,N in enumerate(points):
            for t in range(trials):
                sample = np.random.choice(x,size=N,replace=False)
                means[t,k] = np.mean(sample)

        plt.subplot(4,3,position)
        plt.plot(points,mean+stddev/np.sqrt(points), 'r', label='calculated')
        plt.plot(points,mean-stddev/np.sqrt(points),'r')

        mean = np.mean(means,0)
        stddev = np.std(means,0)
        plt.errorbar(points,mean,yerr=stddev,fmt='k-o',capsize=7,label="estimated")

        for point in range(len(points)):
            plt.plot(np.full(trials,points[point]),means[:,point],'o',markersize=2)

        plt.xlabel("number of sample averaged")
        plt.ylabel(f"mean estimated of {newcols[i][j]}")

        position = position + 1

plt.tight_layout()
plt
12
Out[16]:
<module 'matplotlib.pyplot' from '/opt/conda/lib/python3.13/site-packages/matplotlib/pyplot.py'>
No description has been provided for this image
In [17]:
import numpy as np
import matplotlib.pyplot as plt

# 対象となる列(例:newcols[0][0])
target_col = newcols[0][0]

x = df_kind_hdi[target_col].dropna().values
mean_true = np.mean(x)
width = np.std(x)

trials = 100
points = np.arange(10, len(x), 25)
means = np.zeros((trials, len(points)))

# メインループ(下のコード構造)
for p in range(len(points)):
    N = points[p]
    for t in range(trials):
        sample = np.random.choice(x, size=N, replace=False)
        means[t, p] = np.mean(sample)

# calculated lines
plt.plot(points, mean_true + width / np.sqrt(points), 'r')
plt.plot(points, mean_true - width / np.sqrt(points), 'r')

# estimated values
mean_est = np.mean(means, axis=0)
std_est = np.std(means, axis=0)

plt.errorbar(points, mean_est, yerr=std_est, fmt='k-o', capsize=7)

# scatter plot
for p in range(len(points)):
    plt.plot(np.full(trials, points[p]), means[:, p], 'o', markersize=2)

plt.xlabel("number of samples averaged")
plt.ylabel(f"mean estimated of {target_col}")
plt.show()
No description has been provided for this image
In [18]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 12))

position = 1  # サブプロット位置

for i in range(4):          # ← 4 行
    for j in range(3):      # ← 3 列

        target_col = newcols[i][j]          # 使用する列名
        x = df_kind_hdi[target_col].dropna().values
        
        if len(x) < 20:
            print(f"⚠ データ数が少ないためスキップ: {target_col}")
            continue

        # 基本統計量
        mean_true = np.mean(x)
        width = np.std(x)

        trials = 100
        points = np.arange(10, len(x), 25)   # データ数以内に制限
        means = np.zeros((trials, len(points)))

        # メインループ(下のコードの構造)
        for p in range(len(points)):
            N = points[p]
            for t in range(trials):
                sample = np.random.choice(x, size=N, replace=False)
                means[t, p] = np.mean(sample)

        # --- プロット ---
        plt.subplot(4, 3, position)

        # calculated lines
        plt.plot(points, mean_true + width / np.sqrt(points), 'r')
        plt.plot(points, mean_true - width / np.sqrt(points), 'r')

        # estimated values
        mean_est = np.mean(means, axis=0)
        std_est = np.std(means, axis=0)
        plt.errorbar(points, mean_est, yerr=std_est, fmt='k-o', capsize=7)

        # scatter points
        for p in range(len(points)):
            plt.plot(np.full(trials, points[p]), means[:, p], 'o', markersize=2)

        plt.xlabel("num samples")
        plt.ylabel(target_col)

        position += 1

plt.tight_layout()
plt.show()
No description has been provided for this image

Covariance¶

I also tried to see covariance. Same as above, I followed Neil's and Tsuchiya-san's code.

In [19]:
# HDI × Lab数(欠損除外)
covarsamples = df_labno_hdi[['hdi_2023', 'lab_count']].dropna().values

#
# find mean, covariance, eigenvalues, and eigenvectors
#
covarmean = np.mean(covarsamples,axis=0)
covar = np.cov(covarsamples,rowvar=False)
evalu,evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
#
# plot and print
#
plt.figure(figsize=(6,6))

# 散布図
plt.scatter(
    covarsamples[:,0],
    covarsamples[:,1],
    alpha=0.5
)

# 平均点
plt.plot(covarmean[0], covarmean[1], 'ro')

# 共分散の主軸
plt.plot(covarplotx, covarploty, 'r')

plt.xlabel("HDI")
plt.ylabel("Number of FabLabs")
plt.title("HDI vs FabLab Count (with covariance)")
plt.show()
No description has been provided for this image

I thought this one is nice to see the relationship between LabNo and HDI, so I tried this with all HDI index in 2023.

Likely a positive correlation

  • Human Development Index(HDI)
  • Life Expectancy at Birth(LE)
  • Expected Years of Schooling(EYS)
  • Mean Years of Schooling(MYS)
  • Gender Development Index(GDI)
  • Inequality-adjusted Human Development Index(IHDI)
  • Share of seats in parliament male/female(PR)
  • Planetary pressures–adjusted Human Development Index(PHDI)
  • Difference from HDI value (DIFF HDI)
  • Carbon dioxide emissions per capita (production) (COEF)
  • Material footprint per capita(MF)

Likely a negative correlation

  • Coefficient of human inequality(COEF INEQ)
  • Overall loss(LOSS)
  • Inequality in life expectancy(INEQ LE)
  • Inequality in eduation(INEQ EDU)
  • Gender Inequality Index (GII)
  • Labour force participation rate(ages 15 and older)(LFPR)


Additionaly, there was not much difference between the gender-specific indicators and the overall indicators.

Select only 2023¶

In [20]:
cols = df_labno_hdi.columns
cols2023 = []
for item in cols:
    if '2023' in item:
        cols2023.append(item)
#cols2023
#[c for c in cols2023 if 'rank' in c.lower()]
cols2023 = [c for c in cols2023 if 'rank' not in c.lower()]
cols2023 = [c for c in cols2023 if 'hdi' not in c.lower()]
In [21]:
# Lab数
y_col = 'lab_count'

df_fil = df_labno_hdi[['lab_count'] + cols2023].dropna()

check_conb = []

for col in cols2023:
    tmp = df_labno_hdi[[col, y_col]].dropna()
    
    if len(tmp) < 5:
        continue  # データ少なすぎ防止
    
    corr = np.corrcoef(tmp[col], tmp[y_col])[0,1]
    check_conb.append([col, y_col, corr])

nplots = len(check_conb)
ncols = 3                      
nrows = int(np.ceil(nplots / ncols))

plt.figure(figsize=(6 * ncols, 5 * nrows))
position = 1

for r in check_conb:
    
    a = df_labno_hdi[r[0]].to_numpy()
    b = df_labno_hdi[r[1]].to_numpy()
    
    mask = ~np.isnan(a) & ~np.isnan(b)
    samples = np.column_stack((a[mask], b[mask]))
    
    # --- variance ---
    varmean = np.mean(samples, axis=0)
    varstd = np.sqrt(np.var(samples, axis=0))
    
    varplotx = [
        varmean[0]-varstd[0], varmean[0]+varstd[0],
        None,
        varmean[0], varmean[0]
    ]
    varploty = [
        varmean[1], varmean[1],
        None,
        varmean[1]-varstd[1], varmean[1]+varstd[1]
    ]
    
    # --- covariance ---
    covarmean = np.mean(samples, axis=0)
    covar = np.cov(samples, rowvar=False)
    evalu, evect = np.linalg.eig(covar)
    
    dx0 = evect[0,0] * np.sqrt(evalu[0])
    dx1 = evect[1,0] * np.sqrt(evalu[1])
    dy0 = evect[0,1] * np.sqrt(evalu[0])
    dy1 = evect[1,1] * np.sqrt(evalu[1])
    
    covarplotx = [
        covarmean[0]-dx0, covarmean[0]+dx0,
        None,
        covarmean[0]-dx1, covarmean[0]+dx1
    ]
    covarploty = [
        covarmean[1]+dy0, covarmean[1]-dy0,
        None,
        covarmean[1]+dy1, covarmean[1]-dy1
    ]
    
    # --- plot ---
    plt.subplot(nrows, ncols, position)
    plt.scatter(samples[:,0], samples[:,1], alpha=0.4)
    plt.plot(varplotx, varploty, 'g', lw=2)
    plt.plot(covarplotx, covarploty, 'r', lw=2)
    plt.plot(covarmean[0], covarmean[1], 'ro')
    
    plt.xlabel(r[0])
    plt.ylabel("Lab count")
    plt.title(f"{r[0]} × Lab count (corr={r[2]:.2f})")
    
    position += 1

#plt.tight_layout()
#plt.show()
No description has been provided for this image

With CHatGPT's suggestion, I also tried to see which index is most affected to lab No. The code is from ChatGPT.

In [22]:
hdi_cols = cols2023

n = len(hdi_cols)
ncols = 3
nrows = int(np.ceil(n / ncols))

plt.figure(figsize=(15, 4*nrows))

for i, col in enumerate(hdi_cols, start=1):

    tmp = df_labno_hdi[[col, 'lab_count']].dropna()

    x = tmp[col].values
    y = tmp['lab_count'].values

    plt.subplot(nrows, ncols, i)
    plt.scatter(x, y, alpha=0.4)

    plt.xlabel(col)
    plt.ylabel("Lab count")
    plt.title(col)

plt.tight_layout()
plt.show()

scores = []

for col in hdi_cols:
    tmp = df_labno_hdi[[col, 'lab_count']].dropna()
    corr = np.corrcoef(tmp[col], tmp['lab_count'])[0,1]
    scores.append((col, corr))

scores = sorted(scores, key=lambda x: abs(x[1]), reverse=True)

scores

top_cols = [s[0] for s in scores[:3]]

plt.figure(figsize=(6, 5*len(top_cols)))
pos = 1

for col in top_cols:

    tmp = df_labno_hdi[[col, 'lab_count']].dropna()
    samples = tmp.values

    mean = np.mean(samples, axis=0)
    cov = np.cov(samples, rowvar=False)
    eigval, eigvec = np.linalg.eig(cov)

    dx = eigvec[0,0] * np.sqrt(eigval[0])
    dy = eigvec[1,0] * np.sqrt(eigval[0])

    plt.subplot(len(top_cols), 1, pos)
    plt.scatter(samples[:,0], samples[:,1], alpha=0.4)
    plt.plot(mean[0], mean[1], 'ro')
    plt.plot(
        [mean[0]-dx, mean[0]+dx],
        [mean[1]-dy, mean[1]+dy],
        'r', lw=2
    )

    plt.xlabel(col)
    plt.ylabel("Lab count")
    plt.title(f"{col} (corr={scores[top_cols.index(col)][1]:.2f})")

    pos += 1

plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image

Correlation Analysis¶

To see the Correlation Analysis, I also followed Tsuchiya-san's code

  • 1st graph
  • Close to black: Small p-value (significant)
  • Close to white: Large p-value (not significant)
  • 2nd graph
  • Red: Positive correlation (increases with increase)
  • Blue: Negative correlation (decreases with increase)
  • White: Almost no correlation
    It seems there are not significant relationship...
    At reast, "Share of seats in parliament(PR)" and "Labour force participation rate(LFPR)" is not so important for Lab Nos.
In [23]:
from scipy.stats import spearmanr
import numpy as np
import seaborn as sns

cols = ['lab_count'] + cols2023
df_fil = df_labno_hdi[cols].dropna()

cormatrix = pd.DataFrame(index=cols,columns=cols,dtype=float)
p_val_matrix = pd.DataFrame(index=cols,columns=cols,dtype=float)

for colx in cols:
    for coly in cols:
        if colx == coly:
            cormatrix.loc[colx,coly] = 1.0
            p_val_matrix.loc[colx,coly] = 1.0
        elif(pd.isna(cormatrix.loc[colx,coly])):
            corr_coef,p_value = spearmanr(df_fil[colx],df_fil[coly])

            cormatrix.loc[colx,coly] = corr_coef
            cormatrix.loc[coly,colx] = corr_coef
            p_val_matrix.loc[colx,coly] = p_value
            p_val_matrix.loc[coly,colx] = p_value

plt.figure(figsize=(10,8))
sns.heatmap(p_val_matrix,annot=True,annot_kws={"size": 6},cmap="binary_r",vmin=0.0,vmax=0.05)
plt.title("p-value of HDI indicators significantly correlated with the number of labs")
plt.tight_layout()

plt.figure(figsize=(10,8))
sns.heatmap(cormatrix,annot=True,annot_kws={"size": 6}, cmap="coolwarm",vmin=-1.0,vmax=1.0)
plt.title("co-efficience of HDI indicators significantly correlated with the number of labs")
plt.tight_layout()
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: