import requests
import json
import pandas as pd

url = 'https://api.fablabs.io/0/labs.json'
r = requests.get(url)

data = r.json()
df_lablist = pd.DataFrame(data)

path_hdr25 = "./HDR25.csv" #define dataset path

import pandas as pd #import Panda Library
import numpy as np  #import Numpy

df_hdr25 = pd.read_csv(path_hdr25, encoding='utf-8', encoding_errors='ignore') #df = data flame = read dataset path "path_hdr25" defined above with panda 
df_hdr25.fillna(0, inplace=True) #replace N/A to 0

cols2023 = []
for item in cols:
    if '2023' in item:
        cols2023.append(item)

cols = df_hdr25.columns

df_hdr_2023 = df_hdr25.loc[:,cols2023]

country_code = df_lablist['country_code']
country_code.to_csv('country_code.csv')
print(country_code.head(5))

print(df_hdr25['iso3'].head(10))

!pip install pycountry

import pycountry

def alpha2_to_alpha3(code2):
    country = pycountry.countries.get(alpha_2=code2.upper())
    return country.alpha_3 if country else None

ccd = country_code.to_list()

ccd3 = []
for c in ccd:
    cc = alpha2_to_alpha3(c)
    ccd3.append(cc)

df_lablist['ccd3'] = ccd3
df_lablist.loc[:,['country_code','ccd3']]
df_merge = pd.merge(df_lablist,df_hdr25,left_on='ccd3',right_on='iso3',how="left")
df_merge_tmp = df_merge.query('iso3 == "JPN"')
print(df_merge_tmp.loc[:,['name']])
#pd.merge(world,geo_df_fin,left_on='SOV_A3',right_on='country_code',how='left')

cols2023 = []
for item in cols:
    if '2023' in item:
        cols2023.append(item)

cols2023 = ['iso3','country'] + cols2023
df_hdi2023 = df_hdr25.loc[:,cols2023]

df_merge_2023 = pd.merge(df_lablist,df_hdi2023,left_on='ccd3',right_on='iso3',how="left")
print(df_merge_2023.loc[:,['name','kind_name','hdi_2023','country']].head(5))

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encoded = le.fit_transform(df_merge_2023['kind_name'].values)
decoded = le.inverse_transform(encoded)
df_merge_2023['kind_name_encoded'] = encoded

print('存在するクラス: ', le.classes_)
print('変換先: fab_lab, mini_fab_lab, mobie ->', le.transform(['fab_lab', 'mini_fab_lab', 'mobile']))
print('エンコード結果: ', encoded)
print('元に戻す: ', decoded)

encoded_country = le.fit_transform(df_merge_2023['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_merge_2023['country_encoded'] = encoded_country

print('存在するクラス: ', le.classes_)
print('エンコード結果: ', encoded_country)
print('元に戻す: ', decoded_country)

count_result = df_merge_2023.groupby('ccd3',as_index=False)['country_encoded'].count()
count_result.sort_values('country_encoded',ascending=False)

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
import numpy as np
from datetime import datetime as dt

df_merge_2023 = df_merge_2023.fillna(0)
X = df_merge_2023.loc[:,['le_2023', 'eys_2023',
       'mys_2023', 'gnipc_2023', 'gdi_group_2023', 'gdi_2023', 'hdi_f_2023',
       'le_f_2023', 'eys_f_2023', 'mys_f_2023', 'gni_pc_f_2023', 'hdi_m_2023',
       'le_m_2023', 'eys_m_2023', 'mys_m_2023', 'gni_pc_m_2023', 'ihdi_2023',
       'coef_ineq_2023', 'loss_2023', 'ineq_le_2023', 'ineq_edu_2023',
       'ineq_inc_2023', 'gii_rank_2023', 'gii_2023', 'mmr_2023', 'abr_2023',
       'se_f_2023', 'se_m_2023', 'pr_f_2023', 'pr_m_2023', 'lfpr_f_2023',
       'lfpr_m_2023', 'rankdiff_hdi_phdi_2023', 'phdi_2023',
       'diff_hdi_phdi_2023', 'co2_prod_2023', 'mf_2023', 'pop_total_2023',
       'kind_name_encoded', 'country_encoded']]
columns = X.columns
X = X.to_numpy()
print(X.shape)

Y = df_merge_2023['kind_name_encoded'].to_numpy()
Y = np.ravel(Y)
print(Y.shape)

scaler = StandardScaler()
X = scaler.fit_transform(X)
#Y = scaler.fit_transform(Y)

X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size=0.2)

print(X_train.shape, X_test.shape,y_train.shape,y_test.shape)

model = MLPClassifier()

starttime = dt.now()
model.fit(X_train,y_train)
endtime = dt.now()
print("Predict:",model.score(X_test,y_test)," time:", (endtime.timestamp() - starttime.timestamp()))

plt.title("Loss Curve")
plt.plot(model.loss_curve_)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.grid()
plt.show()

df_merge_2023.columns
df_fil = df_merge_2023.loc[:,['hdi_rank_2023', 'hdi_2023', 'le_2023', 'eys_2023',
       'mys_2023', 'gnipc_2023', 'gdi_group_2023', 'gdi_2023', 'hdi_f_2023',
       'le_f_2023', 'eys_f_2023', 'mys_f_2023', 'gni_pc_f_2023', 'hdi_m_2023',
       'le_m_2023', 'eys_m_2023', 'mys_m_2023', 'gni_pc_m_2023', 'ihdi_2023',
       'coef_ineq_2023', 'loss_2023', 'ineq_le_2023', 'ineq_edu_2023',
       'ineq_inc_2023', 'gii_rank_2023', 'gii_2023', 'mmr_2023', 'abr_2023',
       'se_f_2023', 'se_m_2023', 'pr_f_2023', 'pr_m_2023', 'lfpr_f_2023',
       'lfpr_m_2023', 'rankdiff_hdi_phdi_2023', 'phdi_2023',
       'diff_hdi_phdi_2023', 'co2_prod_2023', 'mf_2023', 'pop_total_2023',
       'kind_name_encoded', 'country_encoded']]
len(df_fil.columns)

fil_col = df_fil.columns

newcols = []
row = []

for col in fil_col:

    if len(row) < 4:
        row.append(col)
        if col == fil_col[-1]:
            newcols.append(row)
            
    else:
        if col == 'mf_2023' or col == "pop_total_2023" or col == "kind_name_encoded" or col == "country_code":
            print(f"{col} has come here. else")
        newcols.append(row)
        row = []
        row.append(col)

plt.figure(figsize=(45,60))
position = 1
for i in range(11):
    for j in range(len(newcols[i])):
        
        x = df_fil[newcols[i][j]]
        mean = np.mean(x,0)
        stddev = np.std(x,0)
        plt.subplot(12,4,position)
        plt.hist(x,10,density=True)
        plt.plot(x,0*x,'|',ms=20)
        plt.title(newcols[i][j])
        position = position + 1

        # plot gaussian
        xi = np.linspace(mean - 3*stddev,mean+3*stddev,100)
        yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)
        plt.plot(xi,yi,'r')

#plt.savefig('./notupload/histgrams.png')
plt.tight_layout()
plt.show()

newcols = []
row = []
print(len(fil_col))
for col in fil_col:

    if len(row) < 4:
        row.append(col)
        if col == fil_col[-1]:
            newcols.append(row)
            
    else:
        if col == 'mf_2023' or col == "pop_total_2023" or col == "kind_name_encoded" or col == "country_code":
            print(f"{col} has come here. else")
        newcols.append(row)
        row = []
        row.append(col)

plt.figure(figsize=(45,60))
position = 1
for i in range(11):
    for j in range(len(newcols[i])):
        
        x = df_fil[newcols[i][j]]
        mean = np.mean(x)
        stddev = np.std(x)
        trials = 100
        points = np.arange(10,1000,100)
        means = np.zeros((100,len(points)))

        for k,N in enumerate(points):
            for t in range(trials):
                sample = np.random.choice(x,size=N,replace=False)
                means[t,k] = np.mean(sample)

        plt.subplot(11,4,position)
        plt.plot(points,mean+stddev/np.sqrt(points), 'r', label='calculated')
        plt.plot(points,mean-stddev/np.sqrt(points),'r')

        mean = np.mean(means,0)
        stddev = np.std(means,0)
        plt.errorbar(points,mean,yerr=stddev,fmt='k-o',capsize=7,label="estimated")

        for point in range(len(points)):
            plt.plot(np.full(trials,points[point]),means[:,point],'o',markersize=2)

        plt.xlabel("number of sample averaged")
        plt.ylabel(f"mean estimated of {newcols[i][j]}")

        position = position + 1

plt.tight_layout()
plt.show()

from scipy.stats import spearmanr
import numpy as np
import seaborn as sns

cols = df_fil.columns

cormatrix = pd.DataFrame(index=cols,columns=cols,dtype=float)
p_val_matrix = pd.DataFrame(index=cols,columns=cols,dtype=float)

for colx in cols:
    for coly in cols:
        if colx == coly:
            cormatrix.loc[colx,coly] = 1.0
            p_val_matrix.loc[colx,coly] = 1.0
        elif(pd.isna(cormatrix.loc[colx,coly])):
            corr_coef,p_value = spearmanr(df_fil[colx],df_fil[coly])

            cormatrix.loc[colx,coly] = corr_coef
            cormatrix.loc[coly,colx] = corr_coef
            p_val_matrix.loc[colx,coly] = p_value
            p_val_matrix.loc[coly,colx] = p_value

plt.figure(figsize=(16,16))
sns.heatmap(p_val_matrix,annot=True,annot_kws={"size": 6},cmap="binary_r",vmin=0.0,vmax=0.05)
plt.title("p-value of spearman correlation in all variables")
plt.tight_layout()
#plt.savefig('./notupload/sp-pval.png')
plt.show()

plt.figure(figsize=(16,16))
sns.heatmap(cormatrix,annot=True,annot_kws={"size": 6}, cmap="coolwarm",vmin=-1.0,vmax=1.0)
plt.title("co-efficience of spearman correlation in all variables")
plt.tight_layout()
#plt.savefig('./notupload/spearman.png')
plt.show()

cols = df_fil.columns

check_conb = []
for colx in cols:
    for coly in cols:
        if colx == coly:
            continue
            
        p = p_val_matrix.at[colx,coly]
        
        if p < 0.05:
            coeff = cormatrix.at[colx,coly]
            check_conb.append([colx,coly,p,coeff])
        for r in check_conb:
            if r[0] == coly and r[1] == colx:
                check_conb.remove(r)
position = 1

#plt.figure(figsize=(30,120))
for r in check_conb:
    
    a = df_fil[r[0]].to_numpy()
    b = df_fil[r[1]].to_numpy()
    
    #a = df_fil['bed_temp_current'].to_numpy()
    #b = df_fil['bed_heater_power'].to_numpy()
    
    samples = np.column_stack((a,b))
    
    varmean = np.mean(samples,axis=0)
    varstd = np.sqrt(np.var(samples,axis=0))
    varplotx = [varmean[0]-varstd[0],varmean[0]+varstd[0],None,varmean[0],varmean[0]]
    varploty = [varmean[1],varmean[1],None,varmean[1]-varstd[1],varmean[1]+varstd[1]]
    
    covarmean = np.mean(samples,axis=0)
    covar = np.cov(samples,rowvar=False)
    evalu,evect = np.linalg.eig(covar)
    dx0 = evect[0,0] * np.sqrt(evalu[0])
    dx1 = evect[1,0] * np.sqrt(evalu[1])
    dy0 = evect[0,1] * np.sqrt(evalu[0])
    dy1 = evect[1,1] * np.sqrt(evalu[1])
    covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
    covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
    #
    # plot and print
    #

    #plt.figure(figsize=(30,30))
    #plt.subplot(12,3,position)
    
    #plt.hist2d(samples[:,0],samples[:,1],bins=30,cmap='binary')
    #plt.plot(samples[:,0],samples[:,1],'o',markersize=1.5,alpha=0.3)
    #plt.plot(varmean[0],varmean[1],'go')
    #plt.plot(covarmean[0],covarmean[1],'ro')
    #plt.plot(varplotx,varploty,'g',lw=4)
    #plt.plot(covarplotx,covarploty,'r',lw=4)
    #plt.text(2.5,-4,"variance",fontsize=15)
    #plt.text(-1.25,8.5,"covariance",fontsize=15)
    #plt.axis('off')
    #plt.title(f"{r[0]},{r[1]},pval:{r[2]:.4f},coeff:{r[3]:.4f}",fontsize=18)
    #position = position + 1
    
#plt.tight_layout()
#plt.savefig('./covariance_sk.png')
#plt.show()

nbins = 256
# 1 dimension entropy
def entropy(dist):
    index = np.where(dist > 0) # o log(0) = 0
    positives = dist[index]
    return -np.sum(positives*np.log2(positives))

# 2 dimension entropy
def entropy2(dist):
    indexx,indexy = np.where(dist > 0) # 0 log(0) = 0
    positives = dist[indexx,indexy]
    return -np.sum(positives*np.log2(positives))

# information
def information(x,y):
    xhist,xedges = np.histogram(x,nbins)
    xdist = xhist/np.sum(xhist)
    yhist,yedges = np.histogram(y,nbins)
    ydist = yhist/np.sum(yhist)
    xyhist,xedges,yedges = np.histogram2d(x,y,[nbins,nbins])
    xydist = xyhist/np.sum(xyhist)
    Hx = entropy(xdist)
    Hy = entropy(ydist)
    Hxy = entropy2(xydist)

    return Hx+Hy-Hxy

check_conb = []
for colx in cols:
    for coly in cols:
        if colx == coly:
            continue
            
        p = p_val_matrix.at[colx,coly]
        
        if p < 0.05:
            coeff = cormatrix.at[colx,coly]
            check_conb.append([colx,coly,p,coeff])
        for r in check_conb:
            if r[0] == coly and r[1] == colx:
                check_conb.remove(r)

position = 1
np.set_printoptions(precision=4)

results = []
#plt.figure(figsize=(30,120))
for r in check_conb:
    
    x = df_fil[r[0]].to_numpy()
    y = df_fil[r[1]].to_numpy()
    
    samples = np.column_stack((x,y))
    covar = np.cov(samples,rowvar=False)
    #print(f"{npts:.0e} points")
    #print(f"covariance:\n{covar}")

    #plt.subplot(18,3,position)
    I = information(x,y)
    #plt.scatter(x,y)
    #plt.title(f"{r[0]},{r[1]} mutual information: {I:.1f} bits, coeff: {r[3]:.3f}",fontsize=14)

    results.append([r[0],r[1],r[2],r[3],I,covar])


#plt.tight_layout()
#plt.savefig('./notupload/mi.png')
#plt.show()

pd.set_option('display.float_format', '{:.8f}'.format)
im_df = pd.DataFrame(results)
im_df.columns = ['val1','val2','p-val','coeff','IM','covar']
im_df = im_df.sort_values('IM',ascending=False)
im_df.to_csv('./imrank-sk.csv')

im_df = im_df.head(15)
position = 1
plt.figure(figsize=(30,120))
for i,row in im_df.iterrows():
    val1 = row['val1']
    val2 = row['val2']
    im = row['IM']
    coeff = row['coeff']
    covar = row['covar']
    
    x = df_fil[val1].to_numpy()
    y = df_fil[val2].to_numpy()
    plt.subplot(18,3,position)
    plt.scatter(x,y)
    plt.title(f"{val1},{val2},MI:{im:.1f} bits, coeff:{coeff:.3f}, covar:{covar}")

    position = position + 1

plt.tight_layout()
plt.savefig('./images/mi-sk.png')
plt.show()

Key techniques of data anayisis¶

Reading JSON data via HTTP requests¶

Reading CSV data from local¶

Data Integration Technique¶

choose the columns for using¶

Label Encoding¶

Machine Learning by Scikit-learn¶

Sample code to plot histgrams of all variables in the dataset¶

Sample code to plot "averaging" of all variables in the dataset¶

Statictics: Co-relation¶

Multidimensional Distribution¶

Mutual Information of All pairs of variables¶