< Home
Week2-2: Machine Learning¶
I would like to see what kinds of developping countries are good for setting up FabLab using HDI.
In conclusion, in the current FabLab database, the overwhelming majority of lab types are FabLab, so it appears that predicting lab types using HDI may be difficult.
In [84]:
path_hdr25 = "./datasets/hdr25.csv"
#!pip install pycountry #install library ”pycountry”
import pandas as pd #import Panda Library
import numpy as np #import Numpy
import pycountry
import requests
import json
import matplotlib.pyplot as plt
#HDI
df_hdr25 = pd.read_csv(path_hdr25, encoding='utf-8', encoding_errors='ignore') #df = data flame = read dataset path "path_hdr25" defined above with panda
df_hdr25.fillna(0, inplace=True) #replace N/A to 0
#Lab list
url = 'https://api.fablabs.io/0/labs.json'
r = requests.get(url)
data = r.json()
df_lablist = pd.DataFrame(data)
#CountryCodeMarge
def alpha2_to_alpha3(code2):
country = pycountry.countries.get(alpha_2=code2.upper())
return country.alpha_3 if country else None
df_lablist['ccd3'] = df_lablist['country_code'].apply(alpha2_to_alpha3)
#CountLabNo
df_labcount = (df_lablist.groupby('ccd3').agg(lab_count=('id', 'count')).reset_index())
#CountLabKind
df_kindcount = (df_lablist.groupby(['ccd3', 'kind_name']).agg(kind_count=('id', 'count')).reset_index())
#Marge HDI with Lab(Number)
df_labcount = df_lablist.groupby('country_code').agg(lab_count=('id', 'count')).reset_index()
df_labcount['ccd3'] = df_labcount['country_code'].apply(alpha2_to_alpha3)
df_labno_hdi = pd.merge(df_labcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
Encoding¶
(Follow Tsuchiya-san's instruction)
Encode for HDI x Lab Kind list
In [48]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoded_country = le.fit_transform(df_labno_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_labno_hdi['country_encoded'] = encoded_country
#print('存在するクラス: ', le.classes_)
#print('エンコード結果: ', encoded_country)
#print('元に戻す: ', decoded_country)
Encode for HDI x Lab Number list
In [51]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoded = le.fit_transform(df_kind_hdi['kind_name'].values)
decoded = le.inverse_transform(encoded)
df_kind_hdi['kind_name_encoded'] = encoded
print('存在するクラス: ', le.classes_)
print('変換先: a, b, c ->', le.transform(['fab_lab', 'mini_fab_lab', 'mobile']))
#print('エンコード結果: ', encoded)
#print('元に戻す: ', decoded)
存在するクラス: ['fab_lab' 'mini_fab_lab' 'mobile'] 変換先: a, b, c -> [0 1 2]
In [53]:
encoded_country = le.fit_transform(df_kind_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_kind_hdi['country_encoded'] = encoded_country
#print('存在するクラス: ', le.classes_)
#print('エンコード結果: ', encoded_country)
#print('元に戻す: ', decoded_country)
存在するクラス: ['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Argentina' 'Armenia' 'Australia' 'Austria' 'Bahrain' 'Bangladesh' 'Belarus' 'Belgium' 'Benin' 'Bhutan' 'Bolivia (Plurinational State of)' 'Bosnia and Herzegovina' 'Brazil' 'Bulgaria' 'Burkina Faso' 'Cambodia' 'Cameroon' 'Canada' 'Chad' 'Chile' 'China' 'Colombia' 'Congo' 'Congo (Democratic Republic of the)' 'Costa Rica' 'Croatia' "Cte d'Ivoire" 'Cyprus' 'Czechia' 'Denmark' 'Djibouti' 'Ecuador' 'Egypt' 'El Salvador' 'Estonia' 'Ethiopia' 'Finland' 'France' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Guatemala' 'Guinea' 'Honduras' 'Hong Kong, China (SAR)' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran (Islamic Republic of)' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan' 'Kenya' 'Korea (Republic of)' 'Kuwait' 'Kyrgyzstan' 'Latvia' 'Lebanon' 'Libya' 'Lithuania' 'Luxembourg' 'Madagascar' 'Malaysia' 'Mali' 'Malta' 'Mauritania' 'Mexico' 'Moldova (Republic of)' 'Mongolia' 'Montenegro' 'Morocco' 'Myanmar' 'Namibia' 'Nepal' 'Netherlands' 'New Zealand' 'Niger' 'Nigeria' 'North Macedonia' 'Norway' 'Oman' 'Pakistan' 'Palestine, State of' 'Panama' 'Paraguay' 'Peru' 'Philippines' 'Poland' 'Portugal' 'Qatar' 'Romania' 'Russian Federation' 'Rwanda' 'Saudi Arabia' 'Senegal' 'Serbia' 'Sierra Leone' 'Singapore' 'Slovakia' 'Slovenia' 'Somalia' 'South Africa' 'Spain' 'Sri Lanka' 'Suriname' 'Sweden' 'Switzerland' 'Syrian Arab Republic' 'Tanzania (United Republic of)' 'Thailand' 'Togo' 'Trinidad and Tobago' 'Trkiye' 'Tunisia' 'Ukraine' 'United Arab Emirates' 'United Kingdom' 'United States' 'Uruguay' 'Uzbekistan' 'Venezuela (Bolivarian Republic of)' 'Viet Nam' nan]
Confirmation
In [61]:
df_kind_hdi.loc[:,['country','country_encoded','kind_name','kind_name_encoded']].query('kind_name_encoded == 2')
Out[61]:
| country | country_encoded | kind_name | kind_name_encoded | |
|---|---|---|---|---|
| 6 | United Arab Emirates | 126 | mobile | 2 |
| 28 | Brazil | 16 | mobile | 2 |
| 38 | China | 24 | mobile | 2 |
| 51 | Czechia | 32 | mobile | 2 |
| 54 | Germany | 43 | mobile | 2 |
| 62 | Egypt | 36 | mobile | 2 |
| 65 | Spain | 113 | mobile | 2 |
| 71 | France | 41 | mobile | 2 |
| 81 | Greece | 45 | mobile | 2 |
| 89 | Croatia | 29 | mobile | 2 |
| 95 | India | 52 | mobile | 2 |
| 119 | Korea (Republic of) | 64 | mobile | 2 |
| 121 | Kuwait | 65 | mobile | 2 |
| 127 | Sri Lanka | 114 | mobile | 2 |
| 136 | Morocco | 81 | mobile | 2 |
| 161 | Netherlands | 85 | mobile | 2 |
| 179 | Poland | 98 | mobile | 2 |
| 186 | Paraguay | 95 | mobile | 2 |
| 190 | Qatar | 100 | mobile | 2 |
| 215 | Sweden | 116 | mobile | 2 |
| 225 | Trkiye | 123 | mobile | 2 |
| 235 | United States | 128 | mobile | 2 |
In [55]:
df_kind_hdi.columns
Out[55]:
Index(['ccd3', 'kind_name', 'kind_count', 'iso3', 'country', 'hdicode',
'region', 'hdi_rank_2023', 'hdi_1990', 'hdi_1991',
...
'pop_total_2016', 'pop_total_2017', 'pop_total_2018', 'pop_total_2019',
'pop_total_2020', 'pop_total_2021', 'pop_total_2022', 'pop_total_2023',
'kind_name_encoded', 'country_encoded'],
dtype='object', length=1117)
Machine Learning by Scikit-learn¶
Predict Lab Kinds¶
In [75]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
import numpy as np
from datetime import datetime as dt
df_kind_hdi = df_kind_hdi.fillna(0)
X = df_kind_hdi.loc[:,['le_2023','mys_2023','gnipc_2023']]
columns = X.columns
X = X.to_numpy()
print(X.shape)
Y = df_kind_hdi['kind_name_encoded'].to_numpy()
Y = np.ravel(Y)
print(Y.shape)
scaler = StandardScaler()
X = scaler.fit_transform(X)
#Y = scaler.fit_transform(Y)
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size=0.2)
print(X_train.shape, X_test.shape,y_train.shape,y_test.shape)
model = MLPClassifier()
starttime = dt.now()
model.fit(X_train,y_train)
endtime = dt.now()
print("Predict:",model.score(X_test,y_test)," time:", (endtime.timestamp() - starttime.timestamp()))
plt.title("Loss Curve")
plt.plot(model.loss_curve_)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.grid()
plt.show()
(242, 3) (242,) (193, 3) (49, 3) (193,) (49,) Predict: 0.6122448979591837 time: 0.09344983100891113
In [76]:
predict = model.predict(X_test)
predict
Out[76]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0])
In [77]:
y_test
Out[77]:
array([0, 1, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 2, 1, 1, 2, 0, 2, 0, 0,
0, 1, 0, 0, 1])
In [78]:
target = np.array([11.765668,55060.160380,83.325]).reshape(1,-1)
target = scaler.fit_transform(target)
predict = model.predict(target)
predict
Out[78]:
array([0])
Predict Lab Nos¶
In [85]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
import numpy as np
from datetime import datetime as dt
df_labno_hdi = df_labno_hdi.fillna(0)
X = df_labno_hdi.loc[:,['le_2023','mys_2023','gnipc_2023']]
columns = X.columns
X = X.to_numpy()
print(X.shape)
Y = df_labno_hdi['lab_count'].to_numpy()
Y = np.ravel(Y)
print(Y.shape)
scaler = StandardScaler()
X = scaler.fit_transform(X)
#Y = scaler.fit_transform(Y)
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size=0.2)
print(X_train.shape, X_test.shape,y_train.shape,y_test.shape)
model = MLPClassifier()
starttime = dt.now()
model.fit(X_train,y_train)
endtime = dt.now()
print("Predict:",model.score(X_test,y_test)," time:", (endtime.timestamp() - starttime.timestamp()))
plt.title("Loss Curve")
plt.plot(model.loss_curve_)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.grid()
plt.show()
(144, 3) (144,) (115, 3) (29, 3) (115,) (29,) Predict: 0.10344827586206896 time: 0.16406512260437012
/opt/conda/lib/python3.13/site-packages/sklearn/neural_network/_multilayer_perceptron.py:781: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn( /opt/conda/lib/python3.13/site-packages/sklearn/metrics/_classification.py:98: UserWarning: The number of unique classes is greater than 50% of the number of samples. `y` could represent a regression problem, not a classification problem. type_true = type_of_target(y_true, input_name="y_true")
Add years¶
In [102]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
import numpy as np
from datetime import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
# --- 安全にコピーして使う(元の df_kind_hdi を上書きしない) ---
df = df_kind_hdi.copy()
# --- 列名の前後空白を除去しておく(念のため) ---
df.columns = df.columns.str.strip()
# --- ラベルエンコード(元の 'country' / 'kind_name' が存在することを前提) ---
if 'country' not in df.columns or 'kind_name' not in df.columns:
print("現在の列一覧:", df.columns.tolist())
raise ValueError("country または kind_name 列が見つかりません。列名を確認してください。")
le_country = LabelEncoder()
le_kind = LabelEncoder()
df['country_encoded'] = le_country.fit_transform(df['country'].astype(str))
df['kind_name_encoded'] = le_kind.fit_transform(df['kind_name'].astype(str))
# --- 欠損値処理(エンコード後にも適用) ---
df.fillna(0, inplace=True)
# --- 年度付き特徴量列を抽出(あなたの実際の列名に合わせる) ---
feature_cols = [c for c in df.columns
if c.startswith('le_')
or c.startswith('mys_')
or c.startswith('gnipc_')]
if len(feature_cols) == 0:
print("データフレームの列一覧:", df.columns.tolist())
raise ValueError("HDI系の特徴量(le_ / mys_ / gnipc_)が見つかりません")
# --- 数値型に強制(文字列や混在型があればNaNに、さらに0で埋める) ---
df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
# ここで**重要**:絶対に df を上書きしない(元の df に戻してしまう行を削除)
# 例: 「df = df_kind_hdi.fillna(0)」は入れないでください
# -----------------------------
# 2. 縦持ち(long形式)に変換
# -----------------------------
df_long = df.melt(
id_vars=['country_encoded', 'kind_name_encoded'],
value_vars=feature_cols,
var_name='var',
value_name='value'
)
# -----------------------------
# 3. 指標名と年を分割(右から分割して年を取得)
# -----------------------------
df_long[['indicator', 'year']] = df_long['var'].str.rsplit('_', n=1, expand=True)
# 年の列に数値以外が混じっているとエラーになるので安全に変換
df_long['year'] = pd.to_numeric(df_long['year'], errors='coerce')
if df_long['year'].isna().any():
bad_vals = df_long.loc[df_long['year'].isna(), 'var'].unique()
raise ValueError(f"year に変換できない var が存在します: {bad_vals}")
df_long['year'] = df_long['year'].astype(int)
# -----------------------------
# 4. 横持ちに戻す(年度を1サンプルに)
# -----------------------------
df_wide = df_long.pivot_table(
index=['country_encoded', 'kind_name_encoded', 'year'],
columns='indicator',
values='value',
aggfunc='first'
).reset_index()
df_wide = df_wide.fillna(0)
print(df_wide.head())
# -----------------------------
# 5. X と Y を作成
# -----------------------------
# pivot 後に 'le','mys','gnipc' カラムが存在するかチェック
for col in ['le', 'mys', 'gnipc']:
if col not in df_wide.columns:
print("pivot 後の列一覧:", df_wide.columns.tolist())
raise KeyError(f"期待するカラム {col} が pivot 後に存在しません")
X = df_wide[['le', 'mys', 'gnipc', 'year']].to_numpy()
Y = df_wide['kind_name_encoded'].to_numpy()
print("X shape:", X.shape)
print("Y shape:", Y.shape)
# -----------------------------
# 6. 標準化
# -----------------------------
scaler = StandardScaler()
X = scaler.fit_transform(X)
# -----------------------------
# 7. 学習用とテスト用に分割
# -----------------------------
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, Y, test_size=0.2, random_state=42
)
print(X_train.shape, X_test.shape)
# -----------------------------
# 8. モデル作成と学習
# -----------------------------
model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=42)
starttime = dt.now()
model.fit(X_train, y_train)
endtime = dt.now()
# -----------------------------
# 9. 精度と時間を表示
# -----------------------------
print("Accuracy:", model.score(X_test, y_test))
print("time:", (endtime.timestamp() - starttime.timestamp()))
# -----------------------------
# 10. ロスカーブ表示
# -----------------------------
plt.title("Loss Curve")
plt.plot(model.loss_curve_)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.grid()
plt.show()
indicator country_encoded kind_name_encoded year gnipc le \ 0 0 0 1990 3642.049616 45.118 1 0 0 1991 3192.813684 45.521 2 0 0 1992 2990.156395 46.569 3 0 0 1993 1993.698011 51.021 4 0 0 1994 1425.203987 50.969 indicator le_f le_m mys mys_f mys_m 0 47.703 42.712 0.871962 0.201659 1.493952 1 48.282 42.959 0.915267 0.218944 1.578809 2 49.607 43.765 0.958573 0.236229 1.663665 3 52.296 49.755 1.001878 0.253514 1.748522 4 53.135 48.883 1.045184 0.270800 1.833378 X shape: (7820, 4) Y shape: (7820,) (6256, 4) (1564, 4) Accuracy: 0.5754475703324808 time: 3.887295961380005
In [105]:
predict = model.predict(X_test)
predict
Out[105]:
array([0, 0, 0, ..., 0, 0, 0], shape=(1564,))
In [107]:
y_test
Out[107]:
array([0, 0, 1, ..., 0, 1, 1], shape=(1564,))
In [104]:
target = np.array([11.765668,55060.160380,83.325,2023]).reshape(1,-1)
target = scaler.fit_transform(target)
predict = model.predict(target)
predict
Out[104]:
array([0])
In [ ]:
#print(df_kind_hdi.columns.tolist())
In [ ]: