path_hdr25 = "./datasets/hdr25.csv"
#!pip install pycountry  #install library ”pycountry”
import pandas as pd #import Panda Library
import numpy as np  #import Numpy
import pycountry
import requests
import json
import matplotlib.pyplot as plt
#HDI
df_hdr25 = pd.read_csv(path_hdr25, encoding='utf-8', encoding_errors='ignore') #df = data flame = read dataset path "path_hdr25" defined above with panda 
df_hdr25.fillna(0, inplace=True) #replace N/A to 0 
#Lab list
url = 'https://api.fablabs.io/0/labs.json'
r = requests.get(url)
data = r.json()
df_lablist = pd.DataFrame(data)
#CountryCodeMarge
def alpha2_to_alpha3(code2):
    country = pycountry.countries.get(alpha_2=code2.upper())
    return country.alpha_3 if country else None
df_lablist['ccd3'] = df_lablist['country_code'].apply(alpha2_to_alpha3)
#CountLabNo
df_labcount = (df_lablist.groupby('ccd3').agg(lab_count=('id', 'count')).reset_index())
#CountLabKind
df_kindcount = (df_lablist.groupby(['ccd3', 'kind_name']).agg(kind_count=('id', 'count')).reset_index())
#Marge HDI with Lab(Number)
df_labcount = df_lablist.groupby('country_code').agg(lab_count=('id', 'count')).reset_index()
df_labcount['ccd3'] = df_labcount['country_code'].apply(alpha2_to_alpha3)
df_labno_hdi = pd.merge(df_labcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')
#Marge HDI with Lab(bykind)
df_kind_hdi = pd.merge(df_kindcount,df_hdr25,left_on='ccd3',right_on='iso3',how='left')

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encoded_country = le.fit_transform(df_labno_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_labno_hdi['country_encoded'] = encoded_country

#print('存在するクラス: ', le.classes_)
#print('エンコード結果: ', encoded_country)
#print('元に戻す: ', decoded_country)

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encoded = le.fit_transform(df_kind_hdi['kind_name'].values)
decoded = le.inverse_transform(encoded)
df_kind_hdi['kind_name_encoded'] = encoded

print('存在するクラス: ', le.classes_)
print('変換先: a, b, c ->', le.transform(['fab_lab', 'mini_fab_lab', 'mobile']))
#print('エンコード結果: ', encoded)
#print('元に戻す: ', decoded)

存在するクラス:  ['fab_lab' 'mini_fab_lab' 'mobile']
変換先: a, b, c -> [0 1 2]

encoded_country = le.fit_transform(df_kind_hdi['country'].values)
decoded_country = le.inverse_transform(encoded_country)
df_kind_hdi['country_encoded'] = encoded_country

#print('存在するクラス: ', le.classes_)
#print('エンコード結果: ', encoded_country)
#print('元に戻す: ', decoded_country)

存在するクラス:  ['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Argentina' 'Armenia'
 'Australia' 'Austria' 'Bahrain' 'Bangladesh' 'Belarus' 'Belgium' 'Benin'
 'Bhutan' 'Bolivia (Plurinational State of)' 'Bosnia and Herzegovina'
 'Brazil' 'Bulgaria' 'Burkina Faso' 'Cambodia' 'Cameroon' 'Canada' 'Chad'
 'Chile' 'China' 'Colombia' 'Congo' 'Congo (Democratic Republic of the)'
 'Costa Rica' 'Croatia' "Cte d'Ivoire" 'Cyprus' 'Czechia' 'Denmark'
 'Djibouti' 'Ecuador' 'Egypt' 'El Salvador' 'Estonia' 'Ethiopia' 'Finland'
 'France' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Guatemala' 'Guinea'
 'Honduras' 'Hong Kong, China (SAR)' 'Hungary' 'Iceland' 'India'
 'Indonesia' 'Iran (Islamic Republic of)' 'Iraq' 'Ireland' 'Israel'
 'Italy' 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan' 'Kenya'
 'Korea (Republic of)' 'Kuwait' 'Kyrgyzstan' 'Latvia' 'Lebanon' 'Libya'
 'Lithuania' 'Luxembourg' 'Madagascar' 'Malaysia' 'Mali' 'Malta'
 'Mauritania' 'Mexico' 'Moldova (Republic of)' 'Mongolia' 'Montenegro'
 'Morocco' 'Myanmar' 'Namibia' 'Nepal' 'Netherlands' 'New Zealand' 'Niger'
 'Nigeria' 'North Macedonia' 'Norway' 'Oman' 'Pakistan'
 'Palestine, State of' 'Panama' 'Paraguay' 'Peru' 'Philippines' 'Poland'
 'Portugal' 'Qatar' 'Romania' 'Russian Federation' 'Rwanda' 'Saudi Arabia'
 'Senegal' 'Serbia' 'Sierra Leone' 'Singapore' 'Slovakia' 'Slovenia'
 'Somalia' 'South Africa' 'Spain' 'Sri Lanka' 'Suriname' 'Sweden'
 'Switzerland' 'Syrian Arab Republic' 'Tanzania (United Republic of)'
 'Thailand' 'Togo' 'Trinidad and Tobago' 'Trkiye' 'Tunisia' 'Ukraine'
 'United Arab Emirates' 'United Kingdom' 'United States' 'Uruguay'
 'Uzbekistan' 'Venezuela (Bolivarian Republic of)' 'Viet Nam' nan]

df_kind_hdi.loc[:,['country','country_encoded','kind_name','kind_name_encoded']].query('kind_name_encoded == 2')

df_kind_hdi.columns

Index(['ccd3', 'kind_name', 'kind_count', 'iso3', 'country', 'hdicode',
       'region', 'hdi_rank_2023', 'hdi_1990', 'hdi_1991',
       ...
       'pop_total_2016', 'pop_total_2017', 'pop_total_2018', 'pop_total_2019',
       'pop_total_2020', 'pop_total_2021', 'pop_total_2022', 'pop_total_2023',
       'kind_name_encoded', 'country_encoded'],
      dtype='object', length=1117)

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
import numpy as np
from datetime import datetime as dt

df_kind_hdi = df_kind_hdi.fillna(0)
X = df_kind_hdi.loc[:,['le_2023','mys_2023','gnipc_2023']]
columns = X.columns
X = X.to_numpy()
print(X.shape)

Y = df_kind_hdi['kind_name_encoded'].to_numpy()
Y = np.ravel(Y)
print(Y.shape)

scaler = StandardScaler()
X = scaler.fit_transform(X)
#Y = scaler.fit_transform(Y)

X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size=0.2)

print(X_train.shape, X_test.shape,y_train.shape,y_test.shape)

model = MLPClassifier()

starttime = dt.now()
model.fit(X_train,y_train)
endtime = dt.now()
print("Predict:",model.score(X_test,y_test)," time:", (endtime.timestamp() - starttime.timestamp()))

plt.title("Loss Curve")
plt.plot(model.loss_curve_)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.grid()
plt.show()

(242, 3)
(242,)
(193, 3) (49, 3) (193,) (49,)
Predict: 0.6122448979591837  time: 0.09344983100891113

predict = model.predict(X_test)
predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0])

y_test

array([0, 1, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 2, 1, 1, 2, 0, 2, 0, 0,
       0, 1, 0, 0, 1])

target = np.array([11.765668,55060.160380,83.325]).reshape(1,-1)
target = scaler.fit_transform(target)
predict = model.predict(target)
predict

array([0])

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
import numpy as np
from datetime import datetime as dt

df_labno_hdi = df_labno_hdi.fillna(0)
X = df_labno_hdi.loc[:,['le_2023','mys_2023','gnipc_2023']]
columns = X.columns
X = X.to_numpy()
print(X.shape)

Y = df_labno_hdi['lab_count'].to_numpy()
Y = np.ravel(Y)
print(Y.shape)

scaler = StandardScaler()
X = scaler.fit_transform(X)
#Y = scaler.fit_transform(Y)

X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size=0.2)

print(X_train.shape, X_test.shape,y_train.shape,y_test.shape)

model = MLPClassifier()

starttime = dt.now()
model.fit(X_train,y_train)
endtime = dt.now()
print("Predict:",model.score(X_test,y_test)," time:", (endtime.timestamp() - starttime.timestamp()))

plt.title("Loss Curve")
plt.plot(model.loss_curve_)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.grid()
plt.show()

(144, 3)
(144,)
(115, 3) (29, 3) (115,) (29,)
Predict: 0.10344827586206896  time: 0.16406512260437012

/opt/conda/lib/python3.13/site-packages/sklearn/neural_network/_multilayer_perceptron.py:781: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(
/opt/conda/lib/python3.13/site-packages/sklearn/metrics/_classification.py:98: UserWarning: The number of unique classes is greater than 50% of the number of samples. `y` could represent a regression problem, not a classification problem.
  type_true = type_of_target(y_true, input_name="y_true")

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
import numpy as np
from datetime import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# --- 安全にコピーして使う（元の df_kind_hdi を上書きしない） ---
df = df_kind_hdi.copy()

# --- 列名の前後空白を除去しておく（念のため） ---
df.columns = df.columns.str.strip()

# --- ラベルエンコード（元の 'country' / 'kind_name' が存在することを前提） ---
if 'country' not in df.columns or 'kind_name' not in df.columns:
    print("現在の列一覧:", df.columns.tolist())
    raise ValueError("country または kind_name 列が見つかりません。列名を確認してください。")

le_country = LabelEncoder()
le_kind = LabelEncoder()

df['country_encoded'] = le_country.fit_transform(df['country'].astype(str))
df['kind_name_encoded'] = le_kind.fit_transform(df['kind_name'].astype(str))

# --- 欠損値処理（エンコード後にも適用） ---
df.fillna(0, inplace=True)

# --- 年度付き特徴量列を抽出（あなたの実際の列名に合わせる） ---
feature_cols = [c for c in df.columns
                if c.startswith('le_') 
                or c.startswith('mys_') 
                or c.startswith('gnipc_')]

if len(feature_cols) == 0:
    print("データフレームの列一覧:", df.columns.tolist())
    raise ValueError("HDI系の特徴量（le_ / mys_ / gnipc_）が見つかりません")

# --- 数値型に強制（文字列や混在型があればNaNに、さらに0で埋める） ---
df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

# ここで**重要**：絶対に df を上書きしない（元の df に戻してしまう行を削除）
# 例: 「df = df_kind_hdi.fillna(0)」は入れないでください

# -----------------------------
# 2. 縦持ち（long形式）に変換
# -----------------------------
df_long = df.melt(
    id_vars=['country_encoded', 'kind_name_encoded'],
    value_vars=feature_cols,
    var_name='var',
    value_name='value'
)

# -----------------------------
# 3. 指標名と年を分割（右から分割して年を取得）
# -----------------------------
df_long[['indicator', 'year']] = df_long['var'].str.rsplit('_', n=1, expand=True)
# 年の列に数値以外が混じっているとエラーになるので安全に変換
df_long['year'] = pd.to_numeric(df_long['year'], errors='coerce')
if df_long['year'].isna().any():
    bad_vals = df_long.loc[df_long['year'].isna(), 'var'].unique()
    raise ValueError(f"year に変換できない var が存在します: {bad_vals}")
df_long['year'] = df_long['year'].astype(int)

# -----------------------------
# 4. 横持ちに戻す（年度を1サンプルに）
# -----------------------------
df_wide = df_long.pivot_table(
    index=['country_encoded', 'kind_name_encoded', 'year'],
    columns='indicator',
    values='value',
    aggfunc='first'
).reset_index()

df_wide = df_wide.fillna(0)

print(df_wide.head())

# -----------------------------
# 5. X と Y を作成
# -----------------------------
# pivot 後に 'le','mys','gnipc' カラムが存在するかチェック
for col in ['le', 'mys', 'gnipc']:
    if col not in df_wide.columns:
        print("pivot 後の列一覧:", df_wide.columns.tolist())
        raise KeyError(f"期待するカラム {col} が pivot 後に存在しません")

X = df_wide[['le', 'mys', 'gnipc', 'year']].to_numpy()
Y = df_wide['kind_name_encoded'].to_numpy()

print("X shape:", X.shape)
print("Y shape:", Y.shape)

# -----------------------------
# 6. 標準化
# -----------------------------
scaler = StandardScaler()
X = scaler.fit_transform(X)

# -----------------------------
# 7. 学習用とテスト用に分割
# -----------------------------
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, Y, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)

# -----------------------------
# 8. モデル作成と学習
# -----------------------------
model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=42)

starttime = dt.now()
model.fit(X_train, y_train)
endtime = dt.now()

# -----------------------------
# 9. 精度と時間を表示
# -----------------------------
print("Accuracy:", model.score(X_test, y_test))
print("time:", (endtime.timestamp() - starttime.timestamp()))

# -----------------------------
# 10. ロスカーブ表示
# -----------------------------
plt.title("Loss Curve")
plt.plot(model.loss_curve_)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.grid()
plt.show()

indicator  country_encoded  kind_name_encoded  year        gnipc      le  \
0                        0                  0  1990  3642.049616  45.118   
1                        0                  0  1991  3192.813684  45.521   
2                        0                  0  1992  2990.156395  46.569   
3                        0                  0  1993  1993.698011  51.021   
4                        0                  0  1994  1425.203987  50.969   

indicator    le_f    le_m       mys     mys_f     mys_m  
0          47.703  42.712  0.871962  0.201659  1.493952  
1          48.282  42.959  0.915267  0.218944  1.578809  
2          49.607  43.765  0.958573  0.236229  1.663665  
3          52.296  49.755  1.001878  0.253514  1.748522  
4          53.135  48.883  1.045184  0.270800  1.833378  
X shape: (7820, 4)
Y shape: (7820,)
(6256, 4) (1564, 4)
Accuracy: 0.5754475703324808
time: 3.887295961380005

predict = model.predict(X_test)
predict

array([0, 0, 0, ..., 0, 0, 0], shape=(1564,))

y_test

array([0, 0, 1, ..., 0, 1, 1], shape=(1564,))

target = np.array([11.765668,55060.160380,83.325,2023]).reshape(1,-1)
target = scaler.fit_transform(target)
predict = model.predict(target)
predict

array([0])

#print(df_kind_hdi.columns.tolist())

Week2-2: Machine Learning¶

Encoding¶

Machine Learning by Scikit-learn¶

Predict Lab Kinds¶

Predict Lab Nos¶

Add years¶

	country	country_encoded	kind_name	kind_name_encoded
6	United Arab Emirates	126	mobile	2
28	Brazil	16	mobile	2
38	China	24	mobile	2
51	Czechia	32	mobile	2
54	Germany	43	mobile	2
62	Egypt	36	mobile	2
65	Spain	113	mobile	2
71	France	41	mobile	2
81	Greece	45	mobile	2
89	Croatia	29	mobile	2
95	India	52	mobile	2
119	Korea (Republic of)	64	mobile	2
121	Kuwait	65	mobile	2
127	Sri Lanka	114	mobile	2
136	Morocco	81	mobile	2
161	Netherlands	85	mobile	2
179	Poland	98	mobile	2
186	Paraguay	95	mobile	2
190	Qatar	100	mobile	2
215	Sweden	116	mobile	2
225	Trkiye	123	mobile	2
235	United States	128	mobile	2