import pandas as pd
import numpy as np

# Load CSV
df = pd.read_csv("datasets/Final_report_tables_2021AS.csv", header=1)

# Clean up column names and Dzongkhag names
df.columns = df.columns.str.strip()
df['Dzongkhag'] = df['Dzongkhag'].str.strip()

df.head()

# Replace missing values and commas
df.replace(to_replace=r'\s*-\s*', value='0', regex=True, inplace=True)
df.replace(to_replace=r',', value='', regex=True, inplace=True)

# Get column names
cols = df.columns.tolist()

# Extract "Total Tree" and "Production (MT)" columns
total_tree_cols = [cols[i] for i in range(1, len(cols), 3)]   # Every 3rd col starting at 1
production_cols = [cols[i] for i in range(3, len(cols), 3)]  # Every 3rd col starting at 3

# Convert to numeric
for col in total_tree_cols + production_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Define features (X) and target (y)
X = df[total_tree_cols].values
y = df[production_cols].sum(axis=1).values  # Total production per Dzongkhag
dzongkhags = df['Dzongkhag'].values

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Optional: split into train/test (though dataset is small)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1
)

# Build and train model
regressor = MLPRegressor(
    solver='adam',
    hidden_layer_sizes=(50,),
    activation='relu',
    random_state=1,
    max_iter=1000,
    tol=1e-4
)

regressor.fit(X_train, y_train)

# Evaluate
train_score = regressor.score(X_train, y_train)
test_score = regressor.score(X_test, y_test)
y_pred = regressor.predict(X_test)

print(f"Train R²: {train_score:.4f}")
print(f"Test R²: {test_score:.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

Train R²: -10.5560
Test R²: -8.4357
Test RMSE: 6657.3471

import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual Total Production (MT)")
plt.ylabel("Predicted Total Production (MT)")
plt.title("MLP Regressor: Actual vs Predicted")
plt.grid(True)
plt.show()

	Dzongkhag	Total Tree	Bearing Tree	Production (MT)	Total Tree.1	Bearing Tree.1	Production (MT).1	Total Tree.2	Bearing Tree.2	Production (MT).2	...	Production (MT).21	Total Tree.21	Bearing Tree.21	Production (MT).22	Total Tree.22	Bearing Tree.22	Production (MT).23	Total Tree.23	Bearing Tree.23	Production (MT).24
0	Bumthang	5,939	2,182	57.12	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-
1	Chukha	4,782	1,492	36.8	2,29,264	83,737	700.41	1,14,184	49,171	1,353.37	...	14.38	72	56	1.98	6,316	2,680	2.71	239	189	3.69
2	Dagana	258	18	0.16	4,30,893	1,65,405	1,111.14	2,25,820	1,31,028	2,791.97	...	42.77	1,989	1,363	38.9	40,830	15,648	26.89	415	310	5.77
3	Gasa	6	-	-	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-
4	Haa	13,099	7,075	84.94	-	-	-	5,661	1,527	19.45	...	3.33	-	-	-	99	10	0.01	126	112	1.22

Lesson 4: Training a Model¶