< Home
Lesson 4: Training a Model¶
Step 1: Load and Inspect the Data
In [1]:
import pandas as pd
import numpy as np
# Load CSV
df = pd.read_csv("datasets/Final_report_tables_2021AS.csv", header=1)
# Clean up column names and Dzongkhag names
df.columns = df.columns.str.strip()
df['Dzongkhag'] = df['Dzongkhag'].str.strip()
df.head()
Out[1]:
| Dzongkhag | Total Tree | Bearing Tree | Production (MT) | Total Tree.1 | Bearing Tree.1 | Production (MT).1 | Total Tree.2 | Bearing Tree.2 | Production (MT).2 | ... | Production (MT).21 | Total Tree.21 | Bearing Tree.21 | Production (MT).22 | Total Tree.22 | Bearing Tree.22 | Production (MT).23 | Total Tree.23 | Bearing Tree.23 | Production (MT).24 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bumthang | 5,939 | 2,182 | 57.12 | - | - | - | - | - | - | ... | - | - | - | - | - | - | - | - | - | - |
| 1 | Chukha | 4,782 | 1,492 | 36.8 | 2,29,264 | 83,737 | 700.41 | 1,14,184 | 49,171 | 1,353.37 | ... | 14.38 | 72 | 56 | 1.98 | 6,316 | 2,680 | 2.71 | 239 | 189 | 3.69 |
| 2 | Dagana | 258 | 18 | 0.16 | 4,30,893 | 1,65,405 | 1,111.14 | 2,25,820 | 1,31,028 | 2,791.97 | ... | 42.77 | 1,989 | 1,363 | 38.9 | 40,830 | 15,648 | 26.89 | 415 | 310 | 5.77 |
| 3 | Gasa | 6 | - | - | - | - | - | - | - | - | ... | - | - | - | - | - | - | - | - | - | - |
| 4 | Haa | 13,099 | 7,075 | 84.94 | - | - | - | 5,661 | 1,527 | 19.45 | ... | 3.33 | - | - | - | 99 | 10 | 0.01 | 126 | 112 | 1.22 |
5 rows × 76 columns
Step 2: Preprocess the Data
- Identify columns that represent "Total Tree" counts (every 3rd column starting from index 1).
- Identify columns that represent "Production (MT)" (every 3rd column starting from index 3).
- Clean values: replace " - " with 0 and remove commas.
In [5]:
# Replace missing values and commas
df.replace(to_replace=r'\s*-\s*', value='0', regex=True, inplace=True)
df.replace(to_replace=r',', value='', regex=True, inplace=True)
# Get column names
cols = df.columns.tolist()
# Extract "Total Tree" and "Production (MT)" columns
total_tree_cols = [cols[i] for i in range(1, len(cols), 3)] # Every 3rd col starting at 1
production_cols = [cols[i] for i in range(3, len(cols), 3)] # Every 3rd col starting at 3
# Convert to numeric
for col in total_tree_cols + production_cols:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
# Define features (X) and target (y)
X = df[total_tree_cols].values
y = df[production_cols].sum(axis=1).values # Total production per Dzongkhag
dzongkhags = df['Dzongkhag'].values
Step 3: Fit an MLP Regressor use MLPRegressor from scikit-learn for regression.
In [4]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Optional: split into train/test (though dataset is small)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=1
)
# Build and train model
regressor = MLPRegressor(
solver='adam',
hidden_layer_sizes=(50,),
activation='relu',
random_state=1,
max_iter=1000,
tol=1e-4
)
regressor.fit(X_train, y_train)
# Evaluate
train_score = regressor.score(X_train, y_train)
test_score = regressor.score(X_test, y_test)
y_pred = regressor.predict(X_test)
print(f"Train R²: {train_score:.4f}")
print(f"Test R²: {test_score:.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
Train R²: -10.5560 Test R²: -8.4357 Test RMSE: 6657.3471
Step 4: Show Predictions vs Actual
In [6]:
import matplotlib.pyplot as plt
plt.figure(figsize=(6, 4))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual Total Production (MT)")
plt.ylabel("Predicted Total Production (MT)")
plt.title("MLP Regressor: Actual vs Predicted")
plt.grid(True)
plt.show()