country_name = "United States of America" #select the country to show

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet") 
dfmedian=(df.groupby(["Country", "year"], as_index=False)["Yield"].median().rename(columns={"Yield":"median_yield"}))
sel = dfmedian[(dfmedian["Country"] == country_name) & dfmedian["median_yield"]]
x = sel["year"]
y = sel["median_yield"]
plt.scatter(
    x,
    y, 
    label= f"Median by year",
)
plt.xlabel("Year")
plt.ylabel(r"Yield (t.ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.show()
dfmedian.head()

#1.Training pipeline (Python / scikit-learn)
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
#2. Country-based train/test split 
## Step 1: Select held-out countries
countries = dfmedian['Country'].unique()
np.random.seed(42)
test_countries = np.random.choice(
    countries,
    size=int(0.2 * len(countries)),
    replace=False
)
train_countries = [c for c in countries if c not in test_countries]
## Step 2: Split the data
train_df = dfmedian[dfmedian['Country'].isin(train_countries)]
test_df  = dfmedian[dfmedian['Country'].isin(test_countries)]
X_train, y_train = train_df[['year', 'Country']], train_df['median_yield']
X_test, y_test   = test_df[['year', 'Country']], test_df['median_yield']
#train_df.head()
#test_df.head()
#3. Model pipeline (ridge = Tikhonov)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['year']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Country'])
    ]
)
ridge_model = Pipeline(
    steps=[
        ('preprocess', preprocessor),
        ('model', Ridge(alpha=1.0))
    ]
)
#4. Fit the model
ridge_model.fit(X_train, y_train)
#5. Evaluate on unseen countries
from sklearn.metrics import mean_squared_error
y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"RMSE on unseen countries: {rmse:.3f}")

RMSE on unseen countries: 1.261

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# ---------------------------------------------------
# 1. Load data and compute yearly median yield
# ---------------------------------------------------
country_name = "United States of America"

df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet")

dfmedian = (
    df.groupby(["Country", "year"], as_index=False)["Yield"]
    .median()
    .rename(columns={"Yield": "median_yield"})
)

sel = dfmedian[dfmedian["Country"] == country_name].copy()

X = sel["year"].to_numpy().reshape(-1, 1)
y = sel["median_yield"].to_numpy()

# ---------------------------------------------------
# 2. Machine-Learning Model: Random Forest
# ---------------------------------------------------
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,        # allow flexibility to model nonlinear trends
    min_samples_split=2,
    random_state=42
)

rf.fit(X, y)

# Trend curve (ML-based estimate of structural trend)
sel["trend_pred"] = rf.predict(X)

# ---------------------------------------------------
# 3. Residuals = deviations from ML-estimated trend
# ---------------------------------------------------
sel["residual"] = sel["median_yield"] - sel["trend_pred"]

# Anomaly threshold: 2 standard deviations of residuals
threshold = 2 * sel["residual"].std()
sel["anomaly"] = sel["residual"].abs() > threshold

# ---------------------------------------------------
# 4. Plot ML-trend + anomalies
# ---------------------------------------------------
plt.figure(figsize=(12, 4))
plt.scatter(sel["year"], sel["median_yield"], label="Median yield", alpha=0.7)
plt.plot(sel["year"], sel["trend_pred"], color="orange", linewidth=2, label="ML Trend (Random Forest)")

# Highlight anomalies
anomalies = sel[sel["anomaly"]]
plt.scatter(
    anomalies["year"], anomalies["median_yield"],
    color="red", s=80, label="Anomalous years"
)

plt.xlabel("Year")
plt.ylabel(r"Yield (t ha$^{-1}$)")
plt.title(f"ML-detected trend and anomalies in {country_name}")
plt.legend()
plt.show()

# ---------------------------------------------------
# 5. Report anomalies
# ---------------------------------------------------
print("Detected anomalous years (machine-learning residual analysis):")
print(anomalies[["year", "median_yield", "residual"]])

Detected anomalous years (machine-learning residual analysis):
      year  median_yield  residual
1067  1994      2.954587  0.161505

	Country	year	median_yield
0	Argentina	1982	2.075966
1	Argentina	1983	1.851908
2	Argentina	1984	2.436925
3	Argentina	1985	1.999814
4	Argentina	1986	2.105784

Week 2 - Fitting a machine learning model¶