Jeoffrey George - Fab Futures - Data Science
Home About

Jeoffrey George final presentation¶

presentation.png

The dataset¶

In [1]:
import os #os.path to manipulate paths
import xarray as xr #for multidimensional labeled arrays and datasets, using dimensions, coordinates, and attributes on top of NumPy-like arrays

from pathlib import Path
directory = Path("/home/jovyan/work/jeogeorge/datasets/soybean_yield_1981-2016")
files = sorted(                         # Sort the resulting list alphabetically
    str(p)                              # Convert each Path object to a string
    for p in directory.iterdir()        # Iterate over all items in the directory
    if p.name.startswith("yield_")      # Keep only files whose names start with 'yield_'
    and p.name.endswith(".nc4")         # Keep only files whose names end with '.nc4'
)

# Extract years from filenames
years = [
    int(os.path.basename(f).split("_")[1].split(".")[0])
    for f in files
]
datasets = []
for f, y in zip(files, years):
    ds = xr.open_dataset(f)
    # add a year dimension
    ds = ds.expand_dims(year=[y])
    datasets.append(ds)

ds_all = xr.concat(datasets, dim="year")

# Rename variable and unit
ds_all = ds_all.rename({"var": "Yield"})
ds_all["Yield"].attrs["units"] = r"t.ha$^{-1}$"

import matplotlib.pyplot as plt
import matplotlib.animation as animation
import cartopy.crs as ccrs
years = range(1981, 2016)
fig = plt.figure(figsize=(10, 5))
ax = plt.axes(projection=ccrs.PlateCarree())
proj = ccrs.PlateCarree() 
im = ds_all["Yield"].sel(year=years[0]).plot(
    ax=ax,
    transform=ccrs.PlateCarree(),
    cmap="viridis",
    add_colorbar=False
)
cbar = plt.colorbar(im, ax=ax, orientation="vertical", pad=0.02)
cbar.set_label("Yield (t.ha$^{-1}$)")
ax.coastlines()
ax.set_global()
ax.set_title(f"Soybean Yield in {int(years[0])} (t.ha$^{-1}$)")
def update(year):
    ax.clear()
    data = ds_all["Yield"].sel(year=year)
    im = data.plot(
        ax=ax,
        transform=proj,
        cmap="viridis",
        add_colorbar=False
    )
    ax.coastlines()
    ax.set_global()
    ax.set_title(f"Soybean Yield in {int(year)} (t.ha$^{-1}$)")
    return im,
ani = animation.FuncAnimation(
    fig,
    update,
    frames=years,   # use all available years in the dataset
    interval=100    # milliseconds between frames
)
plt.show()
No description has been provided for this image
In [3]:
sel = df[(df["Country"] == country_name) & df["Yield"]]
groups = sel.groupby("year")  
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]

plt.figure(figsize=(12, 5))
plt.boxplot(
    data_by_year,
    positions=np.arange(len(years)),
    showfliers=True,  
)

plt.xticks(np.arange(len(years)), years, rotation=90)
plt.xlabel("Year")
plt.ylabel(r"Yield (t ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.tight_layout()
plt.show()
No description has been provided for this image

Fitting a function (Tikhonov regularization)¶

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet") 
import numpy as np
import matplotlib.pyplot as plt
country_name = "United States of America" #select the country to show
sel = df[(df["Country"] == country_name) & df["Yield"]]
groups = sel.groupby("year")  
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]

sel_median = sel.groupby("year")["Yield"].median()
x = sel_median.index.to_numpy() 
y = sel_median.values 
xmin = x.min()
xmax = x.max()
npts = xmax-xmin+1
ncenters = npts

# generate Vandermonde matrix
#
indices = np.random.uniform(low=0,high=len(x),size=ncenters).astype(int)
centers = x[indices]
M = np.abs(np.outer(x,np.ones(ncenters))
   -np.outer(np.ones(npts),centers))**3
#
# invert matrices to find weight-regularized coefficients
#
Lambda = 20
left = (M.T)@M+Lambda*np.eye(ncenters)
right = (M.T)@y
coeff1 = np.linalg.pinv(left)@right
Lambda = 0
left = (M.T)@M+Lambda*np.eye(ncenters)
right = (M.T)@y
coeff0 = np.linalg.pinv(left)@right
#
# plot fits
#
xfit = np.linspace(xmin,xmax,npts)
yfit0 = (np.abs(np.outer(xfit,np.ones(ncenters))-np.outer(np.ones(npts),centers))**3)@coeff0
yfit1 = (np.abs(np.outer(xfit,np.ones(ncenters))-np.outer(np.ones(npts),centers))**3)@coeff1
plt.scatter(
    x,
    y, 
    label= f"Median by year",
)
plt.plot(x,y,'o',alpha=0.5)
plt.plot(xfit,yfit0,'y-',label=r'$\lambda=0$')
plt.plot(xfit,yfit1,'r-',label=r'$\lambda=20$')
plt.xlabel("Year")
plt.ylabel(r"Yield (t.ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.legend()
plt.show()
No description has been provided for this image

Machine learning¶

ChatGPT 5.1 prompt "I want to use a machine-learning model to analyze soybean yield trends over time. Specifically, I want to: fit an ML model to capture long-term technological yield trends, separate trend from weather-driven variability, and detect anomalous years like drought or flood years. Please adapt my existing Python code to use a machine-learning model (e.g., Random Forest or Gradient Boosting) to estimate the trend and identify anomalies.*"

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# ---------------------------------------------------
# 1. Load data and compute yearly median yield
# ---------------------------------------------------
country_name = "United States of America"

df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet")

dfmedian = (
    df.groupby(["Country", "year"], as_index=False)["Yield"]
    .median()
    .rename(columns={"Yield": "median_yield"})
)

sel = dfmedian[dfmedian["Country"] == country_name].copy()

X = sel["year"].to_numpy().reshape(-1, 1)
y = sel["median_yield"].to_numpy()

# ---------------------------------------------------
# 2. Machine-Learning Model: Random Forest
# ---------------------------------------------------
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,        # allow flexibility to model nonlinear trends
    min_samples_split=2,
    random_state=42
)

rf.fit(X, y)

# Trend curve (ML-based estimate of structural trend)
sel["trend_pred"] = rf.predict(X)

# ---------------------------------------------------
# 3. Residuals = deviations from ML-estimated trend
# ---------------------------------------------------
sel["residual"] = sel["median_yield"] - sel["trend_pred"]

# Anomaly threshold: 2 standard deviations of residuals
threshold = 2 * sel["residual"].std()
sel["anomaly"] = sel["residual"].abs() > threshold

# ---------------------------------------------------
# 4. Plot ML-trend + anomalies
# ---------------------------------------------------
plt.figure(figsize=(12, 4))
plt.scatter(sel["year"], sel["median_yield"], label="Median yield", alpha=0.7)
plt.plot(sel["year"], sel["trend_pred"], color="orange", linewidth=2, label="ML Trend (Random Forest)")

# Highlight anomalies
anomalies = sel[sel["anomaly"]]
plt.scatter(
    anomalies["year"], anomalies["median_yield"],
    color="red", s=80, label="Anomalous years"
)

plt.xlabel("Year")
plt.ylabel(r"Yield (t ha$^{-1}$)")
plt.title(f"ML-detected trend and anomalies in {country_name}")
plt.legend()
plt.show()

# ---------------------------------------------------
# 5. Report anomalies
# ---------------------------------------------------
print("Detected anomalous years (machine-learning residual analysis):")
print(anomalies[["year", "median_yield", "residual"]])
No description has been provided for this image
Detected anomalous years (machine-learning residual analysis):
      year  median_yield  residual
1067  1994      2.954587  0.161505

Distribution of the data¶

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

yieldusa = sel["Yield"]
x = yieldusa.values
#
# plot histogram and points
#
plt.hist(x,bins=100,density=True)
plt.title(f"Distribution of soybean yield in {country_name}")
plt.xlabel(r"Yield (t.ha$^{-1}$)")
plt.plot()
Out[4]:
[]
No description has been provided for this image
In [5]:
sel = sel.copy()

sel["location"] = list(
    zip(sel["lat"].round(2), sel["lon"].round(2))
)
top_locations = (
    sel["location"]
    .value_counts()
    .head(10)
    .index
)
subset = sel[sel["location"].isin(top_locations)]
import seaborn as sns
import matplotlib.pyplot as plt
sns.kdeplot(
    data=subset,
    x="Yield",
    hue="location",
    common_norm=False
)
plt.show()
No description has been provided for this image

Density estimation¶

Gaussian Mixture model¶

Chatgpt5.1 prompt "how to explore bimodal distribution of a dataset"

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet") 
import numpy as np
import matplotlib.pyplot as plt
country_name = "United States of America" #select the country to show
sel = df[(df["Country"] == country_name) & df["Yield"]]
groups = sel.groupby("year")  
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]

sel_median = sel.groupby("year")["Yield"].median()
x = sel_median.index.to_numpy() 
y = sel_median.values 

yieldusa = sel["Yield"]
x = yieldusa.values

xmin = x.min()
xmax = x.max()
npts = xmax-xmin+1
ncenters = npts

# Gaussian Mixture Models (GMM)
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=2)
gmm.fit(x.reshape(-1, 1))

means = gmm.means_
weights = gmm.weights_

means, weights

xs = np.linspace(x.min(), x.max(), 100).reshape(-1, 1)
logprob = gmm.score_samples(xs)
pdf = np.exp(logprob)

plt.hist(x, bins=60, density=True)
plt.plot(xs, pdf)
plt.show()
No description has been provided for this image

Transform¶

Chatgpt 5.2 prompt "How to explore the dataset using a PCA plot?"

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# --- Build a matrix: rows=countries, columns=years, values=Yield ---
mat = (
    df.loc[df["Yield"].notna(), ["Country", "year", "Yield"]]
      .pivot_table(index="Country", columns="year", values="Yield", aggfunc="median")
)

# (Optional) keep only years with decent coverage to reduce missingness
min_countries_per_year = int(0.6 * mat.shape[0])  # keep years present for >=60% of countries
mat = mat.loc[:, mat.notna().sum(axis=0) >= min_countries_per_year]

# --- Preprocess: impute missing years + scale ---
X = mat.to_numpy()
X = SimpleImputer(strategy="median").fit_transform(X)   # fill missing yields
X = StandardScaler().fit_transform(X)                   # scale features (years)

# --- PCA to 2D ---
pca = PCA(n_components=2, random_state=0)
Z = pca.fit_transform(X)

# --- Plot ---
country_name = "United States of America"  # highlight this one
idx = mat.index.to_numpy()

plt.figure(figsize=(8, 6))
plt.scatter(Z[:, 0], Z[:, 1], s=18, alpha=0.6)

# highlight the selected country (if it exists in the matrix)
if country_name in mat.index:
    i = np.where(idx == country_name)[0][0]
    plt.scatter(Z[i, 0], Z[i, 1], s=120, marker="*", edgecolor="k")
    plt.text(Z[i, 0], Z[i, 1], "  " + country_name, va="center")

plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% var)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% var)")
plt.title("PCA of countries by soybean yield time-series (years as features)")
plt.grid(True, alpha=0.2)
plt.show()
No description has been provided for this image

Next¶

Analyze the data again with groups of location which have similar yields