Jeoffrey George final presentation¶
The dataset¶
In [1]:
import os #os.path to manipulate paths
import xarray as xr #for multidimensional labeled arrays and datasets, using dimensions, coordinates, and attributes on top of NumPy-like arrays
from pathlib import Path
directory = Path("/home/jovyan/work/jeogeorge/datasets/soybean_yield_1981-2016")
files = sorted( # Sort the resulting list alphabetically
str(p) # Convert each Path object to a string
for p in directory.iterdir() # Iterate over all items in the directory
if p.name.startswith("yield_") # Keep only files whose names start with 'yield_'
and p.name.endswith(".nc4") # Keep only files whose names end with '.nc4'
)
# Extract years from filenames
years = [
int(os.path.basename(f).split("_")[1].split(".")[0])
for f in files
]
datasets = []
for f, y in zip(files, years):
ds = xr.open_dataset(f)
# add a year dimension
ds = ds.expand_dims(year=[y])
datasets.append(ds)
ds_all = xr.concat(datasets, dim="year")
# Rename variable and unit
ds_all = ds_all.rename({"var": "Yield"})
ds_all["Yield"].attrs["units"] = r"t.ha$^{-1}$"
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import cartopy.crs as ccrs
years = range(1981, 2016)
fig = plt.figure(figsize=(10, 5))
ax = plt.axes(projection=ccrs.PlateCarree())
proj = ccrs.PlateCarree()
im = ds_all["Yield"].sel(year=years[0]).plot(
ax=ax,
transform=ccrs.PlateCarree(),
cmap="viridis",
add_colorbar=False
)
cbar = plt.colorbar(im, ax=ax, orientation="vertical", pad=0.02)
cbar.set_label("Yield (t.ha$^{-1}$)")
ax.coastlines()
ax.set_global()
ax.set_title(f"Soybean Yield in {int(years[0])} (t.ha$^{-1}$)")
def update(year):
ax.clear()
data = ds_all["Yield"].sel(year=year)
im = data.plot(
ax=ax,
transform=proj,
cmap="viridis",
add_colorbar=False
)
ax.coastlines()
ax.set_global()
ax.set_title(f"Soybean Yield in {int(year)} (t.ha$^{-1}$)")
return im,
ani = animation.FuncAnimation(
fig,
update,
frames=years, # use all available years in the dataset
interval=100 # milliseconds between frames
)
plt.show()
In [3]:
sel = df[(df["Country"] == country_name) & df["Yield"]]
groups = sel.groupby("year")
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]
plt.figure(figsize=(12, 5))
plt.boxplot(
data_by_year,
positions=np.arange(len(years)),
showfliers=True,
)
plt.xticks(np.arange(len(years)), years, rotation=90)
plt.xlabel("Year")
plt.ylabel(r"Yield (t ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.tight_layout()
plt.show()
Fitting a function (Tikhonov regularization)¶
In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet")
import numpy as np
import matplotlib.pyplot as plt
country_name = "United States of America" #select the country to show
sel = df[(df["Country"] == country_name) & df["Yield"]]
groups = sel.groupby("year")
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]
sel_median = sel.groupby("year")["Yield"].median()
x = sel_median.index.to_numpy()
y = sel_median.values
xmin = x.min()
xmax = x.max()
npts = xmax-xmin+1
ncenters = npts
# generate Vandermonde matrix
#
indices = np.random.uniform(low=0,high=len(x),size=ncenters).astype(int)
centers = x[indices]
M = np.abs(np.outer(x,np.ones(ncenters))
-np.outer(np.ones(npts),centers))**3
#
# invert matrices to find weight-regularized coefficients
#
Lambda = 20
left = (M.T)@M+Lambda*np.eye(ncenters)
right = (M.T)@y
coeff1 = np.linalg.pinv(left)@right
Lambda = 0
left = (M.T)@M+Lambda*np.eye(ncenters)
right = (M.T)@y
coeff0 = np.linalg.pinv(left)@right
#
# plot fits
#
xfit = np.linspace(xmin,xmax,npts)
yfit0 = (np.abs(np.outer(xfit,np.ones(ncenters))-np.outer(np.ones(npts),centers))**3)@coeff0
yfit1 = (np.abs(np.outer(xfit,np.ones(ncenters))-np.outer(np.ones(npts),centers))**3)@coeff1
plt.scatter(
x,
y,
label= f"Median by year",
)
plt.plot(x,y,'o',alpha=0.5)
plt.plot(xfit,yfit0,'y-',label=r'$\lambda=0$')
plt.plot(xfit,yfit1,'r-',label=r'$\lambda=20$')
plt.xlabel("Year")
plt.ylabel(r"Yield (t.ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.legend()
plt.show()
Machine learning¶
ChatGPT 5.1 prompt "I want to use a machine-learning model to analyze soybean yield trends over time. Specifically, I want to: fit an ML model to capture long-term technological yield trends, separate trend from weather-driven variability, and detect anomalous years like drought or flood years. Please adapt my existing Python code to use a machine-learning model (e.g., Random Forest or Gradient Boosting) to estimate the trend and identify anomalies.*"
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# ---------------------------------------------------
# 1. Load data and compute yearly median yield
# ---------------------------------------------------
country_name = "United States of America"
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet")
dfmedian = (
df.groupby(["Country", "year"], as_index=False)["Yield"]
.median()
.rename(columns={"Yield": "median_yield"})
)
sel = dfmedian[dfmedian["Country"] == country_name].copy()
X = sel["year"].to_numpy().reshape(-1, 1)
y = sel["median_yield"].to_numpy()
# ---------------------------------------------------
# 2. Machine-Learning Model: Random Forest
# ---------------------------------------------------
rf = RandomForestRegressor(
n_estimators=500,
max_depth=None, # allow flexibility to model nonlinear trends
min_samples_split=2,
random_state=42
)
rf.fit(X, y)
# Trend curve (ML-based estimate of structural trend)
sel["trend_pred"] = rf.predict(X)
# ---------------------------------------------------
# 3. Residuals = deviations from ML-estimated trend
# ---------------------------------------------------
sel["residual"] = sel["median_yield"] - sel["trend_pred"]
# Anomaly threshold: 2 standard deviations of residuals
threshold = 2 * sel["residual"].std()
sel["anomaly"] = sel["residual"].abs() > threshold
# ---------------------------------------------------
# 4. Plot ML-trend + anomalies
# ---------------------------------------------------
plt.figure(figsize=(12, 4))
plt.scatter(sel["year"], sel["median_yield"], label="Median yield", alpha=0.7)
plt.plot(sel["year"], sel["trend_pred"], color="orange", linewidth=2, label="ML Trend (Random Forest)")
# Highlight anomalies
anomalies = sel[sel["anomaly"]]
plt.scatter(
anomalies["year"], anomalies["median_yield"],
color="red", s=80, label="Anomalous years"
)
plt.xlabel("Year")
plt.ylabel(r"Yield (t ha$^{-1}$)")
plt.title(f"ML-detected trend and anomalies in {country_name}")
plt.legend()
plt.show()
# ---------------------------------------------------
# 5. Report anomalies
# ---------------------------------------------------
print("Detected anomalous years (machine-learning residual analysis):")
print(anomalies[["year", "median_yield", "residual"]])
Detected anomalous years (machine-learning residual analysis):
year median_yield residual
1067 1994 2.954587 0.161505
Distribution of the data¶
In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
yieldusa = sel["Yield"]
x = yieldusa.values
#
# plot histogram and points
#
plt.hist(x,bins=100,density=True)
plt.title(f"Distribution of soybean yield in {country_name}")
plt.xlabel(r"Yield (t.ha$^{-1}$)")
plt.plot()
Out[4]:
[]
In [5]:
sel = sel.copy()
sel["location"] = list(
zip(sel["lat"].round(2), sel["lon"].round(2))
)
top_locations = (
sel["location"]
.value_counts()
.head(10)
.index
)
subset = sel[sel["location"].isin(top_locations)]
import seaborn as sns
import matplotlib.pyplot as plt
sns.kdeplot(
data=subset,
x="Yield",
hue="location",
common_norm=False
)
plt.show()
Chatgpt5.1 prompt "how to explore bimodal distribution of a dataset"
In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet")
import numpy as np
import matplotlib.pyplot as plt
country_name = "United States of America" #select the country to show
sel = df[(df["Country"] == country_name) & df["Yield"]]
groups = sel.groupby("year")
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]
sel_median = sel.groupby("year")["Yield"].median()
x = sel_median.index.to_numpy()
y = sel_median.values
yieldusa = sel["Yield"]
x = yieldusa.values
xmin = x.min()
xmax = x.max()
npts = xmax-xmin+1
ncenters = npts
# Gaussian Mixture Models (GMM)
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2)
gmm.fit(x.reshape(-1, 1))
means = gmm.means_
weights = gmm.weights_
means, weights
xs = np.linspace(x.min(), x.max(), 100).reshape(-1, 1)
logprob = gmm.score_samples(xs)
pdf = np.exp(logprob)
plt.hist(x, bins=60, density=True)
plt.plot(xs, pdf)
plt.show()
Transform¶
Chatgpt 5.2 prompt "How to explore the dataset using a PCA plot?"
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# --- Build a matrix: rows=countries, columns=years, values=Yield ---
mat = (
df.loc[df["Yield"].notna(), ["Country", "year", "Yield"]]
.pivot_table(index="Country", columns="year", values="Yield", aggfunc="median")
)
# (Optional) keep only years with decent coverage to reduce missingness
min_countries_per_year = int(0.6 * mat.shape[0]) # keep years present for >=60% of countries
mat = mat.loc[:, mat.notna().sum(axis=0) >= min_countries_per_year]
# --- Preprocess: impute missing years + scale ---
X = mat.to_numpy()
X = SimpleImputer(strategy="median").fit_transform(X) # fill missing yields
X = StandardScaler().fit_transform(X) # scale features (years)
# --- PCA to 2D ---
pca = PCA(n_components=2, random_state=0)
Z = pca.fit_transform(X)
# --- Plot ---
country_name = "United States of America" # highlight this one
idx = mat.index.to_numpy()
plt.figure(figsize=(8, 6))
plt.scatter(Z[:, 0], Z[:, 1], s=18, alpha=0.6)
# highlight the selected country (if it exists in the matrix)
if country_name in mat.index:
i = np.where(idx == country_name)[0][0]
plt.scatter(Z[i, 0], Z[i, 1], s=120, marker="*", edgecolor="k")
plt.text(Z[i, 0], Z[i, 1], " " + country_name, va="center")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% var)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% var)")
plt.title("PCA of countries by soybean yield time-series (years as features)")
plt.grid(True, alpha=0.2)
plt.show()
Next¶
Analyze the data again with groups of location which have similar yields