Transform¶
Chatgpt 5.2 prompt "How to explore the dataset using a PCA plot?"
In [1]:
country_name = "United States of America" #select the country to show
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet")
sel = df[(df["Country"] == country_name) & df["Yield"]]
sel = sel.copy()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# --- Build a matrix: rows=countries, columns=years, values=Yield ---
mat = (
df.loc[df["Yield"].notna(), ["Country", "year", "Yield"]]
.pivot_table(index="Country", columns="year", values="Yield", aggfunc="median")
)
# (Optional) keep only years with decent coverage to reduce missingness
min_countries_per_year = int(0.6 * mat.shape[0]) # keep years present for >=60% of countries
mat = mat.loc[:, mat.notna().sum(axis=0) >= min_countries_per_year]
# --- Preprocess: impute missing years + scale ---
X = mat.to_numpy()
X = SimpleImputer(strategy="median").fit_transform(X) # fill missing yields
X = StandardScaler().fit_transform(X) # scale features (years)
# --- PCA to 2D ---
pca = PCA(n_components=2, random_state=0)
Z = pca.fit_transform(X)
# --- Plot ---
country_name = "United States of America" # highlight this one
idx = mat.index.to_numpy()
plt.figure(figsize=(8, 6))
plt.scatter(Z[:, 0], Z[:, 1], s=18, alpha=0.6)
# highlight the selected country (if it exists in the matrix)
if country_name in mat.index:
i = np.where(idx == country_name)[0][0]
plt.scatter(Z[i, 0], Z[i, 1], s=120, marker="*", edgecolor="k")
plt.text(Z[i, 0], Z[i, 1], " " + country_name, va="center")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% var)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% var)")
plt.title("PCA of countries by soybean yield time-series (years as features)")
plt.grid(True, alpha=0.2)
plt.show()