country_name = "United States of America" #select the country to show

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet") 
sel = df[(df["Country"] == country_name) & df["Yield"]]
sel = sel.copy()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# --- Build a matrix: rows=countries, columns=years, values=Yield ---
mat = (
    df.loc[df["Yield"].notna(), ["Country", "year", "Yield"]]
      .pivot_table(index="Country", columns="year", values="Yield", aggfunc="median")
)

# (Optional) keep only years with decent coverage to reduce missingness
min_countries_per_year = int(0.6 * mat.shape[0])  # keep years present for >=60% of countries
mat = mat.loc[:, mat.notna().sum(axis=0) >= min_countries_per_year]

# --- Preprocess: impute missing years + scale ---
X = mat.to_numpy()
X = SimpleImputer(strategy="median").fit_transform(X)   # fill missing yields
X = StandardScaler().fit_transform(X)                   # scale features (years)

# --- PCA to 2D ---
pca = PCA(n_components=2, random_state=0)
Z = pca.fit_transform(X)

# --- Plot ---
country_name = "United States of America"  # highlight this one
idx = mat.index.to_numpy()

plt.figure(figsize=(8, 6))
plt.scatter(Z[:, 0], Z[:, 1], s=18, alpha=0.6)

# highlight the selected country (if it exists in the matrix)
if country_name in mat.index:
    i = np.where(idx == country_name)[0][0]
    plt.scatter(Z[i, 0], Z[i, 1], s=120, marker="*", edgecolor="k")
    plt.text(Z[i, 0], Z[i, 1], "  " + country_name, va="center")

plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% var)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% var)")
plt.title("PCA of countries by soybean yield time-series (years as features)")
plt.grid(True, alpha=0.2)
plt.show()
Transform¶