country_name = "United States of America" #select the country to show

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet") 
sel = df[(df["Country"] == country_name) & df["Yield"]]
sel = sel.copy()
sel["location"] = np.column_stack((sel["lat"], sel["lon"])).tolist()
sel["location"] = sel["location"].apply(tuple)
sel.head()

groups = sel.groupby("year")  
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]
plt.boxplot(
    data_by_year,
    positions=np.arange(len(years)),
    showfliers=True,  
)
plt.xticks(np.arange(len(years)), years, rotation=90)
plt.xlabel("Year")
plt.ylabel(r"Yield (t ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.tight_layout()
plt.show()

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

yieldusa = sel["Yield"]
x = yieldusa.values
#
# plot histogram and points
#
plt.hist(x,bins=100,density=True)
plt.title(f"Distribution of soybean yield in {country_name}")
plt.xlabel(r"Yield (t.ha$^{-1}$)")
plt.plot()

[]

mean = x.mean()
print(f"mean  of soybean yield in {country_name}: {mean} t.ha-1")
stddev = x.std()
print(f"standard deviation  of soybean yield in {country_name}: {stddev} t.ha-1")

mean  of soybean yield in United States of America: 2.648063898086548 t.ha-1
standard deviation  of soybean yield in United States of America: 1.0439186096191406 t.ha-1

import seaborn as sns
sns.histplot(x, bins=60,kde=True)
plt.show()

#Kernel density estimation
import numpy as np
from scipy.stats import gaussian_kde

kde = gaussian_kde(x)
xs = np.linspace(min(x), max(x), 1000)

plt.plot(xs, kde(xs))
plt.show()

#Find peaks (modes) numerically
from scipy.signal import find_peaks
density = kde(xs)
peaks, _ = find_peaks(density, distance=100)
xs[peaks]

array([0.7550473, 2.8326888], dtype=float32)

# Gaussian Mixture Models (GMM)
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=2)
gmm.fit(x.reshape(-1, 1))

means = gmm.means_
weights = gmm.weights_

means, weights

xs = np.linspace(x.min(), x.max(), 100).reshape(-1, 1)
logprob = gmm.score_samples(xs)
pdf = np.exp(logprob)

plt.hist(x, bins=60, density=True)
plt.plot(xs, pdf)
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

xs = np.linspace(x.min(), x.max(), 100)

plt.hist(x, bins=60, density=True, alpha=0.5)

for mean, var, weight in zip(gmm.means_.flatten(),
                             gmm.covariances_.flatten(),
                             gmm.weights_):
    plt.plot(xs, weight * norm.pdf(xs, mean, np.sqrt(var)))

plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(data=sel, x="Yield", hue="year", common_norm=False)
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(data=sel, x="Yield", hue="location", common_norm=False,legend=False)
plt.show()

sel = sel.copy()

sel["location"] = list(
    zip(sel["lat"].round(2), sel["lon"].round(2))
)
top_locations = (
    sel["location"]
    .value_counts()
    .head(10)
    .index
)
subset = sel[sel["location"].isin(top_locations)]
import seaborn as sns
import matplotlib.pyplot as plt
sns.kdeplot(
    data=subset,
    x="Yield",
    hue="location",
    common_norm=False
)
plt.show()

low_mode, high_mode = sorted(gmm.means_.flatten())
low_mode, high_mode
loc_means = (
    sel.groupby("location")["Yield"]
       .mean()
       .sort_values()
)

loc_means

location
(36.75, 259.25)    0.464795
(35.25, 256.75)    0.466836
(33.75, 257.75)    0.472078
(33.25, 258.25)    0.485208
(32.75, 257.25)    0.488031
                     ...   
(38.75, 280.75)    4.435028
(41.25, 284.75)    4.440778
(33.75, 267.25)    4.452085
(32.25, 271.75)    4.504502
(35.75, 267.25)    4.544434
Name: Yield, Length: 1409, dtype: float32

sel["component"] = gmm.predict(
    sel["Yield"].values.reshape(-1,1)
)
import pandas as pd
ct = pd.crosstab(
    sel["location"].astype(str),  
    sel["component"],
    normalize="index"
)

ct = ct.sort_values(by=ct.columns.max())
ct

low, high = sorted(gmm.means_.flatten())

ct.columns = [
    "low_yield_mode" if c == ct.columns.min() else "high_yield_mode"
    for c in ct.columns
]

ct

import matplotlib.pyplot as plt
import seaborn as sns

# Select top locations by sample size
top_locations = (
    sel["location"]
    .value_counts()
    .head(8)
    .index
)

subset = sel[sel["location"].isin(top_locations)].copy()

# Convert location to string for plotting
subset["location_str"] = subset["location"].astype(str)

# Plot
plt.figure(figsize=(10, 4))
sns.stripplot(
    data=subset,
    x="location_str",
    y="Yield",
    hue="component",
    dodge=True,
    alpha=0.5,
    jitter=True
)
plt.xticks(rotation=45)
plt.xlabel("Location (lat, lon)")
plt.ylabel("Yield")
plt.title("Yield by location, colored by GMM component")
plt.legend(title="Component")
plt.tight_layout()
plt.show()

# Use string version of location for stats
sel = sel.copy()
sel["location_str"] = sel["location"].astype(str)

from statsmodels.formula.api import ols
import statsmodels.api as sm

model = ols("Yield ~ C(location_str)", data=sel).fit()
anova = sm.stats.anova_lm(model, typ=2)
anova

loc_means = (
    sel.groupby("location")["Yield"]
       .mean()
       .sort_values()
)

loc_means.head(), loc_means.tail()

(location
 (36.75, 259.25)    0.464795
 (35.25, 256.75)    0.466836
 (33.75, 257.75)    0.472078
 (33.25, 258.25)    0.485208
 (32.75, 257.25)    0.488031
 Name: Yield, dtype: float32,
 location
 (38.75, 280.75)    4.435028
 (41.25, 284.75)    4.440778
 (33.75, 267.25)    4.452085
 (32.25, 271.75)    4.504502
 (35.75, 267.25)    4.544434
 Name: Yield, dtype: float32)

sel[["Yield", "lat","lon", "year"]].corr()

sel[["Yield", "lat", "lon"]].corr()

	year	lat	lon	Yield	Country	location
167563	1981	26.25	261.75	0.765011	United States of America	(26.25, 261.75)
167564	1981	26.25	262.25	0.703201	United States of America	(26.25, 262.25)
170444	1981	28.25	262.25	1.045607	United States of America	(28.25, 262.25)
170445	1981	28.25	262.75	1.321302	United States of America	(28.25, 262.75)
171163	1981	28.75	261.75	0.934217	United States of America	(28.75, 261.75)

component	0	1
location
(43.75, 270.25)	1.0	0.0
(40.75, 280.25)	1.0	0.0
(43.25, 274.25)	1.0	0.0
(40.75, 279.75)	1.0	0.0
(39.25, 282.25)	1.0	0.0
...	...	...
(42.25, 259.75)	0.0	1.0
(42.25, 257.25)	0.0	1.0
(35.75, 257.75)	0.0	1.0
(35.25, 261.25)	0.0	1.0
(26.25, 261.75)	0.0	1.0

	low_yield_mode	high_yield_mode
location
(43.75, 270.25)	1.0	0.0
(40.75, 280.25)	1.0	0.0
(43.25, 274.25)	1.0	0.0
(40.75, 279.75)	1.0	0.0
(39.25, 282.25)	1.0	0.0
...	...	...
(42.25, 259.75)	0.0	1.0
(42.25, 257.25)	0.0	1.0
(35.75, 257.75)	0.0	1.0
(35.25, 261.25)	0.0	1.0
(26.25, 261.75)	0.0	1.0

	sum_sq	df	F	PR(>F)
C(location_str)	45629.241783	1408.0	174.282443	0.0
Residual	9071.564035	48786.0	NaN	NaN

	Yield	lat	lon	year
Yield	1.000000	-0.129658	0.661155	0.328514
lat	-0.129658	1.000000	-0.237207	0.009547
lon	0.661155	-0.237207	1.000000	0.000773
year	0.328514	0.009547	0.000773	1.000000

Probability distribution¶