Probability distribution¶
InĀ [1]:
country_name = "United States of America" #select the country to show
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet")
sel = df[(df["Country"] == country_name) & df["Yield"]]
sel = sel.copy()
sel["location"] = np.column_stack((sel["lat"], sel["lon"])).tolist()
sel["location"] = sel["location"].apply(tuple)
sel.head()
Out[1]:
| year | lat | lon | Yield | Country | location | |
|---|---|---|---|---|---|---|
| 167563 | 1981 | 26.25 | 261.75 | 0.765011 | United States of America | (26.25, 261.75) |
| 167564 | 1981 | 26.25 | 262.25 | 0.703201 | United States of America | (26.25, 262.25) |
| 170444 | 1981 | 28.25 | 262.25 | 1.045607 | United States of America | (28.25, 262.25) |
| 170445 | 1981 | 28.25 | 262.75 | 1.321302 | United States of America | (28.25, 262.75) |
| 171163 | 1981 | 28.75 | 261.75 | 0.934217 | United States of America | (28.75, 261.75) |
InĀ [2]:
groups = sel.groupby("year")
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]
plt.boxplot(
data_by_year,
positions=np.arange(len(years)),
showfliers=True,
)
plt.xticks(np.arange(len(years)), years, rotation=90)
plt.xlabel("Year")
plt.ylabel(r"Yield (t ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.tight_layout()
plt.show()
InĀ [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
yieldusa = sel["Yield"]
x = yieldusa.values
#
# plot histogram and points
#
plt.hist(x,bins=100,density=True)
plt.title(f"Distribution of soybean yield in {country_name}")
plt.xlabel(r"Yield (t.ha$^{-1}$)")
plt.plot()
Out[3]:
[]
InĀ [4]:
mean = x.mean()
print(f"mean of soybean yield in {country_name}: {mean} t.ha-1")
stddev = x.std()
print(f"standard deviation of soybean yield in {country_name}: {stddev} t.ha-1")
mean of soybean yield in United States of America: 2.648063898086548 t.ha-1 standard deviation of soybean yield in United States of America: 1.0439186096191406 t.ha-1
Chatgpt5.1 prompt "how to explore bimodal distribution of a dataset"
InĀ [5]:
import seaborn as sns
sns.histplot(x, bins=60,kde=True)
plt.show()
InĀ [6]:
#Kernel density estimation
import numpy as np
from scipy.stats import gaussian_kde
kde = gaussian_kde(x)
xs = np.linspace(min(x), max(x), 1000)
plt.plot(xs, kde(xs))
plt.show()
InĀ [7]:
#Find peaks (modes) numerically
from scipy.signal import find_peaks
density = kde(xs)
peaks, _ = find_peaks(density, distance=100)
xs[peaks]
Out[7]:
array([0.7550473, 2.8326888], dtype=float32)
InĀ [22]:
# Gaussian Mixture Models (GMM)
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2)
gmm.fit(x.reshape(-1, 1))
means = gmm.means_
weights = gmm.weights_
means, weights
xs = np.linspace(x.min(), x.max(), 100).reshape(-1, 1)
logprob = gmm.score_samples(xs)
pdf = np.exp(logprob)
plt.hist(x, bins=60, density=True)
plt.plot(xs, pdf)
plt.show()
InĀ [9]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
xs = np.linspace(x.min(), x.max(), 100)
plt.hist(x, bins=60, density=True, alpha=0.5)
for mean, var, weight in zip(gmm.means_.flatten(),
gmm.covariances_.flatten(),
gmm.weights_):
plt.plot(xs, weight * norm.pdf(xs, mean, np.sqrt(var)))
plt.show()
InĀ [10]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.kdeplot(data=sel, x="Yield", hue="year", common_norm=False)
plt.show()
InĀ [11]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.kdeplot(data=sel, x="Yield", hue="location", common_norm=False,legend=False)
plt.show()
ChatGPT5.1 "I suspect that the bimodal is coming from different location, how to explore it?"
InĀ [12]:
sel = sel.copy()
sel["location"] = list(
zip(sel["lat"].round(2), sel["lon"].round(2))
)
top_locations = (
sel["location"]
.value_counts()
.head(10)
.index
)
subset = sel[sel["location"].isin(top_locations)]
import seaborn as sns
import matplotlib.pyplot as plt
sns.kdeplot(
data=subset,
x="Yield",
hue="location",
common_norm=False
)
plt.show()
InĀ [Ā ]:
InĀ [13]:
low_mode, high_mode = sorted(gmm.means_.flatten())
low_mode, high_mode
loc_means = (
sel.groupby("location")["Yield"]
.mean()
.sort_values()
)
loc_means
Out[13]:
location
(36.75, 259.25) 0.464795
(35.25, 256.75) 0.466836
(33.75, 257.75) 0.472078
(33.25, 258.25) 0.485208
(32.75, 257.25) 0.488031
...
(38.75, 280.75) 4.435028
(41.25, 284.75) 4.440778
(33.75, 267.25) 4.452085
(32.25, 271.75) 4.504502
(35.75, 267.25) 4.544434
Name: Yield, Length: 1409, dtype: float32
InĀ [14]:
sel["component"] = gmm.predict(
sel["Yield"].values.reshape(-1,1)
)
import pandas as pd
ct = pd.crosstab(
sel["location"].astype(str),
sel["component"],
normalize="index"
)
ct = ct.sort_values(by=ct.columns.max())
ct
Out[14]:
| component | 0 | 1 |
|---|---|---|
| location | ||
| (43.75, 270.25) | 1.0 | 0.0 |
| (40.75, 280.25) | 1.0 | 0.0 |
| (43.25, 274.25) | 1.0 | 0.0 |
| (40.75, 279.75) | 1.0 | 0.0 |
| (39.25, 282.25) | 1.0 | 0.0 |
| ... | ... | ... |
| (42.25, 259.75) | 0.0 | 1.0 |
| (42.25, 257.25) | 0.0 | 1.0 |
| (35.75, 257.75) | 0.0 | 1.0 |
| (35.25, 261.25) | 0.0 | 1.0 |
| (26.25, 261.75) | 0.0 | 1.0 |
1409 rows Ć 2 columns
InĀ [15]:
low, high = sorted(gmm.means_.flatten())
ct.columns = [
"low_yield_mode" if c == ct.columns.min() else "high_yield_mode"
for c in ct.columns
]
ct
Out[15]:
| low_yield_mode | high_yield_mode | |
|---|---|---|
| location | ||
| (43.75, 270.25) | 1.0 | 0.0 |
| (40.75, 280.25) | 1.0 | 0.0 |
| (43.25, 274.25) | 1.0 | 0.0 |
| (40.75, 279.75) | 1.0 | 0.0 |
| (39.25, 282.25) | 1.0 | 0.0 |
| ... | ... | ... |
| (42.25, 259.75) | 0.0 | 1.0 |
| (42.25, 257.25) | 0.0 | 1.0 |
| (35.75, 257.75) | 0.0 | 1.0 |
| (35.25, 261.25) | 0.0 | 1.0 |
| (26.25, 261.75) | 0.0 | 1.0 |
1409 rows Ć 2 columns
InĀ [16]:
import matplotlib.pyplot as plt
import seaborn as sns
# Select top locations by sample size
top_locations = (
sel["location"]
.value_counts()
.head(8)
.index
)
subset = sel[sel["location"].isin(top_locations)].copy()
# Convert location to string for plotting
subset["location_str"] = subset["location"].astype(str)
# Plot
plt.figure(figsize=(10, 4))
sns.stripplot(
data=subset,
x="location_str",
y="Yield",
hue="component",
dodge=True,
alpha=0.5,
jitter=True
)
plt.xticks(rotation=45)
plt.xlabel("Location (lat, lon)")
plt.ylabel("Yield")
plt.title("Yield by location, colored by GMM component")
plt.legend(title="Component")
plt.tight_layout()
plt.show()
InĀ [17]:
# Use string version of location for stats
sel = sel.copy()
sel["location_str"] = sel["location"].astype(str)
from statsmodels.formula.api import ols
import statsmodels.api as sm
model = ols("Yield ~ C(location_str)", data=sel).fit()
anova = sm.stats.anova_lm(model, typ=2)
anova
Out[17]:
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(location_str) | 45629.241783 | 1408.0 | 174.282443 | 0.0 |
| Residual | 9071.564035 | 48786.0 | NaN | NaN |
InĀ [18]:
loc_means = (
sel.groupby("location")["Yield"]
.mean()
.sort_values()
)
loc_means.head(), loc_means.tail()
Out[18]:
(location (36.75, 259.25) 0.464795 (35.25, 256.75) 0.466836 (33.75, 257.75) 0.472078 (33.25, 258.25) 0.485208 (32.75, 257.25) 0.488031 Name: Yield, dtype: float32, location (38.75, 280.75) 4.435028 (41.25, 284.75) 4.440778 (33.75, 267.25) 4.452085 (32.25, 271.75) 4.504502 (35.75, 267.25) 4.544434 Name: Yield, dtype: float32)
InĀ [19]:
sel[["Yield", "lat","lon", "year"]].corr()
Out[19]:
| Yield | lat | lon | year | |
|---|---|---|---|---|
| Yield | 1.000000 | -0.129658 | 0.661155 | 0.328514 |
| lat | -0.129658 | 1.000000 | -0.237207 | 0.009547 |
| lon | 0.661155 | -0.237207 | 1.000000 | 0.000773 |
| year | 0.328514 | 0.009547 | 0.000773 | 1.000000 |
InĀ [20]:
sel[["Yield", "lat", "lon"]].corr()
Out[20]:
| Yield | lat | lon | |
|---|---|---|---|
| Yield | 1.000000 | -0.129658 | 0.661155 |
| lat | -0.129658 | 1.000000 | -0.237207 |
| lon | 0.661155 | -0.237207 | 1.000000 |
InĀ [Ā ]: