Jeoffrey George - Fab Futures - Data Science
Home About

< Week 2 - Machine learning ...Week 3 - Density estimation >

Probability distribution¶

InĀ [1]:
country_name = "United States of America" #select the country to show

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet") 
sel = df[(df["Country"] == country_name) & df["Yield"]]
sel = sel.copy()
sel["location"] = np.column_stack((sel["lat"], sel["lon"])).tolist()
sel["location"] = sel["location"].apply(tuple)
sel.head()
Out[1]:
year lat lon Yield Country location
167563 1981 26.25 261.75 0.765011 United States of America (26.25, 261.75)
167564 1981 26.25 262.25 0.703201 United States of America (26.25, 262.25)
170444 1981 28.25 262.25 1.045607 United States of America (28.25, 262.25)
170445 1981 28.25 262.75 1.321302 United States of America (28.25, 262.75)
171163 1981 28.75 261.75 0.934217 United States of America (28.75, 261.75)
InĀ [2]:
groups = sel.groupby("year")  
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]
plt.boxplot(
    data_by_year,
    positions=np.arange(len(years)),
    showfliers=True,  
)
plt.xticks(np.arange(len(years)), years, rotation=90)
plt.xlabel("Year")
plt.ylabel(r"Yield (t ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

yieldusa = sel["Yield"]
x = yieldusa.values
#
# plot histogram and points
#
plt.hist(x,bins=100,density=True)
plt.title(f"Distribution of soybean yield in {country_name}")
plt.xlabel(r"Yield (t.ha$^{-1}$)")
plt.plot()
Out[3]:
[]
No description has been provided for this image
InĀ [4]:
mean = x.mean()
print(f"mean  of soybean yield in {country_name}: {mean} t.ha-1")
stddev = x.std()
print(f"standard deviation  of soybean yield in {country_name}: {stddev} t.ha-1")
mean  of soybean yield in United States of America: 2.648063898086548 t.ha-1
standard deviation  of soybean yield in United States of America: 1.0439186096191406 t.ha-1

Chatgpt5.1 prompt "how to explore bimodal distribution of a dataset"

InĀ [5]:
import seaborn as sns
sns.histplot(x, bins=60,kde=True)
plt.show()
No description has been provided for this image
InĀ [6]:
#Kernel density estimation
import numpy as np
from scipy.stats import gaussian_kde

kde = gaussian_kde(x)
xs = np.linspace(min(x), max(x), 1000)

plt.plot(xs, kde(xs))
plt.show()
No description has been provided for this image
InĀ [7]:
#Find peaks (modes) numerically
from scipy.signal import find_peaks
density = kde(xs)
peaks, _ = find_peaks(density, distance=100)
xs[peaks]
Out[7]:
array([0.7550473, 2.8326888], dtype=float32)
InĀ [22]:
# Gaussian Mixture Models (GMM)
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=2)
gmm.fit(x.reshape(-1, 1))

means = gmm.means_
weights = gmm.weights_

means, weights

xs = np.linspace(x.min(), x.max(), 100).reshape(-1, 1)
logprob = gmm.score_samples(xs)
pdf = np.exp(logprob)

plt.hist(x, bins=60, density=True)
plt.plot(xs, pdf)
plt.show()
No description has been provided for this image
InĀ [9]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

xs = np.linspace(x.min(), x.max(), 100)

plt.hist(x, bins=60, density=True, alpha=0.5)

for mean, var, weight in zip(gmm.means_.flatten(),
                             gmm.covariances_.flatten(),
                             gmm.weights_):
    plt.plot(xs, weight * norm.pdf(xs, mean, np.sqrt(var)))

plt.show()
No description has been provided for this image
InĀ [10]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(data=sel, x="Yield", hue="year", common_norm=False)
plt.show()
No description has been provided for this image
InĀ [11]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(data=sel, x="Yield", hue="location", common_norm=False,legend=False)
plt.show()
No description has been provided for this image

ChatGPT5.1 "I suspect that the bimodal is coming from different location, how to explore it?"

InĀ [12]:
sel = sel.copy()

sel["location"] = list(
    zip(sel["lat"].round(2), sel["lon"].round(2))
)
top_locations = (
    sel["location"]
    .value_counts()
    .head(10)
    .index
)
subset = sel[sel["location"].isin(top_locations)]
import seaborn as sns
import matplotlib.pyplot as plt
sns.kdeplot(
    data=subset,
    x="Yield",
    hue="location",
    common_norm=False
)
plt.show()
No description has been provided for this image
InĀ [Ā ]:
 
InĀ [13]:
low_mode, high_mode = sorted(gmm.means_.flatten())
low_mode, high_mode
loc_means = (
    sel.groupby("location")["Yield"]
       .mean()
       .sort_values()
)

loc_means
Out[13]:
location
(36.75, 259.25)    0.464795
(35.25, 256.75)    0.466836
(33.75, 257.75)    0.472078
(33.25, 258.25)    0.485208
(32.75, 257.25)    0.488031
                     ...   
(38.75, 280.75)    4.435028
(41.25, 284.75)    4.440778
(33.75, 267.25)    4.452085
(32.25, 271.75)    4.504502
(35.75, 267.25)    4.544434
Name: Yield, Length: 1409, dtype: float32
InĀ [14]:
sel["component"] = gmm.predict(
    sel["Yield"].values.reshape(-1,1)
)
import pandas as pd
ct = pd.crosstab(
    sel["location"].astype(str),  
    sel["component"],
    normalize="index"
)

ct = ct.sort_values(by=ct.columns.max())
ct
Out[14]:
component 0 1
location
(43.75, 270.25) 1.0 0.0
(40.75, 280.25) 1.0 0.0
(43.25, 274.25) 1.0 0.0
(40.75, 279.75) 1.0 0.0
(39.25, 282.25) 1.0 0.0
... ... ...
(42.25, 259.75) 0.0 1.0
(42.25, 257.25) 0.0 1.0
(35.75, 257.75) 0.0 1.0
(35.25, 261.25) 0.0 1.0
(26.25, 261.75) 0.0 1.0

1409 rows Ɨ 2 columns

InĀ [15]:
low, high = sorted(gmm.means_.flatten())

ct.columns = [
    "low_yield_mode" if c == ct.columns.min() else "high_yield_mode"
    for c in ct.columns
]

ct
Out[15]:
low_yield_mode high_yield_mode
location
(43.75, 270.25) 1.0 0.0
(40.75, 280.25) 1.0 0.0
(43.25, 274.25) 1.0 0.0
(40.75, 279.75) 1.0 0.0
(39.25, 282.25) 1.0 0.0
... ... ...
(42.25, 259.75) 0.0 1.0
(42.25, 257.25) 0.0 1.0
(35.75, 257.75) 0.0 1.0
(35.25, 261.25) 0.0 1.0
(26.25, 261.75) 0.0 1.0

1409 rows Ɨ 2 columns

InĀ [16]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select top locations by sample size
top_locations = (
    sel["location"]
    .value_counts()
    .head(8)
    .index
)

subset = sel[sel["location"].isin(top_locations)].copy()

# Convert location to string for plotting
subset["location_str"] = subset["location"].astype(str)

# Plot
plt.figure(figsize=(10, 4))
sns.stripplot(
    data=subset,
    x="location_str",
    y="Yield",
    hue="component",
    dodge=True,
    alpha=0.5,
    jitter=True
)
plt.xticks(rotation=45)
plt.xlabel("Location (lat, lon)")
plt.ylabel("Yield")
plt.title("Yield by location, colored by GMM component")
plt.legend(title="Component")
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [17]:
# Use string version of location for stats
sel = sel.copy()
sel["location_str"] = sel["location"].astype(str)

from statsmodels.formula.api import ols
import statsmodels.api as sm

model = ols("Yield ~ C(location_str)", data=sel).fit()
anova = sm.stats.anova_lm(model, typ=2)
anova
Out[17]:
sum_sq df F PR(>F)
C(location_str) 45629.241783 1408.0 174.282443 0.0
Residual 9071.564035 48786.0 NaN NaN
InĀ [18]:
loc_means = (
    sel.groupby("location")["Yield"]
       .mean()
       .sort_values()
)

loc_means.head(), loc_means.tail()
Out[18]:
(location
 (36.75, 259.25)    0.464795
 (35.25, 256.75)    0.466836
 (33.75, 257.75)    0.472078
 (33.25, 258.25)    0.485208
 (32.75, 257.25)    0.488031
 Name: Yield, dtype: float32,
 location
 (38.75, 280.75)    4.435028
 (41.25, 284.75)    4.440778
 (33.75, 267.25)    4.452085
 (32.25, 271.75)    4.504502
 (35.75, 267.25)    4.544434
 Name: Yield, dtype: float32)
InĀ [19]:
sel[["Yield", "lat","lon", "year"]].corr()
Out[19]:
Yield lat lon year
Yield 1.000000 -0.129658 0.661155 0.328514
lat -0.129658 1.000000 -0.237207 0.009547
lon 0.661155 -0.237207 1.000000 0.000773
year 0.328514 0.009547 0.000773 1.000000
InĀ [20]:
sel[["Yield", "lat", "lon"]].corr()
Out[20]:
Yield lat lon
Yield 1.000000 -0.129658 0.661155
lat -0.129658 1.000000 -0.237207
lon 0.661155 -0.237207 1.000000
InĀ [Ā ]: