import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet")

country_name = "United States of America" #select the country to show

sel = df[(df["Country"] == country_name) & df["Yield"]]
groups = sel.groupby("year")  
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]

plt.figure(figsize=(12, 5))
plt.boxplot(
    data_by_year,
    positions=np.arange(len(years)),
    showfliers=True,  
)

plt.xticks(np.arange(len(years)), years, rotation=90)
plt.xlabel("Year")
plt.ylabel(r"Yield (t ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.tight_layout()
plt.show()

print(sel)

         year    lat     lon     Yield                   Country
167563   1981  26.25  261.75  0.765011  United States of America
167564   1981  26.25  262.25  0.703201  United States of America
170444   1981  28.25  262.25  1.045607  United States of America
170445   1981  28.25  262.75  1.321302  United States of America
171163   1981  28.75  261.75  0.934217  United States of America
...       ...    ...     ...       ...                       ...
9271965  2016  48.75  262.75  2.287417  United States of America
9271966  2016  48.75  263.25  2.532017  United States of America
9271967  2016  48.75  263.75  3.050681  United States of America
9271968  2016  48.75  264.25  3.583090  United States of America
9271969  2016  48.75  264.75  3.735792  United States of America

[50195 rows x 5 columns]

# test 
sel_median = sel.groupby("year")["Yield"].median()
x = sel_median.index.to_numpy() 
y = sel_median.values 
print(sel_median)

year
1981    2.079195
1982    2.312607
1983    1.969006
1984    2.104958
1985    2.527668
1986    2.423721
1987    2.504457
1988    2.031349
1989    2.464397
1990    2.563608
1991    2.538106
1992    2.726767
1993    2.450094
1994    2.954587
1995    2.661250
1996    2.806201
1997    2.875361
1998    2.848866
1999    2.674586
2000    2.756408
2001    2.914491
2002    2.801623
2003    2.473367
2004    3.084394
2005    3.136048
2006    3.131395
2007    2.998413
2008    2.905602
2009    3.185418
2010    3.156585
2011    3.071516
2012    2.933459
2013    3.188277
2014    3.468918
2015    3.444777
2016    3.733463
Name: Yield, dtype: float32

xmin = x.min()
xmax = x.max()
npts = xmax-xmin+1

coeff1 = np.polyfit(x,y,1) # fit first-order polynomial/linear
coeff2 = np.polyfit(x,y,3) # fit second-order polynomial/quadratic
xfit = np.arange(xmin,xmax,(xmax-xmin)/npts)
pfit1 = np.poly1d(coeff1)
yfit1 = pfit1(xfit) # evaluate first-order fit
print(f"first-order fit coefficients: {coeff1}")
pfit2 = np.poly1d(coeff2)
yfit2 = pfit2(xfit) # evaluate second-order fit
print(f"second-order fit coefficients: {coeff2}")
plt.scatter(
    x,
    y, 
    label= f"Median by year",
)
plt.xlabel("Year")
plt.ylabel(r"Yield (t.ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")

plt.plot(xfit,yfit1,'g-',label='linear')
plt.plot(xfit,yfit2,'r-',label='quadratic')
plt.legend()

plt.tight_layout()
plt.show()

first-order fit coefficients: [ 3.51600937e-02 -6.74924212e+01]
second-order fit coefficients: [ 7.22950563e-05 -4.33412504e-01  8.66131059e+02 -5.76969747e+05]

import numpy as np
import matplotlib.pyplot as plt
xmin = x.min()
xmax = x.max()
npts = xmax-xmin+1
ncenters = npts
indices = np.random.uniform(low=2,high=len(x),size=ncenters).astype(int) # choose random RBF centers from data
centers = x[indices]
M = np.abs(np.outer(x,np.ones(ncenters)) # construct matrix of basis terms
   -np.outer(np.ones(npts),centers))**3
b,residuals,rank,values = np.linalg.lstsq(M,y) # do SVD fit
xfit = np.linspace(xmin,xmax,npts)
yfit = (np.abs(np.outer(xfit,np.ones(ncenters))-np.outer(np.ones(npts),centers))**3)@b # evaluate fit
plt.scatter(
    x,
    y, 
    label= f"Median by year",
)
plt.plot(xfit,yfit,'g-',label='RBF fit')
#plt.plot(xfit,(xfit-centers[0])**3,color=(0.75,0.75,0.75),label='$r^3$ basis functions')
#for i in range(ncenters):
#    plt.plot(xfit,np.abs(xfit-centers[i])**3,color=(0.75,0.75,0.75))
#plt.ylim(2,4)
plt.xlabel("Year")
plt.ylabel(r"Yield (t.ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.legend()
plt.show()

import numpy as np
import matplotlib.pyplot as plt

xmin = x.min()
xmax = x.max()
npts = xmax-xmin+1
#xmin = -1
#xmax = 2
#noise = 1
#npts = 100
#c = [-3,2,1]
#np.random.seed(10)
#x = xmin+(xmax-xmin)*np.random.rand(npts)
#y = c[2]+c[1]*x+c[0]*x*x+np.random.normal(0,noise,npts)


# generate Vandermonde matrix
#
indices = np.random.uniform(low=0,high=len(x),size=ncenters).astype(int)
centers = x[indices]
M = np.abs(np.outer(x,np.ones(ncenters))
   -np.outer(np.ones(npts),centers))**3
#
# invert matrices to find weight-regularized coefficients
#
Lambda = 20
left = (M.T)@M+Lambda*np.eye(ncenters)
right = (M.T)@y
coeff1 = np.linalg.pinv(left)@right
Lambda = 0
left = (M.T)@M+Lambda*np.eye(ncenters)
right = (M.T)@y
coeff0 = np.linalg.pinv(left)@right
#
# plot fits
#
xfit = np.linspace(xmin,xmax,npts)
yfit0 = (np.abs(np.outer(xfit,np.ones(ncenters))-np.outer(np.ones(npts),centers))**3)@coeff0
yfit1 = (np.abs(np.outer(xfit,np.ones(ncenters))-np.outer(np.ones(npts),centers))**3)@coeff1
plt.scatter(
    x,
    y, 
    label= f"Median by year",
)
plt.plot(x,y,'o',alpha=0.5)
plt.plot(xfit,yfit0,'y-',label=r'$\lambda=0$')
plt.plot(xfit,yfit1,'r-',label=r'$\lambda=20$')
plt.xlabel("Year")
plt.ylabel(r"Yield (t.ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.legend()
plt.show()

Week 2 - Fitting a function¶

Polynomial fit¶

Radial basis function (RBF)¶

Tikhonov regularization¶