Week 2 - Fitting a function¶
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_parquet("/home/jovyan/work/jeogeorge/datasets/soybean_yield_country.parquet")
In [2]:
country_name = "United States of America" #select the country to show
In [3]:
sel = df[(df["Country"] == country_name) & df["Yield"]]
groups = sel.groupby("year")
years = sorted(sel["year"].unique())
data_by_year = [groups.get_group(y)["Yield"].values for y in years]
plt.figure(figsize=(12, 5))
plt.boxplot(
data_by_year,
positions=np.arange(len(years)),
showfliers=True,
)
plt.xticks(np.arange(len(years)), years, rotation=90)
plt.xlabel("Year")
plt.ylabel(r"Yield (t ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.tight_layout()
plt.show()
Polynomial fit¶
In [4]:
print(sel)
year lat lon Yield Country 167563 1981 26.25 261.75 0.765011 United States of America 167564 1981 26.25 262.25 0.703201 United States of America 170444 1981 28.25 262.25 1.045607 United States of America 170445 1981 28.25 262.75 1.321302 United States of America 171163 1981 28.75 261.75 0.934217 United States of America ... ... ... ... ... ... 9271965 2016 48.75 262.75 2.287417 United States of America 9271966 2016 48.75 263.25 2.532017 United States of America 9271967 2016 48.75 263.75 3.050681 United States of America 9271968 2016 48.75 264.25 3.583090 United States of America 9271969 2016 48.75 264.75 3.735792 United States of America [50195 rows x 5 columns]
In [5]:
# test
sel_median = sel.groupby("year")["Yield"].median()
x = sel_median.index.to_numpy()
y = sel_median.values
print(sel_median)
year 1981 2.079195 1982 2.312607 1983 1.969006 1984 2.104958 1985 2.527668 1986 2.423721 1987 2.504457 1988 2.031349 1989 2.464397 1990 2.563608 1991 2.538106 1992 2.726767 1993 2.450094 1994 2.954587 1995 2.661250 1996 2.806201 1997 2.875361 1998 2.848866 1999 2.674586 2000 2.756408 2001 2.914491 2002 2.801623 2003 2.473367 2004 3.084394 2005 3.136048 2006 3.131395 2007 2.998413 2008 2.905602 2009 3.185418 2010 3.156585 2011 3.071516 2012 2.933459 2013 3.188277 2014 3.468918 2015 3.444777 2016 3.733463 Name: Yield, dtype: float32
In [6]:
xmin = x.min()
xmax = x.max()
npts = xmax-xmin+1
coeff1 = np.polyfit(x,y,1) # fit first-order polynomial/linear
coeff2 = np.polyfit(x,y,3) # fit second-order polynomial/quadratic
xfit = np.arange(xmin,xmax,(xmax-xmin)/npts)
pfit1 = np.poly1d(coeff1)
yfit1 = pfit1(xfit) # evaluate first-order fit
print(f"first-order fit coefficients: {coeff1}")
pfit2 = np.poly1d(coeff2)
yfit2 = pfit2(xfit) # evaluate second-order fit
print(f"second-order fit coefficients: {coeff2}")
plt.scatter(
x,
y,
label= f"Median by year",
)
plt.xlabel("Year")
plt.ylabel(r"Yield (t.ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.plot(xfit,yfit1,'g-',label='linear')
plt.plot(xfit,yfit2,'r-',label='quadratic')
plt.legend()
plt.tight_layout()
plt.show()
first-order fit coefficients: [ 3.51600937e-02 -6.74924212e+01] second-order fit coefficients: [ 7.22950563e-05 -4.33412504e-01 8.66131059e+02 -5.76969747e+05]
Radial basis function (RBF)¶
In [7]:
import numpy as np
import matplotlib.pyplot as plt
xmin = x.min()
xmax = x.max()
npts = xmax-xmin+1
ncenters = npts
indices = np.random.uniform(low=2,high=len(x),size=ncenters).astype(int) # choose random RBF centers from data
centers = x[indices]
M = np.abs(np.outer(x,np.ones(ncenters)) # construct matrix of basis terms
-np.outer(np.ones(npts),centers))**3
b,residuals,rank,values = np.linalg.lstsq(M,y) # do SVD fit
xfit = np.linspace(xmin,xmax,npts)
yfit = (np.abs(np.outer(xfit,np.ones(ncenters))-np.outer(np.ones(npts),centers))**3)@b # evaluate fit
plt.scatter(
x,
y,
label= f"Median by year",
)
plt.plot(xfit,yfit,'g-',label='RBF fit')
#plt.plot(xfit,(xfit-centers[0])**3,color=(0.75,0.75,0.75),label='$r^3$ basis functions')
#for i in range(ncenters):
# plt.plot(xfit,np.abs(xfit-centers[i])**3,color=(0.75,0.75,0.75))
#plt.ylim(2,4)
plt.xlabel("Year")
plt.ylabel(r"Yield (t.ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.legend()
plt.show()
Tikhonov regularization¶
In [8]:
import numpy as np
import matplotlib.pyplot as plt
xmin = x.min()
xmax = x.max()
npts = xmax-xmin+1
#xmin = -1
#xmax = 2
#noise = 1
#npts = 100
#c = [-3,2,1]
#np.random.seed(10)
#x = xmin+(xmax-xmin)*np.random.rand(npts)
#y = c[2]+c[1]*x+c[0]*x*x+np.random.normal(0,noise,npts)
# generate Vandermonde matrix
#
indices = np.random.uniform(low=0,high=len(x),size=ncenters).astype(int)
centers = x[indices]
M = np.abs(np.outer(x,np.ones(ncenters))
-np.outer(np.ones(npts),centers))**3
#
# invert matrices to find weight-regularized coefficients
#
Lambda = 20
left = (M.T)@M+Lambda*np.eye(ncenters)
right = (M.T)@y
coeff1 = np.linalg.pinv(left)@right
Lambda = 0
left = (M.T)@M+Lambda*np.eye(ncenters)
right = (M.T)@y
coeff0 = np.linalg.pinv(left)@right
#
# plot fits
#
xfit = np.linspace(xmin,xmax,npts)
yfit0 = (np.abs(np.outer(xfit,np.ones(ncenters))-np.outer(np.ones(npts),centers))**3)@coeff0
yfit1 = (np.abs(np.outer(xfit,np.ones(ncenters))-np.outer(np.ones(npts),centers))**3)@coeff1
plt.scatter(
x,
y,
label= f"Median by year",
)
plt.plot(x,y,'o',alpha=0.5)
plt.plot(xfit,yfit0,'y-',label=r'$\lambda=0$')
plt.plot(xfit,yfit1,'r-',label=r'$\lambda=20$')
plt.xlabel("Year")
plt.ylabel(r"Yield (t.ha$^{-1}$)")
plt.title(f"Distribution of soybean yield in {country_name} by year")
plt.legend()
plt.show()