import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load and prepare real data (replace path if needed)
df = pd.read_csv('datasets/ICRISAT-District Level Data.csv')
df.replace(-1, np.nan, inplace=True)

durg_rice = df[(df['Dist Name'] == 'Durg') & (df['Year'] >= 1966) & (df['Year'] <= 2000)][['Year', 'RICE YIELD (Kg per ha)']].dropna()
durg_rice['Year'] = durg_rice['Year'].astype(int)

x = durg_rice['Year'].values  # x: Years
y = durg_rice['RICE YIELD (Kg per ha)'].values  # y: Yields

xmin = x.min()
xmax = x.max()
npts = len(x)

np.set_printoptions(precision=3)
np.random.seed(10)  # For reproducibility (not used in generation)

print(f"Dataset: Rice Yield in Durg District (1966-2000)")
print(f"Number of points: {npts}")

# Fit polynomials (no synthetic c or noise addition)
coeff1 = np.polyfit(x, y, 1)  # Linear fit
coeff2 = np.polyfit(x, y, 2)  # Quadratic fit

xfit = np.linspace(xmin, xmax, npts)
pfit1 = np.poly1d(coeff1)
yfit1 = pfit1(xfit)  # Evaluate linear fit
print(f"first-order fit coefficients: {coeff1}")

pfit2 = np.poly1d(coeff2)
yfit2 = pfit2(xfit)  # Evaluate quadratic fit
print(f"second-order fit coefficients: {coeff2}")

# Plot
plt.figure()
plt.plot(x, y, 'o', label='Observed')  # Real data points
plt.plot(xfit, yfit1, 'g-', label='linear')
plt.plot(xfit, yfit2, 'r-', label='quadratic')
plt.xlabel('Year')
plt.ylabel('Rice Yield (kg/ha)')
plt.title('Rice Yield Trend in Durg District')
plt.legend()
plt.show()

Dataset: Rice Yield in Durg District (1966-2000)
Number of points: 35
first-order fit coefficients: [ 1.738e+01 -3.359e+04]
second-order fit coefficients: [-6.381e-02  2.704e+02 -2.845e+05]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Load & prepare
df = pd.read_csv('datasets/ICRISAT-District Level Data.csv')
df.replace(-1, np.nan, inplace=True)
durg_rice = df[(df['Dist Name'] == 'Durg') & (df['Year'] >= 1966) & (df['Year'] <= 2000)][['Year', 'RICE AREA (1000 ha)', 'RICE PRODUCTION (1000 tons)']].dropna()
durg_rice['Year'] = durg_rice['Year'].astype(int)

x, y, z = durg_rice['Year'].values, durg_rice['RICE AREA (1000 ha)'].values, durg_rice['RICE PRODUCTION (1000 tons)'].values

# Vandermonde & fit
M = np.c_[np.ones(len(x)), x, y, x*y, x**2, y**2]
cfit, _, _, _ = np.linalg.lstsq(M, z, rcond=None)

# Grid for surface
x_grid = np.linspace(x.min(), x.max(), 20)
y_grid = np.linspace(y.min(), y.max(), 20)
X, Y = np.meshgrid(x_grid, y_grid)
Z_fit = (cfit[0] + cfit[1]*X + cfit[2]*Y + cfit[3]*X*Y + cfit[4]*X**2 + cfit[5]*Y**2)

# Plot: Contour (top view of surface)
plt.figure(figsize=(8, 6))
plt.contourf(X, Y, Z_fit, levels=20, cmap='viridis')
plt.colorbar(label='Production (1000 tons)')
plt.scatter(x, y, c=z, edgecolors='black', s=30, label='Observed')
plt.xlabel('Year'); plt.ylabel('Rice Area (1000 ha)')
plt.title('2D Polynomial Contour: Rice Production in Durg')
plt.legend(); plt.show()

import numpy as np
import pandas as pd
from scipy.interpolate import RBFInterpolator
import matplotlib.pyplot as plt

# Load and prepare data
df = pd.read_csv('datasets/ICRISAT-District Level Data.csv')  # Ensure CSV path is correct
df.replace(-1, np.nan, inplace=True)
durg_rice = df[(df['Dist Name'] == 'Durg') & (df['Year'] >= 1966) & (df['Year'] <= 2000)][['Year', 'RICE YIELD (Kg per ha)']].dropna()
durg_rice['Year'] = durg_rice['Year'].astype(int)

x = durg_rice['Year'].values  # Input: Years
y = durg_rice['RICE YIELD (Kg per ha)'].values  # Target: Yields

# Fit RBF (Gaussian kernel; tune epsilon for smoothness)
rbf = RBFInterpolator(np.atleast_2d(x).T, y, kernel='gaussian', epsilon=10)  # Increased epsilon for time scale
x_new = np.linspace(x.min(), x.max(), 100)  # Fine grid for smooth curve
y_pred = rbf(np.atleast_2d(x_new).T).flatten()  # Predictions

# Plot
plt.figure(figsize=(8, 5))
plt.plot(x, y, 'o', label='Observed Data', color='blue', markersize=4)
plt.plot(x_new, y_pred, '-', label='RBF Fit', color='red', linewidth=2)
plt.xlabel('Year')
plt.ylabel('Rice Yield (kg/ha)')
plt.title('RBF Interpolation: Rice Yield in Durg District (1966–2000)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()  # This displays the chart!

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('datasets/ICRISAT-District Level Data.csv')
df.replace(-1, np.nan, inplace=True)
durg_rice = df[(df['Dist Name'] == 'Durg') & (df['Year'] >= 1966) & (df['Year'] <= 2000)][['Year', 'RICE YIELD (Kg per ha)']].dropna()
durg_rice['Year'] = durg_rice['Year'].astype(int)

# Prepare data
X = durg_rice['Year'].values.reshape(-1, 1)
y = durg_rice['RICE YIELD (Kg per ha)'].values

# Split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to fit poly and compute R2
def fit_poly(degree):
    coeffs = np.polyfit(X_train.flatten(), y_train, degree)
    p = np.poly1d(coeffs)
    y_train_pred = p(X_train.flatten())
    y_test_pred = p(X_test.flatten())
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)
    return p, r2_train, r2_test

# Fit models
degrees = [1, 3, 5, 10]
models = {}
for deg in degrees:
    models[deg] = fit_poly(deg)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Left: Fits on full data (for viz)
x_full = np.linspace(X.min(), X.max(), 100)
for deg in degrees:
    p = models[deg][0]
    ax1.plot(x_full, p(x_full), label=f'Deg {deg}')
ax1.scatter(X, y, color='black', s=20, label='Data')
ax1.set_title('Polynomial Fits (Overfitting Demo)')
ax1.legend()
ax1.set_xlabel('Year')
ax1.set_ylabel('Rice Yield (kg/ha)')

# Right: R2 comparison
train_r2 = [models[deg][1] for deg in degrees]
test_r2 = [models[deg][2] for deg in degrees]
ax2.plot(degrees, train_r2, 'o-', label='Train R²')
ax2.plot(degrees, test_r2, 's-', label='Test R²')
ax2.set_xlabel('Degree')
ax2.set_ylabel('R²')
ax2.set_title('Overfitting: Train vs Test R²')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()

Week 3: Data Fitting¶

Understanding the Dataset¶

Loading the Dataset¶

Radial Basis Function (RBF)¶