import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import optimize, signal
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load traffic data
df = pd.read_csv('datasets/traffic_index.csv')
df['trafficindexdate'] = pd.to_datetime(df['trafficindexdate'])
df['year'] = df['trafficindexdate'].dt.year
df['month'] = df['trafficindexdate'].dt.month
df['dayofyear'] = df['trafficindexdate'].dt.dayofyear
df['dayofweek'] = df['trafficindexdate'].dt.dayofweek

print(f"Dataset: {len(df)} observations from {df['trafficindexdate'].min().date()} to {df['trafficindexdate'].max().date()}")

Dataset: 3332 observations from 2015-08-06 to 2024-10-18

# Linear trend: yearly average traffic
yearly = df.groupby('year')['average_traffic_index'].mean().reset_index()
x = yearly['year'].values
y = yearly['average_traffic_index'].values

# Construct Vandermonde matrix
M = np.c_[np.ones(len(x)), x]
print(f"Vandermonde matrix shape: {M.shape}")
print(f"First 3 rows:\n{M[:3]}")

# Linear least squares
coeffs, residuals, rank, s = np.linalg.lstsq(M, y, rcond=None)
a, b = coeffs[1], coeffs[0]

print(f"\nFit: y = {a:.3f}x + {b:.3f}")
print(f"Interpretation: Traffic increases {a:.2f} points per year")

# Plot
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(x, y, s=100, alpha=0.6, label='Data')
ax.plot(x, a*x + b, 'r-', linewidth=2, label=f'Linear fit: y={a:.2f}x+{b:.0f}')
ax.set_xlabel('Year')
ax.set_ylabel('Average Traffic Index')
ax.set_title('Linear Trend: Yearly Traffic Growth')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Residual analysis
y_pred = a*x + b
residuals_vals = y - y_pred
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)

print(f"\nModel Quality:")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.3f}")

Vandermonde matrix shape: (10, 2)
First 3 rows:
[[1.000e+00 2.015e+03]
 [1.000e+00 2.016e+03]
 [1.000e+00 2.017e+03]]

Fit: y = 0.276x + -528.752
Interpretation: Traffic increases 0.28 points per year

Model Quality:
RMSE: 2.63
R²: 0.083

# Monthly pattern: Does traffic vary within the year?
monthly_avg = df.groupby('month')['average_traffic_index'].mean().reset_index()
x_month = monthly_avg['month'].values
y_month = monthly_avg['average_traffic_index'].values

# Compare polynomial degrees
degrees = [1, 2, 3, 5]
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, deg in enumerate(degrees):
    # Fit polynomial
    coeffs = np.polyfit(x_month, y_month, deg)
    poly_func = np.poly1d(coeffs)
    
    # Evaluate
    x_fit = np.linspace(1, 12, 100)
    y_fit = poly_func(x_fit)
    y_pred = poly_func(x_month)
    
    rmse = np.sqrt(mean_squared_error(y_month, y_pred))
    r2 = r2_score(y_month, y_pred)
    
    # Plot
    ax = axes[idx]
    ax.scatter(x_month, y_month, s=100, alpha=0.6, label='Data')
    ax.plot(x_fit, y_fit, 'r-', linewidth=2, label=f'Degree {deg}')
    ax.set_xlabel('Month')
    ax.set_ylabel('Avg Traffic')
    ax.set_title(f'Polynomial Degree {deg}\nRMSE={rmse:.2f}, R²={r2:.3f}')
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_xticks(range(1, 13))

plt.tight_layout()
plt.show()

print("Observation: Higher degree polynomials fit training data better,")
print("but may overfit (capture noise instead of signal)")

Observation: Higher degree polynomials fit training data better,
but may overfit (capture noise instead of signal)

# Residual plots for degree 2 and 5
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, deg in enumerate([2, 5]):
    coeffs = np.polyfit(x_month, y_month, deg)
    poly_func = np.poly1d(coeffs)
    y_pred = poly_func(x_month)
    residuals = y_month - y_pred
    
    ax = axes[idx]
    ax.scatter(y_pred, residuals, s=100, alpha=0.6)
    ax.axhline(y=0, color='r', linestyle='--', linewidth=2)
    ax.set_xlabel('Fitted values')
    ax.set_ylabel('Residuals')
    ax.set_title(f'Residual Plot: Degree {deg}')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Degree 2: Residuals show pattern (underfitting)")
print("Degree 5: Residuals more random (better fit, but risk of overfitting)")

Degree 2: Residuals show pattern (underfitting)
Degree 5: Residuals more random (better fit, but risk of overfitting)

# Day-of-week pattern (7-day cycle)
weekly = df.groupby('dayofweek')['average_traffic_index'].mean().reset_index()
x_week = weekly['dayofweek'].values
y_week = weekly['average_traffic_index'].values

# Define sinusoidal function
def sinusoidal(x, A, B, C, D):
    return A * np.sin(B * x + C) + D

# Initial guess
amplitude_guess = (y_week.max() - y_week.min()) / 2
frequency_guess = 2 * np.pi / 7  # weekly cycle
phase_guess = 0
offset_guess = y_week.mean()

p0 = [amplitude_guess, frequency_guess, phase_guess, offset_guess]
print(f"Initial guess: A={p0[0]:.2f}, B={p0[1]:.3f}, C={p0[2]:.2f}, D={p0[3]:.2f}")

# Fit
params, covariance = optimize.curve_fit(sinusoidal, x_week, y_week, p0=p0)
A, B, C, D = params

print(f"\nFitted parameters:")
print(f"  Amplitude (A): {A:.2f}")
print(f"  Frequency (B): {B:.3f} rad")
print(f"  Phase (C): {C:.3f} rad")
print(f"  Offset (D): {D:.2f}")

# Plot
x_fit = np.linspace(0, 6, 100)
y_fit = sinusoidal(x_fit, A, B, C, D)

fig, ax = plt.subplots(figsize=(10, 5))
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
ax.scatter(x_week, y_week, s=100, alpha=0.6, label='Data', zorder=3)
ax.plot(x_fit, y_fit, 'r-', linewidth=2, label='Sinusoidal fit', zorder=2)
ax.set_xlabel('Day of Week')
ax.set_ylabel('Average Traffic Index')
ax.set_title('Weekly Pattern: Sinusoidal Fit')
ax.set_xticks(range(7))
ax.set_xticklabels(days)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

y_pred = sinusoidal(x_week, A, B, C, D)
rmse = np.sqrt(mean_squared_error(y_week, y_pred))
r2 = r2_score(y_week, y_pred)
print(f"\nRMSE: {rmse:.2f}, R²: {r2:.3f}")

Initial guess: A=6.62, B=0.898, C=0.00, D=27.83

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[5], line 20
     17 print(f"Initial guess: A={p0[0]:.2f}, B={p0[1]:.3f}, C={p0[2]:.2f}, D={p0[3]:.2f}")
     19 # Fit
---> 20 params, covariance = optimize.curve_fit(sinusoidal, x_week, y_week, p0=p0)
     21 A, B, C, D = params
     23 print(f"\nFitted parameters:")

File /opt/conda/lib/python3.13/site-packages/scipy/optimize/_minpack_py.py:1026, in curve_fit(f, xdata, ydata, p0, sigma, absolute_sigma, check_finite, bounds, method, jac, full_output, nan_policy, **kwargs)
   1024     cost = np.sum(infodict['fvec'] ** 2)
   1025     if ier not in [1, 2, 3, 4]:
-> 1026         raise RuntimeError("Optimal parameters not found: " + errmsg)
   1027 else:
   1028     # Rename maxfev (leastsq) to max_nfev (least_squares), if specified.
   1029     if 'max_nfev' not in kwargs:

RuntimeError: Optimal parameters not found: Number of calls to function has reached maxfev = 1000.

# Traffic growth over years (with saturation)
def tanh_model(x, A, B, C, D):
    return A * (B + np.tanh(C * (x - D)))

# Normalize x for numerical stability
x_norm = (yearly['year'].values - 2015) / 10
y_yearly = yearly['average_traffic_index'].values

# Initial guess
p0 = [50, 1, 1, 0.5]

try:
    params, cov = optimize.curve_fit(tanh_model, x_norm, y_yearly, p0=p0, maxfev=5000)
    
    x_fit = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_fit = tanh_model(x_fit, *params)
    x_fit_year = x_fit * 10 + 2015
    
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.scatter(yearly['year'], y_yearly, s=100, alpha=0.6, label='Data')
    ax.plot(x_fit_year, y_fit, 'r-', linewidth=2, label='Tanh fit')
    ax.set_xlabel('Year')
    ax.set_ylabel('Average Traffic')
    ax.set_title('Nonlinear Growth Model: Hyperbolic Tangent')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    y_pred = tanh_model(x_norm, *params)
    rmse = np.sqrt(mean_squared_error(y_yearly, y_pred))
    r2 = r2_score(y_yearly, y_pred)
    print(f"RMSE: {rmse:.2f}, R²: {r2:.3f}")
    
except RuntimeError as e:
    print(f"Fit failed: {e}")
    print("Tanh model may not be appropriate for this data (insufficient curvature)")

RMSE: 2.08, R²: 0.424

# Generate synthetic data with noise
np.random.seed(42)
xmin, xmax = 0, 2
npts = 100
noise = 0.05

# True function: quadratic
x_synth = np.linspace(xmin, xmax, npts)
y_true = -0.3 * x_synth**2 + x_synth + 0.5
y_synth = y_true + np.random.normal(0, noise, npts)

# RBF with different numbers of centers
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()
n_centers_list = [5, 10, 20, 50]

for idx, n_centers in enumerate(n_centers_list):
    # Choose random centers
    indices = np.random.choice(npts, n_centers, replace=False)
    centers = x_synth[indices]
    
    # Construct RBF matrix: M[i,j] = |x[i] - center[j]|^3
    M = np.abs(np.outer(x_synth, np.ones(n_centers)) - 
               np.outer(np.ones(npts), centers))**3
    
    # Fit
    coeffs, residuals, rank, s = np.linalg.lstsq(M, y_synth, rcond=None)
    
    # Predict
    x_fit = np.linspace(xmin, xmax, 200)
    M_fit = np.abs(np.outer(x_fit, np.ones(n_centers)) - 
                   np.outer(np.ones(200), centers))**3
    y_fit = M_fit @ coeffs
    
    # Plot
    ax = axes[idx]
    ax.scatter(x_synth, y_synth, alpha=0.5, s=20, label='Noisy data')
    ax.plot(x_synth, y_true, 'k--', linewidth=2, label='True function', alpha=0.5)
    ax.plot(x_fit, y_fit, 'r-', linewidth=2, label=f'RBF ({n_centers} centers)')
    ax.scatter(centers, np.zeros(n_centers), marker='x', s=100, c='green', 
               label='Centers', zorder=5)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title(f'{n_centers} Centers')
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_ylim(-0.5, 1.8)

plt.tight_layout()
plt.show()

print("RBF Observations:")
print("- Few centers (5-10): Smooth, may underfit")
print("- Many centers (50): Wiggly, overfits noise")
print("- Optimal: Balance between smoothness and fit quality")

RBF Observations:
- Few centers (5-10): Smooth, may underfit
- Many centers (50): Wiggly, overfits noise
- Optimal: Balance between smoothness and fit quality

# Generate 2D synthetic data
np.random.seed(10)
xmin = ymin = -2
xmax = ymax = 2
npts = 100
noise = 0.1

# True coefficients
c_true = [1, -1, 0.5, -1.5, 2, 5]
print(f"True coefficients: {c_true}")

# Generate data
x = xmin + (xmax - xmin) * np.random.rand(npts)
y = ymin + (ymax - ymin) * np.random.rand(npts)
z = (c_true[0] + c_true[1]*x + c_true[2]*y + c_true[3]*x*y + 
     c_true[4]*x**2 + c_true[5]*y**2 + np.random.normal(0, noise, npts))

# Construct Vandermonde matrix
M = np.c_[np.ones(npts), x, y, x*y, x**2, y**2]
print(f"Vandermonde shape: {M.shape}")

# Fit
c_fit, residuals, rank, s = np.linalg.lstsq(M, z, rcond=None)
print(f"Fitted coefficients: {c_fit}")
print(f"Coefficient errors: {np.abs(c_fit - c_true)}")

# Evaluate on grid
x_grid = np.linspace(xmin, xmax, 50)
y_grid = np.linspace(ymin, ymax, 50)
X, Y = np.meshgrid(x_grid, y_grid)
Z = (c_fit[0] + c_fit[1]*X + c_fit[2]*Y + c_fit[3]*X*Y + 
     c_fit[4]*X**2 + c_fit[5]*Y**2)

# Plot
fig = plt.figure(figsize=(12, 5))

# 3D scatter + surface
ax1 = fig.add_subplot(121, projection='3d')
ax1.scatter(x, y, z, alpha=0.5, s=20)
ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.3)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('z')
ax1.set_title('2D Polynomial Fit')

# Contour plot
ax2 = fig.add_subplot(122)
contour = ax2.contourf(X, Y, Z, levels=20, cmap='viridis')
ax2.scatter(x, y, c=z, s=20, edgecolors='white', linewidth=0.5)
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Contour View')
plt.colorbar(contour, ax=ax2)

plt.tight_layout()
plt.show()

z_pred = M @ c_fit
rmse = np.sqrt(mean_squared_error(z, z_pred))
r2 = r2_score(z, z_pred)
print(f"\nRMSE: {rmse:.3f}, R²: {r2:.3f}")

True coefficients: [1, -1, 0.5, -1.5, 2, 5]
Vandermonde shape: (100, 6)
Fitted coefficients: [ 1.00746166 -0.99880884  0.50892353 -1.4969289   1.99534833  4.99673621]
Coefficient errors: [0.00746166 0.00119116 0.00892353 0.0030711  0.00465167 0.00326379]

RMSE: 0.088, R²: 1.000

# High-degree polynomial on traffic data (prone to overfitting)
x_month = monthly_avg['month'].values
y_month = monthly_avg['average_traffic_index'].values

degree = 8

# Construct Vandermonde for degree 8
M = np.vander(x_month, degree + 1, increasing=True)

# Compare regularization strengths
lambdas = [0, 0.01, 0.1, 1.0, 10.0]
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, lam in enumerate(lambdas):
    # Ridge regression: (M^T M + lambda I) c = M^T y
    I = np.eye(M.shape[1])
    c = np.linalg.solve(M.T @ M + lam * I, M.T @ y_month)
    
    # Evaluate
    x_fit = np.linspace(1, 12, 100)
    M_fit = np.vander(x_fit, degree + 1, increasing=True)
    y_fit = M_fit @ c
    
    y_pred = M @ c
    rmse = np.sqrt(mean_squared_error(y_month, y_pred))
    coeff_norm = np.linalg.norm(c)
    
    # Plot
    ax = axes[idx]
    ax.scatter(x_month, y_month, s=100, alpha=0.6, label='Data')
    ax.plot(x_fit, y_fit, 'r-', linewidth=2, label=f'λ={lam}')
    ax.set_xlabel('Month')
    ax.set_ylabel('Avg Traffic')
    ax.set_title(f'λ={lam}\nRMSE={rmse:.2f}, ||c||={coeff_norm:.1f}')
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_ylim(50, 90)
    ax.set_xticks(range(1, 13))

# Remove extra subplot
fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()

print("Regularization Effect:")
print("- λ=0: No regularization, wiggly (overfitting)")
print("- λ small (0.01-0.1): Balanced")
print("- λ large (10): Over-regularized, too smooth (underfitting)")

Regularization Effect:
- λ=0: No regularization, wiggly (overfitting)
- λ small (0.01-0.1): Balanced
- λ large (10): Over-regularized, too smooth (underfitting)

# RBF overfitting demonstration
np.random.seed(42)
npts_train = 50
npts_test = 50

# Generate data
x_train = np.sort(np.random.uniform(0, 2, npts_train))
y_train = -0.3 * x_train**2 + x_train + 0.5 + np.random.normal(0, 0.05, npts_train)

x_test = np.sort(np.random.uniform(0, 2, npts_test))
y_test = -0.3 * x_test**2 + x_test + 0.5 + np.random.normal(0, 0.05, npts_test)

# Test different numbers of centers
n_centers_range = range(2, 41, 2)
train_errors = []
test_errors = []

for n_centers in n_centers_range:
    # Choose centers
    indices = np.linspace(0, len(x_train)-1, n_centers).astype(int)
    centers = x_train[indices]
    
    # Construct matrices
    M_train = np.abs(np.outer(x_train, np.ones(n_centers)) - 
                     np.outer(np.ones(npts_train), centers))**3
    M_test = np.abs(np.outer(x_test, np.ones(n_centers)) - 
                    np.outer(np.ones(npts_test), centers))**3
    
    # Fit on training data
    coeffs, _, _, _ = np.linalg.lstsq(M_train, y_train, rcond=None)
    
    # Predict
    y_train_pred = M_train @ coeffs
    y_test_pred = M_test @ coeffs
    
    # Errors
    train_errors.append(np.mean((y_train - y_train_pred)**2))
    test_errors.append(np.mean((y_test - y_test_pred)**2))

# Plot error curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Error vs complexity
ax1.plot(n_centers_range, train_errors, 'b-o', label='Training error', linewidth=2)
ax1.plot(n_centers_range, test_errors, 'r-o', label='Test error', linewidth=2)
optimal_idx = np.argmin(test_errors)
optimal_n = list(n_centers_range)[optimal_idx]
ax1.axvline(optimal_n, color='g', linestyle='--', linewidth=2, 
            label=f'Optimal: {optimal_n} centers')
ax1.set_xlabel('Number of Centers (Model Complexity)')
ax1.set_ylabel('Mean Squared Error')
ax1.set_title('Bias-Variance Tradeoff')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.annotate('Underfitting', xy=(5, 0.004), fontsize=12, color='blue')
ax1.annotate('Overfitting', xy=(30, 0.008), fontsize=12, color='red')

# Show optimal fit
indices = np.linspace(0, len(x_train)-1, optimal_n).astype(int)
centers = x_train[indices]
M_train = np.abs(np.outer(x_train, np.ones(optimal_n)) - 
                 np.outer(np.ones(npts_train), centers))**3
coeffs, _, _, _ = np.linalg.lstsq(M_train, y_train, rcond=None)

x_plot = np.linspace(0, 2, 200)
M_plot = np.abs(np.outer(x_plot, np.ones(optimal_n)) - 
                np.outer(np.ones(200), centers))**3
y_plot = M_plot @ coeffs

ax2.scatter(x_train, y_train, alpha=0.6, label='Training data', s=50)
ax2.scatter(x_test, y_test, alpha=0.6, label='Test data', s=50, marker='s')
ax2.plot(x_plot, y_plot, 'g-', linewidth=2, label=f'Optimal fit ({optimal_n} centers)')
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Optimal Model')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nOptimal number of centers: {optimal_n}")
print(f"Training MSE: {train_errors[optimal_idx]:.5f}")
print(f"Test MSE: {test_errors[optimal_idx]:.5f}")

Optimal number of centers: 16
Training MSE: 0.00150
Test MSE: 0.00225

Data Science - Week 3: Fitting¶

Overview: Fitting Methods¶

Part 1: Linear & Polynomial Fitting¶

1.1 Linear Least Squares¶

1.2 Polynomial Fitting¶

1.3 Residual Analysis¶

Part 2: Nonlinear Fitting with scipy.optimize¶

2.1 Sinusoidal Pattern (Weekly Cycle)¶

2.2 Custom Nonlinear Function¶

Part 3: Advanced Methods (Synthetic Data)¶

3.1 Radial Basis Functions (RBF)¶

3.2 2D Polynomial Fitting¶

Part 4: Regularization & Model Selection¶

4.1 Ridge Regression (Regularization)¶

4.2 Cross-Validation & Overfitting¶

Summary¶

Fitting Methods Covered:¶

Key Insights:¶