import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import least_squares

df = pd.read_csv("datasets/Housing.csv")
df.head()

# Use 'area' as predictor and 'price' as target
x = df['area'].values
y = df['price'].values

x_smooth = np.linspace(x.min(), x.max(), 500)

# Fit first-order (linear) polynomial
coeff1 = np.polyfit(x, y, 1)
pfit1 = np.poly1d(coeff1)
yfit1 = pfit1(x_smooth)  # evaluate first-order fit
print(f"first-order fit coefficients: {coeff1}")

first-order fit coefficients: [4.61974894e+02 2.38730848e+06]

coeff2 = np.polyfit(x, y, 2)
pfit2 = np.poly1d(coeff2)
yfit2 = pfit2(x_smooth)  # evaluate second-order fit
print(f"second-order fit coefficients: {coeff2}")

second-order fit coefficients: [-4.35645185e-02  1.03518489e+03  7.95440758e+05]

plt.figure(figsize=(10,6))
plt.scatter(x, y, color='blue', alpha=0.6, label='Data')
plt.plot(x_smooth, yfit1, 'g-',linewidth=2,label='Linear fit')
plt.plot(x_smooth, yfit2, 'r-', linewidth=2,label='Quadratic fit')
plt.xlabel('Area')
plt.ylabel('Price')
plt.title('Polynomial Fit of House Prices vs Area')
plt.legend()
plt.show()

x = df['area'].values
y = df['price'].values

npts = len(x)
ncenters = 15
np.random.seed(0)

indices = np.random.uniform(low=0, high=len(x), size=ncenters).astype(int)
centers = x[indices]

M = np.abs(np.outer(x, np.ones(ncenters)) - np.outer(np.ones(npts), centers))**3
b, residuals, rank, values = np.linalg.lstsq(M, y, rcond=None)

xfit = np.linspace(x.min(), x.max(), npts)
yfit = (np.abs(np.outer(xfit, np.ones(ncenters)) - np.outer(np.ones(npts), centers))**3) @ b

plt.figure(figsize=(10,6))
plt.plot(x, y, 'o', label='Data')
plt.plot(xfit, yfit, 'g-', label='RBF fit')
for i in range(ncenters):
    plt.plot(xfit, np.abs(xfit - centers[i])**3, color=(0.75,0.75,0.75))
plt.xlabel('Area')
plt.ylabel('Price')
plt.title('RBF Fit of House Prices')
plt.legend()
plt.show()

x = df['area'].values
y = df['price'].values

# scale x to 0-1 to make tanh fit visible
x_scaled = (x - x.min()) / (x.max() - x.min())

coeff = np.array([y.max(), 0.5, 5, 0.5])  # scale coefficients to match data magnitude

def f(coeff, x):
    return coeff[0] * (coeff[1] + np.tanh(coeff[2] * (x - coeff[3])))

def residuals(coeff, x, y):
    return f(coeff, x) - y

result2 = least_squares(residuals, coeff, args=(x_scaled, y), max_nfev=2)
result10 = least_squares(residuals, coeff, args=(x_scaled, y), max_nfev=10)
resultend = least_squares(residuals, coeff, args=(x_scaled, y))

x_sorted = np.sort(x_scaled)

plt.figure(figsize=(10,6))
plt.scatter(x_scaled, y, color='blue', alpha=0.6, label='data')
plt.plot(x_sorted, f(coeff, x_sorted), 'b-', label='start')
plt.plot(x_sorted, f(result2.x, x_sorted), 'c-', label='2 evaluations')
plt.plot(x_sorted, f(result10.x, x_sorted), 'g-', label='10 evaluations')
plt.plot(x_sorted, f(resultend.x, x_sorted), 'r-', label='end')
plt.xlabel('Area (scaled)')
plt.ylabel('Price')
plt.title('Nonlinear Least Squares Fit (tanh) for Housing Data')
plt.legend()
plt.show()

x = df['area'].values
y = df['price'].values

order = 15

coeff2 = np.polyfit(x, y, 2)
coeffN = np.polyfit(x, y, order)

xfit = np.linspace(x.min(), x.max(), len(x))

pfit2 = np.poly1d(coeff2)
yfit2 = pfit2(xfit)

pfitN = np.poly1d(coeffN)
yfitN = pfitN(xfit)

plt.figure(figsize=(10,6))
plt.scatter(x, y, color='blue', alpha=0.6, label='Data')
plt.plot(xfit, yfit2, 'g-', label='order 2')
plt.plot(xfit, yfitN, 'r-', label=f'order {order}')
plt.xlabel('Area')
plt.ylabel('Price')
plt.title('Polynomial Fit of House Prices')
plt.legend()
plt.show()

	price	area	bedrooms	bathrooms	stories	mainroad	guestroom	basement	hotwaterheating	airconditioning	parking	prefarea	furnishingstatus
0	13300000	7420	4	2	3	yes	no	no	no	yes	2	yes	furnished
1	12250000	8960	4	4	4	yes	no	no	no	yes	3	no	furnished
2	12250000	9960	3	2	2	yes	no	yes	no	no	2	yes	semi-furnished
3	12215000	7500	4	2	2	yes	no	yes	no	yes	3	yes	furnished
4	11410000	7420	4	1	2	yes	yes	yes	no	yes	2	no	furnished

fitting¶

polynomial¶

Radial basis function (RBF)¶

Nonlinear least squares¶

Overfitting¶