import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ----------------------------------
# LOAD YOUR DATA
# ----------------------------------
data = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")

# Select columns
x_col = "Age"
y_col = "Depression"

# Drop missing values
df = data[[x_col, y_col]].dropna()

x = df[x_col].values
y = df[y_col].values

np.set_printoptions(precision=3)

# ----------------------------------
# POLYNOMIAL FITTING
# ----------------------------------

# First-order (linear) fit
coeff1 = np.polyfit(x, y, 1)
pfit1 = np.poly1d(coeff1)

# Second-order (quadratic) fit
coeff2 = np.polyfit(x, y, 2)
pfit2 = np.poly1d(coeff2)

print(f"First-order fit coefficients (linear): {coeff1}")
print(f"Second-order fit coefficients (quadratic): {coeff2}")

# ----------------------------------
# CREATE SMOOTH FIT CURVES
# ----------------------------------
xmin, xmax = x.min(), x.max()
xfit = np.linspace(xmin, xmax, 200)

yfit1 = pfit1(xfit)
yfit2 = pfit2(xfit)

# ----------------------------------
# PLOTTING
# ----------------------------------
plt.figure(figsize=(8, 6))
plt.plot(x, y, 'o', alpha=0.6, label="Observed data")
plt.plot(xfit, yfit1, 'g-', linewidth=2, label="Linear fit")
plt.plot(xfit, yfit2, 'r-', linewidth=2, label="Quadratic fit")

plt.xlabel(x_col)
plt.ylabel(y_col)
plt.title("Linear vs Quadratic Fit using Real Data")
plt.legend()
plt.grid(True)
plt.show()

First-order fit coefficients (linear): [-0.023  1.173]
Second-order fit coefficients (quadratic): [-0.001  0.019  0.651]

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ----------------------------------
# LOAD YOUR DATA
# ----------------------------------
data = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")

# Choose columns
x_col = "Age"          # independent variable
y_col = "Depression"   # dependent variable

# Drop missing values
df = data[[x_col, y_col]].dropna()

x = df[x_col].values
y = df[y_col].values

# ----------------------------------
# SORT DATA (IMPORTANT FOR SMOOTH PLOTS)
# ----------------------------------
idx = np.argsort(x)
x = x[idx]
y = y[idx]

xmin, xmax = x.min(), x.max()
npts = len(x)

# ----------------------------------
# FIT POLYNOMIALS OF DIFFERENT ORDERS
# ----------------------------------
xplot = np.linspace(xmin - 0.2, xmax + 0.2, 300)

# Order 1 (linear)
coeff1 = np.polyfit(x, y, 1)
yfit1 = np.poly1d(coeff1)(xplot)

# Order 4 (moderate complexity)
coeff4 = np.polyfit(x, y, 4)
yfit4 = np.poly1d(coeff4)(xplot)

# Order 15 (high-degree / overfitting)
coeff15 = np.polyfit(x, y, 15)
yfit15 = np.poly1d(coeff15)(xplot)

# ----------------------------------
# PLOTTING
# ----------------------------------
fig = plt.figure(figsize=(8, 6))
fig.canvas.header_visible = False

plt.plot(x, y, 'bo', alpha=0.6, label='Observed data')
plt.plot(xplot, yfit1, 'g-', linewidth=2, label='Order 1 (Underfit)')
plt.plot(xplot, yfit4, 'c-', linewidth=2, label='Order 4')
plt.plot(xplot, yfit15, 'r-', linewidth=2, label='Order 15 (Overfit)')

plt.xlabel(x_col)
plt.ylabel(y_col)
plt.title("Polynomial Fit Comparison Using Real Data")
plt.legend()
plt.grid(True)
plt.show()

/tmp/ipykernel_22878/193982490.py:44: RankWarning: Polyfit may be poorly conditioned
  coeff15 = np.polyfit(x, y, 15)

Functions and Fittings¶

Goal¶

Functions¶

linear¶

Fitting¶

linear least squares¶

polynomial¶

1D¶

the problems with polynomials¶