< Home
Day 3: Fitting¶
Assignment 20/11/2025¶
Fit a function to your data
Fitting¶
Year 2009¶
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df.head()
# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")
# Use temperature: TMED or any parameter you want
y = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
x = np.arange(len(y)) # simple numeric index for fitting
mask = ~np.isnan(y)
x = x[mask]
y = y[mask]
# ---------------------------
# 3. FIT TWO MODELS
# ---------------------------
# A) Underfitting: very simple model (degree 2)
coefs_low = Polynomial.fit(x, y, deg=2)
y_pred_low = coefs_low(x)
# B) Overfitting: too many parameters (degree 25)
# WARNING: high degree = noisy overfit
coefs_high = Polynomial.fit(x, y, deg=25)
y_pred_high = coefs_high(x)
# ---------------------------
# 4. PLOT RESULTS
# ---------------------------
plt.figure(figsize=(14,6))
plt.scatter(x, y, s=10, label="Real Data (2009)", alpha=0.7)
plt.plot(x, y_pred_low, linewidth=2, label="Low-degree fit (Underfitting)", linestyle='--')
plt.plot(x, y_pred_high, linewidth=2, label="High-degree fit (Overfitting)", alpha=0.8)
plt.title("Overfitting demonstration – Temperature 2009")
plt.xlabel("Day index (2009)")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df.head()
# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")
# Use temperature: TMED or any parameter you want
y = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
x = np.arange(len(y)) # simple numeric index for fitting
mask = ~np.isnan(y)
x = x[mask]
y = y[mask]
# ---------------------------
# 3. FIT TWO MODELS
# ---------------------------
# A) Underfitting: very simple model (degree 2)
coefs_low = Polynomial.fit(x, y, deg=2)
y_pred_low = coefs_low(x)
# B) Overfitting: too many parameters (degree 25)
# WARNING: high degree = noisy overfit
coefs_high = Polynomial.fit(x, y, deg=25)
y_pred_high = coefs_high(x)
# ---------------------------
# 4. PLOT RESULTS + TMIN/TMAX
# ---------------------------
plt.figure(figsize=(14,6))
# Banda térmica TMIN-TMAX
plt.fill_between(
np.arange(len(df_2009)),
df_2009["TMIN"],
df_2009["TMAX"],
color="lightblue",
alpha=0.4,
label="Rango diario (TMIN – TMAX)"
)
# Scatter de TMEDIA (solo valores válidos)
plt.scatter(x, y, s=10, label="TMEDIA (2009)", alpha=0.8, color="blue")
# Ajuste simple
plt.plot(x, y_pred_low, linewidth=2, linestyle="--",
label="Low-degree fit (Underfitting)")
# Ajuste complejo (overfitting)
plt.plot(x, y_pred_high, linewidth=2, alpha=0.8,
label="High-degree fit (Overfitting)")
plt.title("Temperaturas 2009: Banda térmica (TMIN–TMAX) + Overfitting TMEDIA")
plt.xlabel("Índice de día (2009)")
plt.ylabel("Temperatura (°C)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df.head()
# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")
# Index for fitting
x_full = np.arange(len(df_2009))
# Extract series
tmed = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
tmin = pd.to_numeric(df_2009["TMIN"], errors="coerce")
tmax = pd.to_numeric(df_2009["TMAX"], errors="coerce")
# Remove NaN
mask_tmed = ~np.isnan(tmed)
mask_tmin = ~np.isnan(tmin)
mask_tmax = ~np.isnan(tmax)
# ---------------------------
# 3. FIT MODELS FOR EACH SERIES
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=25):
"""Return simple fit + overfitting predictions."""
x_masked = x[~np.isnan(y)]
y_masked = y[~np.isnan(y)]
# Simple
coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
y_low = coefs_low(x_masked)
# Overfit
deg = min(complex_deg, len(x_masked)-2)
coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
y_high = coefs_high(x_masked)
return x_masked, y_masked, y_low, y_high
# TMEDIA
x_tmed, y_tmed, y_tmed_low, y_tmed_high = fit_series(x_full, tmed)
# TMIN
x_tmin, y_tmin, y_tmin_low, y_tmin_high = fit_series(x_full, tmin)
# TMAX
x_tmax, y_tmax, y_tmax_low, y_tmax_high = fit_series(x_full, tmax)
# ---------------------------
# 4. PLOT RESULTS
# ---------------------------
plt.figure(figsize=(16,8))
# Banda térmica TMIN–TMAX
plt.fill_between(
x_full,
df_2009["TMIN"],
df_2009["TMAX"],
color="lightblue",
alpha=0.3,
label="Rango diario (TMIN–TMAX)"
)
# --- TMEDIA ---
plt.scatter(x_tmed, y_tmed, s=10, color="blue", label="TMEDIA")
plt.plot(x_tmed, y_tmed_low, "--", linewidth=2, color="blue",
label="TMEDIA simple fit")
plt.plot(x_tmed, y_tmed_high, linewidth=2, alpha=0.7, color="blue",
label="TMEDIA overfit")
# --- TMIN ---
plt.scatter(x_tmin, y_tmin, s=10, color="green", alpha=0.7, label="TMIN")
plt.plot(x_tmin, y_tmin_low, "--", linewidth=2, color="green",
label="TMIN simple fit")
plt.plot(x_tmin, y_tmin_high, linewidth=2, alpha=0.7, color="green",
label="TMIN overfit")
# --- TMAX ---
plt.scatter(x_tmax, y_tmax, s=10, color="red", alpha=0.7, label="TMAX")
plt.plot(x_tmax, y_tmax_low, "--", linewidth=2, color="red",
label="TMAX simple fit")
plt.plot(x_tmax, y_tmax_high, linewidth=2, alpha=0.7, color="red",
label="TMAX overfit")
plt.title("Overfitting demonstration – TMEDIA, TMIN, TMAX (2009)")
plt.xlabel("Day index (2009)")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.legend(loc="upper left", ncol=2)
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")
# Index for fitting
x_full = np.arange(len(df_2009))
# Extract series
tmed = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
tmin = pd.to_numeric(df_2009["TMIN"], errors="coerce")
tmax = pd.to_numeric(df_2009["TMAX"], errors="coerce")
prec = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")
# ---------------------------
# 3. FIT MODELS FOR EACH SERIES
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=25):
"""Return simple fit + overfitting predictions."""
x_masked = x[~np.isnan(y)]
y_masked = y[~np.isnan(y)]
# Simple
coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
y_low = coefs_low(x_masked)
# Overfitting
deg = min(complex_deg, len(x_masked)-2)
coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
y_high = coefs_high(x_masked)
return x_masked, y_masked, y_low, y_high
# TMEDIA
x_tmed, y_tmed, y_tmed_low, y_tmed_high = fit_series(x_full, tmed)
# TMIN
x_tmin, y_tmin, y_tmin_low, y_tmin_high = fit_series(x_full, tmin)
# TMAX
x_tmax, y_tmax, y_tmax_low, y_tmax_high = fit_series(x_full, tmax)
# ---------------------------
# 4. PLOT RESULTS + PRECIPITATION
# ---------------------------
fig, ax1 = plt.subplots(figsize=(18,9))
# ----- PRECIPITACIÓN (barras) -----
ax1.bar(
x_full,
prec,
color="gray",
alpha=0.4,
label="Precipitación",
)
ax1.set_ylabel("Temperatura (°C) / Precipitación (mm)")
ax1.set_xlabel("Índice de día (2009)")
# ----- Banda térmica -----
ax1.fill_between(
x_full,
tmin,
tmax,
color="lightblue",
alpha=0.25,
label="Banda térmica (TMIN–TMAX)"
)
# ----- TMEDIA -----
ax1.scatter(x_tmed, y_tmed, s=10, color="blue", label="TMEDIA")
ax1.plot(x_tmed, y_tmed_low, "--", linewidth=2, color="blue", label="TMEDIA simple fit")
ax1.plot(x_tmed, y_tmed_high, linewidth=2, alpha=0.8, color="blue", label="TMEDIA overfit")
# ----- TMIN -----
ax1.scatter(x_tmin, y_tmin, s=10, color="green", alpha=0.7, label="TMIN")
ax1.plot(x_tmin, y_tmin_low, "--", linewidth=2, color="green", label="TMIN simple fit")
ax1.plot(x_tmin, y_tmin_high, linewidth=2, alpha=0.8, color="green", label="TMIN overfit")
# ----- TMAX -----
ax1.scatter(x_tmax, y_tmax, s=10, color="red", alpha=0.7, label="TMAX")
ax1.plot(x_tmax, y_tmax_low, "--", linewidth=2, color="red", label="TMAX simple fit")
ax1.plot(x_tmax, y_tmax_high, linewidth=2, alpha=0.8, color="red", label="TMAX overfit")
plt.title("Temperaturas + Precipitación (2009) — TMEDIA, TMIN, TMAX con Overfitting")
plt.grid(True)
plt.legend(loc="upper left", ncol=3)
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")
# Index for fitting
x_full = np.arange(len(df_2009))
# Extract series as numeric
tmed = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
prec = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")
# ---------------------------
# 3. FIT MODELS (TMEDIA + PRECIP)
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
"""Return masked x, y, simple fit, overfit"""
x_masked = x[~np.isnan(y)]
y_masked = y[~np.isnan(y)]
# Simple fit
coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
y_low = coefs_low(x_masked)
# Overfitting
deg = min(complex_deg, len(x_masked)-2)
coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
y_high = coefs_high(x_masked)
return x_masked, y_masked, y_low, y_high
# TMEDIA
x_tmed, y_tmed, y_tmed_low, y_tmed_high = fit_series(x_full, tmed)
# PRECIPITACION
x_prec, y_prec, y_prec_low, y_prec_high = fit_series(x_full, prec)
# ---------------------------
# 4. PLOT TMEDIA + PRECIP
# ---------------------------
fig, ax1 = plt.subplots(figsize=(18,7))
# ----- PRECIPITACIÓN (barras) -----
ax1.bar(
x_prec,
y_prec,
color="gray",
alpha=0.4,
label="Precipitación (mm)"
)
# Fitting precipitación
ax1.plot(
x_prec, y_prec_low,
"--", color="black",
linewidth=2,
label="Prec.: simple fit"
)
ax1.plot(
x_prec, y_prec_high,
color="black",
linewidth=2,
alpha=0.8,
label="Prec.: overfit"
)
# ----- TMEDIA -----
ax1.scatter(x_tmed, y_tmed, s=12, color="blue", label="TMEDIA")
ax1.plot(x_tmed, y_tmed_low, "--", color="blue", linewidth=2, label="TMEDIA simple fit")
ax1.plot(x_tmed, y_tmed_high, color="blue", linewidth=2, alpha=0.8, label="TMEDIA overfit")
ax1.set_title("TMEDIA + Precipitación (2009) con Underfitting y Overfitting")
ax1.set_xlabel("Índice de día (2009)")
ax1.set_ylabel("Temperatura (°C) / Precipitación (mm)")
ax1.grid(True)
ax1.legend(loc="upper left", ncol=2)
plt.tight_layout()
plt.show()
### Year 2009 vs 2024
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
# Function to process and plot a given year
def plot_year_fit(year, simple_deg=2, complex_deg=25):
# ---------------------------
# FILTER YEAR
# ---------------------------
df_year = df[df["FECHA"].dt.year == year].copy()
df_year = df_year.sort_values("FECHA")
# Extract TMEDIA
y = pd.to_numeric(df_year["TMEDIA"], errors="coerce")
x = np.arange(len(y))
# Clean NaN
mask = ~np.isnan(y)
x = x[mask]
y = y[mask]
# ---------------------------
# FIT MODELS
# ---------------------------
# Simple model
coefs_low = Polynomial.fit(x, y, deg=simple_deg)
y_pred_low = coefs_low(x)
# Complex model (overfitting)
deg = min(complex_deg, len(x)-2) # avoid numerical crash
coefs_high = Polynomial.fit(x, y, deg=deg)
y_pred_high = coefs_high(x)
# ---------------------------
# PLOT
# ---------------------------
plt.figure(figsize=(14,6))
plt.scatter(x, y, s=10, label=f"Real Data ({year})", alpha=0.7)
plt.plot(x, y_pred_low, linewidth=2, linestyle='--',
label=f"Simple fit (Degree {simple_deg})")
plt.plot(x, y_pred_high, linewidth=2, alpha=0.8,
label=f"Overfitting (Degree {deg})")
plt.title(f"Overfitting demonstration – Temperature {year}")
plt.xlabel(f"Day index ({year})")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# ---------------------------
# 2. PLOT YEARS
# ---------------------------
plot_year_fit(2009)
plot_year_fit(2024)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
# ---------------------------
# 2. FUNCTION TO FIT MODELS
# ---------------------------
def fit_models(df_year, simple_deg=2, complex_deg=25):
y = pd.to_numeric(df_year["TMEDIA"], errors="coerce")
x = np.arange(len(y))
mask = ~np.isnan(y)
x = x[mask]
y = y[mask]
# Simple model
coefs_low = Polynomial.fit(x, y, deg=simple_deg)
y_low = coefs_low(x)
# Complex model (overfitting)
deg = min(complex_deg, len(x)-2)
coefs_high = Polynomial.fit(x, y, deg=deg)
y_high = coefs_high(x)
return x, y, y_low, y_high
# ---------------------------
# 3. PREPARE YEARS 2009 & 2024
# ---------------------------
years = [2009, 2024]
data = {}
for yr in years:
df_y = df[df["FECHA"].dt.year == yr].copy().sort_values("FECHA")
x, y, y_low, y_high = fit_models(df_y)
data[yr] = (x, y, y_low, y_high)
# ---------------------------
# 4. PLOT EVERYTHING TOGETHER
# ---------------------------
plt.figure(figsize=(16,7))
# Real data
plt.scatter(data[2009][0], data[2009][1], s=12, alpha=0.6, label="Real 2009")
plt.scatter(data[2024][0], data[2024][1], s=12, alpha=0.6, label="Real 2024")
# Simple fits
plt.plot(data[2009][0], data[2009][2], linewidth=2, linestyle='--',
label="Simple fit 2009 (deg=2)")
plt.plot(data[2024][0], data[2024][2], linewidth=2, linestyle='--',
label="Simple fit 2024 (deg=2)")
# Overfitting fits
plt.plot(data[2009][0], data[2009][3], linewidth=2, alpha=0.8,
label="Overfitting 2009 (deg≈25)")
plt.plot(data[2024][0], data[2024][3], linewidth=2, alpha=0.8,
label="Overfitting 2024 (deg≈25)")
plt.title("Overfitting vs Underfitting – TMEDIA (2009 & 2024)")
plt.xlabel("Day index")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df["TMEDIA"] = pd.to_numeric(df["TMEDIA"], errors="coerce")
# ---------------------------
# FUNCTION: prepare normalized X axis (all months equal)
# ---------------------------
def prepare_year_normalized(year):
df_y = df[df["FECHA"].dt.year == year].copy()
df_y = df_y.dropna(subset=["TMEDIA"]).sort_values("FECHA")
# month number: 1..12
month = df_y["FECHA"].dt.month
# day of month: 1..31
day = df_y["FECHA"].dt.day
# total days in that month
days_in_month = df_y["FECHA"].dt.days_in_month
# Normalized X: month + fraction of month
x = month + (day - 1) / days_in_month
y = df_y["TMEDIA"].values
return x.values, y
# ---------------------------
# 2. Get normalized daily data
# ---------------------------
x09, y09 = prepare_year_normalized(2009)
x24, y24 = prepare_year_normalized(2024)
# ---------------------------
# 3. Fit simple and complex model
# ---------------------------
def fit(x, y, simple_deg=2, complex_deg=25):
low = Polynomial.fit(x, y, deg=simple_deg)(x)
deg = min(complex_deg, len(x)-2)
high = Polynomial.fit(x, y, deg=deg)(x)
return low, high
y09_low, y09_high = fit(x09, y09)
y24_low, y24_high = fit(x24, y24)
# ---------------------------
# 4. Plot
# ---------------------------
plt.figure(figsize=(18,7))
# real data
plt.plot(x09, y09, ".", alpha=0.4, label="Real 2009")
plt.plot(x24, y24, ".", alpha=0.4, label="Real 2024")
# simple fits
plt.plot(x09, y09_low, "--", linewidth=2, label="Simple fit 2009")
plt.plot(x24, y24_low, "--", linewidth=2, label="Simple fit 2024")
# overfitting fits
plt.plot(x09, y09_high, linewidth=2, label="Overfitting 2009")
plt.plot(x24, y24_high, linewidth=2, label="Overfitting 2024")
# X ticks: 12 months equally spaced
month_ticks = np.arange(1, 13)
month_labels = ["Ene","Feb","Mar","Abr","May","Jun","Jul","Ago","Sep","Oct","Nov","Dic"]
plt.xticks(month_ticks + 0.5, month_labels) # center labels
plt.title("TMEDIA – Over/Underfitting (2009 vs 2024) – Months equally sized")
plt.ylabel("Temperatura (°C)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
# ---------------------------
# 2. FILTER YEARS
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy().sort_values("FECHA")
df_2024 = df[df["FECHA"].dt.year == 2024].copy().sort_values("FECHA")
# Index
x_2009 = np.arange(len(df_2009))
x_2024 = np.arange(len(df_2024))
# Extract numeric series
tmed_2009 = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
prec_2009 = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")
tmed_2024 = pd.to_numeric(df_2024["TMEDIA"], errors="coerce")
prec_2024 = pd.to_numeric(df_2024["PRECIPITACION"], errors="coerce")
# ---------------------------
# 3. FIT FUNCTION
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
x_masked = x[~np.isnan(y)]
y_masked = y[~np.isnan(y)]
# Simple fit
coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
y_low = coefs_low(x_masked)
# Overfit
deg = min(complex_deg, len(x_masked) - 2)
coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
y_high = coefs_high(x_masked)
return x_masked, y_masked, y_low, y_high
# Fittings 2009
x_tmed_2009, y_tmed_2009, y_tmed_2009_low, y_tmed_2009_high = fit_series(x_2009, tmed_2009)
x_prec_2009, y_prec_2009, y_prec_2009_low, y_prec_2009_high = fit_series(x_2009, prec_2009)
# Fittings 2024
x_tmed_2024, y_tmed_2024, y_tmed_2024_low, y_tmed_2024_high = fit_series(x_2024, tmed_2024)
x_prec_2024, y_prec_2024, y_prec_2024_low, y_prec_2024_high = fit_series(x_2024, prec_2024)
# ---------------------------
# 4. PLOT COMPARISON
# ---------------------------
plt.figure(figsize=(18,10))
# -------- PRECIP 2009 --------
plt.bar(
x_prec_2009,
y_prec_2009,
color="gray",
alpha=0.3,
label="Prec 2009"
)
# Fitting precipitation (2009)
plt.plot(x_prec_2009, y_prec_2009_low, "--", color="black", label="Prec 2009 simple fit")
plt.plot(x_prec_2009, y_prec_2009_high, color="black", alpha=0.7, label="Prec 2009 overfit")
# -------- PRECIP 2024 --------
plt.bar(
x_prec_2024,
y_prec_2024,
color="orange",
alpha=0.3,
label="Prec 2024"
)
plt.plot(x_prec_2024, y_prec_2024_low, "--", color="orange", label="Prec 2024 simple fit")
plt.plot(x_prec_2024, y_prec_2024_high, color="orange", alpha=0.7, label="Prec 2024 overfit")
# -------- TMEDIA 2009 --------
plt.scatter(x_tmed_2009, y_tmed_2009, s=12, color="blue", label="TMEDIA 2009")
plt.plot(x_tmed_2009, y_tmed_2009_low, "--", color="blue", linewidth=2, label="TMEDIA 2009 simple fit")
plt.plot(x_tmed_2009, y_tmed_2009_high, color="blue", linewidth=2, alpha=0.8, label="TMEDIA 2009 overfit")
# -------- TMEDIA 2024 --------
plt.scatter(x_tmed_2024, y_tmed_2024, s=12, color="red", label="TMEDIA 2024")
plt.plot(x_tmed_2024, y_tmed_2024_low, "--", color="red", linewidth=2, label="TMEDIA 2024 simple fit")
plt.plot(x_tmed_2024, y_tmed_2024_high, color="red", linewidth=2, alpha=0.8, label="TMEDIA 2024 overfit")
plt.title("Comparativa TMEDIA + Precipitación (2009 vs 2024) con Underfitting y Overfitting")
plt.xlabel("Índice de día")
plt.ylabel("Temperatura (°C) / Precipitación (mm)")
plt.grid(True)
plt.legend(loc="upper left", ncol=2)
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
# ---------------------------
# 2. FILTER YEARS
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy().sort_values("FECHA")
df_2024 = df[df["FECHA"].dt.year == 2024].copy().sort_values("FECHA")
# Index for each year
x_2009 = np.arange(len(df_2009))
x_2024 = np.arange(len(df_2024))
# Precipitation as numeric
prec_2009 = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")
prec_2024 = pd.to_numeric(df_2024["PRECIPITACION"], errors="coerce")
# ---------------------------
# 3. FIT FUNCTION
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
x_masked = x[~np.isnan(y)]
y_masked = y[~np.isnan(y)]
# Simple fit
coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
y_low = coefs_low(x_masked)
# Overfit
deg = min(complex_deg, len(x_masked)-2)
coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
y_high = coefs_high(x_masked)
return x_masked, y_masked, y_low, y_high
# Fittings
x_p2009, y_p2009, y_p2009_low, y_p2009_high = fit_series(x_2009, prec_2009)
x_p2024, y_p2024, y_p2024_low, y_p2024_high = fit_series(x_2024, prec_2024)
# ---------------------------
# 4. PLOT PRECIP ONLY
# ---------------------------
plt.figure(figsize=(18,7))
# ----- PRECIP 2009 -----
plt.bar(x_p2009, y_p2009, color="gray", alpha=0.4, label="Precipitación 2009")
plt.plot(x_p2009, y_p2009_low, "--", color="black", linewidth=2, label="2009 simple fit")
plt.plot(x_p2009, y_p2009_high, color="black", linewidth=2, alpha=0.8, label="2009 overfit")
# ----- PRECIP 2024 -----
plt.bar(x_p2024, y_p2024, color="orange", alpha=0.4, label="Precipitación 2024")
plt.plot(x_p2024, y_p2024_low, "--", color="orange", linewidth=2, label="2024 simple fit")
plt.plot(x_p2024, y_p2024_high, color="orange", linewidth=2, alpha=0.8, label="2024 overfit")
plt.title("Comparación de Precipitación – 2009 vs 2024 (simple fit vs overfit)")
plt.xlabel("Índice de día")
plt.ylabel("Precipitación (mm)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
Overfitting vs Fitting by Year¶
This interactive graph compares the daily mean temperature patterns of two selected years by applying two different polynomial models: a simple low-degree fit and a complex high-degree overfitting model. By normalizing the X-axis so that all months have equal width, the visualization highlights seasonal behavior independently of month length or missing data. The comparison reveals how each model captures long-term trends versus noise, and how temperature patterns differ between the two chosen years.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
from ipywidgets import interact, Dropdown, fixed
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df["TMEDIA"] = pd.to_numeric(df["TMEDIA"], errors="coerce")
# ---------------------------
# 2. FUNCTION: normalized X axis (months equally sized)
# ---------------------------
def prepare_year_normalized(year):
df_y = df[df["FECHA"].dt.year == year].copy()
df_y = df_y.dropna(subset=["TMEDIA"]).sort_values("FECHA")
month = df_y["FECHA"].dt.month
day = df_y["FECHA"].dt.day
days_in_month = df_y["FECHA"].dt.days_in_month
# Normalized X = month + fraction of month
x = month + (day - 1) / days_in_month
y = df_y["TMEDIA"].values
return x.values, y
# ---------------------------
# 3. FIT MODELS
# ---------------------------
def fit(x, y, simple_deg=2, complex_deg=25):
low = Polynomial.fit(x, y, deg=simple_deg)(x)
deg = min(complex_deg, len(x)-2)
high = Polynomial.fit(x, y, deg=deg)(x)
return low, high
# ---------------------------
# 4. INTERACTIVE PLOT FUNCTION
# ---------------------------
def plot_comparison(yearA, yearB):
xA, yA = prepare_year_normalized(yearA)
xB, yB = prepare_year_normalized(yearB)
yA_low, yA_high = fit(xA, yA)
yB_low, yB_high = fit(xB, yB)
plt.figure(figsize=(18,7))
# Real data
plt.plot(xA, yA, ".", alpha=0.4, label=f"Real {yearA}")
plt.plot(xB, yB, ".", alpha=0.4, label=f"Real {yearB}")
# Simple fit
plt.plot(xA, yA_low, "--", linewidth=2, label=f"Simple fit {yearA}")
plt.plot(xB, yB_low, "--", linewidth=2, label=f"Simple fit {yearB}")
# Overfit
plt.plot(xA, yA_high, linewidth=2, label=f"Overfit {yearA}")
plt.plot(xB, yB_high, linewidth=2, label=f"Overfit {yearB}")
# X ticks
month_ticks = np.arange(1, 13)
month_labels = ["Ene","Feb","Mar","Abr","May","Jun","Jul","Ago","Sep","Oct","Nov","Dic"]
plt.xticks(month_ticks + 0.5, month_labels)
plt.title(f"Comparación TMEDIA – {yearA} vs {yearB} (Meses igualados)")
plt.ylabel("Temperatura (°C)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
# ---------------------------
# 5. INTERACTIVE WIDGET
# ---------------------------
years = list(range(2009, 2025))
interact(
plot_comparison,
yearA=Dropdown(options=years, value=2009, description="Año A"),
yearB=Dropdown(options=years, value=2024, description="Año B")
)
interactive(children=(Dropdown(description='Año A', options=(2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2…
<function __main__.plot_comparison(yearA, yearB)>
Compare multiple years¶
This interactive graph lets you compare daily temperatures between any years from 2009 to 2024. All months have the same width on the X-axis, making seasonal patterns easier to see. You can display real data or apply simple and complex polynomial fits to explore how models capture trends or overfit noise. This tool helps visualize climate variations across years and demonstrates key Data Science concepts like seasonality, model complexity, and overfitting.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
from ipywidgets import interact, SelectMultiple, Dropdown, IntSlider, fixed
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df["TMEDIA"] = pd.to_numeric(df["TMEDIA"], errors="coerce")
# ---------------------------
# Function: normalized X axis (months equally sized)
# ---------------------------
def prepare_year_normalized(year):
df_y = df[df["FECHA"].dt.year == year].copy()
df_y = df_y.dropna(subset=["TMEDIA"]).sort_values("FECHA")
month = df_y["FECHA"].dt.month
day = df_y["FECHA"].dt.day
days = df_y["FECHA"].dt.days_in_month
x = month + (day - 1) / days
y = df_y["TMEDIA"].values
return x.values, y
# ---------------------------
# Fit function
# ---------------------------
def fit_curve(x, y, degree):
degree = min(degree, len(x)-2)
return Polynomial.fit(x, y, deg=degree)(x)
# ---------------------------
# Interactive plot
# ---------------------------
def plot_years(selected_years, curve_type, degree):
plt.figure(figsize=(19,8))
for year in selected_years:
x, y = prepare_year_normalized(year)
if curve_type == "Datos reales":
plt.plot(x, y, ".", markersize=3, alpha=0.5, label=f"{year}")
elif curve_type == "Ajuste simple (deg 2)":
y_low = fit_curve(x, y, 2)
plt.plot(x, y_low, "-", linewidth=2, alpha=0.8, label=f"{year} (fit2)")
elif curve_type == "Ajuste complejo (deg variable)":
y_high = fit_curve(x, y, degree)
plt.plot(x, y_high, "-", linewidth=2, alpha=0.8, label=f"{year} (fit{degree})")
# X axis: month labels equally spaced
month_ticks = np.arange(1, 13)
month_labels = ["Ene","Feb","Mar","Abr","May","Jun","Jul","Ago","Sep","Oct","Nov","Dic"]
plt.xticks(month_ticks + 0.5, month_labels)
plt.title(f"TMEDIA 2009–2024 · Curva: {curve_type}")
plt.ylabel("Temperatura (ºC)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
# ---------------------------
# Create widgets
# ---------------------------
years = list(range(2009, 2025))
interact(
plot_years,
selected_years = SelectMultiple(
options=years,
value=(2009, 2010, 2024),
description="Años:"
),
curve_type = Dropdown(
options=["Datos reales", "Ajuste simple (deg 2)", "Ajuste complejo (deg variable)"],
value="Datos reales",
description="Tipo curva:"
),
degree = IntSlider(
value=15, min=3, max=40, step=1,
description="Grado (si complejo):"
)
)
interactive(children=(SelectMultiple(description='Años:', index=(0, 1, 15), options=(2009, 2010, 2011, 2012, 2…
<function __main__.plot_years(selected_years, curve_type, degree)>
Find out which year has the most daily data¶
import pandas as pd
# Cargar CSV
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
# Agrupar por año y contar
registros_por_ano = df.groupby(df["FECHA"].dt.year).size().sort_values(ascending=False)
registros_por_ano
FECHA 2012 366 2016 366 2020 366 2017 365 2018 365 2021 365 2011 364 2013 364 2023 364 2010 363 2019 363 2014 362 2015 361 2009 360 2022 356 2024 345 2025 310 2008 80 dtype: int64
### 2012 vs 2023
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
# ---------------------------
# 2. FILTER YEARS 2012 & 2023
# ---------------------------
df_2012 = df[df["FECHA"].dt.year == 2012].copy().sort_values("FECHA")
df_2023 = df[df["FECHA"].dt.year == 2023].copy().sort_values("FECHA")
x_2012 = np.arange(len(df_2012))
x_2023 = np.arange(len(df_2023))
prec_2012 = pd.to_numeric(df_2012["PRECIPITACION"], errors="coerce")
prec_2023 = pd.to_numeric(df_2023["PRECIPITACION"], errors="coerce")
# ---------------------------
# 3. FIT FUNCTION
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
x_mask = x[~np.isnan(y)]
y_mask = y[~np.isnan(y)]
# Simple fit
coefs_low = Polynomial.fit(x_mask, y_mask, deg=simple_deg)
y_low = coefs_low(x_mask)
# Overfit
deg = min(complex_deg, len(x_mask) - 2)
coefs_high = Polynomial.fit(x_mask, y_mask, deg=deg)
y_high = coefs_high(x_mask)
return x_mask, y_mask, y_low, y_high
# Apply fitting to both years
x_p2012, y_p2012, y_p2012_low, y_p2012_high = fit_series(x_2012, prec_2012)
x_p2023, y_p2023, y_p2023_low, y_p2023_high = fit_series(x_2023, prec_2023)
# ---------------------------
# 4. PLOT PRECIPITATION ONLY
# ---------------------------
plt.figure(figsize=(18,7))
# ----- PRECIP 2012 -----
plt.bar(x_p2012, y_p2012, color="gray", alpha=0.4, label="Precipitación 2012")
plt.plot(x_p2012, y_p2012_low, "--", color="black", linewidth=2, label="2012 simple fit")
plt.plot(x_p2012, y_p2012_high, color="black", linewidth=2, alpha=0.8, label="2012 overfit")
# ----- PRECIP 2023 -----
plt.bar(x_p2023, y_p2023, color="green", alpha=0.3, label="Precipitación 2023")
plt.plot(x_p2023, y_p2023_low, "--", color="green", linewidth=2, label="2023 simple fit")
plt.plot(x_p2023, y_p2023_high, color="green", linewidth=2, alpha=0.8, label="2023 overfit")
plt.title("Comparación de Precipitación – 2012 vs 2023 (simple fit vs overfit)")
plt.xlabel("Índice de día")
plt.ylabel("Precipitación (mm)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
# ------------------------------------------
# 1. LOAD DATA
# ------------------------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
# Variable a comparar
var = "TMEDIA" # Puedes poner: "TMIN", "TMAX", "PRECIPITACION"
# ------------------------------------------
# 2. AGRUPAR POR AÑO
# ------------------------------------------
df = df.dropna(subset=[var])
df["AÑO"] = df["FECHA"].dt.year
# Diccionario año -> serie normalizada a longitud igual
series = {}
# Encontrar longitud mínima (para alinear)
min_len = df.groupby("AÑO").size().min()
for year, group in df.groupby("AÑO"):
y = pd.to_numeric(group[var], errors="coerce").dropna().values
if len(y) >= min_len:
y = y[:min_len] # recortar a misma longitud
series[year] = y
years = sorted(series.keys())
# ------------------------------------------
# 3. CALCULAR DISTANCIA ENTRE CADA PAR DE AÑOS
# ------------------------------------------
best_pair = None
best_distance = float("inf")
for i in range(len(years)):
for j in range(i+1, len(years)):
y1 = series[years[i]]
y2 = series[years[j]]
dist = euclidean(y1, y2)
if dist < best_distance:
best_distance = dist
best_pair = (years[i], years[j])
print("The two most similar years in", var, "son:", best_pair)
print("Distance:", best_distance)
The two most similar years in TMEDIA son: (2014, 2015) Distance: 23.189221634198937
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
# ------------------------------------------
# 1. LOAD DATA
# ------------------------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
var = "PRECIPITACION" # rainfall variable
df = df.dropna(subset=[var])
df["YEAR"] = df["FECHA"].dt.year
# ------------------------------------------
# 2. FIND THE YEAR WITH THE MOST RAINFALL
# ------------------------------------------
total_rain_per_year = df.groupby("YEAR")[var].sum()
year_max_rain = total_rain_per_year.idxmax()
max_rain_value = total_rain_per_year.max()
print("--------------------------------------------------------")
print("YEAR WITH THE HIGHEST TOTAL RAINFALL:")
print(f"➡ Year {year_max_rain} with {max_rain_value:.2f} mm")
print("--------------------------------------------------------")
# ------------------------------------------
# 3. PREPARE SERIES FOR SIMILARITY ANALYSIS
# ------------------------------------------
series = {}
min_len = df.groupby("YEAR").size().min() # align different years
for year, group in df.groupby("YEAR"):
y = pd.to_numeric(group[var], errors="coerce").dropna().values
if len(y) >= min_len:
series[year] = y[:min_len]
years = sorted(series.keys())
# ------------------------------------------
# 4. COMPUTE YEAR-TO-YEAR DISTANCES
# ------------------------------------------
best_pair = None
best_distance = float("inf")
for i in range(len(years)):
for j in range(i+1, len(years)):
y1 = series[years[i]]
y2 = series[years[j]]
dist = euclidean(y1, y2) # similarity measure
if dist < best_distance:
best_distance = dist
best_pair = (years[i], years[j])
print("Most similar years in PRECIPITATION:", best_pair)
print("Distance (lower = more similar):", best_distance)
print("--------------------------------------------------------")
print("Total annual rainfall (mm):")
print(total_rain_per_year)
print("--------------------------------------------------------")
-------------------------------------------------------- YEAR WITH THE HIGHEST TOTAL RAINFALL: ➡ Year 2023 with 2211.40 mm -------------------------------------------------------- Most similar years in PRECIPITATION: (2011, 2012) Distance (lower = more similar): 49.25038070918843 -------------------------------------------------------- Total annual rainfall (mm): YEAR 2008 217.2 2009 1397.9 2010 1338.8 2011 678.8 2012 1056.0 2013 1316.8 2014 1386.3 2015 1036.2 2016 1868.7 2017 1170.8 2018 1275.1 2019 1725.5 2020 1727.1 2021 1829.8 2022 1272.0 2023 2211.4 2024 1559.0 2025 1604.8 Name: PRECIPITACION, dtype: float64 --------------------------------------------------------
Humedad¶
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
# Fix column names
df.columns = [c.strip() for c in df.columns]
# Use your real column names
date_col = "UTC"
hum_col = "Hum"
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
df = df.dropna(subset=[date_col, hum_col])
# Sort by time
df = df.sort_values(date_col)
# ---------------------------
# 2. PREPARE SERIES
# ---------------------------
y = pd.to_numeric(df[hum_col], errors="coerce")
x = np.arange(len(y))
mask = ~np.isnan(y)
x = x[mask]
y = y[mask]
# ---------------------------
# 3. FIT MODELS
# ---------------------------
# Simple model (underfitting)
coefs_low = Polynomial.fit(x, y, deg=2)
y_low = coefs_low(x)
# Complex model (overfitting)
deg = min(20, len(x) - 2) # avoid crash if few points
coefs_high = Polynomial.fit(x, y, deg=deg)
y_high = coefs_high(x)
# ---------------------------
# 4. PLOT
# ---------------------------
plt.figure(figsize=(16,7))
plt.scatter(x, y, s=10, color="blue", alpha=0.7, label="Humidity (real data)")
plt.plot(x, y_low, "--", linewidth=2, color="green", label="Simple fit (deg=2)")
plt.plot(x, y_high, linewidth=2, color="red", alpha=0.8, label=f"Overfit (deg={deg})")
plt.title("Humidity with Underfitting and Overfitting")
plt.xlabel("Time index")
plt.ylabel("Humidity (%)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])
# ---------------------------
# 2. GROUP BY MONTH
# ---------------------------
df["MONTH"] = df["UTC"].dt.month
# Average humidity per month
monthly = df.groupby("MONTH")["Hum"].mean()
# Prepare the series
x = np.arange(1, 13) # months 1–12
y = monthly.values # mean humidity per month
# ---------------------------
# 3. FIT MODELS
# ---------------------------
# Simple model (underfitting)
simple_deg = 2
coefs_low = Polynomial.fit(x, y, deg=simple_deg)
y_low = coefs_low(x)
# Overfitting model
complex_deg = min(8, len(x)-2)
coefs_high = Polynomial.fit(x, y, deg=complex_deg)
y_high = coefs_high(x)
# ---------------------------
# 4. PLOT
# ---------------------------
plt.figure(figsize=(14,6))
# Real data
plt.plot(x, y, 'o', color="blue", label="Real humidity (monthly average)")
# Simple fit
plt.plot(x, y_low, '--', linewidth=2, color="green",
label=f"Simple fit (deg={simple_deg})")
# Overfit
plt.plot(x, y_high, linewidth=2, color="red", alpha=0.8,
label=f"Overfit (deg={complex_deg})")
plt.title("Monthly Humidity with Underfitting and Overfitting")
plt.xlabel("Month")
plt.ylabel("Humidity (%)")
plt.xticks(np.arange(1, 13))
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])
# Extract year and month
df["YEAR"] = df["UTC"].dt.year
df["MONTH"] = df["UTC"].dt.month
# ---------------------------
# 2. FILTER YEARS 2020–2024
# ---------------------------
df = df[(df["YEAR"] >= 2020) & (df["YEAR"] <= 2024)]
# Group into a table: rows=months, columns=years
monthly_year = df.groupby(["YEAR", "MONTH"])["Hum"].mean().unstack(level=0)
# ---------------------------
# 3. PLOT — Humidity + Fitting
# ---------------------------
plt.figure(figsize=(18,9))
years = sorted(monthly_year.columns.dropna().tolist())
months = np.arange(1, 13)
for year in years:
y = monthly_year[year].values
x = months
# Remove NaN
mask = ~np.isnan(y)
x_clean = x[mask]
y_clean = y[mask]
if len(x_clean) < 3:
continue # not enough data to fit
# Fitting simple (underfitting)
deg_low = 2
coef_low = Polynomial.fit(x_clean, y_clean, deg=deg_low)
y_low = coef_low(x_clean)
# Fitting complex (overfitting)
deg_high = min(8, len(x_clean) - 2)
coef_high = Polynomial.fit(x_clean, y_clean, deg=deg_high)
y_high = coef_high(x_clean)
# Plot real data
plt.plot(x_clean, y_clean, marker="o", linewidth=2, label=f"{year} real")
# Simple fit
plt.plot(x_clean, y_low, "--", linewidth=2, label=f"{year} simple fit")
# Overfit
plt.plot(x_clean, y_high, linewidth=2, alpha=0.7, label=f"{year} overfit")
plt.title("Monthly Humidity (2019–2025) with Underfitting and Overfitting")
plt.xlabel("Month")
plt.ylabel("Humidity (%)")
plt.xticks(months)
plt.grid(True)
plt.legend(ncol=3, fontsize=9)
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])
# Prepare data
df = df.sort_values("UTC")
y = df["Hum"].values
X = np.arange(len(y)).reshape(-1, 1) # simple time index
# ---------------------------
# 2. CROSS-VALIDATION SETUP
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
degrees = range(1, 20) # test polynomial degrees 1 to 19
cv_errors = []
# ---------------------------
# 3. RUN CROSS-VALIDATION FOR EACH DEGREE
# ---------------------------
for deg in degrees:
fold_errors = []
poly = PolynomialFeatures(degree=deg)
for train_idx, test_idx in kf.split(X):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# Transform features
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Fit model
model = LinearRegression()
model.fit(X_train_poly, y_train)
# Predict + calculate error
y_pred = model.predict(X_test_poly)
error = mean_squared_error(y_test, y_pred)
fold_errors.append(error)
# Average CV error for this degree
cv_errors.append(np.mean(fold_errors))
# ---------------------------
# 4. PLOT CROSS-VALIDATION RESULTS
# ---------------------------
plt.figure(figsize=(12,6))
plt.plot(degrees, cv_errors, marker="o", linewidth=2)
plt.xlabel("Polynomial Degree")
plt.ylabel("Cross-Validation Error (MSE)")
plt.title("Cross-Validation Curve for Humidity Fitting")
plt.grid(True)
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])
df["YEAR"] = df["UTC"].dt.year
# Years to evaluate
years = [2020, 2021, 2022, 2023, 2024]
# Polynomial degrees
degrees = range(1, 12)
# Store results
errors_by_year = {}
# ---------------------------
# 2. CROSS-VALIDATION PER YEAR
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for year in years:
df_y = df[df["YEAR"] == year].copy()
if len(df_y) < 10:
print(f"Skipping {year}: not enough data")
continue
y = df_y["Hum"].values
X = np.arange(len(y)).reshape(-1, 1)
year_errors = []
for deg in degrees:
fold_errors = []
poly = PolynomialFeatures(degree=deg)
for train_idx, test_idx in kf.split(X):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
model = LinearRegression()
model.fit(X_train_poly, y_train)
y_pred = model.predict(X_test_poly)
fold_errors.append(mean_squared_error(y_test, y_pred))
year_errors.append(np.mean(fold_errors))
errors_by_year[year] = year_errors
# ---------------------------
# 3. PLOT CROSS-VALIDATION CURVES
# ---------------------------
plt.figure(figsize=(14,7))
for year in errors_by_year:
plt.plot(degrees, errors_by_year[year], marker="o", linewidth=2, label=str(year))
plt.title("Cross-Validation Error (Humidity) — 2020 to 2024")
plt.xlabel("Polynomial Degree")
plt.ylabel("MSE (Cross-Validation Error)")
plt.grid(True)
plt.legend(title="Year")
plt.tight_layout()
plt.show()
# ---------------------------
# 4. SHOW ERRORS IN A TABLE
# ---------------------------
error_table = pd.DataFrame(errors_by_year, index=degrees)
error_table.index.name = "Degree"
error_table
| 2020 | 2021 | 2022 | 2023 | 2024 | |
|---|---|---|---|---|---|
| Degree | |||||
| 1 | 216.068824 | 218.752229 | 205.806426 | 223.386426 | 121.662982 |
| 2 | 199.123679 | 206.903699 | 197.047975 | 206.637540 | 120.667957 |
| 3 | 199.019093 | 201.704614 | 196.737505 | 206.017819 | 120.021651 |
| 4 | 197.902190 | 207.015701 | 196.320941 | 207.876785 | 116.337365 |
| 5 | 197.509293 | 210.007744 | 195.493466 | 209.290042 | 116.981801 |
| 6 | 199.035569 | 210.817214 | 194.980695 | 209.993881 | 116.845109 |
| 7 | 201.411285 | 210.744167 | 195.175063 | 210.322704 | 116.787237 |
| 8 | 203.592694 | 210.467845 | 195.974085 | 210.498426 | 116.688301 |
| 9 | 205.158340 | 210.258218 | 197.035062 | 210.616151 | 116.304289 |
| 10 | 206.100096 | 210.176259 | 198.047936 | 210.704588 | 119.011331 |
| 11 | 206.568808 | 210.195144 | 198.845317 | 210.768333 | 119.901257 |
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])
df["YEAR"] = df["UTC"].dt.year
# Years to analyze
years = [2020, 2021, 2022, 2023, 2024]
# Regularization strengths (λ)
lambdas = np.logspace(-4, 4, 20)
# Where to store results
reg_errors = {}
# ---------------------------
# 2. REGULARIZATION ANALYSIS PER YEAR
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for year in years:
df_y = df[df["YEAR"] == year].copy()
if len(df_y) < 20:
print(f"Skipping {year}: not enough samples")
continue
y = df_y["Hum"].values
X = np.arange(len(y)).reshape(-1, 1)
year_errors = []
for lam in lambdas:
fold_errors = []
model = Ridge(alpha=lam)
for train_idx, test_idx in kf.split(X):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
fold_errors.append(mean_squared_error(y_test, y_pred))
year_errors.append(np.mean(fold_errors))
reg_errors[year] = year_errors
# ---------------------------
# 3. PLOT REGULARIZATION CURVES
# ---------------------------
plt.figure(figsize=(14,7))
for year in reg_errors:
plt.plot(lambdas, reg_errors[year], marker="o", linewidth=2, label=str(year))
plt.xscale("log")
plt.xlabel("Regularization strength (lambda)")
plt.ylabel("Validation Error (MSE)")
plt.title("Regularization Curve per Year (Humidity) — Ridge Regression")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])
df["YEAR"] = df["UTC"].dt.year
df["MONTH"] = df["UTC"].dt.month
# Years you want to analyze
years = sorted(df["YEAR"].unique())
# Regularization strengths (lambda)
lambdas = np.logspace(-4, 4, 15)
# ---------------------------
# 2. REGULARIZATION PER YEAR
# ---------------------------
best_models = {} # to store best model prediction per year
for year in years:
df_y = df[df["YEAR"] == year].copy()
if len(df_y) < 30:
print(f"Skipping {year}: not enough data")
continue
# Daily humidity
y = df_y["Hum"].values
X = np.arange(len(y)).reshape(-1, 1) # simple time index
# K-fold cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Evaluate each lambda
mse_list = []
models = []
for lam in lambdas:
fold_errors = []
model = Ridge(alpha=lam)
for train_idx, test_idx in kf.split(X):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
fold_errors.append(mean_squared_error(y_test, y_pred))
mse_list.append(np.mean(fold_errors))
models.append(model)
# Pick best lambda
best_idx = np.argmin(mse_list)
best_lambda = lambdas[best_idx]
best_model = models[best_idx]
# Predict using the BEST model
df_y["Pred"] = best_model.predict(X)
best_models[year] = df_y
# ---------------------------
# 3. AGGREGATE PREDICTIONS BY MONTH (X-axis = months)
# ---------------------------
plt.figure(figsize=(16,8))
for year, df_y in best_models.items():
monthly_pred = df_y.groupby("MONTH")["Pred"].mean()
plt.plot(
monthly_pred.index,
monthly_pred.values,
marker="o",
linewidth=2,
label=f"{year} (best λ)"
)
plt.title("Monthly Humidity (Daily Fitting + Ridge Regularization)\nBest Regularized Model for Each Year")
plt.xlabel("Month")
plt.ylabel("Humidity (%)")
plt.xticks(np.arange(1,13))
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()