Luis Diaz-Faes - Fab Futures - Data Science
Home About A Industriosa

< Home

Day 3: Fitting¶

Assignment 20/11/2025¶

Fit a function to your data

Fitting¶

Year 2009¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df.head()

# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")

# Use temperature: TMED or any parameter you want
y = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
x = np.arange(len(y))  # simple numeric index for fitting

mask = ~np.isnan(y)
x = x[mask]
y = y[mask]

# ---------------------------
# 3. FIT TWO MODELS
# ---------------------------

# A) Underfitting: very simple model (degree 2)
coefs_low = Polynomial.fit(x, y, deg=2)
y_pred_low = coefs_low(x)

# B) Overfitting: too many parameters (degree 25)
# WARNING: high degree = noisy overfit
coefs_high = Polynomial.fit(x, y, deg=25)
y_pred_high = coefs_high(x)

# ---------------------------
# 4. PLOT RESULTS
# ---------------------------
plt.figure(figsize=(14,6))

plt.scatter(x, y, s=10, label="Real Data (2009)", alpha=0.7)

plt.plot(x, y_pred_low, linewidth=2, label="Low-degree fit (Underfitting)", linestyle='--')
plt.plot(x, y_pred_high, linewidth=2, label="High-degree fit (Overfitting)", alpha=0.8)

plt.title("Overfitting demonstration – Temperature 2009")
plt.xlabel("Day index (2009)")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df.head()

# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")

# Use temperature: TMED or any parameter you want
y = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
x = np.arange(len(y))  # simple numeric index for fitting

mask = ~np.isnan(y)
x = x[mask]
y = y[mask]

# ---------------------------
# 3. FIT TWO MODELS
# ---------------------------

# A) Underfitting: very simple model (degree 2)
coefs_low = Polynomial.fit(x, y, deg=2)
y_pred_low = coefs_low(x)

# B) Overfitting: too many parameters (degree 25)
# WARNING: high degree = noisy overfit
coefs_high = Polynomial.fit(x, y, deg=25)
y_pred_high = coefs_high(x)

# ---------------------------
# 4. PLOT RESULTS + TMIN/TMAX
# ---------------------------
plt.figure(figsize=(14,6))

# Banda térmica TMIN-TMAX
plt.fill_between(
    np.arange(len(df_2009)),
    df_2009["TMIN"],
    df_2009["TMAX"],
    color="lightblue",
    alpha=0.4,
    label="Rango diario (TMIN – TMAX)"
)

# Scatter de TMEDIA (solo valores válidos)
plt.scatter(x, y, s=10, label="TMEDIA (2009)", alpha=0.8, color="blue")

# Ajuste simple
plt.plot(x, y_pred_low, linewidth=2, linestyle="--",
         label="Low-degree fit (Underfitting)")

# Ajuste complejo (overfitting)
plt.plot(x, y_pred_high, linewidth=2, alpha=0.8,
         label="High-degree fit (Overfitting)")

plt.title("Temperaturas 2009: Banda térmica (TMIN–TMAX) + Overfitting TMEDIA")
plt.xlabel("Índice de día (2009)")
plt.ylabel("Temperatura (°C)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df.head()

# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")

# Index for fitting
x_full = np.arange(len(df_2009))

# Extract series
tmed = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
tmin = pd.to_numeric(df_2009["TMIN"], errors="coerce")
tmax = pd.to_numeric(df_2009["TMAX"], errors="coerce")

# Remove NaN
mask_tmed = ~np.isnan(tmed)
mask_tmin = ~np.isnan(tmin)
mask_tmax = ~np.isnan(tmax)

# ---------------------------
# 3. FIT MODELS FOR EACH SERIES
# ---------------------------

def fit_series(x, y, simple_deg=2, complex_deg=25):
    """Return simple fit + overfitting predictions."""
    x_masked = x[~np.isnan(y)]
    y_masked = y[~np.isnan(y)]
    # Simple
    coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
    y_low = coefs_low(x_masked)
    # Overfit
    deg = min(complex_deg, len(x_masked)-2)
    coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
    y_high = coefs_high(x_masked)
    return x_masked, y_masked, y_low, y_high

# TMEDIA
x_tmed, y_tmed, y_tmed_low, y_tmed_high = fit_series(x_full, tmed)

# TMIN
x_tmin, y_tmin, y_tmin_low, y_tmin_high = fit_series(x_full, tmin)

# TMAX
x_tmax, y_tmax, y_tmax_low, y_tmax_high = fit_series(x_full, tmax)

# ---------------------------
# 4. PLOT RESULTS
# ---------------------------
plt.figure(figsize=(16,8))

# Banda térmica TMIN–TMAX
plt.fill_between(
    x_full,
    df_2009["TMIN"],
    df_2009["TMAX"],
    color="lightblue",
    alpha=0.3,
    label="Rango diario (TMIN–TMAX)"
)

# --- TMEDIA ---
plt.scatter(x_tmed, y_tmed, s=10, color="blue", label="TMEDIA")
plt.plot(x_tmed, y_tmed_low, "--", linewidth=2, color="blue",
         label="TMEDIA simple fit")
plt.plot(x_tmed, y_tmed_high, linewidth=2, alpha=0.7, color="blue",
         label="TMEDIA overfit")

# --- TMIN ---
plt.scatter(x_tmin, y_tmin, s=10, color="green", alpha=0.7, label="TMIN")
plt.plot(x_tmin, y_tmin_low, "--", linewidth=2, color="green",
         label="TMIN simple fit")
plt.plot(x_tmin, y_tmin_high, linewidth=2, alpha=0.7, color="green",
         label="TMIN overfit")

# --- TMAX ---
plt.scatter(x_tmax, y_tmax, s=10, color="red", alpha=0.7, label="TMAX")
plt.plot(x_tmax, y_tmax_low, "--", linewidth=2, color="red",
         label="TMAX simple fit")
plt.plot(x_tmax, y_tmax_high, linewidth=2, alpha=0.7, color="red",
         label="TMAX overfit")

plt.title("Overfitting demonstration – TMEDIA, TMIN, TMAX (2009)")
plt.xlabel("Day index (2009)")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.legend(loc="upper left", ncol=2)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")

# Index for fitting
x_full = np.arange(len(df_2009))

# Extract series
tmed = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
tmin = pd.to_numeric(df_2009["TMIN"], errors="coerce")
tmax = pd.to_numeric(df_2009["TMAX"], errors="coerce")
prec = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")

# ---------------------------
# 3. FIT MODELS FOR EACH SERIES
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=25):
    """Return simple fit + overfitting predictions."""
    x_masked = x[~np.isnan(y)]
    y_masked = y[~np.isnan(y)]
    # Simple
    coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
    y_low = coefs_low(x_masked)
    # Overfitting
    deg = min(complex_deg, len(x_masked)-2)
    coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
    y_high = coefs_high(x_masked)
    return x_masked, y_masked, y_low, y_high

# TMEDIA
x_tmed, y_tmed, y_tmed_low, y_tmed_high = fit_series(x_full, tmed)
# TMIN
x_tmin, y_tmin, y_tmin_low, y_tmin_high = fit_series(x_full, tmin)
# TMAX
x_tmax, y_tmax, y_tmax_low, y_tmax_high = fit_series(x_full, tmax)

# ---------------------------
# 4. PLOT RESULTS + PRECIPITATION
# ---------------------------
fig, ax1 = plt.subplots(figsize=(18,9))

# ----- PRECIPITACIÓN (barras) -----
ax1.bar(
    x_full,
    prec,
    color="gray",
    alpha=0.4,
    label="Precipitación",
)

ax1.set_ylabel("Temperatura (°C) / Precipitación (mm)")
ax1.set_xlabel("Índice de día (2009)")

# ----- Banda térmica -----
ax1.fill_between(
    x_full,
    tmin,
    tmax,
    color="lightblue",
    alpha=0.25,
    label="Banda térmica (TMIN–TMAX)"
)

# ----- TMEDIA -----
ax1.scatter(x_tmed, y_tmed, s=10, color="blue", label="TMEDIA")
ax1.plot(x_tmed, y_tmed_low, "--", linewidth=2, color="blue", label="TMEDIA simple fit")
ax1.plot(x_tmed, y_tmed_high, linewidth=2, alpha=0.8, color="blue", label="TMEDIA overfit")

# ----- TMIN -----
ax1.scatter(x_tmin, y_tmin, s=10, color="green", alpha=0.7, label="TMIN")
ax1.plot(x_tmin, y_tmin_low, "--", linewidth=2, color="green", label="TMIN simple fit")
ax1.plot(x_tmin, y_tmin_high, linewidth=2, alpha=0.8, color="green", label="TMIN overfit")

# ----- TMAX -----
ax1.scatter(x_tmax, y_tmax, s=10, color="red", alpha=0.7, label="TMAX")
ax1.plot(x_tmax, y_tmax_low, "--", linewidth=2, color="red", label="TMAX simple fit")
ax1.plot(x_tmax, y_tmax_high, linewidth=2, alpha=0.8, color="red", label="TMAX overfit")

plt.title("Temperaturas + Precipitación (2009) — TMEDIA, TMIN, TMAX con Overfitting")
plt.grid(True)
plt.legend(loc="upper left", ncol=3)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")

# Index for fitting
x_full = np.arange(len(df_2009))

# Extract series as numeric
tmed = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
prec = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")

# ---------------------------
# 3. FIT MODELS (TMEDIA + PRECIP)
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
    """Return masked x, y, simple fit, overfit"""
    x_masked = x[~np.isnan(y)]
    y_masked = y[~np.isnan(y)]
    # Simple fit
    coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
    y_low = coefs_low(x_masked)
    # Overfitting
    deg = min(complex_deg, len(x_masked)-2)
    coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
    y_high = coefs_high(x_masked)
    return x_masked, y_masked, y_low, y_high

# TMEDIA
x_tmed, y_tmed, y_tmed_low, y_tmed_high = fit_series(x_full, tmed)

# PRECIPITACION
x_prec, y_prec, y_prec_low, y_prec_high = fit_series(x_full, prec)

# ---------------------------
# 4. PLOT TMEDIA + PRECIP
# ---------------------------
fig, ax1 = plt.subplots(figsize=(18,7))

# ----- PRECIPITACIÓN (barras) -----
ax1.bar(
    x_prec,
    y_prec,
    color="gray",
    alpha=0.4,
    label="Precipitación (mm)"
)

# Fitting precipitación
ax1.plot(
    x_prec, y_prec_low,
    "--", color="black",
    linewidth=2,
    label="Prec.: simple fit"
)
ax1.plot(
    x_prec, y_prec_high,
    color="black",
    linewidth=2,
    alpha=0.8,
    label="Prec.: overfit"
)

# ----- TMEDIA -----
ax1.scatter(x_tmed, y_tmed, s=12, color="blue", label="TMEDIA")
ax1.plot(x_tmed, y_tmed_low, "--", color="blue", linewidth=2, label="TMEDIA simple fit")
ax1.plot(x_tmed, y_tmed_high, color="blue", linewidth=2, alpha=0.8, label="TMEDIA overfit")

ax1.set_title("TMEDIA + Precipitación (2009) con Underfitting y Overfitting")
ax1.set_xlabel("Índice de día (2009)")
ax1.set_ylabel("Temperatura (°C) / Precipitación (mm)")
ax1.grid(True)
ax1.legend(loc="upper left", ncol=2)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
### Year 2009 vs 2024
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# Function to process and plot a given year
def plot_year_fit(year, simple_deg=2, complex_deg=25):

    # ---------------------------
    # FILTER YEAR
    # ---------------------------
    df_year = df[df["FECHA"].dt.year == year].copy()
    df_year = df_year.sort_values("FECHA")

    # Extract TMEDIA
    y = pd.to_numeric(df_year["TMEDIA"], errors="coerce")
    x = np.arange(len(y))

    # Clean NaN
    mask = ~np.isnan(y)
    x = x[mask]
    y = y[mask]

    # ---------------------------
    # FIT MODELS
    # ---------------------------
    # Simple model
    coefs_low = Polynomial.fit(x, y, deg=simple_deg)
    y_pred_low = coefs_low(x)

    # Complex model (overfitting)
    deg = min(complex_deg, len(x)-2)  # avoid numerical crash
    coefs_high = Polynomial.fit(x, y, deg=deg)
    y_pred_high = coefs_high(x)

    # ---------------------------
    # PLOT
    # ---------------------------
    plt.figure(figsize=(14,6))
    plt.scatter(x, y, s=10, label=f"Real Data ({year})", alpha=0.7)
    plt.plot(x, y_pred_low, linewidth=2, linestyle='--',
             label=f"Simple fit (Degree {simple_deg})")
    plt.plot(x, y_pred_high, linewidth=2, alpha=0.8,
             label=f"Overfitting (Degree {deg})")

    plt.title(f"Overfitting demonstration – Temperature {year}")
    plt.xlabel(f"Day index ({year})")
    plt.ylabel("Temperature (°C)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# ---------------------------
# 2. PLOT YEARS
# ---------------------------
plot_year_fit(2009)
plot_year_fit(2024)
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FUNCTION TO FIT MODELS
# ---------------------------
def fit_models(df_year, simple_deg=2, complex_deg=25):

    y = pd.to_numeric(df_year["TMEDIA"], errors="coerce")
    x = np.arange(len(y))

    mask = ~np.isnan(y)
    x = x[mask]
    y = y[mask]

    # Simple model
    coefs_low = Polynomial.fit(x, y, deg=simple_deg)
    y_low = coefs_low(x)

    # Complex model (overfitting)
    deg = min(complex_deg, len(x)-2)
    coefs_high = Polynomial.fit(x, y, deg=deg)
    y_high = coefs_high(x)

    return x, y, y_low, y_high

# ---------------------------
# 3. PREPARE YEARS 2009 & 2024
# ---------------------------
years = [2009, 2024]
data = {}

for yr in years:
    df_y = df[df["FECHA"].dt.year == yr].copy().sort_values("FECHA")
    x, y, y_low, y_high = fit_models(df_y)
    data[yr] = (x, y, y_low, y_high)

# ---------------------------
# 4. PLOT EVERYTHING TOGETHER
# ---------------------------
plt.figure(figsize=(16,7))

# Real data
plt.scatter(data[2009][0], data[2009][1], s=12, alpha=0.6, label="Real 2009")
plt.scatter(data[2024][0], data[2024][1], s=12, alpha=0.6, label="Real 2024")

# Simple fits
plt.plot(data[2009][0], data[2009][2], linewidth=2, linestyle='--',
         label="Simple fit 2009 (deg=2)")
plt.plot(data[2024][0], data[2024][2], linewidth=2, linestyle='--',
         label="Simple fit 2024 (deg=2)")

# Overfitting fits
plt.plot(data[2009][0], data[2009][3], linewidth=2, alpha=0.8,
         label="Overfitting 2009 (deg≈25)")
plt.plot(data[2024][0], data[2024][3], linewidth=2, alpha=0.8,
         label="Overfitting 2024 (deg≈25)")

plt.title("Overfitting vs Underfitting – TMEDIA (2009 & 2024)")
plt.xlabel("Day index")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df["TMEDIA"] = pd.to_numeric(df["TMEDIA"], errors="coerce")

# ---------------------------
# FUNCTION: prepare normalized X axis (all months equal)
# ---------------------------
def prepare_year_normalized(year):
    df_y = df[df["FECHA"].dt.year == year].copy()
    df_y = df_y.dropna(subset=["TMEDIA"]).sort_values("FECHA")

    # month number: 1..12
    month = df_y["FECHA"].dt.month

    # day of month: 1..31
    day = df_y["FECHA"].dt.day

    # total days in that month
    days_in_month = df_y["FECHA"].dt.days_in_month

    # Normalized X: month + fraction of month
    x = month + (day - 1) / days_in_month

    y = df_y["TMEDIA"].values
    return x.values, y

# ---------------------------
# 2. Get normalized daily data
# ---------------------------
x09, y09 = prepare_year_normalized(2009)
x24, y24 = prepare_year_normalized(2024)

# ---------------------------
# 3. Fit simple and complex model
# ---------------------------
def fit(x, y, simple_deg=2, complex_deg=25):
    low = Polynomial.fit(x, y, deg=simple_deg)(x)
    deg = min(complex_deg, len(x)-2)
    high = Polynomial.fit(x, y, deg=deg)(x)
    return low, high

y09_low, y09_high = fit(x09, y09)
y24_low, y24_high = fit(x24, y24)

# ---------------------------
# 4. Plot
# ---------------------------
plt.figure(figsize=(18,7))

# real data
plt.plot(x09, y09, ".", alpha=0.4, label="Real 2009")
plt.plot(x24, y24, ".", alpha=0.4, label="Real 2024")

# simple fits
plt.plot(x09, y09_low, "--", linewidth=2, label="Simple fit 2009")
plt.plot(x24, y24_low, "--", linewidth=2, label="Simple fit 2024")

# overfitting fits
plt.plot(x09, y09_high, linewidth=2, label="Overfitting 2009")
plt.plot(x24, y24_high, linewidth=2, label="Overfitting 2024")

# X ticks: 12 months equally spaced
month_ticks = np.arange(1, 13)
month_labels = ["Ene","Feb","Mar","Abr","May","Jun","Jul","Ago","Sep","Oct","Nov","Dic"]
plt.xticks(month_ticks + 0.5, month_labels)  # center labels

plt.title("TMEDIA – Over/Underfitting (2009 vs 2024) – Months equally sized")
plt.ylabel("Temperatura (°C)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FILTER YEARS
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy().sort_values("FECHA")
df_2024 = df[df["FECHA"].dt.year == 2024].copy().sort_values("FECHA")

# Index
x_2009 = np.arange(len(df_2009))
x_2024 = np.arange(len(df_2024))

# Extract numeric series
tmed_2009 = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
prec_2009 = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")

tmed_2024 = pd.to_numeric(df_2024["TMEDIA"], errors="coerce")
prec_2024 = pd.to_numeric(df_2024["PRECIPITACION"], errors="coerce")

# ---------------------------
# 3. FIT FUNCTION
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
    x_masked = x[~np.isnan(y)]
    y_masked = y[~np.isnan(y)]
    # Simple fit
    coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
    y_low = coefs_low(x_masked)
    # Overfit
    deg = min(complex_deg, len(x_masked) - 2)
    coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
    y_high = coefs_high(x_masked)
    return x_masked, y_masked, y_low, y_high

# Fittings 2009
x_tmed_2009, y_tmed_2009, y_tmed_2009_low, y_tmed_2009_high = fit_series(x_2009, tmed_2009)
x_prec_2009, y_prec_2009, y_prec_2009_low, y_prec_2009_high = fit_series(x_2009, prec_2009)

# Fittings 2024
x_tmed_2024, y_tmed_2024, y_tmed_2024_low, y_tmed_2024_high = fit_series(x_2024, tmed_2024)
x_prec_2024, y_prec_2024, y_prec_2024_low, y_prec_2024_high = fit_series(x_2024, prec_2024)

# ---------------------------
# 4. PLOT COMPARISON
# ---------------------------
plt.figure(figsize=(18,10))

# -------- PRECIP 2009 --------
plt.bar(
    x_prec_2009,
    y_prec_2009,
    color="gray",
    alpha=0.3,
    label="Prec 2009"
)

# Fitting precipitation (2009)
plt.plot(x_prec_2009, y_prec_2009_low, "--", color="black", label="Prec 2009 simple fit")
plt.plot(x_prec_2009, y_prec_2009_high, color="black", alpha=0.7, label="Prec 2009 overfit")

# -------- PRECIP 2024 --------
plt.bar(
    x_prec_2024,
    y_prec_2024,
    color="orange",
    alpha=0.3,
    label="Prec 2024"
)

plt.plot(x_prec_2024, y_prec_2024_low, "--", color="orange", label="Prec 2024 simple fit")
plt.plot(x_prec_2024, y_prec_2024_high, color="orange", alpha=0.7, label="Prec 2024 overfit")

# -------- TMEDIA 2009 --------
plt.scatter(x_tmed_2009, y_tmed_2009, s=12, color="blue", label="TMEDIA 2009")
plt.plot(x_tmed_2009, y_tmed_2009_low, "--", color="blue", linewidth=2, label="TMEDIA 2009 simple fit")
plt.plot(x_tmed_2009, y_tmed_2009_high, color="blue", linewidth=2, alpha=0.8, label="TMEDIA 2009 overfit")

# -------- TMEDIA 2024 --------
plt.scatter(x_tmed_2024, y_tmed_2024, s=12, color="red", label="TMEDIA 2024")
plt.plot(x_tmed_2024, y_tmed_2024_low, "--", color="red", linewidth=2, label="TMEDIA 2024 simple fit")
plt.plot(x_tmed_2024, y_tmed_2024_high, color="red", linewidth=2, alpha=0.8, label="TMEDIA 2024 overfit")

plt.title("Comparativa TMEDIA + Precipitación (2009 vs 2024) con Underfitting y Overfitting")
plt.xlabel("Índice de día")
plt.ylabel("Temperatura (°C) / Precipitación (mm)")
plt.grid(True)
plt.legend(loc="upper left", ncol=2)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FILTER YEARS
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy().sort_values("FECHA")
df_2024 = df[df["FECHA"].dt.year == 2024].copy().sort_values("FECHA")

# Index for each year
x_2009 = np.arange(len(df_2009))
x_2024 = np.arange(len(df_2024))

# Precipitation as numeric
prec_2009 = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")
prec_2024 = pd.to_numeric(df_2024["PRECIPITACION"], errors="coerce")

# ---------------------------
# 3. FIT FUNCTION
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
    x_masked = x[~np.isnan(y)]
    y_masked = y[~np.isnan(y)]
    # Simple fit
    coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
    y_low = coefs_low(x_masked)
    # Overfit
    deg = min(complex_deg, len(x_masked)-2)
    coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
    y_high = coefs_high(x_masked)
    return x_masked, y_masked, y_low, y_high

# Fittings
x_p2009, y_p2009, y_p2009_low, y_p2009_high = fit_series(x_2009, prec_2009)
x_p2024, y_p2024, y_p2024_low, y_p2024_high = fit_series(x_2024, prec_2024)

# ---------------------------
# 4. PLOT PRECIP ONLY
# ---------------------------
plt.figure(figsize=(18,7))

# ----- PRECIP 2009 -----
plt.bar(x_p2009, y_p2009, color="gray", alpha=0.4, label="Precipitación 2009")
plt.plot(x_p2009, y_p2009_low, "--", color="black", linewidth=2, label="2009 simple fit")
plt.plot(x_p2009, y_p2009_high, color="black", linewidth=2, alpha=0.8, label="2009 overfit")

# ----- PRECIP 2024 -----
plt.bar(x_p2024, y_p2024, color="orange", alpha=0.4, label="Precipitación 2024")
plt.plot(x_p2024, y_p2024_low, "--", color="orange", linewidth=2, label="2024 simple fit")
plt.plot(x_p2024, y_p2024_high, color="orange", linewidth=2, alpha=0.8, label="2024 overfit")

plt.title("Comparación de Precipitación – 2009 vs 2024 (simple fit vs overfit)")
plt.xlabel("Índice de día")
plt.ylabel("Precipitación (mm)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image

Overfitting vs Fitting by Year¶

This interactive graph compares the daily mean temperature patterns of two selected years by applying two different polynomial models: a simple low-degree fit and a complex high-degree overfitting model. By normalizing the X-axis so that all months have equal width, the visualization highlights seasonal behavior independently of month length or missing data. The comparison reveals how each model captures long-term trends versus noise, and how temperature patterns differ between the two chosen years.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
from ipywidgets import interact, Dropdown, fixed

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df["TMEDIA"] = pd.to_numeric(df["TMEDIA"], errors="coerce")


# ---------------------------
# 2. FUNCTION: normalized X axis (months equally sized)
# ---------------------------
def prepare_year_normalized(year):
    df_y = df[df["FECHA"].dt.year == year].copy()
    df_y = df_y.dropna(subset=["TMEDIA"]).sort_values("FECHA")

    month = df_y["FECHA"].dt.month
    day = df_y["FECHA"].dt.day
    days_in_month = df_y["FECHA"].dt.days_in_month

    # Normalized X = month + fraction of month
    x = month + (day - 1) / days_in_month
    y = df_y["TMEDIA"].values
    return x.values, y


# ---------------------------
# 3. FIT MODELS
# ---------------------------
def fit(x, y, simple_deg=2, complex_deg=25):
    low = Polynomial.fit(x, y, deg=simple_deg)(x)
    deg = min(complex_deg, len(x)-2)
    high = Polynomial.fit(x, y, deg=deg)(x)
    return low, high


# ---------------------------
# 4. INTERACTIVE PLOT FUNCTION
# ---------------------------
def plot_comparison(yearA, yearB):
    xA, yA = prepare_year_normalized(yearA)
    xB, yB = prepare_year_normalized(yearB)

    yA_low, yA_high = fit(xA, yA)
    yB_low, yB_high = fit(xB, yB)

    plt.figure(figsize=(18,7))

    # Real data
    plt.plot(xA, yA, ".", alpha=0.4, label=f"Real {yearA}")
    plt.plot(xB, yB, ".", alpha=0.4, label=f"Real {yearB}")

    # Simple fit
    plt.plot(xA, yA_low, "--", linewidth=2, label=f"Simple fit {yearA}")
    plt.plot(xB, yB_low, "--", linewidth=2, label=f"Simple fit {yearB}")

    # Overfit
    plt.plot(xA, yA_high, linewidth=2, label=f"Overfit {yearA}")
    plt.plot(xB, yB_high, linewidth=2, label=f"Overfit {yearB}")

    # X ticks
    month_ticks = np.arange(1, 13)
    month_labels = ["Ene","Feb","Mar","Abr","May","Jun","Jul","Ago","Sep","Oct","Nov","Dic"]
    plt.xticks(month_ticks + 0.5, month_labels)

    plt.title(f"Comparación TMEDIA – {yearA} vs {yearB} (Meses igualados)")
    plt.ylabel("Temperatura (°C)")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()


# ---------------------------
# 5. INTERACTIVE WIDGET
# ---------------------------
years = list(range(2009, 2025))

interact(
    plot_comparison,
    yearA=Dropdown(options=years, value=2009, description="Año A"),
    yearB=Dropdown(options=years, value=2024, description="Año B")
)
interactive(children=(Dropdown(description='Año A', options=(2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2…
Out[1]:
<function __main__.plot_comparison(yearA, yearB)>

Compare multiple years¶

This interactive graph lets you compare daily temperatures between any years from 2009 to 2024. All months have the same width on the X-axis, making seasonal patterns easier to see. You can display real data or apply simple and complex polynomial fits to explore how models capture trends or overfit noise. This tool helps visualize climate variations across years and demonstrates key Data Science concepts like seasonality, model complexity, and overfitting.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
from ipywidgets import interact, SelectMultiple, Dropdown, IntSlider, fixed

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df["TMEDIA"] = pd.to_numeric(df["TMEDIA"], errors="coerce")

# ---------------------------
# Function: normalized X axis (months equally sized)
# ---------------------------
def prepare_year_normalized(year):
    df_y = df[df["FECHA"].dt.year == year].copy()
    df_y = df_y.dropna(subset=["TMEDIA"]).sort_values("FECHA")

    month = df_y["FECHA"].dt.month
    day   = df_y["FECHA"].dt.day
    days  = df_y["FECHA"].dt.days_in_month

    x = month + (day - 1) / days
    y = df_y["TMEDIA"].values
    return x.values, y

# ---------------------------
# Fit function
# ---------------------------
def fit_curve(x, y, degree):
    degree = min(degree, len(x)-2)
    return Polynomial.fit(x, y, deg=degree)(x)

# ---------------------------
# Interactive plot
# ---------------------------
def plot_years(selected_years, curve_type, degree):

    plt.figure(figsize=(19,8))

    for year in selected_years:
        x, y = prepare_year_normalized(year)

        if curve_type == "Datos reales":
            plt.plot(x, y, ".", markersize=3, alpha=0.5, label=f"{year}")
        elif curve_type == "Ajuste simple (deg 2)":
            y_low = fit_curve(x, y, 2)
            plt.plot(x, y_low, "-", linewidth=2, alpha=0.8, label=f"{year} (fit2)")
        elif curve_type == "Ajuste complejo (deg variable)":
            y_high = fit_curve(x, y, degree)
            plt.plot(x, y_high, "-", linewidth=2, alpha=0.8, label=f"{year} (fit{degree})")

    # X axis: month labels equally spaced
    month_ticks = np.arange(1, 13)
    month_labels = ["Ene","Feb","Mar","Abr","May","Jun","Jul","Ago","Sep","Oct","Nov","Dic"]
    plt.xticks(month_ticks + 0.5, month_labels)

    plt.title(f"TMEDIA 2009–2024 · Curva: {curve_type}")
    plt.ylabel("Temperatura (ºC)")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

# ---------------------------
# Create widgets
# ---------------------------
years = list(range(2009, 2025))

interact(
    plot_years,
    selected_years = SelectMultiple(
        options=years,
        value=(2009, 2010, 2024),
        description="Años:"
    ),
    curve_type = Dropdown(
        options=["Datos reales", "Ajuste simple (deg 2)", "Ajuste complejo (deg variable)"],
        value="Datos reales",
        description="Tipo curva:"
    ),
    degree = IntSlider(
        value=15, min=3, max=40, step=1,
        description="Grado (si complejo):"
    )
)
interactive(children=(SelectMultiple(description='Años:', index=(0, 1, 15), options=(2009, 2010, 2011, 2012, 2…
Out[5]:
<function __main__.plot_years(selected_years, curve_type, degree)>

Find out which year has the most daily data¶

In [16]:
import pandas as pd

# Cargar CSV
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# Agrupar por año y contar
registros_por_ano = df.groupby(df["FECHA"].dt.year).size().sort_values(ascending=False)

registros_por_ano
Out[16]:
FECHA
2012    366
2016    366
2020    366
2017    365
2018    365
2021    365
2011    364
2013    364
2023    364
2010    363
2019    363
2014    362
2015    361
2009    360
2022    356
2024    345
2025    310
2008     80
dtype: int64
In [ ]:
### 2012 vs 2023
In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FILTER YEARS 2012 & 2023
# ---------------------------
df_2012 = df[df["FECHA"].dt.year == 2012].copy().sort_values("FECHA")
df_2023 = df[df["FECHA"].dt.year == 2023].copy().sort_values("FECHA")

x_2012 = np.arange(len(df_2012))
x_2023 = np.arange(len(df_2023))

prec_2012 = pd.to_numeric(df_2012["PRECIPITACION"], errors="coerce")
prec_2023 = pd.to_numeric(df_2023["PRECIPITACION"], errors="coerce")

# ---------------------------
# 3. FIT FUNCTION
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
    x_mask = x[~np.isnan(y)]
    y_mask = y[~np.isnan(y)]
    # Simple fit
    coefs_low = Polynomial.fit(x_mask, y_mask, deg=simple_deg)
    y_low = coefs_low(x_mask)
    # Overfit
    deg = min(complex_deg, len(x_mask) - 2)
    coefs_high = Polynomial.fit(x_mask, y_mask, deg=deg)
    y_high = coefs_high(x_mask)
    return x_mask, y_mask, y_low, y_high

# Apply fitting to both years
x_p2012, y_p2012, y_p2012_low, y_p2012_high = fit_series(x_2012, prec_2012)
x_p2023, y_p2023, y_p2023_low, y_p2023_high = fit_series(x_2023, prec_2023)

# ---------------------------
# 4. PLOT PRECIPITATION ONLY
# ---------------------------
plt.figure(figsize=(18,7))

# ----- PRECIP 2012 -----
plt.bar(x_p2012, y_p2012, color="gray", alpha=0.4, label="Precipitación 2012")
plt.plot(x_p2012, y_p2012_low, "--", color="black", linewidth=2, label="2012 simple fit")
plt.plot(x_p2012, y_p2012_high, color="black", linewidth=2, alpha=0.8, label="2012 overfit")

# ----- PRECIP 2023 -----
plt.bar(x_p2023, y_p2023, color="green", alpha=0.3, label="Precipitación 2023")
plt.plot(x_p2023, y_p2023_low, "--", color="green", linewidth=2, label="2023 simple fit")
plt.plot(x_p2023, y_p2023_high, color="green", linewidth=2, alpha=0.8, label="2023 overfit")

plt.title("Comparación de Precipitación – 2012 vs 2023 (simple fit vs overfit)")
plt.xlabel("Índice de día")
plt.ylabel("Precipitación (mm)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [2]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean

# ------------------------------------------
# 1. LOAD DATA
# ------------------------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# Variable a comparar
var = "TMEDIA"   # Puedes poner: "TMIN", "TMAX", "PRECIPITACION"

# ------------------------------------------
# 2. AGRUPAR POR AÑO
# ------------------------------------------
df = df.dropna(subset=[var])
df["AÑO"] = df["FECHA"].dt.year

# Diccionario año -> serie normalizada a longitud igual
series = {}

# Encontrar longitud mínima (para alinear)
min_len = df.groupby("AÑO").size().min()

for year, group in df.groupby("AÑO"):
    y = pd.to_numeric(group[var], errors="coerce").dropna().values
    if len(y) >= min_len:
        y = y[:min_len]  # recortar a misma longitud
        series[year] = y

years = sorted(series.keys())

# ------------------------------------------
# 3. CALCULAR DISTANCIA ENTRE CADA PAR DE AÑOS
# ------------------------------------------
best_pair = None
best_distance = float("inf")

for i in range(len(years)):
    for j in range(i+1, len(years)):
        y1 = series[years[i]]
        y2 = series[years[j]]
        
        dist = euclidean(y1, y2)
        
        if dist < best_distance:
            best_distance = dist
            best_pair = (years[i], years[j])

print("The two most similar years in", var, "son:", best_pair)
print("Distance:", best_distance)
The two most similar years in TMEDIA son: (2014, 2015)
Distance: 23.189221634198937
In [4]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean

# ------------------------------------------
# 1. LOAD DATA
# ------------------------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

var = "PRECIPITACION"   # rainfall variable

df = df.dropna(subset=[var])
df["YEAR"] = df["FECHA"].dt.year

# ------------------------------------------
# 2. FIND THE YEAR WITH THE MOST RAINFALL
# ------------------------------------------
total_rain_per_year = df.groupby("YEAR")[var].sum()

year_max_rain = total_rain_per_year.idxmax()
max_rain_value = total_rain_per_year.max()

print("--------------------------------------------------------")
print("YEAR WITH THE HIGHEST TOTAL RAINFALL:")
print(f"➡ Year {year_max_rain} with {max_rain_value:.2f} mm")
print("--------------------------------------------------------")

# ------------------------------------------
# 3. PREPARE SERIES FOR SIMILARITY ANALYSIS
# ------------------------------------------
series = {}
min_len = df.groupby("YEAR").size().min()   # align different years

for year, group in df.groupby("YEAR"):
    y = pd.to_numeric(group[var], errors="coerce").dropna().values
    if len(y) >= min_len:
        series[year] = y[:min_len]

years = sorted(series.keys())

# ------------------------------------------
# 4. COMPUTE YEAR-TO-YEAR DISTANCES
# ------------------------------------------
best_pair = None
best_distance = float("inf")

for i in range(len(years)):
    for j in range(i+1, len(years)):
        y1 = series[years[i]]
        y2 = series[years[j]]

        dist = euclidean(y1, y2)  # similarity measure

        if dist < best_distance:
            best_distance = dist
            best_pair = (years[i], years[j])

print("Most similar years in PRECIPITATION:", best_pair)
print("Distance (lower = more similar):", best_distance)
print("--------------------------------------------------------")
print("Total annual rainfall (mm):")
print(total_rain_per_year)
print("--------------------------------------------------------")
--------------------------------------------------------
YEAR WITH THE HIGHEST TOTAL RAINFALL:
➡ Year 2023 with 2211.40 mm
--------------------------------------------------------
Most similar years in PRECIPITATION: (2011, 2012)
Distance (lower = more similar): 49.25038070918843
--------------------------------------------------------
Total annual rainfall (mm):
YEAR
2008     217.2
2009    1397.9
2010    1338.8
2011     678.8
2012    1056.0
2013    1316.8
2014    1386.3
2015    1036.2
2016    1868.7
2017    1170.8
2018    1275.1
2019    1725.5
2020    1727.1
2021    1829.8
2022    1272.0
2023    2211.4
2024    1559.0
2025    1604.8
Name: PRECIPITACION, dtype: float64
--------------------------------------------------------

Humedad¶

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Fix column names
df.columns = [c.strip() for c in df.columns]

# Use your real column names
date_col = "UTC"
hum_col = "Hum"

df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
df = df.dropna(subset=[date_col, hum_col])

# Sort by time
df = df.sort_values(date_col)

# ---------------------------
# 2. PREPARE SERIES
# ---------------------------
y = pd.to_numeric(df[hum_col], errors="coerce")
x = np.arange(len(y))

mask = ~np.isnan(y)
x = x[mask]
y = y[mask]

# ---------------------------
# 3. FIT MODELS
# ---------------------------

# Simple model (underfitting)
coefs_low = Polynomial.fit(x, y, deg=2)
y_low = coefs_low(x)

# Complex model (overfitting)
deg = min(20, len(x) - 2)   # avoid crash if few points
coefs_high = Polynomial.fit(x, y, deg=deg)
y_high = coefs_high(x)

# ---------------------------
# 4. PLOT
# ---------------------------
plt.figure(figsize=(16,7))

plt.scatter(x, y, s=10, color="blue", alpha=0.7, label="Humidity (real data)")
plt.plot(x, y_low, "--", linewidth=2, color="green", label="Simple fit (deg=2)")
plt.plot(x, y_high, linewidth=2, color="red", alpha=0.8, label=f"Overfit (deg={deg})")

plt.title("Humidity with Underfitting and Overfitting")
plt.xlabel("Time index")
plt.ylabel("Humidity (%)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

# ---------------------------
# 2. GROUP BY MONTH
# ---------------------------
df["MONTH"] = df["UTC"].dt.month

# Average humidity per month
monthly = df.groupby("MONTH")["Hum"].mean()

# Prepare the series
x = np.arange(1, 13)              # months 1–12
y = monthly.values                # mean humidity per month

# ---------------------------
# 3. FIT MODELS
# ---------------------------

# Simple model (underfitting)
simple_deg = 2
coefs_low = Polynomial.fit(x, y, deg=simple_deg)
y_low = coefs_low(x)

# Overfitting model
complex_deg = min(8, len(x)-2)
coefs_high = Polynomial.fit(x, y, deg=complex_deg)
y_high = coefs_high(x)

# ---------------------------
# 4. PLOT
# ---------------------------
plt.figure(figsize=(14,6))

# Real data
plt.plot(x, y, 'o', color="blue", label="Real humidity (monthly average)")

# Simple fit
plt.plot(x, y_low, '--', linewidth=2, color="green",
         label=f"Simple fit (deg={simple_deg})")

# Overfit
plt.plot(x, y_high, linewidth=2, color="red", alpha=0.8,
         label=f"Overfit (deg={complex_deg})")

plt.title("Monthly Humidity with Underfitting and Overfitting")
plt.xlabel("Month")
plt.ylabel("Humidity (%)")
plt.xticks(np.arange(1, 13))
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

# Extract year and month
df["YEAR"] = df["UTC"].dt.year
df["MONTH"] = df["UTC"].dt.month

# ---------------------------
# 2. FILTER YEARS 2020–2024
# ---------------------------
df = df[(df["YEAR"] >= 2020) & (df["YEAR"] <= 2024)]

# Group into a table: rows=months, columns=years
monthly_year = df.groupby(["YEAR", "MONTH"])["Hum"].mean().unstack(level=0)

# ---------------------------
# 3. PLOT — Humidity + Fitting
# ---------------------------
plt.figure(figsize=(18,9))

years = sorted(monthly_year.columns.dropna().tolist())
months = np.arange(1, 13)

for year in years:
    y = monthly_year[year].values
    x = months
    
    # Remove NaN
    mask = ~np.isnan(y)
    x_clean = x[mask]
    y_clean = y[mask]
    
    if len(x_clean) < 3:
        continue  # not enough data to fit
    
    # Fitting simple (underfitting)
    deg_low = 2
    coef_low = Polynomial.fit(x_clean, y_clean, deg=deg_low)
    y_low = coef_low(x_clean)

    # Fitting complex (overfitting)
    deg_high = min(8, len(x_clean) - 2)
    coef_high = Polynomial.fit(x_clean, y_clean, deg=deg_high)
    y_high = coef_high(x_clean)
    
    # Plot real data
    plt.plot(x_clean, y_clean, marker="o", linewidth=2, label=f"{year} real")
    
    # Simple fit
    plt.plot(x_clean, y_low, "--", linewidth=2, label=f"{year} simple fit")
    
    # Overfit
    plt.plot(x_clean, y_high, linewidth=2, alpha=0.7, label=f"{year} overfit")

plt.title("Monthly Humidity (2019–2025) with Underfitting and Overfitting")
plt.xlabel("Month")
plt.ylabel("Humidity (%)")
plt.xticks(months)
plt.grid(True)
plt.legend(ncol=3, fontsize=9)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

# Prepare data
df = df.sort_values("UTC")
y = df["Hum"].values
X = np.arange(len(y)).reshape(-1, 1)  # simple time index

# ---------------------------
# 2. CROSS-VALIDATION SETUP
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

degrees = range(1, 20)  # test polynomial degrees 1 to 19
cv_errors = []

# ---------------------------
# 3. RUN CROSS-VALIDATION FOR EACH DEGREE
# ---------------------------
for deg in degrees:
    fold_errors = []
    poly = PolynomialFeatures(degree=deg)
    
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Transform features
        X_train_poly = poly.fit_transform(X_train)
        X_test_poly = poly.transform(X_test)

        # Fit model
        model = LinearRegression()
        model.fit(X_train_poly, y_train)

        # Predict + calculate error
        y_pred = model.predict(X_test_poly)
        error = mean_squared_error(y_test, y_pred)
        fold_errors.append(error)

    # Average CV error for this degree
    cv_errors.append(np.mean(fold_errors))

# ---------------------------
# 4. PLOT CROSS-VALIDATION RESULTS
# ---------------------------
plt.figure(figsize=(12,6))
plt.plot(degrees, cv_errors, marker="o", linewidth=2)
plt.xlabel("Polynomial Degree")
plt.ylabel("Cross-Validation Error (MSE)")
plt.title("Cross-Validation Curve for Humidity Fitting")
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

df["YEAR"] = df["UTC"].dt.year

# Years to evaluate
years = [2020, 2021, 2022, 2023, 2024]

# Polynomial degrees
degrees = range(1, 12)

# Store results
errors_by_year = {}

# ---------------------------
# 2. CROSS-VALIDATION PER YEAR
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for year in years:
    df_y = df[df["YEAR"] == year].copy()
    if len(df_y) < 10:
        print(f"Skipping {year}: not enough data")
        continue

    y = df_y["Hum"].values
    X = np.arange(len(y)).reshape(-1, 1)

    year_errors = []

    for deg in degrees:
        fold_errors = []
        poly = PolynomialFeatures(degree=deg)

        for train_idx, test_idx in kf.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train_poly = poly.fit_transform(X_train)
            X_test_poly = poly.transform(X_test)

            model = LinearRegression()
            model.fit(X_train_poly, y_train)

            y_pred = model.predict(X_test_poly)
            fold_errors.append(mean_squared_error(y_test, y_pred))

        year_errors.append(np.mean(fold_errors))

    errors_by_year[year] = year_errors

# ---------------------------
# 3. PLOT CROSS-VALIDATION CURVES
# ---------------------------
plt.figure(figsize=(14,7))

for year in errors_by_year:
    plt.plot(degrees, errors_by_year[year], marker="o", linewidth=2, label=str(year))

plt.title("Cross-Validation Error (Humidity) — 2020 to 2024")
plt.xlabel("Polynomial Degree")
plt.ylabel("MSE (Cross-Validation Error)")
plt.grid(True)
plt.legend(title="Year")
plt.tight_layout()
plt.show()

# ---------------------------
# 4. SHOW ERRORS IN A TABLE
# ---------------------------
error_table = pd.DataFrame(errors_by_year, index=degrees)
error_table.index.name = "Degree"
error_table
No description has been provided for this image
Out[13]:
2020 2021 2022 2023 2024
Degree
1 216.068824 218.752229 205.806426 223.386426 121.662982
2 199.123679 206.903699 197.047975 206.637540 120.667957
3 199.019093 201.704614 196.737505 206.017819 120.021651
4 197.902190 207.015701 196.320941 207.876785 116.337365
5 197.509293 210.007744 195.493466 209.290042 116.981801
6 199.035569 210.817214 194.980695 209.993881 116.845109
7 201.411285 210.744167 195.175063 210.322704 116.787237
8 203.592694 210.467845 195.974085 210.498426 116.688301
9 205.158340 210.258218 197.035062 210.616151 116.304289
10 206.100096 210.176259 198.047936 210.704588 119.011331
11 206.568808 210.195144 198.845317 210.768333 119.901257
In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

df["YEAR"] = df["UTC"].dt.year

# Years to analyze
years = [2020, 2021, 2022, 2023, 2024]

# Regularization strengths (λ)
lambdas = np.logspace(-4, 4, 20)

# Where to store results
reg_errors = {}

# ---------------------------
# 2. REGULARIZATION ANALYSIS PER YEAR
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for year in years:
    df_y = df[df["YEAR"] == year].copy()
    if len(df_y) < 20:
        print(f"Skipping {year}: not enough samples")
        continue
    
    y = df_y["Hum"].values
    X = np.arange(len(y)).reshape(-1, 1)
    
    year_errors = []
    
    for lam in lambdas:
        fold_errors = []
        model = Ridge(alpha=lam)
        
        for train_idx, test_idx in kf.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            fold_errors.append(mean_squared_error(y_test, y_pred))
        
        year_errors.append(np.mean(fold_errors))
    
    reg_errors[year] = year_errors

# ---------------------------
# 3. PLOT REGULARIZATION CURVES
# ---------------------------
plt.figure(figsize=(14,7))

for year in reg_errors:
    plt.plot(lambdas, reg_errors[year], marker="o", linewidth=2, label=str(year))

plt.xscale("log")
plt.xlabel("Regularization strength (lambda)")
plt.ylabel("Validation Error (MSE)")
plt.title("Regularization Curve per Year (Humidity) — Ridge Regression")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

df["YEAR"] = df["UTC"].dt.year
df["MONTH"] = df["UTC"].dt.month

# Years you want to analyze
years = sorted(df["YEAR"].unique())

# Regularization strengths (lambda)
lambdas = np.logspace(-4, 4, 15)

# ---------------------------
# 2. REGULARIZATION PER YEAR
# ---------------------------
best_models = {}   # to store best model prediction per year

for year in years:
    df_y = df[df["YEAR"] == year].copy()
    if len(df_y) < 30:
        print(f"Skipping {year}: not enough data")
        continue
    
    # Daily humidity
    y = df_y["Hum"].values
    X = np.arange(len(y)).reshape(-1, 1)  # simple time index
    
    # K-fold cross-validation setup
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Evaluate each lambda
    mse_list = []
    models = []
    
    for lam in lambdas:
        fold_errors = []
        model = Ridge(alpha=lam)
        
        for train_idx, test_idx in kf.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            fold_errors.append(mean_squared_error(y_test, y_pred))
        
        mse_list.append(np.mean(fold_errors))
        models.append(model)
    
    # Pick best lambda
    best_idx = np.argmin(mse_list)
    best_lambda = lambdas[best_idx]
    best_model = models[best_idx]

    # Predict using the BEST model
    df_y["Pred"] = best_model.predict(X)
    best_models[year] = df_y


# ---------------------------
# 3. AGGREGATE PREDICTIONS BY MONTH (X-axis = months)
# ---------------------------
plt.figure(figsize=(16,8))

for year, df_y in best_models.items():
    monthly_pred = df_y.groupby("MONTH")["Pred"].mean()
    
    plt.plot(
        monthly_pred.index,
        monthly_pred.values,
        marker="o",
        linewidth=2,
        label=f"{year} (best λ)"
    )

plt.title("Monthly Humidity (Daily Fitting + Ridge Regularization)\nBest Regularized Model for Each Year")
plt.xlabel("Month")
plt.ylabel("Humidity (%)")
plt.xticks(np.arange(1,13))
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]: