import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df.head()

# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")

# Use temperature: TMED or any parameter you want
y = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
x = np.arange(len(y))  # simple numeric index for fitting

mask = ~np.isnan(y)
x = x[mask]
y = y[mask]

# ---------------------------
# 3. FIT TWO MODELS
# ---------------------------

# A) Underfitting: very simple model (degree 2)
coefs_low = Polynomial.fit(x, y, deg=2)
y_pred_low = coefs_low(x)

# B) Overfitting: too many parameters (degree 25)
# WARNING: high degree = noisy overfit
coefs_high = Polynomial.fit(x, y, deg=25)
y_pred_high = coefs_high(x)

# ---------------------------
# 4. PLOT RESULTS
# ---------------------------
plt.figure(figsize=(14,6))

plt.scatter(x, y, s=10, label="Real Data (2009)", alpha=0.7)

plt.plot(x, y_pred_low, linewidth=2, label="Low-degree fit (Underfitting)", linestyle='--')
plt.plot(x, y_pred_high, linewidth=2, label="High-degree fit (Overfitting)", alpha=0.8)

plt.title("Overfitting demonstration – Temperature 2009")
plt.xlabel("Day index (2009)")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df.head()

# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")

# Use temperature: TMED or any parameter you want
y = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
x = np.arange(len(y))  # simple numeric index for fitting

mask = ~np.isnan(y)
x = x[mask]
y = y[mask]

# ---------------------------
# 3. FIT TWO MODELS
# ---------------------------

# A) Underfitting: very simple model (degree 2)
coefs_low = Polynomial.fit(x, y, deg=2)
y_pred_low = coefs_low(x)

# B) Overfitting: too many parameters (degree 25)
# WARNING: high degree = noisy overfit
coefs_high = Polynomial.fit(x, y, deg=25)
y_pred_high = coefs_high(x)

# ---------------------------
# 4. PLOT RESULTS + TMIN/TMAX
# ---------------------------
plt.figure(figsize=(14,6))

# Banda térmica TMIN-TMAX
plt.fill_between(
    np.arange(len(df_2009)),
    df_2009["TMIN"],
    df_2009["TMAX"],
    color="lightblue",
    alpha=0.4,
    label="Rango diario (TMIN – TMAX)"
)

# Scatter de TMEDIA (solo valores válidos)
plt.scatter(x, y, s=10, label="TMEDIA (2009)", alpha=0.8, color="blue")

# Ajuste simple
plt.plot(x, y_pred_low, linewidth=2, linestyle="--",
         label="Low-degree fit (Underfitting)")

# Ajuste complejo (overfitting)
plt.plot(x, y_pred_high, linewidth=2, alpha=0.8,
         label="High-degree fit (Overfitting)")

plt.title("Temperaturas 2009: Banda térmica (TMIN–TMAX) + Overfitting TMEDIA")
plt.xlabel("Índice de día (2009)")
plt.ylabel("Temperatura (°C)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df.head()

# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")

# Index for fitting
x_full = np.arange(len(df_2009))

# Extract series
tmed = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
tmin = pd.to_numeric(df_2009["TMIN"], errors="coerce")
tmax = pd.to_numeric(df_2009["TMAX"], errors="coerce")

# Remove NaN
mask_tmed = ~np.isnan(tmed)
mask_tmin = ~np.isnan(tmin)
mask_tmax = ~np.isnan(tmax)

# ---------------------------
# 3. FIT MODELS FOR EACH SERIES
# ---------------------------

def fit_series(x, y, simple_deg=2, complex_deg=25):
    """Return simple fit + overfitting predictions."""
    x_masked = x[~np.isnan(y)]
    y_masked = y[~np.isnan(y)]
    # Simple
    coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
    y_low = coefs_low(x_masked)
    # Overfit
    deg = min(complex_deg, len(x_masked)-2)
    coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
    y_high = coefs_high(x_masked)
    return x_masked, y_masked, y_low, y_high

# TMEDIA
x_tmed, y_tmed, y_tmed_low, y_tmed_high = fit_series(x_full, tmed)

# TMIN
x_tmin, y_tmin, y_tmin_low, y_tmin_high = fit_series(x_full, tmin)

# TMAX
x_tmax, y_tmax, y_tmax_low, y_tmax_high = fit_series(x_full, tmax)

# ---------------------------
# 4. PLOT RESULTS
# ---------------------------
plt.figure(figsize=(16,8))

# Banda térmica TMIN–TMAX
plt.fill_between(
    x_full,
    df_2009["TMIN"],
    df_2009["TMAX"],
    color="lightblue",
    alpha=0.3,
    label="Rango diario (TMIN–TMAX)"
)

# --- TMEDIA ---
plt.scatter(x_tmed, y_tmed, s=10, color="blue", label="TMEDIA")
plt.plot(x_tmed, y_tmed_low, "--", linewidth=2, color="blue",
         label="TMEDIA simple fit")
plt.plot(x_tmed, y_tmed_high, linewidth=2, alpha=0.7, color="blue",
         label="TMEDIA overfit")

# --- TMIN ---
plt.scatter(x_tmin, y_tmin, s=10, color="green", alpha=0.7, label="TMIN")
plt.plot(x_tmin, y_tmin_low, "--", linewidth=2, color="green",
         label="TMIN simple fit")
plt.plot(x_tmin, y_tmin_high, linewidth=2, alpha=0.7, color="green",
         label="TMIN overfit")

# --- TMAX ---
plt.scatter(x_tmax, y_tmax, s=10, color="red", alpha=0.7, label="TMAX")
plt.plot(x_tmax, y_tmax_low, "--", linewidth=2, color="red",
         label="TMAX simple fit")
plt.plot(x_tmax, y_tmax_high, linewidth=2, alpha=0.7, color="red",
         label="TMAX overfit")

plt.title("Overfitting demonstration – TMEDIA, TMIN, TMAX (2009)")
plt.xlabel("Day index (2009)")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.legend(loc="upper left", ncol=2)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")

# Index for fitting
x_full = np.arange(len(df_2009))

# Extract series
tmed = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
tmin = pd.to_numeric(df_2009["TMIN"], errors="coerce")
tmax = pd.to_numeric(df_2009["TMAX"], errors="coerce")
prec = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")

# ---------------------------
# 3. FIT MODELS FOR EACH SERIES
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=25):
    """Return simple fit + overfitting predictions."""
    x_masked = x[~np.isnan(y)]
    y_masked = y[~np.isnan(y)]
    # Simple
    coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
    y_low = coefs_low(x_masked)
    # Overfitting
    deg = min(complex_deg, len(x_masked)-2)
    coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
    y_high = coefs_high(x_masked)
    return x_masked, y_masked, y_low, y_high

# TMEDIA
x_tmed, y_tmed, y_tmed_low, y_tmed_high = fit_series(x_full, tmed)
# TMIN
x_tmin, y_tmin, y_tmin_low, y_tmin_high = fit_series(x_full, tmin)
# TMAX
x_tmax, y_tmax, y_tmax_low, y_tmax_high = fit_series(x_full, tmax)

# ---------------------------
# 4. PLOT RESULTS + PRECIPITATION
# ---------------------------
fig, ax1 = plt.subplots(figsize=(18,9))

# ----- PRECIPITACIÓN (barras) -----
ax1.bar(
    x_full,
    prec,
    color="gray",
    alpha=0.4,
    label="Precipitación",
)

ax1.set_ylabel("Temperatura (°C) / Precipitación (mm)")
ax1.set_xlabel("Índice de día (2009)")

# ----- Banda térmica -----
ax1.fill_between(
    x_full,
    tmin,
    tmax,
    color="lightblue",
    alpha=0.25,
    label="Banda térmica (TMIN–TMAX)"
)

# ----- TMEDIA -----
ax1.scatter(x_tmed, y_tmed, s=10, color="blue", label="TMEDIA")
ax1.plot(x_tmed, y_tmed_low, "--", linewidth=2, color="blue", label="TMEDIA simple fit")
ax1.plot(x_tmed, y_tmed_high, linewidth=2, alpha=0.8, color="blue", label="TMEDIA overfit")

# ----- TMIN -----
ax1.scatter(x_tmin, y_tmin, s=10, color="green", alpha=0.7, label="TMIN")
ax1.plot(x_tmin, y_tmin_low, "--", linewidth=2, color="green", label="TMIN simple fit")
ax1.plot(x_tmin, y_tmin_high, linewidth=2, alpha=0.8, color="green", label="TMIN overfit")

# ----- TMAX -----
ax1.scatter(x_tmax, y_tmax, s=10, color="red", alpha=0.7, label="TMAX")
ax1.plot(x_tmax, y_tmax_low, "--", linewidth=2, color="red", label="TMAX simple fit")
ax1.plot(x_tmax, y_tmax_high, linewidth=2, alpha=0.8, color="red", label="TMAX overfit")

plt.title("Temperaturas + Precipitación (2009) — TMEDIA, TMIN, TMAX con Overfitting")
plt.grid(True)
plt.legend(loc="upper left", ncol=3)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FILTER YEAR 2009
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy()
df_2009 = df_2009.sort_values("FECHA")

# Index for fitting
x_full = np.arange(len(df_2009))

# Extract series as numeric
tmed = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
prec = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")

# ---------------------------
# 3. FIT MODELS (TMEDIA + PRECIP)
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
    """Return masked x, y, simple fit, overfit"""
    x_masked = x[~np.isnan(y)]
    y_masked = y[~np.isnan(y)]
    # Simple fit
    coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
    y_low = coefs_low(x_masked)
    # Overfitting
    deg = min(complex_deg, len(x_masked)-2)
    coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
    y_high = coefs_high(x_masked)
    return x_masked, y_masked, y_low, y_high

# TMEDIA
x_tmed, y_tmed, y_tmed_low, y_tmed_high = fit_series(x_full, tmed)

# PRECIPITACION
x_prec, y_prec, y_prec_low, y_prec_high = fit_series(x_full, prec)

# ---------------------------
# 4. PLOT TMEDIA + PRECIP
# ---------------------------
fig, ax1 = plt.subplots(figsize=(18,7))

# ----- PRECIPITACIÓN (barras) -----
ax1.bar(
    x_prec,
    y_prec,
    color="gray",
    alpha=0.4,
    label="Precipitación (mm)"
)

# Fitting precipitación
ax1.plot(
    x_prec, y_prec_low,
    "--", color="black",
    linewidth=2,
    label="Prec.: simple fit"
)
ax1.plot(
    x_prec, y_prec_high,
    color="black",
    linewidth=2,
    alpha=0.8,
    label="Prec.: overfit"
)

# ----- TMEDIA -----
ax1.scatter(x_tmed, y_tmed, s=12, color="blue", label="TMEDIA")
ax1.plot(x_tmed, y_tmed_low, "--", color="blue", linewidth=2, label="TMEDIA simple fit")
ax1.plot(x_tmed, y_tmed_high, color="blue", linewidth=2, alpha=0.8, label="TMEDIA overfit")

ax1.set_title("TMEDIA + Precipitación (2009) con Underfitting y Overfitting")
ax1.set_xlabel("Índice de día (2009)")
ax1.set_ylabel("Temperatura (°C) / Precipitación (mm)")
ax1.grid(True)
ax1.legend(loc="upper left", ncol=2)
plt.tight_layout()
plt.show()

### Year 2009 vs 2024

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# Function to process and plot a given year
def plot_year_fit(year, simple_deg=2, complex_deg=25):

    # ---------------------------
    # FILTER YEAR
    # ---------------------------
    df_year = df[df["FECHA"].dt.year == year].copy()
    df_year = df_year.sort_values("FECHA")

    # Extract TMEDIA
    y = pd.to_numeric(df_year["TMEDIA"], errors="coerce")
    x = np.arange(len(y))

    # Clean NaN
    mask = ~np.isnan(y)
    x = x[mask]
    y = y[mask]

    # ---------------------------
    # FIT MODELS
    # ---------------------------
    # Simple model
    coefs_low = Polynomial.fit(x, y, deg=simple_deg)
    y_pred_low = coefs_low(x)

    # Complex model (overfitting)
    deg = min(complex_deg, len(x)-2)  # avoid numerical crash
    coefs_high = Polynomial.fit(x, y, deg=deg)
    y_pred_high = coefs_high(x)

    # ---------------------------
    # PLOT
    # ---------------------------
    plt.figure(figsize=(14,6))
    plt.scatter(x, y, s=10, label=f"Real Data ({year})", alpha=0.7)
    plt.plot(x, y_pred_low, linewidth=2, linestyle='--',
             label=f"Simple fit (Degree {simple_deg})")
    plt.plot(x, y_pred_high, linewidth=2, alpha=0.8,
             label=f"Overfitting (Degree {deg})")

    plt.title(f"Overfitting demonstration – Temperature {year}")
    plt.xlabel(f"Day index ({year})")
    plt.ylabel("Temperature (°C)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# ---------------------------
# 2. PLOT YEARS
# ---------------------------
plot_year_fit(2009)
plot_year_fit(2024)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA FROM CSV
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FUNCTION TO FIT MODELS
# ---------------------------
def fit_models(df_year, simple_deg=2, complex_deg=25):

    y = pd.to_numeric(df_year["TMEDIA"], errors="coerce")
    x = np.arange(len(y))

    mask = ~np.isnan(y)
    x = x[mask]
    y = y[mask]

    # Simple model
    coefs_low = Polynomial.fit(x, y, deg=simple_deg)
    y_low = coefs_low(x)

    # Complex model (overfitting)
    deg = min(complex_deg, len(x)-2)
    coefs_high = Polynomial.fit(x, y, deg=deg)
    y_high = coefs_high(x)

    return x, y, y_low, y_high

# ---------------------------
# 3. PREPARE YEARS 2009 & 2024
# ---------------------------
years = [2009, 2024]
data = {}

for yr in years:
    df_y = df[df["FECHA"].dt.year == yr].copy().sort_values("FECHA")
    x, y, y_low, y_high = fit_models(df_y)
    data[yr] = (x, y, y_low, y_high)

# ---------------------------
# 4. PLOT EVERYTHING TOGETHER
# ---------------------------
plt.figure(figsize=(16,7))

# Real data
plt.scatter(data[2009][0], data[2009][1], s=12, alpha=0.6, label="Real 2009")
plt.scatter(data[2024][0], data[2024][1], s=12, alpha=0.6, label="Real 2024")

# Simple fits
plt.plot(data[2009][0], data[2009][2], linewidth=2, linestyle='--',
         label="Simple fit 2009 (deg=2)")
plt.plot(data[2024][0], data[2024][2], linewidth=2, linestyle='--',
         label="Simple fit 2024 (deg=2)")

# Overfitting fits
plt.plot(data[2009][0], data[2009][3], linewidth=2, alpha=0.8,
         label="Overfitting 2009 (deg≈25)")
plt.plot(data[2024][0], data[2024][3], linewidth=2, alpha=0.8,
         label="Overfitting 2024 (deg≈25)")

plt.title("Overfitting vs Underfitting – TMEDIA (2009 & 2024)")
plt.xlabel("Day index")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df["TMEDIA"] = pd.to_numeric(df["TMEDIA"], errors="coerce")

# ---------------------------
# FUNCTION: prepare normalized X axis (all months equal)
# ---------------------------
def prepare_year_normalized(year):
    df_y = df[df["FECHA"].dt.year == year].copy()
    df_y = df_y.dropna(subset=["TMEDIA"]).sort_values("FECHA")

    # month number: 1..12
    month = df_y["FECHA"].dt.month

    # day of month: 1..31
    day = df_y["FECHA"].dt.day

    # total days in that month
    days_in_month = df_y["FECHA"].dt.days_in_month

    # Normalized X: month + fraction of month
    x = month + (day - 1) / days_in_month

    y = df_y["TMEDIA"].values
    return x.values, y

# ---------------------------
# 2. Get normalized daily data
# ---------------------------
x09, y09 = prepare_year_normalized(2009)
x24, y24 = prepare_year_normalized(2024)

# ---------------------------
# 3. Fit simple and complex model
# ---------------------------
def fit(x, y, simple_deg=2, complex_deg=25):
    low = Polynomial.fit(x, y, deg=simple_deg)(x)
    deg = min(complex_deg, len(x)-2)
    high = Polynomial.fit(x, y, deg=deg)(x)
    return low, high

y09_low, y09_high = fit(x09, y09)
y24_low, y24_high = fit(x24, y24)

# ---------------------------
# 4. Plot
# ---------------------------
plt.figure(figsize=(18,7))

# real data
plt.plot(x09, y09, ".", alpha=0.4, label="Real 2009")
plt.plot(x24, y24, ".", alpha=0.4, label="Real 2024")

# simple fits
plt.plot(x09, y09_low, "--", linewidth=2, label="Simple fit 2009")
plt.plot(x24, y24_low, "--", linewidth=2, label="Simple fit 2024")

# overfitting fits
plt.plot(x09, y09_high, linewidth=2, label="Overfitting 2009")
plt.plot(x24, y24_high, linewidth=2, label="Overfitting 2024")

# X ticks: 12 months equally spaced
month_ticks = np.arange(1, 13)
month_labels = ["Ene","Feb","Mar","Abr","May","Jun","Jul","Ago","Sep","Oct","Nov","Dic"]
plt.xticks(month_ticks + 0.5, month_labels)  # center labels

plt.title("TMEDIA – Over/Underfitting (2009 vs 2024) – Months equally sized")
plt.ylabel("Temperatura (°C)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FILTER YEARS
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy().sort_values("FECHA")
df_2024 = df[df["FECHA"].dt.year == 2024].copy().sort_values("FECHA")

# Index
x_2009 = np.arange(len(df_2009))
x_2024 = np.arange(len(df_2024))

# Extract numeric series
tmed_2009 = pd.to_numeric(df_2009["TMEDIA"], errors="coerce")
prec_2009 = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")

tmed_2024 = pd.to_numeric(df_2024["TMEDIA"], errors="coerce")
prec_2024 = pd.to_numeric(df_2024["PRECIPITACION"], errors="coerce")

# ---------------------------
# 3. FIT FUNCTION
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
    x_masked = x[~np.isnan(y)]
    y_masked = y[~np.isnan(y)]
    # Simple fit
    coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
    y_low = coefs_low(x_masked)
    # Overfit
    deg = min(complex_deg, len(x_masked) - 2)
    coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
    y_high = coefs_high(x_masked)
    return x_masked, y_masked, y_low, y_high

# Fittings 2009
x_tmed_2009, y_tmed_2009, y_tmed_2009_low, y_tmed_2009_high = fit_series(x_2009, tmed_2009)
x_prec_2009, y_prec_2009, y_prec_2009_low, y_prec_2009_high = fit_series(x_2009, prec_2009)

# Fittings 2024
x_tmed_2024, y_tmed_2024, y_tmed_2024_low, y_tmed_2024_high = fit_series(x_2024, tmed_2024)
x_prec_2024, y_prec_2024, y_prec_2024_low, y_prec_2024_high = fit_series(x_2024, prec_2024)

# ---------------------------
# 4. PLOT COMPARISON
# ---------------------------
plt.figure(figsize=(18,10))

# -------- PRECIP 2009 --------
plt.bar(
    x_prec_2009,
    y_prec_2009,
    color="gray",
    alpha=0.3,
    label="Prec 2009"
)

# Fitting precipitation (2009)
plt.plot(x_prec_2009, y_prec_2009_low, "--", color="black", label="Prec 2009 simple fit")
plt.plot(x_prec_2009, y_prec_2009_high, color="black", alpha=0.7, label="Prec 2009 overfit")

# -------- PRECIP 2024 --------
plt.bar(
    x_prec_2024,
    y_prec_2024,
    color="orange",
    alpha=0.3,
    label="Prec 2024"
)

plt.plot(x_prec_2024, y_prec_2024_low, "--", color="orange", label="Prec 2024 simple fit")
plt.plot(x_prec_2024, y_prec_2024_high, color="orange", alpha=0.7, label="Prec 2024 overfit")

# -------- TMEDIA 2009 --------
plt.scatter(x_tmed_2009, y_tmed_2009, s=12, color="blue", label="TMEDIA 2009")
plt.plot(x_tmed_2009, y_tmed_2009_low, "--", color="blue", linewidth=2, label="TMEDIA 2009 simple fit")
plt.plot(x_tmed_2009, y_tmed_2009_high, color="blue", linewidth=2, alpha=0.8, label="TMEDIA 2009 overfit")

# -------- TMEDIA 2024 --------
plt.scatter(x_tmed_2024, y_tmed_2024, s=12, color="red", label="TMEDIA 2024")
plt.plot(x_tmed_2024, y_tmed_2024_low, "--", color="red", linewidth=2, label="TMEDIA 2024 simple fit")
plt.plot(x_tmed_2024, y_tmed_2024_high, color="red", linewidth=2, alpha=0.8, label="TMEDIA 2024 overfit")

plt.title("Comparativa TMEDIA + Precipitación (2009 vs 2024) con Underfitting y Overfitting")
plt.xlabel("Índice de día")
plt.ylabel("Temperatura (°C) / Precipitación (mm)")
plt.grid(True)
plt.legend(loc="upper left", ncol=2)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FILTER YEARS
# ---------------------------
df_2009 = df[df["FECHA"].dt.year == 2009].copy().sort_values("FECHA")
df_2024 = df[df["FECHA"].dt.year == 2024].copy().sort_values("FECHA")

# Index for each year
x_2009 = np.arange(len(df_2009))
x_2024 = np.arange(len(df_2024))

# Precipitation as numeric
prec_2009 = pd.to_numeric(df_2009["PRECIPITACION"], errors="coerce")
prec_2024 = pd.to_numeric(df_2024["PRECIPITACION"], errors="coerce")

# ---------------------------
# 3. FIT FUNCTION
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
    x_masked = x[~np.isnan(y)]
    y_masked = y[~np.isnan(y)]
    # Simple fit
    coefs_low = Polynomial.fit(x_masked, y_masked, deg=simple_deg)
    y_low = coefs_low(x_masked)
    # Overfit
    deg = min(complex_deg, len(x_masked)-2)
    coefs_high = Polynomial.fit(x_masked, y_masked, deg=deg)
    y_high = coefs_high(x_masked)
    return x_masked, y_masked, y_low, y_high

# Fittings
x_p2009, y_p2009, y_p2009_low, y_p2009_high = fit_series(x_2009, prec_2009)
x_p2024, y_p2024, y_p2024_low, y_p2024_high = fit_series(x_2024, prec_2024)

# ---------------------------
# 4. PLOT PRECIP ONLY
# ---------------------------
plt.figure(figsize=(18,7))

# ----- PRECIP 2009 -----
plt.bar(x_p2009, y_p2009, color="gray", alpha=0.4, label="Precipitación 2009")
plt.plot(x_p2009, y_p2009_low, "--", color="black", linewidth=2, label="2009 simple fit")
plt.plot(x_p2009, y_p2009_high, color="black", linewidth=2, alpha=0.8, label="2009 overfit")

# ----- PRECIP 2024 -----
plt.bar(x_p2024, y_p2024, color="orange", alpha=0.4, label="Precipitación 2024")
plt.plot(x_p2024, y_p2024_low, "--", color="orange", linewidth=2, label="2024 simple fit")
plt.plot(x_p2024, y_p2024_high, color="orange", linewidth=2, alpha=0.8, label="2024 overfit")

plt.title("Comparación de Precipitación – 2009 vs 2024 (simple fit vs overfit)")
plt.xlabel("Índice de día")
plt.ylabel("Precipitación (mm)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
from ipywidgets import interact, Dropdown, fixed

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df["TMEDIA"] = pd.to_numeric(df["TMEDIA"], errors="coerce")


# ---------------------------
# 2. FUNCTION: normalized X axis (months equally sized)
# ---------------------------
def prepare_year_normalized(year):
    df_y = df[df["FECHA"].dt.year == year].copy()
    df_y = df_y.dropna(subset=["TMEDIA"]).sort_values("FECHA")

    month = df_y["FECHA"].dt.month
    day = df_y["FECHA"].dt.day
    days_in_month = df_y["FECHA"].dt.days_in_month

    # Normalized X = month + fraction of month
    x = month + (day - 1) / days_in_month
    y = df_y["TMEDIA"].values
    return x.values, y


# ---------------------------
# 3. FIT MODELS
# ---------------------------
def fit(x, y, simple_deg=2, complex_deg=25):
    low = Polynomial.fit(x, y, deg=simple_deg)(x)
    deg = min(complex_deg, len(x)-2)
    high = Polynomial.fit(x, y, deg=deg)(x)
    return low, high


# ---------------------------
# 4. INTERACTIVE PLOT FUNCTION
# ---------------------------
def plot_comparison(yearA, yearB):
    xA, yA = prepare_year_normalized(yearA)
    xB, yB = prepare_year_normalized(yearB)

    yA_low, yA_high = fit(xA, yA)
    yB_low, yB_high = fit(xB, yB)

    plt.figure(figsize=(18,7))

    # Real data
    plt.plot(xA, yA, ".", alpha=0.4, label=f"Real {yearA}")
    plt.plot(xB, yB, ".", alpha=0.4, label=f"Real {yearB}")

    # Simple fit
    plt.plot(xA, yA_low, "--", linewidth=2, label=f"Simple fit {yearA}")
    plt.plot(xB, yB_low, "--", linewidth=2, label=f"Simple fit {yearB}")

    # Overfit
    plt.plot(xA, yA_high, linewidth=2, label=f"Overfit {yearA}")
    plt.plot(xB, yB_high, linewidth=2, label=f"Overfit {yearB}")

    # X ticks
    month_ticks = np.arange(1, 13)
    month_labels = ["Ene","Feb","Mar","Abr","May","Jun","Jul","Ago","Sep","Oct","Nov","Dic"]
    plt.xticks(month_ticks + 0.5, month_labels)

    plt.title(f"Comparación TMEDIA – {yearA} vs {yearB} (Meses igualados)")
    plt.ylabel("Temperatura (°C)")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()


# ---------------------------
# 5. INTERACTIVE WIDGET
# ---------------------------
years = list(range(2009, 2025))

interact(
    plot_comparison,
    yearA=Dropdown(options=years, value=2009, description="Año A"),
    yearB=Dropdown(options=years, value=2024, description="Año B")
)

interactive(children=(Dropdown(description='Año A', options=(2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2…

<function __main__.plot_comparison(yearA, yearB)>

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial
from ipywidgets import interact, SelectMultiple, Dropdown, IntSlider, fixed

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)
df["TMEDIA"] = pd.to_numeric(df["TMEDIA"], errors="coerce")

# ---------------------------
# Function: normalized X axis (months equally sized)
# ---------------------------
def prepare_year_normalized(year):
    df_y = df[df["FECHA"].dt.year == year].copy()
    df_y = df_y.dropna(subset=["TMEDIA"]).sort_values("FECHA")

    month = df_y["FECHA"].dt.month
    day   = df_y["FECHA"].dt.day
    days  = df_y["FECHA"].dt.days_in_month

    x = month + (day - 1) / days
    y = df_y["TMEDIA"].values
    return x.values, y

# ---------------------------
# Fit function
# ---------------------------
def fit_curve(x, y, degree):
    degree = min(degree, len(x)-2)
    return Polynomial.fit(x, y, deg=degree)(x)

# ---------------------------
# Interactive plot
# ---------------------------
def plot_years(selected_years, curve_type, degree):

    plt.figure(figsize=(19,8))

    for year in selected_years:
        x, y = prepare_year_normalized(year)

        if curve_type == "Datos reales":
            plt.plot(x, y, ".", markersize=3, alpha=0.5, label=f"{year}")
        elif curve_type == "Ajuste simple (deg 2)":
            y_low = fit_curve(x, y, 2)
            plt.plot(x, y_low, "-", linewidth=2, alpha=0.8, label=f"{year} (fit2)")
        elif curve_type == "Ajuste complejo (deg variable)":
            y_high = fit_curve(x, y, degree)
            plt.plot(x, y_high, "-", linewidth=2, alpha=0.8, label=f"{year} (fit{degree})")

    # X axis: month labels equally spaced
    month_ticks = np.arange(1, 13)
    month_labels = ["Ene","Feb","Mar","Abr","May","Jun","Jul","Ago","Sep","Oct","Nov","Dic"]
    plt.xticks(month_ticks + 0.5, month_labels)

    plt.title(f"TMEDIA 2009–2024 · Curva: {curve_type}")
    plt.ylabel("Temperatura (ºC)")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

# ---------------------------
# Create widgets
# ---------------------------
years = list(range(2009, 2025))

interact(
    plot_years,
    selected_years = SelectMultiple(
        options=years,
        value=(2009, 2010, 2024),
        description="Años:"
    ),
    curve_type = Dropdown(
        options=["Datos reales", "Ajuste simple (deg 2)", "Ajuste complejo (deg variable)"],
        value="Datos reales",
        description="Tipo curva:"
    ),
    degree = IntSlider(
        value=15, min=3, max=40, step=1,
        description="Grado (si complejo):"
    )
)

interactive(children=(SelectMultiple(description='Años:', index=(0, 1, 15), options=(2009, 2010, 2011, 2012, 2…

<function __main__.plot_years(selected_years, curve_type, degree)>

import pandas as pd

# Cargar CSV
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# Agrupar por año y contar
registros_por_ano = df.groupby(df["FECHA"].dt.year).size().sort_values(ascending=False)

registros_por_ano

FECHA
2012    366
2016    366
2020    366
2017    365
2018    365
2021    365
2011    364
2013    364
2023    364
2010    363
2019    363
2014    362
2015    361
2009    360
2022    356
2024    345
2025    310
2008     80
dtype: int64

### 2012 vs 2023

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ---------------------------
# 2. FILTER YEARS 2012 & 2023
# ---------------------------
df_2012 = df[df["FECHA"].dt.year == 2012].copy().sort_values("FECHA")
df_2023 = df[df["FECHA"].dt.year == 2023].copy().sort_values("FECHA")

x_2012 = np.arange(len(df_2012))
x_2023 = np.arange(len(df_2023))

prec_2012 = pd.to_numeric(df_2012["PRECIPITACION"], errors="coerce")
prec_2023 = pd.to_numeric(df_2023["PRECIPITACION"], errors="coerce")

# ---------------------------
# 3. FIT FUNCTION
# ---------------------------
def fit_series(x, y, simple_deg=2, complex_deg=20):
    x_mask = x[~np.isnan(y)]
    y_mask = y[~np.isnan(y)]
    # Simple fit
    coefs_low = Polynomial.fit(x_mask, y_mask, deg=simple_deg)
    y_low = coefs_low(x_mask)
    # Overfit
    deg = min(complex_deg, len(x_mask) - 2)
    coefs_high = Polynomial.fit(x_mask, y_mask, deg=deg)
    y_high = coefs_high(x_mask)
    return x_mask, y_mask, y_low, y_high

# Apply fitting to both years
x_p2012, y_p2012, y_p2012_low, y_p2012_high = fit_series(x_2012, prec_2012)
x_p2023, y_p2023, y_p2023_low, y_p2023_high = fit_series(x_2023, prec_2023)

# ---------------------------
# 4. PLOT PRECIPITATION ONLY
# ---------------------------
plt.figure(figsize=(18,7))

# ----- PRECIP 2012 -----
plt.bar(x_p2012, y_p2012, color="gray", alpha=0.4, label="Precipitación 2012")
plt.plot(x_p2012, y_p2012_low, "--", color="black", linewidth=2, label="2012 simple fit")
plt.plot(x_p2012, y_p2012_high, color="black", linewidth=2, alpha=0.8, label="2012 overfit")

# ----- PRECIP 2023 -----
plt.bar(x_p2023, y_p2023, color="green", alpha=0.3, label="Precipitación 2023")
plt.plot(x_p2023, y_p2023_low, "--", color="green", linewidth=2, label="2023 simple fit")
plt.plot(x_p2023, y_p2023_high, color="green", linewidth=2, alpha=0.8, label="2023 overfit")

plt.title("Comparación de Precipitación – 2012 vs 2023 (simple fit vs overfit)")
plt.xlabel("Índice de día")
plt.ylabel("Precipitación (mm)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean

# ------------------------------------------
# 1. LOAD DATA
# ------------------------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# Variable a comparar
var = "TMEDIA"   # Puedes poner: "TMIN", "TMAX", "PRECIPITACION"

# ------------------------------------------
# 2. AGRUPAR POR AÑO
# ------------------------------------------
df = df.dropna(subset=[var])
df["AÑO"] = df["FECHA"].dt.year

# Diccionario año -> serie normalizada a longitud igual
series = {}

# Encontrar longitud mínima (para alinear)
min_len = df.groupby("AÑO").size().min()

for year, group in df.groupby("AÑO"):
    y = pd.to_numeric(group[var], errors="coerce").dropna().values
    if len(y) >= min_len:
        y = y[:min_len]  # recortar a misma longitud
        series[year] = y

years = sorted(series.keys())

# ------------------------------------------
# 3. CALCULAR DISTANCIA ENTRE CADA PAR DE AÑOS
# ------------------------------------------
best_pair = None
best_distance = float("inf")

for i in range(len(years)):
    for j in range(i+1, len(years)):
        y1 = series[years[i]]
        y2 = series[years[j]]
        
        dist = euclidean(y1, y2)
        
        if dist < best_distance:
            best_distance = dist
            best_pair = (years[i], years[j])

print("The two most similar years in", var, "son:", best_pair)
print("Distance:", best_distance)

The two most similar years in TMEDIA son: (2014, 2015)
Distance: 23.189221634198937

import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean

# ------------------------------------------
# 1. LOAD DATA
# ------------------------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

var = "PRECIPITACION"   # rainfall variable

df = df.dropna(subset=[var])
df["YEAR"] = df["FECHA"].dt.year

# ------------------------------------------
# 2. FIND THE YEAR WITH THE MOST RAINFALL
# ------------------------------------------
total_rain_per_year = df.groupby("YEAR")[var].sum()

year_max_rain = total_rain_per_year.idxmax()
max_rain_value = total_rain_per_year.max()

print("--------------------------------------------------------")
print("YEAR WITH THE HIGHEST TOTAL RAINFALL:")
print(f"➡ Year {year_max_rain} with {max_rain_value:.2f} mm")
print("--------------------------------------------------------")

# ------------------------------------------
# 3. PREPARE SERIES FOR SIMILARITY ANALYSIS
# ------------------------------------------
series = {}
min_len = df.groupby("YEAR").size().min()   # align different years

for year, group in df.groupby("YEAR"):
    y = pd.to_numeric(group[var], errors="coerce").dropna().values
    if len(y) >= min_len:
        series[year] = y[:min_len]

years = sorted(series.keys())

# ------------------------------------------
# 4. COMPUTE YEAR-TO-YEAR DISTANCES
# ------------------------------------------
best_pair = None
best_distance = float("inf")

for i in range(len(years)):
    for j in range(i+1, len(years)):
        y1 = series[years[i]]
        y2 = series[years[j]]

        dist = euclidean(y1, y2)  # similarity measure

        if dist < best_distance:
            best_distance = dist
            best_pair = (years[i], years[j])

print("Most similar years in PRECIPITATION:", best_pair)
print("Distance (lower = more similar):", best_distance)
print("--------------------------------------------------------")
print("Total annual rainfall (mm):")
print(total_rain_per_year)
print("--------------------------------------------------------")

--------------------------------------------------------
YEAR WITH THE HIGHEST TOTAL RAINFALL:
➡ Year 2023 with 2211.40 mm
--------------------------------------------------------
Most similar years in PRECIPITATION: (2011, 2012)
Distance (lower = more similar): 49.25038070918843
--------------------------------------------------------
Total annual rainfall (mm):
YEAR
2008     217.2
2009    1397.9
2010    1338.8
2011     678.8
2012    1056.0
2013    1316.8
2014    1386.3
2015    1036.2
2016    1868.7
2017    1170.8
2018    1275.1
2019    1725.5
2020    1727.1
2021    1829.8
2022    1272.0
2023    2211.4
2024    1559.0
2025    1604.8
Name: PRECIPITACION, dtype: float64
--------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Fix column names
df.columns = [c.strip() for c in df.columns]

# Use your real column names
date_col = "UTC"
hum_col = "Hum"

df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
df = df.dropna(subset=[date_col, hum_col])

# Sort by time
df = df.sort_values(date_col)

# ---------------------------
# 2. PREPARE SERIES
# ---------------------------
y = pd.to_numeric(df[hum_col], errors="coerce")
x = np.arange(len(y))

mask = ~np.isnan(y)
x = x[mask]
y = y[mask]

# ---------------------------
# 3. FIT MODELS
# ---------------------------

# Simple model (underfitting)
coefs_low = Polynomial.fit(x, y, deg=2)
y_low = coefs_low(x)

# Complex model (overfitting)
deg = min(20, len(x) - 2)   # avoid crash if few points
coefs_high = Polynomial.fit(x, y, deg=deg)
y_high = coefs_high(x)

# ---------------------------
# 4. PLOT
# ---------------------------
plt.figure(figsize=(16,7))

plt.scatter(x, y, s=10, color="blue", alpha=0.7, label="Humidity (real data)")
plt.plot(x, y_low, "--", linewidth=2, color="green", label="Simple fit (deg=2)")
plt.plot(x, y_high, linewidth=2, color="red", alpha=0.8, label=f"Overfit (deg={deg})")

plt.title("Humidity with Underfitting and Overfitting")
plt.xlabel("Time index")
plt.ylabel("Humidity (%)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

# ---------------------------
# 2. GROUP BY MONTH
# ---------------------------
df["MONTH"] = df["UTC"].dt.month

# Average humidity per month
monthly = df.groupby("MONTH")["Hum"].mean()

# Prepare the series
x = np.arange(1, 13)              # months 1–12
y = monthly.values                # mean humidity per month

# ---------------------------
# 3. FIT MODELS
# ---------------------------

# Simple model (underfitting)
simple_deg = 2
coefs_low = Polynomial.fit(x, y, deg=simple_deg)
y_low = coefs_low(x)

# Overfitting model
complex_deg = min(8, len(x)-2)
coefs_high = Polynomial.fit(x, y, deg=complex_deg)
y_high = coefs_high(x)

# ---------------------------
# 4. PLOT
# ---------------------------
plt.figure(figsize=(14,6))

# Real data
plt.plot(x, y, 'o', color="blue", label="Real humidity (monthly average)")

# Simple fit
plt.plot(x, y_low, '--', linewidth=2, color="green",
         label=f"Simple fit (deg={simple_deg})")

# Overfit
plt.plot(x, y_high, linewidth=2, color="red", alpha=0.8,
         label=f"Overfit (deg={complex_deg})")

plt.title("Monthly Humidity with Underfitting and Overfitting")
plt.xlabel("Month")
plt.ylabel("Humidity (%)")
plt.xticks(np.arange(1, 13))
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import Polynomial

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

# Extract year and month
df["YEAR"] = df["UTC"].dt.year
df["MONTH"] = df["UTC"].dt.month

# ---------------------------
# 2. FILTER YEARS 2020–2024
# ---------------------------
df = df[(df["YEAR"] >= 2020) & (df["YEAR"] <= 2024)]

# Group into a table: rows=months, columns=years
monthly_year = df.groupby(["YEAR", "MONTH"])["Hum"].mean().unstack(level=0)

# ---------------------------
# 3. PLOT — Humidity + Fitting
# ---------------------------
plt.figure(figsize=(18,9))

years = sorted(monthly_year.columns.dropna().tolist())
months = np.arange(1, 13)

for year in years:
    y = monthly_year[year].values
    x = months
    
    # Remove NaN
    mask = ~np.isnan(y)
    x_clean = x[mask]
    y_clean = y[mask]
    
    if len(x_clean) < 3:
        continue  # not enough data to fit
    
    # Fitting simple (underfitting)
    deg_low = 2
    coef_low = Polynomial.fit(x_clean, y_clean, deg=deg_low)
    y_low = coef_low(x_clean)

    # Fitting complex (overfitting)
    deg_high = min(8, len(x_clean) - 2)
    coef_high = Polynomial.fit(x_clean, y_clean, deg=deg_high)
    y_high = coef_high(x_clean)
    
    # Plot real data
    plt.plot(x_clean, y_clean, marker="o", linewidth=2, label=f"{year} real")
    
    # Simple fit
    plt.plot(x_clean, y_low, "--", linewidth=2, label=f"{year} simple fit")
    
    # Overfit
    plt.plot(x_clean, y_high, linewidth=2, alpha=0.7, label=f"{year} overfit")

plt.title("Monthly Humidity (2019–2025) with Underfitting and Overfitting")
plt.xlabel("Month")
plt.ylabel("Humidity (%)")
plt.xticks(months)
plt.grid(True)
plt.legend(ncol=3, fontsize=9)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

# Prepare data
df = df.sort_values("UTC")
y = df["Hum"].values
X = np.arange(len(y)).reshape(-1, 1)  # simple time index

# ---------------------------
# 2. CROSS-VALIDATION SETUP
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

degrees = range(1, 20)  # test polynomial degrees 1 to 19
cv_errors = []

# ---------------------------
# 3. RUN CROSS-VALIDATION FOR EACH DEGREE
# ---------------------------
for deg in degrees:
    fold_errors = []
    poly = PolynomialFeatures(degree=deg)
    
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Transform features
        X_train_poly = poly.fit_transform(X_train)
        X_test_poly = poly.transform(X_test)

        # Fit model
        model = LinearRegression()
        model.fit(X_train_poly, y_train)

        # Predict + calculate error
        y_pred = model.predict(X_test_poly)
        error = mean_squared_error(y_test, y_pred)
        fold_errors.append(error)

    # Average CV error for this degree
    cv_errors.append(np.mean(fold_errors))

# ---------------------------
# 4. PLOT CROSS-VALIDATION RESULTS
# ---------------------------
plt.figure(figsize=(12,6))
plt.plot(degrees, cv_errors, marker="o", linewidth=2)
plt.xlabel("Polynomial Degree")
plt.ylabel("Cross-Validation Error (MSE)")
plt.title("Cross-Validation Curve for Humidity Fitting")
plt.grid(True)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

df["YEAR"] = df["UTC"].dt.year

# Years to evaluate
years = [2020, 2021, 2022, 2023, 2024]

# Polynomial degrees
degrees = range(1, 12)

# Store results
errors_by_year = {}

# ---------------------------
# 2. CROSS-VALIDATION PER YEAR
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for year in years:
    df_y = df[df["YEAR"] == year].copy()
    if len(df_y) < 10:
        print(f"Skipping {year}: not enough data")
        continue

    y = df_y["Hum"].values
    X = np.arange(len(y)).reshape(-1, 1)

    year_errors = []

    for deg in degrees:
        fold_errors = []
        poly = PolynomialFeatures(degree=deg)

        for train_idx, test_idx in kf.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train_poly = poly.fit_transform(X_train)
            X_test_poly = poly.transform(X_test)

            model = LinearRegression()
            model.fit(X_train_poly, y_train)

            y_pred = model.predict(X_test_poly)
            fold_errors.append(mean_squared_error(y_test, y_pred))

        year_errors.append(np.mean(fold_errors))

    errors_by_year[year] = year_errors

# ---------------------------
# 3. PLOT CROSS-VALIDATION CURVES
# ---------------------------
plt.figure(figsize=(14,7))

for year in errors_by_year:
    plt.plot(degrees, errors_by_year[year], marker="o", linewidth=2, label=str(year))

plt.title("Cross-Validation Error (Humidity) — 2020 to 2024")
plt.xlabel("Polynomial Degree")
plt.ylabel("MSE (Cross-Validation Error)")
plt.grid(True)
plt.legend(title="Year")
plt.tight_layout()
plt.show()

# ---------------------------
# 4. SHOW ERRORS IN A TABLE
# ---------------------------
error_table = pd.DataFrame(errors_by_year, index=degrees)
error_table.index.name = "Degree"
error_table

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

df["YEAR"] = df["UTC"].dt.year

# Years to analyze
years = [2020, 2021, 2022, 2023, 2024]

# Regularization strengths (λ)
lambdas = np.logspace(-4, 4, 20)

# Where to store results
reg_errors = {}

# ---------------------------
# 2. REGULARIZATION ANALYSIS PER YEAR
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for year in years:
    df_y = df[df["YEAR"] == year].copy()
    if len(df_y) < 20:
        print(f"Skipping {year}: not enough samples")
        continue
    
    y = df_y["Hum"].values
    X = np.arange(len(y)).reshape(-1, 1)
    
    year_errors = []
    
    for lam in lambdas:
        fold_errors = []
        model = Ridge(alpha=lam)
        
        for train_idx, test_idx in kf.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            fold_errors.append(mean_squared_error(y_test, y_pred))
        
        year_errors.append(np.mean(fold_errors))
    
    reg_errors[year] = year_errors

# ---------------------------
# 3. PLOT REGULARIZATION CURVES
# ---------------------------
plt.figure(figsize=(14,7))

for year in reg_errors:
    plt.plot(lambdas, reg_errors[year], marker="o", linewidth=2, label=str(year))

plt.xscale("log")
plt.xlabel("Regularization strength (lambda)")
plt.ylabel("Validation Error (MSE)")
plt.title("Regularization Curve per Year (Humidity) — Ridge Regression")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# ---------------------------
# 1. LOAD DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum"])

df["YEAR"] = df["UTC"].dt.year
df["MONTH"] = df["UTC"].dt.month

# Years you want to analyze
years = sorted(df["YEAR"].unique())

# Regularization strengths (lambda)
lambdas = np.logspace(-4, 4, 15)

# ---------------------------
# 2. REGULARIZATION PER YEAR
# ---------------------------
best_models = {}   # to store best model prediction per year

for year in years:
    df_y = df[df["YEAR"] == year].copy()
    if len(df_y) < 30:
        print(f"Skipping {year}: not enough data")
        continue
    
    # Daily humidity
    y = df_y["Hum"].values
    X = np.arange(len(y)).reshape(-1, 1)  # simple time index
    
    # K-fold cross-validation setup
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Evaluate each lambda
    mse_list = []
    models = []
    
    for lam in lambdas:
        fold_errors = []
        model = Ridge(alpha=lam)
        
        for train_idx, test_idx in kf.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            fold_errors.append(mean_squared_error(y_test, y_pred))
        
        mse_list.append(np.mean(fold_errors))
        models.append(model)
    
    # Pick best lambda
    best_idx = np.argmin(mse_list)
    best_lambda = lambdas[best_idx]
    best_model = models[best_idx]

    # Predict using the BEST model
    df_y["Pred"] = best_model.predict(X)
    best_models[year] = df_y


# ---------------------------
# 3. AGGREGATE PREDICTIONS BY MONTH (X-axis = months)
# ---------------------------
plt.figure(figsize=(16,8))

for year, df_y in best_models.items():
    monthly_pred = df_y.groupby("MONTH")["Pred"].mean()
    
    plt.plot(
        monthly_pred.index,
        monthly_pred.values,
        marker="o",
        linewidth=2,
        label=f"{year} (best λ)"
    )

plt.title("Monthly Humidity (Daily Fitting + Ridge Regularization)\nBest Regularized Model for Each Year")
plt.xlabel("Month")
plt.ylabel("Humidity (%)")
plt.xticks(np.arange(1,13))
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

	2020	2021	2022	2023	2024
Degree
1	216.068824	218.752229	205.806426	223.386426	121.662982
2	199.123679	206.903699	197.047975	206.637540	120.667957
3	199.019093	201.704614	196.737505	206.017819	120.021651
4	197.902190	207.015701	196.320941	207.876785	116.337365
5	197.509293	210.007744	195.493466	209.290042	116.981801
6	199.035569	210.817214	194.980695	209.993881	116.845109
7	201.411285	210.744167	195.175063	210.322704	116.787237
8	203.592694	210.467845	195.974085	210.498426	116.688301
9	205.158340	210.258218	197.035062	210.616151	116.304289
10	206.100096	210.176259	198.047936	210.704588	119.011331
11	206.568808	210.195144	198.845317	210.768333	119.901257

Day 3: Fitting¶

Assignment 20/11/2025¶

Fitting¶

Year 2009¶

Overfitting vs Fitting by Year¶

Compare multiple years¶

Find out which year has the most daily data¶

Humedad¶