import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ------------------------------------
# 1. Cargar datos
# ------------------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# ------------------------------------
# 2. Filtrar de 2009 a 2024
# ------------------------------------
mask = df["FECHA"].dt.year.between(2009, 2024)
data = df.loc[mask].copy()

# Series numéricas
temp = pd.to_numeric(data["TMEDIA"], errors="coerce").dropna()
precip = pd.to_numeric(data["PRECIPITACION"], errors="coerce").dropna()

print(f"Nº datos temperatura: {len(temp)}, Nº datos precipitación: {len(precip)}")

# ------------------------------------
# 3. Función: histograma + palitos + gaussiana
# ------------------------------------
def plot_hist_with_gaussian(values, xlabel):
    mean = values.mean()
    std = values.std(ddof=0)
    npts = len(values)

    # Histograma (densidad)
    plt.hist(values,
             bins=max(20, npts // 50),
             density=True,
             alpha=0.5)

    # “Palitos” (rug plot)
    marker_size = max(2, npts / 50)
    plt.plot(values, np.zeros_like(values), '|', ms=marker_size)

    # Curva gaussiana con media y std de tus datos
    xi = np.linspace(mean - 3*std, mean + 3*std, 200)
    yi = np.exp(-(xi - mean)**2 / (2*std**2)) / np.sqrt(2*np.pi*std**2)
    plt.plot(xi, yi, 'r')

    plt.xlabel(xlabel)
    plt.ylabel("Densidad")
    plt.title(f"{xlabel} 2009–2024\nmedia = {mean:.2f}, std = {std:.2f}")
    plt.grid(alpha=0.3)

# ------------------------------------
# 4. Dibujar las dos gráficas
# ------------------------------------
plt.figure(figsize=(10, 8))

plt.subplot(2, 1, 1)
plot_hist_with_gaussian(temp, "Temperatura media (°C)")

plt.subplot(2, 1, 2)
plot_hist_with_gaussian(precip, "Precipitación (mm)")

plt.tight_layout()
plt.show()

Nº datos temperatura: 5744, Nº datos precipitación: 5288

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ------------------------------------
# 1. Load your data
# ------------------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# Filter from 2009 to 2024
mask = df["FECHA"].dt.year.between(2009, 2024)
data = df.loc[mask].copy()

# Numeric series
tmedia = pd.to_numeric(data["TMEDIA"], errors="coerce").dropna()
precip = pd.to_numeric(data["PRECIPITACION"], errors="coerce").dropna()

print(f"Nº datos TMEDIA: {len(tmedia)}, Nº datos PRECIPITACION: {len(precip)}")

# ------------------------------------
# 2. Function to compute std of sample means
# ------------------------------------
def std_of_sample_means(values, Ns, n_trials=2000, seed=10):
    """
    values: 1D array-like of your real data
    Ns: array of sample sizes (N) to test
    n_trials: how many times we compute the mean for each N
    returns: empirical_std, theoretical_std
    """
    rng = np.random.default_rng(seed)
    values = np.asarray(values)
    sigma = values.std(ddof=0)

    empirical_std = []
    for N in Ns:
        # Draw n_trials x N samples with replacement
        samples = rng.choice(values, size=(n_trials, N), replace=True)
        means = samples.mean(axis=1)
        empirical_std.append(means.std(ddof=1))

    empirical_std = np.array(empirical_std)
    theoretical_std = sigma / np.sqrt(Ns)

    return empirical_std, theoretical_std

# ------------------------------------
# 3. Choose N values and compute
# ------------------------------------
Ns = np.array([1, 2, 5, 10, 20, 50, 100, 200, 500])

emp_tmedia, theo_tmedia = std_of_sample_means(tmedia, Ns)
emp_precip, theo_precip = std_of_sample_means(precip, Ns)

# ------------------------------------
# 4. Plot results (log-log to see 1/sqrt(N) clearly)
# ------------------------------------
plt.figure(figsize=(10, 8))

# --- TMEDIA ---
plt.subplot(2, 1, 1)
plt.loglog(Ns, emp_tmedia, 'o-', label='Empirical std of mean (TMEDIA)')
plt.loglog(Ns, theo_tmedia, '--', label=r'Theoretical $\sigma/\sqrt{N}$')
plt.xlabel('N (days averaged)')
plt.ylabel('Std of sample mean (°C)')
plt.title('TMEDIA: averaging reduces error as 1/sqrt(N)')
plt.grid(True, which='both', ls=':')
plt.legend()

# --- PRECIPITATION ---
plt.subplot(2, 1, 2)
plt.loglog(Ns, emp_precip, 'o-', label='Empirical std of mean (PRECIPITACION)')
plt.loglog(Ns, theo_precip, '--', label=r'Theoretical $\sigma/\sqrt{N}$')
plt.xlabel('N (days averaged)')
plt.ylabel('Std of sample mean (mm)')
plt.title('PRECIPITACION: averaging reduces error as 1/sqrt(N)')
plt.grid(True, which='both', ls=':')
plt.legend()

plt.tight_layout()
plt.show()

Nº datos TMEDIA: 5744, Nº datos PRECIPITACION: 5288

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ------------------------------------
# 1. Cargar datos (si ya tienes df, puedes saltarte esto)
# ------------------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# Filtrar 2009–2024
mask = df["FECHA"].dt.year.between(2009, 2024)
data = df.loc[mask].copy()

# Serie numérica de TMEDIA
tmedia = pd.to_numeric(data["TMEDIA"], errors="coerce").dropna().values

# ------------------------------------
# 2. Parámetros para el experimento
# ------------------------------------
trials = 100                         # número de repeticiones por N
points = np.arange(10, 500, 25)      # distintos tamaños de N
means = np.zeros((trials, len(points)))

mu = tmedia.mean()                   # media real de tus datos
sigma = tmedia.std(ddof=0)           # desviación típica real

rng = np.random.default_rng(10)

# ------------------------------------
# 3. Bucle: medias de N días muestreados de tus datos
# ------------------------------------
for j, N in enumerate(points):
    for trial in range(trials):
        sample = rng.choice(tmedia, size=N, replace=True)
        means[trial, j] = sample.mean()

# ------------------------------------
# 4. Calcular media y std de las estimaciones
# ------------------------------------
mean_est = np.mean(means, axis=0)
std_est = np.std(means, axis=0, ddof=1)

# ------------------------------------
# 5. Representar: mu ± sigma/sqrt(N) y estimaciones
# ------------------------------------
plt.figure(figsize=(8, 6))

# Curvas teóricas mu ± sigma/√N
plt.plot(points, mu + sigma/np.sqrt(points), 'r', label='μ ± σ/√N (theoretical)')
plt.plot(points, mu - sigma/np.sqrt(points), 'r')

# Medias estimadas con barras de error
plt.errorbar(points, mean_est, yerr=std_est, fmt='k-o', capsize=7, label='estimated')

# Nubes de puntos de cada N
for j, N in enumerate(points):
    plt.plot(np.full(trials, N), means[:, j], 'o', markersize=2)

plt.xlabel('number of samples averaged')
plt.ylabel('mean TMEDIA estimates')
plt.title('TMEDIA: averaging N days (error ∝ 1/√N)')
plt.legend()
plt.tight_layout()
plt.show()

def plot_mean_convergence(values, label):
    values = np.asarray(values)
    trials = 100
    points = np.arange(10, 500, 25)
    means = np.zeros((trials, len(points)))

    mu = values.mean()
    sigma = values.std(ddof=0)
    rng = np.random.default_rng(10)

    for j, N in enumerate(points):
        for trial in range(trials):
            sample = rng.choice(values, size=N, replace=True)
            means[trial, j] = sample.mean()

    mean_est = means.mean(axis=0)
    std_est = means.std(axis=0, ddof=1)

    plt.figure(figsize=(8, 6))
    plt.plot(points, mu + sigma/np.sqrt(points), 'r', label='μ ± σ/√N (theoretical)')
    plt.plot(points, mu - sigma/np.sqrt(points), 'r')
    plt.errorbar(points, mean_est, yerr=std_est, fmt='k-o', capsize=7, label='estimated')
    for j, N in enumerate(points):
        plt.plot(np.full(trials, N), means[:, j], 'o', markersize=2)

    plt.xlabel('number of samples averaged')
    plt.ylabel(f'mean {label} estimates')
    plt.title(f'{label}: averaging N samples (error ∝ 1/√N)')
    plt.legend()
    plt.tight_layout()
    plt.show()

# TMEDIA
plot_mean_convergence(
    pd.to_numeric(data["TMEDIA"], errors="coerce").dropna(),
    "TMEDIA (°C)"
)

# PRECIPITACION
plot_mean_convergence(
    pd.to_numeric(data["PRECIPITACION"], errors="coerce").dropna(),
    "PRECIPITACION (mm)"
)

import pandas as pd
import numpy as np

# 1. Cargar datos
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# 2. Filtrar 2009–2024
mask = df["FECHA"].dt.year.between(2009, 2024)
data = df.loc[mask].copy()

# 3. Seleccionar variables numéricas para la matriz de covarianza
cols = ["TMEDIA", "PRECIPITACION", "TMIN", "TMAX"]
data_cov = data[cols].apply(pd.to_numeric, errors="coerce")

# 4. Calcular matriz de covarianza
cov_matrix = data_cov.cov()   # por defecto es covarianza muestral

print(cov_matrix)

                  TMEDIA  PRECIPITACION       TMIN       TMAX
TMEDIA         24.296487      -8.927096  20.149995  28.449299
PRECIPITACION  -8.927096      94.637279  -0.414069 -17.427208
TMIN           20.149995      -0.414069  21.728499  18.576236
TMAX           28.449299     -17.427208  18.576236  38.339974

data_cov2 = data[["TMEDIA", "PRECIPITACION"]].apply(pd.to_numeric, errors="coerce")
cov_matrix_2 = data_cov2.cov()
print(cov_matrix_2)

                  TMEDIA  PRECIPITACION
TMEDIA         24.296487      -8.927096
PRECIPITACION  -8.927096      94.637279

import matplotlib.pyplot as plt

plt.figure(figsize=(7,7))
plt.imshow(cov_matrix, interpolation='nearest')
plt.colorbar(label="Covariance")
plt.xticks(range(len(cols)), cols, rotation=45)
plt.yticks(range(len(cols)), cols)
plt.title("Covariance matrix (2009–2024)")
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ---------------------------
# 1. Cargar datos y filtrar 2009–2024
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

mask = df["FECHA"].dt.year.between(2009, 2024)
data = df.loc[mask].copy()

tmedia = pd.to_numeric(data["TMEDIA"], errors="coerce").dropna()
precip = pd.to_numeric(data["PRECIPITACION"], errors="coerce").dropna()

print(f"Nº datos TMEDIA: {len(tmedia)}, Nº datos PRECIPITACION: {len(precip)}")

# ---------------------------
# 2. Función para una gráfica 1D tipo variance
# ---------------------------
def plot_1d_variance(values, xlabel, title):
    vals = np.asarray(values)
    mu = vals.mean()
    sigma = vals.std(ddof=0)

    # Histograma con densidad
    plt.hist(vals, bins=40, density=True, alpha=0.5)

    # Media (línea vertical)
    plt.axvline(mu, color='r', linestyle='-', label=f"mean = {mu:.2f}")

    # Banda ±1 sigma
    plt.axvspan(mu - sigma, mu + sigma, alpha=0.2, color='r',
                label=f"±1σ = {sigma:.2f}")

    plt.xlabel(xlabel)
    plt.ylabel("Density")
    plt.title(title)
    plt.grid(alpha=0.3)
    plt.legend()

# ---------------------------
# 3. Gráfica de TMEDIA
# ---------------------------
plt.figure(figsize=(8, 5))
plot_1d_variance(tmedia,
                 xlabel="TMEDIA (°C)",
                 title="Distribution of TMEDIA (2009–2024)")
plt.tight_layout()
plt.show()

# ---------------------------
# 4. Gráfica de PRECIPITACION
# ---------------------------
plt.figure(figsize=(8, 5))
plot_1d_variance(precip,
                 xlabel="PRECIPITACION (mm)",
                 title="Distribution of PRECIPITACION (2009–2024)")
plt.tight_layout()
plt.show()

Nº datos TMEDIA: 5744, Nº datos PRECIPITACION: 5288

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --------------------------------------------------
# 1. Cargar datos (si ya tienes df, puedes saltarlo)
# --------------------------------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

# Filtrar 2009–2024 y ordenar por fecha
mask = df["FECHA"].dt.year.between(2009, 2024)
data = df.loc[mask].sort_values("FECHA").copy()

# Convertir a numérico
tmedia = pd.to_numeric(data["TMEDIA"], errors="coerce").dropna().values
precip = pd.to_numeric(data["PRECIPITACION"], errors="coerce").dropna().values

print(f"Nº datos TMEDIA: {len(tmedia)}, Nº datos PRECIPITACION: {len(precip)}")

np.random.seed(10)
np.set_printoptions(precision=2, suppress=True)

# --------------------------------------------------
# 2. Función: variance cloud vs covariance cloud
# --------------------------------------------------
def plot_variance_covariance_1d(series, title_text, variance_label="variance", covariance_label="covariance"):
    """
    series: array 1D con los datos (ordenados temporalmente).
    Hace:
      - variance samples: (x,y) independientes muestreados de la serie
      - covariance samples: (X_t, X_{t+1}) pares consecutivos
    """
    series = np.asarray(series)

    # Pares consecutivos (para la parte de covarianza)
    x_cov = series[:-1]
    y_cov = series[1:]
    n_cov = len(x_cov)

    # Elegimos un nº de puntos (para que no sea gigante)
    npts = min(n_cov, 2000)

    # Submuestreo aleatorio de los pares consecutivos
    idx_cov = np.random.choice(n_cov, size=npts, replace=False)
    covarsamples = np.column_stack((x_cov[idx_cov], y_cov[idx_cov]))

    # Variance samples: x e y independientes, extraídos al azar de la serie
    varsamples = np.column_stack((
        np.random.choice(series, size=npts, replace=True),
        np.random.choice(series, size=npts, replace=True)
    ))

    # --- "variance" stats ---
    varmean = np.mean(varsamples, axis=0)
    varstd  = np.sqrt(np.var(varsamples, axis=0))

    varplotx = [
        varmean[0] - varstd[0], varmean[0] + varstd[0],
        None,
        varmean[0],             varmean[0]
    ]
    varploty = [
        varmean[1],             varmean[1],
        None,
        varmean[1] - varstd[1], varmean[1] + varstd[1]
    ]

    # --- "covariance" stats (pares consecutivos) ---
    covarmean = np.mean(covarsamples, axis=0)
    covar     = np.cov(covarsamples, rowvar=False)
    evalu, evect = np.linalg.eig(covar)

    dx0 = evect[0, 0] * np.sqrt(evalu[0])
    dy0 = evect[1, 0] * np.sqrt(evalu[0])
    dx1 = evect[0, 1] * np.sqrt(evalu[1])
    dy1 = evect[1, 1] * np.sqrt(evalu[1])

    covarplotx = [
        covarmean[0] - dx0, covarmean[0] + dx0,
        None,
        covarmean[0] - dx1, covarmean[0] + dx1
    ]
    covarploty = [
        covarmean[1] + dy0, covarmean[1] - dy0,
        None,
        covarmean[1] + dy1, covarmean[1] - dy1
    ]

    # --- Plot combinado ---
    samples = np.vstack((varsamples, covarsamples))

    plt.figure(figsize=(7, 7))
    plt.hist2d(samples[:, 0], samples[:, 1], bins=30, cmap='binary')
    plt.plot(samples[:, 0], samples[:, 1], 'o', markersize=1.5, alpha=0.3)

    # Medias de cada nube
    plt.plot(varmean[0],   varmean[1],   'ro')
    plt.plot(covarmean[0], covarmean[1], 'ro')

    # Ejes de varianza independiente
    plt.plot(varplotx, varploty, 'r')

    # Ejes principales de la covarianza temporal
    plt.plot(covarplotx, covarploty, 'r')

    # Textos (ajusta si se solapan)
    plt.text(varmean[0],   varmean[1],
             variance_label,
             fontsize=12, ha='center', va='center')
    plt.text(covarmean[0], covarmean[1],
             covariance_label,
             fontsize=12, ha='center', va='center')

    plt.axis('off')
    plt.title(title_text)
    plt.tight_layout()
    plt.show()

    # Imprimir matrices
    print(title_text)
    print("  Covariance (variance samples, indep.):")
    print(np.cov(varsamples, rowvar=False))
    print("  Covariance (consecutive pairs):")
    print(covar)
    print()

# --------------------------------------------------
# 3. Gráfica TMEDIA: variance vs covariance (pares de días)
# --------------------------------------------------
plot_variance_covariance_1d(
    tmedia,
    title_text="TMEDIA (2009–2024): variance vs covariance (X_t vs X_{t+1})",
    variance_label="variance\n(independent samples)",
    covariance_label="covariance\n(consecutive days)"
)

# --------------------------------------------------
# 4. Gráfica PRECIPITACION: variance vs covariance (pares de días)
# --------------------------------------------------
plot_variance_covariance_1d(
    precip,
    title_text="PRECIPITACION (2009–2024): variance vs covariance (X_t vs X_{t+1})",
    variance_label="variance\n(independent samples)",
    covariance_label="covariance\n(consecutive days)"
)

Nº datos TMEDIA: 5744, Nº datos PRECIPITACION: 5288

TMEDIA (2009–2024): variance vs covariance (X_t vs X_{t+1})
  Covariance (variance samples, indep.):
[[24.46  0.08]
 [ 0.08 23.64]]
  Covariance (consecutive pairs):
[[24.4  22.12]
 [22.12 24.4 ]]

PRECIPITACION (2009–2024): variance vs covariance (X_t vs X_{t+1})
  Covariance (variance samples, indep.):
[[84.15  0.08]
 [ 0.08 87.13]]
  Covariance (consecutive pairs):
[[88.47 32.01]
 [32.01 80.6 ]]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ---------------------------
# 1. Load data and filter 2009–2024
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

mask = df["FECHA"].dt.year.between(2009, 2024)
data = df.loc[mask].copy()

tmedia = pd.to_numeric(data["TMEDIA"], errors="coerce").dropna().values
precip = pd.to_numeric(data["PRECIPITACION"], errors="coerce").dropna().values

print(f"Nº datos TMEDIA: {len(tmedia)}, Nº datos PRECIPITACION: {len(precip)}")

# ---------------------------
# 2. Shannon entropy from histogram
# ---------------------------
def shannon_entropy(values, bins):
    """
    Estimate Shannon entropy (in bits) using a histogram.
    H = - sum p_i log2 p_i
    """
    counts, edges = np.histogram(values, bins=bins)
    p = counts.astype(float) / counts.sum()
    # remove zero-probability bins to avoid log problems
    p = p[p > 0]
    H = -np.sum(p * np.log2(p))
    return H

# ---------------------------
# 3. Entropy vs number of bins for TMEDIA and PRECIPITACION
# ---------------------------
bins_list = np.arange(5, 105, 5)  # 5,10,...,100

H_tmedia = [shannon_entropy(tmedia, b) for b in bins_list]
H_precip = [shannon_entropy(precip, b) for b in bins_list]

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(bins_list, H_tmedia, 'o-')
plt.xlabel("Number of bins")
plt.ylabel("Entropy H (bits)")
plt.title("TMEDIA entropy vs bins (2009–2024)")
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(bins_list, H_precip, 'o-')
plt.xlabel("Number of bins")
plt.ylabel("Entropy H (bits)")
plt.title("PRECIPITACION entropy vs bins (2009–2024)")
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# ---------------------------
# 4. Optional: show one histogram with entropy in the title
# ---------------------------
bins_fixed = 40
H_t = shannon_entropy(tmedia, bins_fixed)
H_p = shannon_entropy(precip, bins_fixed)

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.hist(tmedia, bins=bins_fixed, density=True, alpha=0.6)
plt.title(f"TMEDIA histogram (H ≈ {H_t:.2f} bits)")
plt.xlabel("TMEDIA (°C)")
plt.ylabel("Density")
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(precip, bins=bins_fixed, density=True, alpha=0.6)
plt.title(f"PRECIPITACION histogram (H ≈ {H_p:.2f} bits)")
plt.xlabel("PRECIPITACION (mm)")
plt.ylabel("Density")
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

Nº datos TMEDIA: 5744, Nº datos PRECIPITACION: 5288

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ---------------------------
# 1. Cargar datos y filtrar 2009–2024
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

mask = df["FECHA"].dt.year.between(2009, 2024)
data = df.loc[mask].copy()

# Nos quedamos solo con filas donde ambas columnas existen
pair = data[["TMEDIA", "PRECIPITACION"]].apply(pd.to_numeric, errors="coerce").dropna()
x = pair["TMEDIA"].values
y = pair["PRECIPITACION"].values

print(f"Nº pares TMEDIA–PRECIPITACION: {len(pair)}")

# ---------------------------
# 2. Mutual information con histograma
# ---------------------------
def mutual_information_hist(x, y, bins):
    """
    Aproxima I(X;Y) en bits usando histogramas conjuntos.
    """
    # histograma conjunto
    joint_counts, xedges, yedges = np.histogram2d(x, y, bins=bins)
    joint_prob = joint_counts / joint_counts.sum()

    # marginales
    px = joint_prob.sum(axis=1, keepdims=True)   # (bins, 1)
    py = joint_prob.sum(axis=0, keepdims=True)   # (1, bins)

    # evitar log(0): nos quedamos solo con entradas > 0
    nonzero = joint_prob > 0
    pij = joint_prob[nonzero]
    pix = px[nonzero.any(axis=1)][np.newaxis, :]  # forma un poco fea de alinear, mejor calculamos directo

    # Mejor: recalcular p_x y p_y planos
    px_flat = joint_prob.sum(axis=1)   # shape (bins,)
    py_flat = joint_prob.sum(axis=0)   # shape (bins,)

    # indices (i,j) donde p_xy > 0
    i_idx, j_idx = np.where(joint_prob > 0)
    pij = joint_prob[i_idx, j_idx]
    pix = px_flat[i_idx]
    pjy = py_flat[j_idx]

    mi = np.sum(pij * np.log2(pij / (pix * pjy)))
    return mi

# ---------------------------
# 3. Mutual information vs número de bins
# ---------------------------
bins_list = np.arange(5, 105, 5)  # 5,10,...,100
mi_values = [mutual_information_hist(x, y, b) for b in bins_list]

plt.figure(figsize=(7, 5))
plt.plot(bins_list, mi_values, 'o-')
plt.xlabel("Number of bins")
plt.ylabel("Mutual information I(TMEDIA; PRECIP) [bits]")
plt.title("Mutual information TMEDIA–PRECIPITACION vs bins (2009–2024)")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# ---------------------------
# 4. Una gráfica conjunta tipo heatmap + MI
# ---------------------------
bins_2d = 40
joint_counts, xedges, yedges = np.histogram2d(x, y, bins=bins_2d)
H, xedges, yedges = joint_counts, xedges, yedges

plt.figure(figsize=(7, 6))
plt.imshow(
    H.T,
    origin="lower",
    aspect="auto",
    extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]]
)
plt.colorbar(label="Counts")
plt.xlabel("TMEDIA (°C)")
plt.ylabel("PRECIPITACION (mm)")

mi_fixed = mutual_information_hist(x, y, bins_2d)
plt.title(f"Joint histogram TMEDIA vs PRECIPITACION\nI ≈ {mi_fixed:.3f} bits (bins={bins_2d})")
plt.tight_layout()
plt.show()

Nº pares TMEDIA–PRECIPITACION: 5253

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ---------------------------
# 1. Cargar datos y filtrar 2009–2024
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True)

mask = df["FECHA"].dt.year.between(2009, 2024)
data = df.loc[mask].sort_values("FECHA").copy()

# Series numéricas ordenadas temporalmente
tmedia = pd.to_numeric(data["TMEDIA"], errors="coerce").dropna().values
precip = pd.to_numeric(data["PRECIPITACION"], errors="coerce").dropna().values

print(f"Nº datos TMEDIA: {len(tmedia)}, Nº datos PRECIPITACION: {len(precip)}")

# ---------------------------
# 2. Mutual information entre X_t y X_{t+1}
# ---------------------------
def mutual_information_lag1(series, bins):
    """
    I(X_t ; X_{t+1}) en bits, usando histograma 2D.
    """
    series = np.asarray(series)

    x = series[:-1]
    y = series[1:]     # lag 1

    joint_counts, xedges, yedges = np.histogram2d(x, y, bins=bins)
    pxy = joint_counts / joint_counts.sum()

    # marginales
    px = pxy.sum(axis=1)  # shape (bins,)
    py = pxy.sum(axis=0)  # shape (bins,)

    # índices donde pxy > 0
    i_idx, j_idx = np.where(pxy > 0)
    pij = pxy[i_idx, j_idx]
    pi = px[i_idx]
    pj = py[j_idx]

    mi = np.sum(pij * np.log2(pij / (pi * pj)))
    return mi

# ---------------------------
# 3. Mutual information vs nº de bins
# ---------------------------
bins_list = np.arange(5, 105, 5)  # 5,10,...,100

mi_tmedia  = [mutual_information_lag1(tmedia,  b) for b in bins_list]
mi_precip  = [mutual_information_lag1(precip,  b) for b in bins_list]

# ---------------------------
# 4. Gráfica independiente para TMEDIA
# ---------------------------
plt.figure(figsize=(7, 5))
plt.plot(bins_list, mi_tmedia, 'o-')
plt.xlabel("Number of bins")
plt.ylabel("Mutual information I(TM_t ; TM_{t+1}) [bits]")
plt.title("TMEDIA: mutual information with next day (2009–2024)")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# ---------------------------
# 5. Gráfica independiente para PRECIPITACION
# ---------------------------
plt.figure(figsize=(7, 5))
plt.plot(bins_list, mi_precip, 'o-')
plt.xlabel("Number of bins")
plt.ylabel("Mutual information I(PR_t ; PR_{t+1}) [bits]")
plt.title("PRECIPITACION: mutual information with next day (2009–2024)")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

Nº datos TMEDIA: 5744, Nº datos PRECIPITACION: 5288

Day 5: Probability¶

Assignment 2/12/2025¶

1. Histograms + Gaussian fit (TMEDIA and PRECIPITACION)¶

2. Spread of the sample mean vs sample size (TMEDIA and PRECIPITACION)¶

3. Convergence of TMEDIA and PRECIPITACION means as N grows¶

4. 4×4 covariance matrix (TMEDIA, PRECIPITACION, TMIN, TMAX)¶

5. 2×2 covariance matrix (TMEDIA–PRECIPITACION only)¶

6. Covariance matrix heatmap¶

7. 1D histograms with mean and ±1σ (TMEDIA and PRECIP)¶

8. “Variance vs covariance” scatter plots (Xₜ vs Xₜ₊₁) for TMEDIA and PRECIPITACION¶

9. Shannon entropy of TMEDIA and PRECIPITACION vs number of bins¶

10. Mutual information between TMEDIA and PRECIPITACION (same day)¶

11. Temporal mutual information (lag 1) for TMEDIA and PRECIPITACION¶

Weekly Conclusion¶