import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

# ---------------------------
# 1. LOAD WEATHER DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20081001-20251107.csv", sep=";")

# Parse date column
df["FECHA"] = pd.to_datetime(df["FECHA"], format="%d/%m/%y", dayfirst=True, errors="coerce")

# Select variables of interest for clustering
# We'll start with TMEDIA (°C) and PRECIPITACION (mm)
vars_for_clustering = ["TMEDIA", "PRECIPITACION"]
data = df[vars_for_clustering].dropna().copy()

print(data.head())
print("Number of valid days:", len(data))

   TMEDIA  PRECIPITACION
0    13.1            0.4
1    11.8            4.4
2     9.6            0.0
3     9.8            0.0
4    12.4            2.0
Number of valid days: 5597

# ---------------------------
# 2. STANDARDIZE FEATURES
# ---------------------------
X = data.values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Mean (after scaling):", X_scaled.mean(axis=0))
print("Std (after scaling):", X_scaled.std(axis=0))

Mean (after scaling): [-3.55461794e-16  1.01560513e-17]
Std (after scaling): [1. 1.]

# ---------------------------
# 3. K-MEANS: ELBOW CURVE
# ---------------------------
ks = range(1, 8)
inertias = []

for k in ks:
    km = KMeans(n_clusters=k, random_state=0, n_init=10)
    km.fit(X_scaled)
    inertias.append(km.inertia_)

plt.figure(figsize=(6,4))
plt.plot(list(ks), inertias, "o-")
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia (sum of squared distances)")
plt.title("K-means elbow curve (TMEDIA vs PRECIP)")
plt.grid(True)
plt.show()

# ---------------------------
# 4. K-MEANS: CLUSTERING WITH k=3
# ---------------------------
k_best = 3  

kmeans = KMeans(n_clusters=k_best, random_state=0, n_init=10)
labels_km = kmeans.fit_predict(X_scaled)

data_km = data.copy()
data_km["cluster_km"] = labels_km

plt.figure(figsize=(6,5))
scatter = plt.scatter(
    data_km["TMEDIA"], 
    data_km["PRECIPITACION"], 
    c=data_km["cluster_km"], 
    s=10, 
    alpha=0.6
)
plt.xlabel("Daily mean temperature TMEDIA (°C)")
plt.ylabel("Daily precipitation (mm)")
plt.title(f"K-means clustering (k={k_best})")
plt.grid(True)
plt.show()

# Optional: cluster centers back-transformed to original units
centers_scaled = kmeans.cluster_centers_
centers = scaler.inverse_transform(centers_scaled)
for i, (tmean, prec) in enumerate(centers):
    print(f"Cluster {i}: TMEDIA ≈ {tmean:.2f} °C, PRECIPITACION ≈ {prec:.2f} mm")

Cluster 0: TMEDIA ≈ 17.34 °C, PRECIPITACION ≈ 1.14 mm
Cluster 1: TMEDIA ≈ 10.96 °C, PRECIPITACION ≈ 31.82 mm
Cluster 2: TMEDIA ≈ 9.09 °C, PRECIPITACION ≈ 3.18 mm

# ---------------------------
# 4. K-MEANS: CLUSTERING WITH k=4
# ---------------------------
k_best = 4  

kmeans = KMeans(n_clusters=k_best, random_state=0, n_init=10)
labels_km = kmeans.fit_predict(X_scaled)

data_km = data.copy()
data_km["cluster_km"] = labels_km

plt.figure(figsize=(6,5))
scatter = plt.scatter(
    data_km["TMEDIA"], 
    data_km["PRECIPITACION"], 
    c=data_km["cluster_km"], 
    s=10, 
    alpha=0.6
)
plt.xlabel("Daily mean temperature TMEDIA (°C)")
plt.ylabel("Daily precipitation (mm)")
plt.title(f"K-means clustering (k={k_best})")
plt.grid(True)
plt.show()

# Optional: cluster centers back-transformed to original units
centers_scaled = kmeans.cluster_centers_
centers = scaler.inverse_transform(centers_scaled)
for i, (tmean, prec) in enumerate(centers):
    print(f"Cluster {i}: TMEDIA ≈ {tmean:.2f} °C, PRECIPITACION ≈ {prec:.2f} mm")

Cluster 0: TMEDIA ≈ 17.43 °C, PRECIPITACION ≈ 0.95 mm
Cluster 1: TMEDIA ≈ 10.28 °C, PRECIPITACION ≈ 18.53 mm
Cluster 2: TMEDIA ≈ 9.15 °C, PRECIPITACION ≈ 1.71 mm
Cluster 3: TMEDIA ≈ 11.85 °C, PRECIPITACION ≈ 50.13 mm

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# ----------------------------------------
# MULTIPLE K-MEANS CLUSTERINGS (k = 2, 3, 4, 5)
# ----------------------------------------
ks_to_plot = [2, 3, 4, 5]

fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=True, sharey=True)
axes = axes.ravel()

for ax, k in zip(axes, ks_to_plot):
    # Fit k-means
    km = KMeans(n_clusters=k, random_state=0, n_init=10)
    labels = km.fit_predict(X_scaled)
    
    # Back-transform cluster centers to original units
    centers_scaled = km.cluster_centers_
    centers = scaler.inverse_transform(centers_scaled)
    
    # Scatter of data points, colored by cluster
    sc = ax.scatter(
        data["TMEDIA"], 
        data["PRECIPITACION"], 
        c=labels, 
        s=8, 
        alpha=0.6
    )
    
    # Plot cluster centers
    ax.scatter(
        centers[:, 0], 
        centers[:, 1], 
        marker="X", 
        s=100, 
        edgecolor="black"
    )
    
    ax.set_title(f"K-means (k={k})")
    ax.set_xlabel("TMEDIA (°C)")
    ax.set_ylabel("PRECIPITACION (mm)")
    ax.grid(True)

plt.suptitle("K-means clustering for several k values", y=1.02, fontsize=14)
plt.tight_layout()
plt.show()

# ---------------------------
# 5. GMM: MODEL SELECTION WITH BIC / AIC
# ---------------------------
ks = range(1, 8)
bics = []
aics = []
gmms = []

for k in ks:
    gmm = GaussianMixture(
        n_components=k, 
        covariance_type="full", 
        random_state=0
    )
    gmm.fit(X_scaled)
    gmms.append(gmm)
    bics.append(gmm.bic(X_scaled))
    aics.append(gmm.aic(X_scaled))

plt.figure(figsize=(7,4))
plt.plot(ks, bics, "o-", label="BIC")
plt.plot(ks, aics, "o--", label="AIC")
plt.xlabel("Number of components (M)")
plt.ylabel("Information criterion")
plt.title("Gaussian Mixture Model selection (TMEDIA vs PRECIPITACION)")
plt.legend()
plt.grid(True)
plt.show()

best_k_bic = ks[int(np.argmin(bics))]
print("Best number of components by BIC:", best_k_bic)

Best number of components by BIC: 7

# ---------------------------
# 6. FIT BEST GMM AND VISUALIZE CLUSTERS
# ---------------------------
M = best_k_bic

gmm_best = GaussianMixture(
    n_components=M,
    covariance_type="full",
    random_state=0
)
gmm_best.fit(X_scaled)

# Most probable component for each day
labels_gmm = gmm_best.predict(X_scaled)
probs_gmm = gmm_best.predict_proba(X_scaled).max(axis=1)

data_gmm = data.copy()
data_gmm["cluster_gmm"] = labels_gmm
data_gmm["p_max"] = probs_gmm

plt.figure(figsize=(6,5))
plt.scatter(
    data_gmm["TMEDIA"], 
    data_gmm["PRECIPITACION"], 
    c=data_gmm["cluster_gmm"], 
    s=10, 
    alpha=0.6
)
plt.xlabel("Daily mean temperature TMEDIA (°C)")
plt.ylabel("Daily precipitation (mm)")
plt.title(f"GMM clustering (M={M} components)")
plt.grid(True)
plt.show()

# ---------------------------
# 6b. GMM: DENSITY CONTOURS
# ---------------------------
# Create a grid in original units
tmin, tmax = data["TMEDIA"].min(), data["TMEDIA"].max()
pmin, pmax = data["PRECIPITACION"].min(), data["PRECIPITACION"].max()

t_vals = np.linspace(tmin, tmax, 100)
p_vals = np.linspace(pmin, pmax, 100)
TT, PP = np.meshgrid(t_vals, p_vals)

grid_points = np.column_stack([TT.ravel(), PP.ravel()])
grid_scaled = scaler.transform(grid_points)

# log probability density on grid
logp = gmm_best.score_samples(grid_scaled)
P = np.exp(logp).reshape(TT.shape)

plt.figure(figsize=(7,5))
# Filled contours of density
cont = plt.contourf(TT, PP, P, levels=20, alpha=0.7)
plt.colorbar(cont, label="Estimated density p(TMEDIA, PRECIP)")
# Original data for context
plt.scatter(
    data["TMEDIA"], 
    data["PRECIPITACION"], 
    s=5, 
    alpha=0.3, 
    edgecolors="none"
)
plt.xlabel("Daily mean temperature TMEDIA (°C)")
plt.ylabel("Daily precipitation (mm)")
plt.title(f"GMM density estimation (M={M})")
plt.grid(True)
plt.show()

from sklearn.mixture import GaussianMixture
import numpy as np

# ----------------------------------------
# MULTIPLE GMM CLUSTERINGS (M = 2, 3, 4)
# ----------------------------------------
components_to_plot = [2, 3, 4]

fig, axes = plt.subplots(1, len(components_to_plot), figsize=(15, 4), sharex=True, sharey=True)
if len(components_to_plot) == 1:
    axes = [axes]  # just in case

for ax, M in zip(axes, components_to_plot):
    gmm = GaussianMixture(
        n_components=M,
        covariance_type="full",
        random_state=0
    )
    gmm.fit(X_scaled)
    
    labels_gmm = gmm.predict(X_scaled)
    probs_gmm = gmm.predict_proba(X_scaled).max(axis=1)
    
    # Plot points, colored by cluster, with alpha depending on confidence
    sc = ax.scatter(
        data["TMEDIA"],
        data["PRECIPITACION"],
        c=labels_gmm,
        s=8,
        alpha=0.6
    )
    
    ax.set_title(f"GMM (M={M})")
    ax.set_xlabel("TMEDIA (°C)")
    ax.set_ylabel("PRECIPITACION (mm)")
    ax.grid(True)

plt.suptitle("Gaussian Mixture Models with several numbers of components", y=1.05, fontsize=14)
plt.tight_layout()
plt.show()

# ---------------------------
# 7. OPTIONAL: CLUSTERING WITH TMIN AND TMAX
# ---------------------------
vars_tt = ["TMIN", "TMAX"]
data_tt = df[vars_tt].dropna().copy()

X_tt = data_tt.values
scaler_tt = StandardScaler()
X_tt_scaled = scaler_tt.fit_transform(X_tt)

# Quick k-means with k=3
k_tt = 3
kmeans_tt = KMeans(n_clusters=k_tt, random_state=0, n_init=10)
labels_tt = kmeans_tt.fit_predict(X_tt_scaled)

plt.figure(figsize=(6,5))
plt.scatter(
    data_tt["TMIN"],
    data_tt["TMAX"],
    c=labels_tt,
    s=10,
    alpha=0.6
)
plt.xlabel("Daily minimum temperature TMIN (°C)")
plt.ylabel("Daily maximum temperature TMAX (°C)")
plt.title(f"K-means clustering in (TMIN, TMAX), k={k_tt}")
plt.grid(True)
plt.show()

# ----------------------------------------
# OPTIONAL: MULTIPLE KMEANS IN (TMIN, TMAX)
# ----------------------------------------
vars_tt = ["TMIN", "TMAX"]
data_tt = df[vars_tt].dropna().copy()
X_tt = data_tt.values

scaler_tt = StandardScaler()
X_tt_scaled = scaler_tt.fit_transform(X_tt)

ks_tt = [2, 3, 4]

fig, axes = plt.subplots(1, len(ks_tt), figsize=(15, 4), sharex=True, sharey=True)
axes = axes.ravel()

for ax, k in zip(axes, ks_tt):
    km_tt = KMeans(n_clusters=k, random_state=0, n_init=10)
    labels_tt = km_tt.fit_predict(X_tt_scaled)
    
    ax.scatter(
        data_tt["TMIN"],
        data_tt["TMAX"],
        c=labels_tt,
        s=8,
        alpha=0.6
    )
    ax.set_title(f"K-means in (TMIN, TMAX), k={k}")
    ax.set_xlabel("TMIN (°C)")
    ax.set_ylabel("TMAX (°C)")
    ax.grid(True)

plt.suptitle("Daily regimes based on (TMIN, TMAX) for several k", y=1.05)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# ----------------------------------------------------
# 0. LOAD AND PREPARE DATA (YOUR WAY)
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC"])     # primero solo exigimos UTC
df["YEAR"] = df["UTC"].dt.year
df = df.sort_values("UTC")

print("Available columns in df:")
print(df.columns.tolist())

# ----------------------------------------------------
# 1. TRY TO GUESS COLUMN NAMES FOR Hum, tmed, Tprec
# ----------------------------------------------------
def guess_column(substring_options):
    """
    Find first column whose lowercase name contains
    ANY of the substrings in substring_options.
    """
    cols = df.columns
    for col in cols:
        cl = col.lower()
        for s in substring_options:
            if s in cl:
                return col
    return None

# Try to detect humidity, mean temperature, precipitation
hum_col  = "Hum"
tmed_col = "Temp"      # el nombre exacto que veas en df.columns
prec_col = "Prec"     # idem

features = [hum_col, tmed_col, prec_col]


# If detection failed for any, raise an informative error
missing = []
if hum_col is None:
    missing.append("humidity (e.g. 'Hum')")
if tmed_col is None:
    missing.append("mean temperature (e.g. 'tmed' or 'TMEDIA')")
if prec_col is None:
    missing.append("precipitation (e.g. 'Tprec', 'PRECIP', 'precp')")

if missing:
    raise ValueError(
        "Could not auto-detect the following columns: "
        + ", ".join(missing)
        + "\nCheck df.columns above and set hum_col / tmed_col / prec_col manually."
    )

# Now we know the real names
features = [hum_col, tmed_col, prec_col]

# Also drop rows where these are NaN
df = df.dropna(subset=features)

print("\nUsing features:", features)

# ----------------------------------------------------
# 2. FILTER YEARS 2019 AND 2020
# ----------------------------------------------------
df_2019 = df[df["YEAR"] == 2019].copy()
df_2020 = df[df["YEAR"] == 2020].copy()
df_2021 = df[df["YEAR"] == 2021].copy()
df_2022 = df[df["YEAR"] == 2022].copy()

print("Days in 2019:", len(df_2019))
print("Days in 2020:", len(df_2020))
print("Days in 2021:", len(df_2021))
print("Days in 2022:", len(df_2022))

# ----------------------------------------------------
# 3. BUILD FEATURE MATRICES AND STANDARDIZE
# ----------------------------------------------------
def prepare_X(df_year, feat_cols):
    """
    From a yearly dataframe, build the feature matrix X,
    standardize it and return (data, X_scaled, scaler).
    """
    data = df_year[feat_cols].dropna().copy()
    if data.empty:
        print(f"WARNING: no valid rows for this year after dropna({feat_cols}).")
    X = data.values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return data, X_scaled, scaler

data_2019, X2019_scaled, scaler_2019 = prepare_X(df_2019, features)
data_2020, X2020_scaled, scaler_2020 = prepare_X(df_2020, features)
data_2021, X2021_scaled, scaler_2021 = prepare_X(df_2021, features)
data_2022, X2022_scaled, scaler_2022 = prepare_X(df_2022, features)

# ----------------------------------------------------
# 4. K-MEANS CLUSTERING FOR A GIVEN YEAR
# ----------------------------------------------------
def kmeans_for_year(data, X_scaled, scaler, year, feat_cols, k_values=(3, 4)):
    """
    Run K-means for several k values on one year and plot
    pairwise projections of the clusters.
    """
    if data.empty:
        print(f"No data available for year {year}, skipping.")
        return
    
    hum_name, tmed_name, prec_name = feat_cols

    for k in k_values:
        km = KMeans(n_clusters=k, random_state=0, n_init=10)
        labels = km.fit_predict(X_scaled)

        data_k = data.copy()
        data_k["cluster"] = labels

        # Cluster centers back to original units
        centers_scaled = km.cluster_centers_
        centers = scaler.inverse_transform(centers_scaled)

        print(f"\nYear {year}, k={k} cluster centers ({hum_name}, {tmed_name}, {prec_name}):")
        for i, (hum_c, tmed_c, prec_c) in enumerate(centers):
            print(f"  Cluster {i}: {hum_name}≈{hum_c:.2f}, {tmed_name}≈{tmed_c:.2f}, {prec_name}≈{prec_c:.2f}")

        # Pairwise plots: (Hum vs tmed), (Hum vs Tprec), (tmed vs Tprec)
        fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharex=False, sharey=False)
        ax1, ax2, ax3 = axes

        # Hum vs tmed
        ax1.scatter(
            data_k[hum_name], data_k[tmed_name],
            c=data_k["cluster"], s=10, alpha=0.6
        )
        ax1.set_xlabel(f"Humidity ({hum_name})")
        ax1.set_ylabel(f"Mean temperature ({tmed_name})")
        ax1.set_title(f"{year} – k-means (k={k})\n{hum_name} vs {tmed_name}")
        ax1.grid(True)

        # Hum vs Precip
        ax2.scatter(
            data_k[hum_name], data_k[prec_name],
            c=data_k["cluster"], s=10, alpha=0.6
        )
        ax2.set_xlabel(f"Humidity ({hum_name})")
        ax2.set_ylabel(f"Precipitation ({prec_name})")
        ax2.set_title(f"{year} – k-means (k={k})\n{hum_name} vs {prec_name}")
        ax2.grid(True)

        # tmed vs Precip
        ax3.scatter(
            data_k[tmed_name], data_k[prec_name],
            c=data_k["cluster"], s=10, alpha=0.6
        )
        ax3.set_xlabel(f"Mean temperature ({tmed_name})")
        ax3.set_ylabel(f"Precipitation ({prec_name})")
        ax3.set_title(f"{year} – k-means (k={k})\n{tmed_name} vs {prec_name}")
        ax3.grid(True)

        plt.suptitle(f"K-means clustering for year {year} (k={k})", y=1.05, fontsize=14)
        plt.tight_layout()
        plt.show()

# ----------------------------------------------------
# 5. RUN CLUSTERING FOR 2019, 2020, 2021 and 2022
# ----------------------------------------------------
kmeans_for_year(data_2019, X2019_scaled, scaler_2019, year=2019, feat_cols=features, k_values=(3, 4))
kmeans_for_year(data_2020, X2020_scaled, scaler_2020, year=2020, feat_cols=features, k_values=(3, 4))
kmeans_for_year(data_2021, X2021_scaled, scaler_2021, year=2021, feat_cols=features, k_values=(3, 4))
kmeans_for_year(data_2022, X2022_scaled, scaler_2022, year=2022, feat_cols=features, k_values=(3, 4))

Available columns in df:
['Id', 'Lon', 'Lat', 'Alt', 'Nombre', 'UTC', 'Prec', 'Hum', 'Temp', 'TempMin', 'TempMax', 'YEAR']

Using features: ['Hum', 'Temp', 'Prec']
Days in 2019: 7198
Days in 2020: 8690
Days in 2021: 8713
Days in 2022: 8228

Year 2019, k=3 cluster centers (Hum, Temp, Prec):
  Cluster 0: Hum≈91.53, Temp≈10.50, Prec≈0.12
  Cluster 1: Hum≈63.06, Temp≈19.55, Prec≈0.00
  Cluster 2: Hum≈96.56, Temp≈10.30, Prec≈3.69

Year 2019, k=4 cluster centers (Hum, Temp, Prec):
  Cluster 0: Hum≈94.41, Temp≈7.75, Prec≈0.17
  Cluster 1: Hum≈84.44, Temp≈14.82, Prec≈0.05
  Cluster 2: Hum≈56.83, Temp≈21.02, Prec≈0.00
  Cluster 3: Hum≈96.56, Temp≈10.30, Prec≈3.69

Year 2020, k=3 cluster centers (Hum, Temp, Prec):
  Cluster 0: Hum≈64.47, Temp≈19.85, Prec≈0.00
  Cluster 1: Hum≈92.22, Temp≈10.55, Prec≈0.12
  Cluster 2: Hum≈96.33, Temp≈10.92, Prec≈3.70

Year 2020, k=4 cluster centers (Hum, Temp, Prec):
  Cluster 0: Hum≈95.66, Temp≈7.88, Prec≈0.17
  Cluster 1: Hum≈58.33, Temp≈21.79, Prec≈0.00
  Cluster 2: Hum≈85.00, Temp≈14.50, Prec≈0.05
  Cluster 3: Hum≈96.26, Temp≈11.12, Prec≈3.75

Year 2021, k=3 cluster centers (Hum, Temp, Prec):
  Cluster 0: Hum≈96.24, Temp≈11.42, Prec≈3.74
  Cluster 1: Hum≈92.45, Temp≈9.66, Prec≈0.14
  Cluster 2: Hum≈66.48, Temp≈17.90, Prec≈0.00

Year 2021, k=4 cluster centers (Hum, Temp, Prec):
  Cluster 0: Hum≈60.90, Temp≈19.13, Prec≈0.00
  Cluster 1: Hum≈94.42, Temp≈6.49, Prec≈0.17
  Cluster 2: Hum≈87.33, Temp≈13.70, Prec≈0.09
  Cluster 3: Hum≈96.17, Temp≈11.54, Prec≈3.80

Year 2022, k=3 cluster centers (Hum, Temp, Prec):
  Cluster 0: Hum≈90.40, Temp≈10.26, Prec≈0.11
  Cluster 1: Hum≈94.53, Temp≈12.01, Prec≈4.44
  Cluster 2: Hum≈65.36, Temp≈19.11, Prec≈0.00

Year 2022, k=4 cluster centers (Hum, Temp, Prec):
  Cluster 0: Hum≈60.24, Temp≈20.51, Prec≈0.00
  Cluster 1: Hum≈84.57, Temp≈14.35, Prec≈0.07
  Cluster 2: Hum≈93.57, Temp≈6.88, Prec≈0.12
  Cluster 3: Hum≈94.53, Temp≈12.01, Prec≈4.44

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# --------------------------------------
# 1. Load data and prepare YEAR
# --------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse UTC with timezone
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)

# Keep only rows with valid time and variables
df = df.dropna(subset=["UTC", "Hum", "Temp", "Prec"])

# Add year column
df["YEAR"] = df["UTC"].dt.year

# Keep only years between 2019 and 2024 (in this file tendrás 2019 y 2020)
df_years = df[df["YEAR"].between(2019, 2024)].copy()

print("Years present in data:", df_years["YEAR"].unique())
print("Total rows used:", len(df_years))

# --------------------------------------
# 2. Build feature matrix and standardize
# --------------------------------------
features = ["Hum", "Temp", "Prec"]
X = df_years[features].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --------------------------------------
# 3. K-means clustering on ALL years together
# --------------------------------------
k = 4  # you can try 3, 4, 5...
kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
labels = kmeans.fit_predict(X_scaled)

df_years["cluster"] = labels

# Optional: show cluster centers in original units
centers_scaled = kmeans.cluster_centers_
centers = scaler.inverse_transform(centers_scaled)

print(f"\nCluster centers (Hum, Temp, Prec) with k={k}:")
for i, (hum_c, temp_c, prec_c) in enumerate(centers):
    print(f"  Cluster {i}: Hum≈{hum_c:.2f}, Temp≈{temp_c:.2f} °C, Prec≈{prec_c:.3f} mm")

# --------------------------------------
# 4. Single scatter plot: ALL years, color = cluster
#    X axis: Temp, Y axis: Prec
# --------------------------------------
plt.figure(figsize=(7, 5))

scatter = plt.scatter(
    df_years["Temp"],
    df_years["Prec"],
    c=df_years["cluster"],
    s=10,
    alpha=0.6
)

plt.xlabel("Temperature (Temp, °C)")
plt.ylabel("Precipitation (Prec, mm)")
plt.title(f"K-means clustering (k={k}) for all years (2019–2024)")

plt.grid(True)
plt.tight_layout()
plt.show()

Years present in data: [2019 2020 2021 2022 2023 2024]
Total rows used: 43343

Cluster centers (Hum, Temp, Prec) with k=4:
  Cluster 0: Hum≈85.60, Temp≈14.27 °C, Prec≈0.077 mm
  Cluster 1: Hum≈59.24, Temp≈20.59 °C, Prec≈0.001 mm
  Cluster 2: Hum≈95.26, Temp≈11.37 °C, Prec≈4.549 mm
  Cluster 3: Hum≈94.14, Temp≈7.09 °C, Prec≈0.189 mm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# ---------------------------
# 1. LOAD AND PREPARE DATA
# ---------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse UTC column
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce")
df = df.dropna(subset=["UTC", "Hum", "Temp", "Prec"])

# (Optional) add year if you want to filter
df["YEAR"] = df["UTC"].dt.year

# Select features and target
# X = [Hum, Temp], y = Prec
data = df[["Hum", "Temp", "Prec"]].dropna().copy()
X_raw = data[["Hum", "Temp"]].values
y = data["Prec"].values

print("Number of samples:", len(data))
print(data.head())

# ---------------------------
# 2. STANDARDIZE INPUT FEATURES
# ---------------------------
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)

print("Mean after scaling:", X.mean(axis=0))
print("Std after scaling:", X.std(axis=0))

# ---------------------------
# 3. FIT GMM ON X (Hum, Temp)
# ---------------------------
K = 3  # number of clusters / components in the CWM (you can try 2, 3, 4...)
gmm = GaussianMixture(
    n_components=K,
    covariance_type="full",
    random_state=0
)
gmm.fit(X)

# Responsibilities gamma_{nk}
resp = gmm.predict_proba(X)   # shape (N, K)
labels_hard = resp.argmax(axis=1)

print("GMM converged:", gmm.converged_)
print("GMM weights:", gmm.weights_)

# ---------------------------
# 4. FIT ONE LINEAR REGRESSION PER CLUSTER (WEIGHTED)
# ---------------------------
regs = []

for k in range(K):
    reg = LinearRegression()
    # Weighted fit: sample_weight = responsibilities for component k
    reg.fit(X, y, sample_weight=resp[:, k])
    regs.append(reg)
    print(f"Component {k}: coefficients = {reg.coef_}, intercept = {reg.intercept_}")

# ---------------------------
# 5. CWM PREDICTION FUNCTION
# ---------------------------
def cwm_predict(X_new_raw):
    """
    X_new_raw: array of shape (N, 2) with columns [Hum, Temp] in original units.
    
    Returns:
        y_hat: final CWM prediction
        resp_new: responsibilities (N, K)
        preds_components: predictions of each component (N, K)
    """
    # Standardize with same scaler used for training
    X_new = scaler.transform(X_new_raw)
    
    # Responsibilities under the GMM
    resp_new = gmm.predict_proba(X_new)  # (N, K)
    
    # Predictions of each component
    preds_components = np.column_stack([
        regs[k].predict(X_new) for k in range(K)
    ])  # shape (N, K)
    
    # Cluster-weighted prediction
    y_hat = np.sum(resp_new * preds_components, axis=1)
    
    return y_hat, resp_new, preds_components

# Evaluate on training data
y_hat_train, resp_train, preds_comp_train = cwm_predict(X_raw)

mse_cwm = mean_squared_error(y, y_hat_train)
print("CWM training MSE:", mse_cwm)

# ---------------------------
# 6. GLOBAL LINEAR BASELINE FOR COMPARISON
# ---------------------------
global_reg = LinearRegression()
global_reg.fit(X, y)
y_hat_global = global_reg.predict(X)

mse_global = mean_squared_error(y, y_hat_global)
print("Global linear regression MSE:", mse_global)

# ---------------------------
# 7.1 CLUSTER STRUCTURE IN (Hum, Temp)
# ---------------------------
plt.figure(figsize=(6, 5))
plt.scatter(
    data["Hum"], data["Temp"],
    c=labels_hard,
    s=8,
    alpha=0.6
)
plt.xlabel("Humidity (Hum)")
plt.ylabel("Temperature (Temp, °C)")
plt.title(f"GMM clusters in (Hum, Temp), K={K}")
plt.grid(True)
plt.tight_layout()
plt.show()

Number of samples: 43343
    Hum  Temp  Prec
0  81.0   7.3   0.0
1  85.0   5.7   0.0
2  89.0   4.3   0.0
3  90.0   3.4   0.0
4  93.0   2.6   0.0
Mean after scaling: [-3.46230362e-16  2.41312070e-16]
Std after scaling: [1. 1.]
GMM converged: True
GMM weights: [0.36049034 0.27755092 0.36195875]
Component 0: coefficients = [0.09829515 0.20290521], intercept = 0.4433711161830282
Component 1: coefficients = [ 0.03844935 -0.00015775], intercept = 0.06689255128904004
Component 2: coefficients = [ 0.33390228 -0.00261882], intercept = 0.12025152271122325
CWM training MSE: 0.6711506717962583
Global linear regression MSE: 0.6884022354543123

# ---------------------------
# 7.2 TRUE vs PREDICTED PRECIPITATION (CWM)
# ---------------------------
plt.figure(figsize=(6, 5))
plt.scatter(
    y, y_hat_train,
    s=8,
    alpha=0.5
)
mn = min(y.min(), y_hat_train.min())
mx = max(y.max(), y_hat_train.max())
plt.plot([mn, mx], [mn, mx])
plt.xlabel("True Prec (mm)")
plt.ylabel("Predicted Prec (mm)")
plt.title(f"Cluster-Weighted Model (K={K})\nMSE={mse_cwm:.4f}")
plt.grid(True)
plt.tight_layout()
plt.show()

# ---------------------------
# 7.3 COMPARE CWM vs GLOBAL LINEAR REGRESSION
# ---------------------------
plt.figure(figsize=(6, 5))
plt.scatter(
    y_hat_global, y_hat_train,
    s=8,
    alpha=0.5
)
plt.xlabel("Global linear prediction (mm)")
plt.ylabel("CWM prediction (mm)")
plt.title("CWM vs global linear regression")
plt.grid(True)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# -----------------------------------------
# 1. LOAD DATA
# -----------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse UTC time
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)

# Keep only valid rows
df = df.dropna(subset=["UTC", "Hum", "Prec"])
df = df.sort_values("UTC").reset_index(drop=True)

print("Rows:", len(df))
print(df[["UTC", "Hum", "Prec"]].head())

# -----------------------------------------
# 2. SIMPLE RELATION: Hum vs Prec
# -----------------------------------------
plt.figure(figsize=(6, 5))
plt.scatter(df["Prec"], df["Hum"], s=5, alpha=0.3)
plt.xlabel("Precipitation (Prec, mm)")
plt.ylabel("Humidity (Hum, %)")
plt.title("Humidity vs precipitation (same time)")
plt.grid(True)
plt.tight_layout()
plt.show()

# -----------------------------------------
# 3. EFFECT OF PRECIPITATION ON HUMIDITY CHANGE
#    ΔHum = Hum(t+1) - Hum(t) vs Prec(t)
# -----------------------------------------

# Shift humidity one step forward
df["Hum_next"] = df["Hum"].shift(-1)
df["dHum"] = df["Hum_next"] - df["Hum"]

# Drop last row (no next value) and NaNs
df_delta = df.dropna(subset=["dHum", "Prec"]).copy()

print("Rows for delta analysis:", len(df_delta))

plt.figure(figsize=(6, 5))
plt.scatter(df_delta["Prec"], df_delta["dHum"], s=5, alpha=0.3)
plt.axhline(0, color="gray", linestyle="--")
plt.xlabel("Precipitation at time t (Prec, mm)")
plt.ylabel("Change in humidity ΔHum = Hum(t+1) - Hum(t)")
plt.title("Humidity change vs precipitation")
plt.grid(True)
plt.tight_layout()
plt.show()

# -----------------------------------------
# 4. Binned average: how mean ΔHum depends on Prec
# -----------------------------------------
# Define precipitation bins
bins = np.linspace(0, df_delta["Prec"].max(), 15)  # 15 bins from 0 to max Prec
df_delta["Prec_bin"] = pd.cut(df_delta["Prec"], bins=bins, include_lowest=True)

# Compute mean ΔHum per bin
bin_stats = (
    df_delta
    .groupby("Prec_bin", observed=False)["dHum"]
    .agg(["mean", "count"])
    .reset_index()
)


# Use bin centers for plotting
# 4. Binned average: how mean ΔHum depends on Prec
bins = np.linspace(0, df_delta["Prec"].max(), 15)  # 15 bins from 0 to max Prec
df_delta["Prec_bin"] = pd.cut(df_delta["Prec"], bins=bins, include_lowest=True)

bin_stats = (
    df_delta
    .groupby("Prec_bin", observed=False)["dHum"]
    .agg(["mean", "count"])
    .reset_index()
)

bin_centers = [interval.mid for interval in bin_stats["Prec_bin"]]

plt.figure(figsize=(6, 5))
plt.scatter(df_delta["Prec"], df_delta["dHum"], s=5, alpha=0.1, label="Individual points")
plt.axhline(0, color="gray", linestyle="--")
plt.plot(bin_centers, bin_stats["mean"], marker="o", linewidth=2, label="Mean ΔHum per Prec bin")
plt.xlabel("Precipitation (Prec, mm)")
plt.ylabel("Change in humidity ΔHum")
plt.title("Average humidity change vs precipitation")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Rows: 43343
                        UTC   Hum  Prec
0 2019-02-15 21:00:00+00:00  81.0   0.0
1 2019-02-15 22:00:00+00:00  85.0   0.0
2 2019-02-15 23:00:00+00:00  89.0   0.0
3 2019-02-16 00:00:00+00:00  90.0   0.0
4 2019-02-16 01:00:00+00:00  93.0   0.0

Rows for delta analysis: 43342

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial import Voronoi, voronoi_plot_2d

# ----------------------------------------------------
# 1. LOAD AND PREPARE DATA
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse UTC time
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)

# Keep only valid rows for humidity and temperature
df = df.dropna(subset=["UTC", "Hum", "Temp"])
df = df.sort_values("UTC").reset_index(drop=True)

print("Rows:", len(df))
print(df[["UTC", "Hum", "Temp"]].head())

# Optional: restrict to a year range if you want
df["YEAR"] = df["UTC"].dt.year
df_sel = df[df["YEAR"].between(2019, 2024)].copy()

print("Years present:", df_sel["YEAR"].unique())

# ----------------------------------------------------
# 2. BUILD FEATURE MATRIX: X = [Hum, Temp]
# ----------------------------------------------------
X_raw = df_sel[["Hum", "Temp"]].values

# Standardize for k-means (Voronoi will use centers in original scale)
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)

# ----------------------------------------------------
# 3. K-MEANS TO GET CLUSTER CENTERS
# ----------------------------------------------------
k = 5  # number of clusters for Voronoi; try 3–6
kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
labels = kmeans.fit_predict(X)

# Centers in standardized space
centers_scaled = kmeans.cluster_centers_
# Back to original units (Hum, Temp)
centers = scaler.inverse_transform(centers_scaled)

print("\nCluster centers (Hum, Temp) for k =", k)
for i, (hum_c, temp_c) in enumerate(centers):
    print(f"  Cluster {i}: Hum≈{hum_c:.1f} %, Temp≈{temp_c:.1f} °C")

# ----------------------------------------------------
# 4. VORONOI TESSELLATION OF CLUSTER CENTERS
# ----------------------------------------------------
# Voronoi is computed directly in the (Hum, Temp) plane
vor = Voronoi(centers)

# ----------------------------------------------------
# 5. PLOT: VORONOI CELLS + DATA POINTS
# ----------------------------------------------------
fig, ax = plt.subplots(figsize=(7, 6))

# Plot Voronoi diagram (only edges)
voronoi_plot_2d(
    vor,
    ax=ax,
    show_vertices=False,
    line_colors="black",
    line_width=1.0,
    line_alpha=0.8,
    point_size=0  # we will plot centers ourselves
)

# Scatter of data points, colored by cluster label
sc = ax.scatter(
    df_sel["Hum"], df_sel["Temp"],
    c=labels,
    s=8,
    alpha=0.4,
    cmap="viridis"
)

# Plot centers on top
ax.scatter(
    centers[:, 0], centers[:, 1],
    marker="X",
    s=120,
    edgecolor="white",
    linewidth=1.5,
    c=range(k),
    cmap="viridis"
)

ax.set_xlabel("Humidity (Hum, %)")
ax.set_ylabel("Temperature (Temp, °C)")
ax.set_title(f"Voronoi tessellation of k-means centers (k={k})\nIn (Hum, Temp) space")
ax.grid(True)

# Adjust limits a bit around data
hum_min, hum_max = df_sel["Hum"].min(), df_sel["Hum"].max()
temp_min, temp_max = df_sel["Temp"].min(), df_sel["Temp"].max()
ax.set_xlim(hum_min - 2, hum_max + 2)
ax.set_ylim(temp_min - 2, temp_max + 2)

plt.tight_layout()
plt.show()

Rows: 43629
                        UTC   Hum  Temp
0 2019-02-15 21:00:00+00:00  81.0   7.3
1 2019-02-15 22:00:00+00:00  85.0   5.7
2 2019-02-15 23:00:00+00:00  89.0   4.3
3 2019-02-16 00:00:00+00:00  90.0   3.4
4 2019-02-16 01:00:00+00:00  93.0   2.6
Years present: [2019 2020 2021 2022 2023 2024]

Cluster centers (Hum, Temp) for k = 5
  Cluster 0: Hum≈92.6 %, Temp≈13.5 °C
  Cluster 1: Hum≈53.0 %, Temp≈22.8 °C
  Cluster 2: Hum≈74.0 %, Temp≈18.1 °C
  Cluster 3: Hum≈95.2 %, Temp≈6.1 °C
  Cluster 4: Hum≈75.0 %, Temp≈10.6 °C

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial import Voronoi, voronoi_plot_2d

# ----------------------------------------------------
# 1. LOAD AND PREPARE DATA
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse UTC time
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)

# Keep only valid rows for humidity and temperature
df = df.dropna(subset=["UTC", "Hum", "Temp"])
df = df.sort_values("UTC").reset_index(drop=True)

# Year and calendar day
df["YEAR"] = df["UTC"].dt.year
df["DATE"] = df["UTC"].dt.date

print("Rows total:", len(df))
print("Years present:", df["YEAR"].unique())

# ----------------------------------------------------
# 2. FLAG FOG-LIKE CONDITIONS HOURLY
#    fog_cond = high humidity + relatively low temperature
# ----------------------------------------------------
fog_cond = (
    (df["Hum"] >= 95.0) &
    (df["Temp"] >= -2.0) &
    (df["Temp"] <= 15.0)
)

df["FOG_FLAG"] = fog_cond

# ----------------------------------------------------
# 3. DAILY FOG COUNTS (DAYS 2019–2024)
# ----------------------------------------------------
daily_fog = (
    df.groupby("DATE")
    .agg(
        fog_hours=("FOG_FLAG", "sum"),
        n_obs=("FOG_FLAG", "size"),
        YEAR=("YEAR", "first")
    )
    .reset_index()
)

# Keep only 2016–2024
daily_fog = daily_fog[daily_fog["YEAR"].between(2019, 2024)]

# Define "fog day" as >= 8 fog-like hours
threshold_hours = 8
fog_days = daily_fog.loc[daily_fog["fog_hours"] >= threshold_hours, "DATE"]

print(f"Number of fog days (>= {threshold_hours} fog-like hours):", len(fog_days))

# ----------------------------------------------------
# 4. SELECT ONLY FOG-LIKE HOURS OF FOG DAYS
# ----------------------------------------------------
df_fog = df[df["DATE"].isin(fog_days) & df["FOG_FLAG"]].copy()

print("Fog-like hourly records in fog days:", len(df_fog))

if len(df_fog) < 2:
    raise ValueError("Not enough fog-like records to build clusters / Voronoi.")

# Restrict to 2019–2024 again (just in case)
df_fog = df_fog[df_fog["YEAR"].between(2019, 2024)].copy()

# ----------------------------------------------------
# 5. BUILD FEATURE MATRIX: X = [Hum, Temp] FOR FOG CONDITIONS
# ----------------------------------------------------
X_raw = df_fog[["Hum", "Temp"]].values

# Standardize for k-means
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)

# ----------------------------------------------------
# 6. K-MEANS ON FOG CONDITIONS
# ----------------------------------------------------
# Choose number of clusters (limited by number of fog points)
k = min(5, len(df_fog))  # at most 5, but no more than number of points
if k < 2:
    raise ValueError("Not enough fog points for k-means with k>=2.")

kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
labels = kmeans.fit_predict(X)

# Centers in standardized space -> back to (Hum, Temp)
centers_scaled = kmeans.cluster_centers_
centers = scaler.inverse_transform(centers_scaled)

print("\nCluster centers for fog conditions (Hum, Temp):")
for i, (hum_c, temp_c) in enumerate(centers):
    print(f"  Cluster {i}: Hum≈{hum_c:.1f} %, Temp≈{temp_c:.1f} °C")

# ----------------------------------------------------
# 7. VORONOI TESSELLATION OF FOG CLUSTER CENTERS
# ----------------------------------------------------
vor = Voronoi(centers)

# ----------------------------------------------------
# 8. PLOT: VORONOI CELLS + FOG POINTS (2019–2024)
# ----------------------------------------------------
fig, ax = plt.subplots(figsize=(7, 6))

# Voronoi diagram (only edges)
voronoi_plot_2d(
    vor,
    ax=ax,
    show_vertices=False,
    line_colors="black",
    line_width=1.0,
    line_alpha=0.8,
    point_size=0  # we'll plot centers ourselves
)

# Scatter of fog points, colored by cluster label
sc = ax.scatter(
    df_fog["Hum"], df_fog["Temp"],
    c=labels,
    s=10,
    alpha=0.5,
    cmap="viridis"
)

# Plot cluster centers
ax.scatter(
    centers[:, 0], centers[:, 1],
    marker="X",
    s=120,
    edgecolor="white",
    linewidth=1.5,
    c=range(k),
    cmap="viridis"
)

ax.set_xlabel("Humidity (Hum, %)")
ax.set_ylabel("Temperature (Temp, °C)")
ax.set_title(
    f"Voronoi tessellation of fog regimes (2019–2024)\n"
    f"Fog days: {len(fog_days)}, fog records: {len(df_fog)}, k={k}"
)
ax.grid(True)

# Adjust axis limits around fog data
hum_min, hum_max = df_fog["Hum"].min(), df_fog["Hum"].max()
temp_min, temp_max = df_fog["Temp"].min(), df_fog["Temp"].max()
ax.set_xlim(hum_min - 2, hum_max + 2)
ax.set_ylim(temp_min - 2, temp_max + 2)

plt.tight_layout()
plt.show()

Rows total: 43629
Years present: [2019 2020 2021 2022 2023 2024]
Number of fog days (>= 8 fog-like hours): 748
Fog-like hourly records in fog days: 9512

Cluster centers for fog conditions (Hum, Temp):
  Cluster 0: Hum≈96.2 %, Temp≈4.5 °C
  Cluster 1: Hum≈99.2 %, Temp≈7.6 °C
  Cluster 2: Hum≈99.3 %, Temp≈2.2 °C
  Cluster 3: Hum≈96.1 %, Temp≈10.9 °C
  Cluster 4: Hum≈99.0 %, Temp≈11.8 °C

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # needed for 3D projection in some environments
import numpy as np

# ----------------------------------------------------
# 3D SCATTER OF FOG CONDITIONS: (Hum, Temp, Prec)
# ----------------------------------------------------

# We need precipitation too; ensure column Prec exists and drop NaNs
df_fog_3d = df_fog.dropna(subset=["Prec"]).copy()

print("Fog-like records with precipitation:", len(df_fog_3d))

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d")

p = ax.scatter(
    df_fog_3d["Hum"],        # X: humidity
    df_fog_3d["Temp"],       # Y: temperature
    df_fog_3d["Prec"],       # Z: precipitation
    c=labels[:len(df_fog_3d)],   # colour by cluster (same labels as in 2D)
    s=10,
    alpha=0.7,
    cmap="viridis"
)

# Optional: cluster centers extended to 3D, using median Prec per cluster
centers_3d = []
for k_idx in range(len(centers)):
    mask_k = (labels == k_idx)
    prec_med = df_fog_3d["Prec"][mask_k[:len(df_fog_3d)]].median()
    centers_3d.append((centers[k_idx, 0], centers[k_idx, 1], prec_med))

centers_3d = np.array(centers_3d)

ax.scatter(
    centers_3d[:, 0],
    centers_3d[:, 1],
    centers_3d[:, 2],
    marker="X",
    s=120,
    edgecolor="white",
    linewidth=1.5,
    c=range(len(centers_3d)),
    cmap="viridis"
)

ax.set_xlabel("Humidity (Hum, %)")
ax.set_ylabel("Temperature (Temp, °C)")
ax.set_zlabel("Precipitation (Prec, mm)")
ax.set_title("Fog-like conditions in 3D: (Hum, Temp, Prec)")

plt.tight_layout()
plt.show()

Fog-like records with precipitation: 9381

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# ----------------------------------------------------
# 0. LOAD AND PREPARE DATA
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse UTC column with timezone
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)

# Keep only rows with valid time and main variables
df = df.dropna(subset=["UTC", "Hum", "Temp", "Prec"]).copy()

# Add year and date for convenience
df["YEAR"] = df["UTC"].dt.year
df["DATE"] = df["UTC"].dt.date

print("Columns:", df.columns.tolist())
print("Total rows after cleaning:", len(df))

# ----------------------------------------------------
# 1. DEFINE FOG-LIKE CONDITIONS (HOURLY)
#    fog: high humidity + relatively low temperature
# ----------------------------------------------------
fog_mask = (
    (df["YEAR"].between(2019, 2024)) &
    (df["Hum"] >= 95.0) &
    (df["Temp"] >= -2.0) &
    (df["Temp"] <= 15.0)
)

df["FOG_FLAG"] = fog_mask

df_fog = df[df["FOG_FLAG"]].copy()
print("Fog-like hourly records (2019–2024):", len(df_fog))

if len(df_fog) < 50:
    print("WARNING: very few fog-like records, CWM may not be very meaningful.")

# ----------------------------------------------------
# 2. SELECT FEATURES AND TARGET FOR FOG CONDITIONS
#    X = [Hum, Temp], y = Prec
# ----------------------------------------------------
data_fog = df_fog[["Hum", "Temp", "Prec"]].dropna().copy()
X_raw = data_fog[["Hum", "Temp"]].values
y = data_fog["Prec"].values

print("Samples used for fog CWM:", len(data_fog))
print(data_fog.head())

# ----------------------------------------------------
# 3. STANDARDIZE INPUT FEATURES
# ----------------------------------------------------
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)

print("Mean after scaling:", X.mean(axis=0))
print("Std after scaling:", X.std(axis=0))

# ----------------------------------------------------
# 4. FIT GMM ON X (Hum, Temp) FOR FOG CONDITIONS
# ----------------------------------------------------
# Choose number of fog regimes
K = 3  # you can try 2, 3, 4...
K = min(K, len(data_fog))  # basic safety check

gmm = GaussianMixture(
    n_components=K,
    covariance_type="full",
    random_state=0
)
gmm.fit(X)

# Responsibilities gamma_{nk} = p(k | x_n)
resp = gmm.predict_proba(X)   # shape (N, K)
labels_hard = resp.argmax(axis=1)

print("GMM converged:", gmm.converged_)
print("GMM weights:", gmm.weights_)

# ----------------------------------------------------
# 5. FIT ONE LINEAR REGRESSION PER FOG REGIME (WEIGHTED)
#    Each component k has its own local regression:
#    Prec ~ Hum, Temp, fitted with weights gamma_{nk}
# ----------------------------------------------------
regs = []

for k in range(K):
    reg = LinearRegression()
    # Weighted fit: sample_weight = responsibilities for component k
    reg.fit(X, y, sample_weight=resp[:, k])
    regs.append(reg)
    print(f"Fog component {k}: coefficients = {reg.coef_}, intercept = {reg.intercept_}")

# ----------------------------------------------------
# 6. CWM PREDICTION FUNCTION (FOG-ONLY MODEL)
#    y_hat(x) = sum_k p(k|x) * f_k(x)
# ----------------------------------------------------
def cwm_predict_fog(X_new_raw):
    """
    X_new_raw: array of shape (N, 2) with columns [Hum, Temp] in original units.

    Returns:
        y_hat: final CWM prediction (N,)
        resp_new: responsibilities (N, K)
        preds_components: predictions of each component (N, K)
    """
    # Standardize with same scaler used for training
    X_new = scaler.transform(X_new_raw)

    # Responsibilities under the fog GMM
    resp_new = gmm.predict_proba(X_new)  # (N, K)

    # Predictions of each component
    preds_components = np.column_stack([
        regs[k].predict(X_new) for k in range(K)
    ])  # shape (N, K)

    # Cluster-weighted prediction
    y_hat = np.sum(resp_new * preds_components, axis=1)

    return y_hat, resp_new, preds_components

# ----------------------------------------------------
# 7. EVALUATE FOG CWM ON TRAINING DATA
# ----------------------------------------------------
y_hat_fog, resp_fog, preds_comp_fog = cwm_predict_fog(X_raw)

mse_cwm_fog = mean_squared_error(y, y_hat_fog)
print("Fog CWM training MSE:", mse_cwm_fog)

# ----------------------------------------------------
# 8. GLOBAL LINEAR BASELINE (FOG-ONLY) FOR COMPARISON
# ----------------------------------------------------
global_reg_fog = LinearRegression()
global_reg_fog.fit(X, y)
y_hat_global_fog = global_reg_fog.predict(X)

mse_global_fog = mean_squared_error(y, y_hat_global_fog)
print("Global linear regression (fog-only) MSE:", mse_global_fog)

# ----------------------------------------------------
# 9. PLOTS FOR FOG CWM
# ----------------------------------------------------

# 9.1 Fog cluster structure in (Hum, Temp)
plt.figure(figsize=(6, 5))
plt.scatter(
    data_fog["Hum"], data_fog["Temp"],
    c=labels_hard,
    s=8,
    alpha=0.6
)
plt.xlabel("Humidity (Hum, %)")
plt.ylabel("Temperature (Temp, °C)")
plt.title(f"GMM fog clusters in (Hum, Temp), K={K}")
plt.grid(True)
plt.tight_layout()
plt.show()

# 9.2 True vs predicted precipitation (Fog CWM)
plt.figure(figsize=(6, 5))
plt.scatter(
    y, y_hat_fog,
    s=8,
    alpha=0.5
)
mn = min(y.min(), y_hat_fog.min())
mx = max(y.max(), y_hat_fog.max())
plt.plot([mn, mx], [mn, mx])
plt.xlabel("True Prec (mm)")
plt.ylabel("Predicted Prec (mm)")
plt.title(f"Fog Cluster-Weighted Model (K={K})\nMSE={mse_cwm_fog:.4f}")
plt.grid(True)
plt.tight_layout()
plt.show()

# 9.3 Compare Fog CWM vs fog global linear regression
plt.figure(figsize=(6, 5))
plt.scatter(
    y_hat_global_fog, y_hat_fog,
    s=8,
    alpha=0.5
)
plt.xlabel("Global linear prediction (fog-only, mm)")
plt.ylabel("CWM prediction (fog-only, mm)")
plt.title("Fog CWM vs global linear regression (fog episodes)")
plt.grid(True)
plt.tight_layout()
plt.show()

Columns: ['Id', 'Lon', 'Lat', 'Alt', 'Nombre', 'UTC', 'Prec', 'Hum', 'Temp', 'TempMin', 'TempMax', 'YEAR', 'DATE']
Total rows after cleaning: 43343
Fog-like hourly records (2019–2024): 11988
Samples used for fog CWM: 11988
     Hum  Temp  Prec
7   95.0   1.1   0.0
8   96.0   0.7   0.0
9   97.0   0.9   0.0
10  97.0   1.3   0.0
11  96.0   1.6   0.0
Mean after scaling: [ 2.18117890e-16 -4.74169326e-17]
Std after scaling: [1. 1.]
GMM converged: True
GMM weights: [0.37104245 0.39268471 0.23627285]
Fog component 0: coefficients = [0.02901152 0.19581914], intercept = 0.46390083736812393
Fog component 1: coefficients = [-0.07801394  0.1880965 ], intercept = 0.4176650378550246
Fog component 2: coefficients = [0.00651661 0.17296774], intercept = 0.37399395595856066
Fog CWM training MSE: 1.2448069286055783
Global linear regression (fog-only) MSE: 1.2461277681804295

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ----------------------------------------------------
# 0. LOAD AND PREPARE DATA
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse UTC with timezone
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)

# Keep only valid rows for humidity and temperature
df = df.dropna(subset=["UTC", "Hum", "Temp"]).copy()

# Time fields
df["YEAR"] = df["UTC"].dt.year
df["HOUR"] = df["UTC"].dt.hour

print("Rows total:", len(df))
print("Years present:", df["YEAR"].unique())

# ----------------------------------------------------
# 1. DEFINE FOG-LIKE CONDITIONS (HOURLY)
#    fog: high humidity + relatively low temperature
# ----------------------------------------------------
fog_mask = (
    (df["YEAR"].between(2019, 2024)) &
    (df["Hum"] >= 95.0) &
    (df["Temp"] >= -2.0) &
    (df["Temp"] <= 15.0)
)

df["FOG_FLAG"] = fog_mask

# Filter to 2019–2024 just to be explicit
df_period = df[df["YEAR"].between(2019, 2024)].copy()

# ----------------------------------------------------
# 2. FOG HOURS PER YEAR AND FOGGIEST YEAR
# ----------------------------------------------------
fog_hours_per_year = df_period[df_period["FOG_FLAG"]].groupby("YEAR").size()

print("\nFog hours per year (2019–2024):")
print(fog_hours_per_year)

foggiest_year = fog_hours_per_year.idxmax()
max_fog_hours = fog_hours_per_year.max()

print(f"\nFoggiest year: {foggiest_year} with {max_fog_hours} fog-like hours")

# ----------------------------------------------------
# 3. HOURLY DISTRIBUTION OF FOG IN THE FOGGIEST YEAR
# ----------------------------------------------------
# All data from the foggiest year
df_y = df_period[df_period["YEAR"] == foggiest_year].copy()

# Fog-only records in that year
df_y_fog = df_y[df_y["FOG_FLAG"]].copy()

# Count fog occurrences per hour (0–23)
fog_by_hour = df_y_fog.groupby("HOUR").size()

# Also count all observations per hour in that year
total_by_hour = df_y.groupby("HOUR").size()

# Make sure we have all hours 0–23 even if some are missing
hours = np.arange(24)
fog_by_hour = fog_by_hour.reindex(hours, fill_value=0)
total_by_hour = total_by_hour.reindex(hours, fill_value=0)

# Avoid division by zero
with np.errstate(divide="ignore", invalid="ignore"):
    fog_percent = np.where(total_by_hour > 0,
                           100.0 * fog_by_hour / total_by_hour,
                           0.0)

# ----------------------------------------------------
# 4. PLOT: FOG HOURS BY HOUR OF DAY (FOGGIEST YEAR)
# ----------------------------------------------------
fig, ax1 = plt.subplots(figsize=(8, 5))

# Bar: number of fog-like records per hour
ax1.bar(hours, fog_by_hour, alpha=0.7)
ax1.set_xlabel("Hour of day")
ax1.set_ylabel("Number of fog-like records")
ax1.set_xticks(hours)
ax1.grid(True, axis="y", linestyle="--", alpha=0.5)

ax1.set_title(
    f"Fog distribution by hour in {foggiest_year}\n"
    f"(total fog-like hours: {int(max_fog_hours)})"
)

# Optional: second axis with percentage
ax2 = ax1.twinx()
ax2.plot(hours, fog_percent, marker="o", linewidth=2, color="tab:red")
ax2.set_ylabel("Fog percentage per hour (%)", color="tab:red")
ax2.tick_params(axis="y", labelcolor="tab:red")

plt.tight_layout()
plt.show()

Rows total: 43629
Years present: [2019 2020 2021 2022 2023 2024]

Fog hours per year (2019–2024):
YEAR
2019    2162
2020    2916
2021    2811
2022    1910
2023    1926
2024     410
dtype: int64

Foggiest year: 2020 with 2916 fog-like hours

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

# ----------------------------------------------------
# 0. LOAD AND PREPARE DATA
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse UTC time with timezone
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)

# Keep only valid rows
df = df.dropna(subset=["UTC", "Hum", "Temp"]).copy()
df["YEAR"] = df["UTC"].dt.year

# Restrict to 2019–2024
df = df[df["YEAR"].between(2019, 2024)].copy()
df = df.sort_values("UTC").reset_index(drop=True)

print("Rows in 2019–2024:", len(df))

# ----------------------------------------------------
# 1. DEFINE FOG-LIKE CONDITIONS (HOURLY)
#    fog: high humidity + relatively low temperature
# ----------------------------------------------------
fog_mask = (
    (df["Hum"] >= 95.0) &
    (df["Temp"] >= -2.0) &
    (df["Temp"] <= 15.0)
)

df["FOG_FLAG"] = fog_mask
df_fog = df[df["FOG_FLAG"]].copy()

print("Fog-like hourly records:", len(df_fog))

if len(df_fog) < 50:
    print("WARNING: few fog-like records; results may be noisy.")

# ----------------------------------------------------
# 2. BUILD FEATURE MATRIX FOR FOG: X = [Hum, Temp]
# ----------------------------------------------------
X_raw = df_fog[["Hum", "Temp"]].values

scaler = StandardScaler()
X = scaler.fit_transform(X_raw)

# ----------------------------------------------------
# 3. K-MEANS CLUSTERING ON FOG CONDITIONS
# ----------------------------------------------------
K_kmeans = 4  # you can change this (e.g. 3, 4, 5)
K_kmeans = min(K_kmeans, len(df_fog))  # safety: cannot exceed number of points

kmeans = KMeans(n_clusters=K_kmeans, random_state=0, n_init=10)
labels_km = kmeans.fit_predict(X)

# Cluster centers in original (Hum, Temp) units
centers_scaled = kmeans.cluster_centers_
centers = scaler.inverse_transform(centers_scaled)

print(f"\nK-means centers for fog (K={K_kmeans}):")
for i, (hum_c, temp_c) in enumerate(centers):
    print(f"  Cluster {i}: Hum≈{hum_c:.1f} %, Temp≈{temp_c:.1f} °C")

# PLOT: K-means clustering in (Hum, Temp) for fog
plt.figure(figsize=(6, 5))
plt.scatter(
    df_fog["Hum"], df_fog["Temp"],
    c=labels_km,
    s=8,
    alpha=0.6
)
plt.scatter(
    centers[:, 0], centers[:, 1],
    marker="X",
    s=120,
    edgecolor="white",
    linewidth=1.5,
    c=range(K_kmeans)
)
plt.xlabel("Humidity (Hum, %)")
plt.ylabel("Temperature (Temp, °C)")
plt.title(f"K-means clustering for fog conditions (Hum, Temp), K={K_kmeans}")
plt.grid(True)
plt.tight_layout()
plt.show()

# ----------------------------------------------------
# 4. GMM DENSITY ESTIMATION ON FOG CONDITIONS
#    select number of components via BIC
# ----------------------------------------------------
max_components = min(6, len(df_fog))  # up to 6 components (or less if few points)

bics = []
gmms = []
components_range = range(1, max_components + 1)

for m in components_range:
    gmm = GaussianMixture(
        n_components=m,
        covariance_type="full",
        random_state=0
    )
    gmm.fit(X)
    gmms.append(gmm)
    bics.append(gmm.bic(X))

best_idx = int(np.argmin(bics))
best_M = components_range[best_idx]
gmm_best = gmms[best_idx]

print("\nGMM density estimation for fog:")
print("  Tested components (M):", list(components_range))
print("  BIC values:", [round(b, 1) for b in bics])
print("  Best number of components by BIC:", best_M)

# ----------------------------------------------------
# 5. PLOT BIC CURVE (OPTIONAL)
# ----------------------------------------------------
plt.figure(figsize=(6, 4))
plt.plot(list(components_range), bics, marker="o")
plt.xlabel("Number of components (M)")
plt.ylabel("BIC")
plt.title("GMM model selection for fog data (Hum, Temp)")
plt.grid(True)
plt.tight_layout()
plt.show()

# ----------------------------------------------------
# 6. 2D DENSITY MAP (GMM) IN (Hum, Temp)
# ----------------------------------------------------
# Create a grid in original (Hum, Temp) units
hum_min, hum_max = df_fog["Hum"].min(), df_fog["Hum"].max()
temp_min, temp_max = df_fog["Temp"].min(), df_fog["Temp"].max()

hum_vals = np.linspace(hum_min, hum_max, 100)
temp_vals = np.linspace(temp_min, temp_max, 100)
HH, TT = np.meshgrid(hum_vals, temp_vals)

grid_points = np.column_stack([HH.ravel(), TT.ravel()])

# Transform grid to scaled space and compute log density
grid_scaled = scaler.transform(grid_points)
logp = gmm_best.score_samples(grid_scaled)
P = np.exp(logp).reshape(HH.shape)  # relative density (up to a constant factor)

# PLOT: GMM density estimation + fog points
plt.figure(figsize=(7, 5))
cont = plt.contourf(HH, TT, P, levels=20, alpha=0.7)
plt.colorbar(cont, label="Estimated density (relative)")

plt.scatter(
    df_fog["Hum"], df_fog["Temp"],
    s=5,
    alpha=0.3,
    edgecolors="none"
)

plt.xlabel("Humidity (Hum, %)")
plt.ylabel("Temperature (Temp, °C)")
plt.title(f"GMM density estimation for fog conditions (Hum, Temp)\nM={best_M} components")
plt.grid(True)
plt.tight_layout()
plt.show()

Rows in 2019–2024: 43629
Fog-like hourly records: 12135

K-means centers for fog (K=4):
  Cluster 0: Hum≈96.0 %, Temp≈11.3 °C
  Cluster 1: Hum≈96.1 %, Temp≈4.7 °C
  Cluster 2: Hum≈99.0 %, Temp≈10.5 °C
  Cluster 3: Hum≈99.3 %, Temp≈3.7 °C

GMM density estimation for fog:
  Tested components (M): [1, 2, 3, 4, 5, 6]
  BIC values: [np.float64(68551.6), np.float64(66088.9), np.float64(65896.3), np.float64(32801.1), np.float64(63903.3), np.float64(10684.3)]
  Best number of components by BIC: 6

Day 6: Density Estimation¶

Assignment 4/12/2025¶

1. Load and prepare the data¶

2. Scaling of features¶

3. K-means: choose number of clusters (elbow)¶

4. K-means: visualising clusters in your climate¶

5. Gaussian Mixture Models (GMM): choosing the number of components with BIC/AIC¶

6. GMM: “soft” clusters and density map¶

7. Clustering with TMIN and TMAX¶

8. Density estimation and clustering on local weather data¶

9. Clustering 2019 vs 2024 with humidity, temperature and precipitation¶

10. Cluster-Weighted Modelling (CWM)¶

11. Fog regimes and Voronoi tessellation in (Hum, Temp)¶