import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# ----------------------------------------------------
# 0. LOAD AND PREPARE DATA (FOG CONDITIONS)
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)
df = df.dropna(subset=["UTC", "Hum", "Temp", "Prec"]).copy()
df["YEAR"] = df["UTC"].dt.year

# 2019–2024
df = df[df["YEAR"].between(2019, 2024)].copy()

# Fog-like condition
fog_mask = (
    (df["Hum"] >= 95.0) &
    (df["Temp"] >= -2.0) &
    (df["Temp"] <= 15.0)
)
df_fog = df[fog_mask].copy()

print("Fog-like hourly records for PCA:", len(df_fog))

# ----------------------------------------------------
# 1. FEATURE MATRIX FOR PCA: X = [Hum, Temp, Prec]
# ----------------------------------------------------
X_raw = df_fog[["Hum", "Temp", "Prec"]].values

# Standardize before PCA
scaler_pca = StandardScaler()
X_scaled = scaler_pca.fit_transform(X_raw)

# ----------------------------------------------------
# 2. PCA: REDUCE TO 2 COMPONENTS
# ----------------------------------------------------
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

expl_var = pca.explained_variance_ratio_
print("Explained variance ratio (PC1, PC2):", expl_var)

# ----------------------------------------------------
# 3. TRY TO USE K-MEANS LABELS IF ALREADY DEFINED
# ----------------------------------------------------
try:
    # If labels_km already exists from previous K-means on fog
    color_labels = labels_km[:len(df_fog)]
    use_clusters = True
except NameError:
    # Otherwise, use a single color
    color_labels = "tab:blue"
    use_clusters = False

# ----------------------------------------------------
# 4. PLOT: PCA SCATTER
# ----------------------------------------------------
plt.figure(figsize=(7, 5))
plt.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=color_labels,
    s=8,
    alpha=0.6
)
plt.xlabel(f"PC1 ({expl_var[0]*100:.1f}% var)")
plt.ylabel(f"PC2 ({expl_var[1]*100:.1f}% var)")
title_extra = " (coloured by K-means cluster)" if use_clusters else ""
plt.title("PCA of fog conditions (Hum, Temp, Prec)" + title_extra)
plt.grid(True)
plt.tight_layout()
plt.show()

Fog-like hourly records for PCA: 11988
Explained variance ratio (PC1, PC2): [0.42381013 0.31488568]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# ----------------------------------------------------
# 1. LOAD DATA AND BASIC PREPROCESSING
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse UTC datetime (with timezone)
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)

# Keep only rows with valid main variables
df = df.dropna(subset=["UTC", "Temp", "Hum", "Prec"]).copy()

# Time fields
df["YEAR"] = df["UTC"].dt.year

# Restrict to 2019–2024
df = df[df["YEAR"].between(2019, 2024)].copy()
df = df.sort_values("UTC").reset_index(drop=True)

print("Rows used:", len(df))
print("Years present:", df["YEAR"].unique())

# ----------------------------------------------------
# 2. BUILD FEATURE MATRIX: X = [Temp, Hum, Prec]
# ----------------------------------------------------
X_raw = df[["Temp", "Hum", "Prec"]].values

# Standardize features (zero mean, unit variance)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)

# ----------------------------------------------------
# 3. PCA: REDUCE TO 2 PRINCIPAL COMPONENTS
# ----------------------------------------------------
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

expl_var = pca.explained_variance_ratio_
print(f"Explained variance ratio: PC1={expl_var[0]:.3f}, PC2={expl_var[1]:.3f}")

# ----------------------------------------------------
# 4. PLOT: PCA SCATTER (PC1 vs PC2), COLOURED BY YEAR
# ----------------------------------------------------
plt.figure(figsize=(8, 6))

scatter = plt.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=df["YEAR"],
    s=8,
    alpha=0.6,
    cmap="viridis"
)

cbar = plt.colorbar(scatter)
cbar.set_label("Year")

plt.xlabel(f"PC1 ({expl_var[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({expl_var[1]*100:.1f}% variance)")
plt.title("PCA of weather data (Temp, Hum, Prec)\n2019–2024, coloured by year")
plt.grid(True, alpha=0.4)
plt.tight_layout()
plt.show()

Rows used: 43343
Years present: [2019 2020 2021 2022 2023 2024]
Explained variance ratio: PC1=0.565, PC2=0.321

from sklearn.datasets import fetch_openml
import numpy as np

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X = mnist.data   # shape (N, 784)
y = mnist.target # labels 0–9

print("MNIST X shape:", X.shape)  # e.g. (70000, 784)

# If you want to see one image as 28x28 again:
img0 = X[0].reshape(28, 28)

MNIST X shape: (70000, 784)

X_raw = df[["Temp", "Hum", "Prec"]].values
print(X_raw.shape)   # (N_hours, 3)

(43343, 3)

from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# 1. Load MNIST
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X = mnist.data   # shape (N, 784)
y = mnist.target.astype(int)

print("MNIST X shape:", X.shape)  # e.g. (70000, 784)

# (Optionally subsample to go faster)
N = 5000
X_sub = X[:N]
y_sub = y[:N]

# 2. PCA to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_sub)

print("Explained variance ratio:", pca.explained_variance_ratio_)

# 3. Plot PC1 vs PC2 coloured by digit
plt.figure(figsize=(7, 6))
scatter = plt.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=y_sub,
    s=5,
    alpha=0.6,
    cmap="tab10"
)
plt.colorbar(scatter, label="Digit label")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA of MNIST (subsample)")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

MNIST X shape: (70000, 784)
Explained variance ratio: [0.09867566 0.07404546]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# -----------------------------------------
# LOAD DATA
# -----------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)

# Keep only rows with valid values
df = df.dropna(subset=["UTC", "Hum", "Prec"]).copy()
df["YEAR"] = df["UTC"].dt.year

print("Rows:", len(df))
print("Years:", df["YEAR"].unique())
print(df[["UTC", "Hum", "Prec"]].head())

Rows: 43343
Years: [2019 2020 2021 2022 2023 2024]
                        UTC   Hum  Prec
0 2019-02-15 21:00:00+00:00  81.0   0.0
1 2019-02-15 22:00:00+00:00  85.0   0.0
2 2019-02-15 23:00:00+00:00  89.0   0.0
3 2019-02-16 00:00:00+00:00  90.0   0.0
4 2019-02-16 01:00:00+00:00  93.0   0.0

# -----------------------------------------
# SCATTER: HUMIDITY vs PRECIPITATION
# -----------------------------------------
plt.figure(figsize=(7, 5))

plt.scatter(
    df["Hum"],
    df["Prec"],
    s=5,
    alpha=0.3
)

plt.xlabel("Humidity (Hum, %)")
plt.ylabel("Precipitation (Prec, mm)")
plt.title("Humidity vs precipitation (all records)")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# -----------------------------------------
# BINNED RELATION: HUMIDITY → RAIN PROBABILITY & MEAN PREC
# -----------------------------------------

# Define humidity bins (0–100% in steps of 5)
bins = np.arange(0, 105, 5)  # 0,5,10,...,100
df["hum_bin"] = pd.cut(df["Hum"], bins=bins, include_lowest=True)

# For each bin, compute:
# - rain_freq: fraction of times with Prec > 0
# - prec_mean: mean Prec (including zeros)
# - count: number of samples in that bin
stats = (
    df.groupby("hum_bin", observed=False)      # <-- añadido observed=False
    .agg(
        rain_freq=("Prec", lambda x: (x > 0).mean()),
        prec_mean=("Prec", "mean"),
        count=("Prec", "size")
    )
    .reset_index()
)

# Compute bin centres for plotting
bin_centers = np.array([interval.mid for interval in stats["hum_bin"]])

print(stats.head())

fig, ax1 = plt.subplots(figsize=(8, 5))

# Left axis: probability of rain vs humidity
ax1.plot(bin_centers, stats["rain_freq"], marker="o", linewidth=2, label="P(Prec > 0)")
ax1.set_xlabel("Humidity (%)")
ax1.set_ylabel("Rain probability", color="tab:blue")
ax1.tick_params(axis="y", labelcolor="tab:blue")
ax1.set_ylim(0, 1)
ax1.grid(True, axis="both", alpha=0.3)

# Right axis: mean precipitation vs humidity
ax2 = ax1.twinx()
ax2.plot(bin_centers, stats["prec_mean"], marker="s", linewidth=2, color="tab:red", label="Mean Prec (mm)")
ax2.set_ylabel("Mean precipitation (mm)", color="tab:red")
ax2.tick_params(axis="y", labelcolor="tab:red")

plt.title("Relation between humidity and precipitation")
plt.tight_layout()
plt.show()

         hum_bin  rain_freq  prec_mean  count
0  (-0.001, 5.0]        NaN        NaN      0
1    (5.0, 10.0]        NaN        NaN      0
2   (10.0, 15.0]        NaN        NaN      0
3   (15.0, 20.0]        0.0        0.0      1
4   (20.0, 25.0]        0.0        0.0     20

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ----------------------------------------------------
# 1. LOAD DATA AND EXTRACT A 1D TEMPERATURE SIGNAL
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse time
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)
df = df.dropna(subset=["UTC", "Temp"]).copy()
df = df.sort_values("UTC").reset_index(drop=True)

# Optional: restrict to one specific year (e.g. 2019) for a cleaner segment
df["YEAR"] = df["UTC"].dt.year
df_year = df[df["YEAR"] == 2019].copy()

# Extract a contiguous segment of N points from the temperature series
N = 512  # length of segment (power of 2 works nicely with FFT)
temp_series = df_year["Temp"].values

if len(temp_series) < N:
    raise ValueError(f"Not enough samples in selected year for N={N}")

signal = temp_series[:N]   # take the first N samples
t = np.arange(N)           # time index (sample index)

print("Segment length:", N)

# ----------------------------------------------------
# 2. FFT: TRANSFORM TO FREQUENCY DOMAIN
# ----------------------------------------------------
# Real FFT: we work with complex frequency coefficients
F = np.fft.rfft(signal)           # shape ~ (N/2+1,)
freqs = np.fft.rfftfreq(N, d=1.0) # not strictly needed, but good to have

# ----------------------------------------------------
# 3. COMPRESSED REPRESENTATION: KEEP ONLY K LARGEST COEFFICIENTS
# ----------------------------------------------------
K = 20  # number of non-zero frequency components to keep
print(f"Keeping only the {K} largest frequency coefficients (by magnitude).")

# Indices of coefficients sorted by magnitude (descending)
idx_sorted = np.argsort(np.abs(F))[::-1]

# Build a sparse spectrum: zeros except the K largest coefficients
F_sparse = np.zeros_like(F, dtype=complex)
F_sparse[idx_sorted[:K]] = F[idx_sorted[:K]]

# ----------------------------------------------------
# 4. RECONSTRUCTION FROM SPARSE COEFFICIENTS (INVERSE FFT)
# ----------------------------------------------------
signal_recon = np.fft.irfft(F_sparse, n=N)

# ----------------------------------------------------
# 5. PLOTS: ORIGINAL VS COMPRESSED-SENSING-STYLE RECONSTRUCTION
# ----------------------------------------------------
fig, axes = plt.subplots(3, 1, figsize=(10, 8), sharex=True)

# Original signal
axes[0].plot(t, signal, label="Original Temp")
axes[0].set_ylabel("Temp (°C)")
axes[0].set_title("Original temperature segment")
axes[0].grid(True, alpha=0.3)

# Reconstructed signal (from K coefficients)
axes[1].plot(t, signal_recon, color="tab:orange", label="Reconstructed Temp (K sparse)")
axes[1].set_ylabel("Temp (°C)")
axes[1].set_title(f"Reconstruction from {K} largest Fourier coefficients")
axes[1].grid(True, alpha=0.3)

# Error (difference)
axes[2].plot(t, signal - signal_recon)
axes[2].set_xlabel("Sample index")
axes[2].set_ylabel("Error")
axes[2].set_title("Reconstruction error (Original - Reconstructed)")
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Segment length: 512
Keeping only the 20 largest frequency coefficients (by magnitude).

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ----------------------------------------------------
# 1. LOAD DATA AND EXTRACT HUMIDITY & PRECIP SIGNALS
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

# Parse time
df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)
df = df.dropna(subset=["UTC", "Hum", "Prec"]).copy()
df = df.sort_values("UTC").reset_index(drop=True)

# Pick one year to get a reasonably continuous segment
df["YEAR"] = df["UTC"].dt.year
df_year = df[df["YEAR"] == 2019].copy()

hum_series = df_year["Hum"].values
prec_series = df_year["Prec"].values

# Segment length
N = 512  # power of 2 works nicely
N = min(N, len(hum_series), len(prec_series))
if N < 32:
    raise ValueError(f"Not enough samples in 2019 for N={N}")

hum_signal = hum_series[:N]
prec_signal = prec_series[:N]
t = np.arange(N)

print("Segment length used:", N)

# ----------------------------------------------------
# 2. HELPER FUNCTION: SPARSE FFT RECONSTRUCTION
# ----------------------------------------------------
def sparse_fft_reconstruction(signal, K, var_name, y_label):
    """
    signal: 1D numpy array (length N)
    K: number of largest Fourier coefficients to keep
    var_name: string for titles
    y_label: label for y-axis
    """
    N = len(signal)

    # Full FFT (real-valued input -> rfft)
    F = np.fft.rfft(signal)
    freqs = np.fft.rfftfreq(N, d=1.0)

    # Keep only K largest coefficients by magnitude
    idx_sorted = np.argsort(np.abs(F))[::-1]
    F_sparse = np.zeros_like(F, dtype=complex)
    F_sparse[idx_sorted[:K]] = F[idx_sorted[:K]]

    # Reconstruct from sparse spectrum
    recon = np.fft.irfft(F_sparse, n=N)

    # Plot
    fig, axes = plt.subplots(3, 1, figsize=(10, 8), sharex=True)

    t = np.arange(N)

    # Original
    axes[0].plot(t, signal)
    axes[0].set_ylabel(y_label)
    axes[0].set_title(f"Original {var_name} segment")
    axes[0].grid(True, alpha=0.3)

    # Reconstructed
    axes[1].plot(t, recon, color="tab:orange")
    axes[1].set_ylabel(y_label)
    axes[1].set_title(f"Reconstruction of {var_name} from {K} largest Fourier coefficients")
    axes[1].grid(True, alpha=0.3)

    # Error
    axes[2].plot(t, signal - recon)
    axes[2].set_xlabel("Sample index")
    axes[2].set_ylabel("Error")
    axes[2].set_title(f"Reconstruction error ({var_name} original - reconstructed)")
    axes[2].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    return recon

# ----------------------------------------------------
# 3. APPLY TO HUMIDITY AND PRECIPITATION
# ----------------------------------------------------
K = 20  # number of non-zero Fourier coefficients to keep

print(f"\n=== Humidity (Hum) sparse reconstruction with K={K} ===")
hum_recon = sparse_fft_reconstruction(hum_signal, K, "Humidity", "Hum (%)")

print(f"\n=== Precipitation (Prec) sparse reconstruction with K={K} ===")
prec_recon = sparse_fft_reconstruction(prec_signal, K, "Precipitation", "Prec (mm)")

Segment length used: 512

=== Humidity (Hum) sparse reconstruction with K=20 ===

=== Precipitation (Prec) sparse reconstruction with K=20 ===

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ----------------------------------------------------
# 1. LOAD DATA
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)
df = df.dropna(subset=["UTC", "Hum", "Prec"]).copy()
df = df.sort_values("UTC").reset_index(drop=True)

df["YEAR"] = df["UTC"].dt.year

print("Years present:", df["YEAR"].unique())

# ----------------------------------------------------
# 2. SELECT YEARS 2019 AND 2024
# ----------------------------------------------------
df_2019 = df[df["YEAR"] == 2019].copy()
df_2024 = df[df["YEAR"] == 2024].copy()

print("Samples 2019:", len(df_2019))
print("Samples 2024:", len(df_2024))

if len(df_2019) == 0 or len(df_2024) == 0:
    raise ValueError("No data for 2019 or 2024 in the CSV.")

# ----------------------------------------------------
# 3. CHOOSE SEGMENT LENGTH N (SAME FOR BOTH YEARS)
# ----------------------------------------------------
N_max = 1024     # máximo deseado
N = min(N_max, len(df_2019), len(df_2024))

if N < 64:
    raise ValueError(f"Not enough samples for a meaningful comparison, N={N}")

print("Segment length used for each year:", N)

hum_2019 = df_2019["Hum"].values[:N]
hum_2024 = df_2024["Hum"].values[:N]

prec_2019 = df_2019["Prec"].values[:N]
prec_2024 = df_2024["Prec"].values[:N]

t = np.arange(N)

# ----------------------------------------------------
# 4. HELPER FUNCTION: SPARSE FFT RECONSTRUCTION + ERROR
# ----------------------------------------------------
def sparse_fft_reconstruction(signal, K):
    """
    signal: 1D array of length N
    K: number of largest Fourier coefficients to keep
    Returns: reconstructed_signal, rmse
    """
    N = len(signal)
    # Real FFT
    F = np.fft.rfft(signal)
    # indices sorted by magnitude (descending)
    idx_sorted = np.argsort(np.abs(F))[::-1]
    # sparse spectrum
    F_sparse = np.zeros_like(F, dtype=complex)
    F_sparse[idx_sorted[:K]] = F[idx_sorted[:K]]
    # inverse FFT
    recon = np.fft.irfft(F_sparse, n=N)
    # root mean squared error
    rmse = np.sqrt(np.mean((signal - recon) ** 2))
    return recon, rmse

# ----------------------------------------------------
# 5. APPLY TO HUMIDITY AND PRECIPITATION (2019 vs 2024)
# ----------------------------------------------------
K = 20  # número de coeficientes no nulos
print(f"Using K={K} largest Fourier coefficients for each signal/year")

# Humidity
hum_2019_recon, hum_2019_rmse = sparse_fft_reconstruction(hum_2019, K)
hum_2024_recon, hum_2024_rmse = sparse_fft_reconstruction(hum_2024, K)

print(f"Humidity RMSE 2019: {hum_2019_rmse:.3f}")
print(f"Humidity RMSE 2024: {hum_2024_rmse:.3f}")

# Precipitation
prec_2019_recon, prec_2019_rmse = sparse_fft_reconstruction(prec_2019, K)
prec_2024_recon, prec_2024_rmse = sparse_fft_reconstruction(prec_2024, K)

print(f"Precipitation RMSE 2019: {prec_2019_rmse:.3f}")
print(f"Precipitation RMSE 2024: {prec_2024_rmse:.3f}")

# ----------------------------------------------------
# 6. PLOTS: HUMIDITY 2019 vs 2024
# ----------------------------------------------------
fig, axes = plt.subplots(3, 2, figsize=(12, 8), sharex=True)

# Column 0: 2019, Column 1: 2024
# Row 0: original, Row 1: reconstructed, Row 2: error

# --- 2019 HUMIDITY ---
axes[0, 0].plot(t, hum_2019)
axes[0, 0].set_title("Humidity 2019 - Original")
axes[0, 0].set_ylabel("Hum (%)")
axes[0, 0].grid(True, alpha=0.3)

axes[1, 0].plot(t, hum_2019_recon)
axes[1, 0].set_title(f"Humidity 2019 - Reconstructed (K={K})")
axes[1, 0].set_ylabel("Hum (%)")
axes[1, 0].grid(True, alpha=0.3)

axes[2, 0].plot(t, hum_2019 - hum_2019_recon)
axes[2, 0].set_title(f"Humidity 2019 - Error (RMSE={hum_2019_rmse:.2f})")
axes[2, 0].set_xlabel("Sample index")
axes[2, 0].set_ylabel("Error")
axes[2, 0].grid(True, alpha=0.3)

# --- 2024 HUMIDITY ---
axes[0, 1].plot(t, hum_2024)
axes[0, 1].set_title("Humidity 2024 - Original")
axes[0, 1].grid(True, alpha=0.3)

axes[1, 1].plot(t, hum_2024_recon)
axes[1, 1].set_title(f"Humidity 2024 - Reconstructed (K={K})")
axes[1, 1].grid(True, alpha=0.3)

axes[2, 1].plot(t, hum_2024 - hum_2024_recon)
axes[2, 1].set_title(f"Humidity 2024 - Error (RMSE={hum_2024_rmse:.2f})")
axes[2, 1].set_xlabel("Sample index")
axes[2, 1].grid(True, alpha=0.3)

plt.suptitle("Sparse Fourier reconstruction of humidity (2019 vs 2024)", y=1.02)
plt.tight_layout()
plt.show()

# ----------------------------------------------------
# 7. PLOTS: PRECIPITATION 2019 vs 2024
# ----------------------------------------------------
fig, axes = plt.subplots(3, 2, figsize=(12, 8), sharex=True)

# --- 2019 PRECIP ---
axes[0, 0].plot(t, prec_2019)
axes[0, 0].set_title("Precipitation 2019 - Original")
axes[0, 0].set_ylabel("Prec (mm)")
axes[0, 0].grid(True, alpha=0.3)

axes[1, 0].plot(t, prec_2019_recon)
axes[1, 0].set_title(f"Precipitation 2019 - Reconstructed (K={K})")
axes[1, 0].set_ylabel("Prec (mm)")
axes[1, 0].grid(True, alpha=0.3)

axes[2, 0].plot(t, prec_2019 - prec_2019_recon)
axes[2, 0].set_title(f"Precipitation 2019 - Error (RMSE={prec_2019_rmse:.2f})")
axes[2, 0].set_xlabel("Sample index")
axes[2, 0].set_ylabel("Error")
axes[2, 0].grid(True, alpha=0.3)

# --- 2024 PRECIP ---
axes[0, 1].plot(t, prec_2024)
axes[0, 1].set_title("Precipitation 2024 - Original")
axes[0, 1].grid(True, alpha=0.3)

axes[1, 1].plot(t, prec_2024_recon)
axes[1, 1].set_title(f"Precipitation 2024 - Reconstructed (K={K})")
axes[1, 1].grid(True, alpha=0.3)

axes[2, 1].plot(t, prec_2024 - prec_2024_recon)
axes[2, 1].set_title(f"Precipitation 2024 - Error (RMSE={prec_2024_rmse:.2f})")
axes[2, 1].set_xlabel("Sample index")
axes[2, 1].grid(True, alpha=0.3)

plt.suptitle("Sparse Fourier reconstruction of precipitation (2019 vs 2024)", y=1.02)
plt.tight_layout()
plt.show()

Years present: [2019 2020 2021 2022 2023 2024]
Samples 2019: 7198
Samples 2024: 1763
Segment length used for each year: 1024
Using K=20 largest Fourier coefficients for each signal/year
Humidity RMSE 2019: 8.851
Humidity RMSE 2024: 5.980
Precipitation RMSE 2019: 0.367
Precipitation RMSE 2024: 0.967

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ----------------------------------------------------
# 1. LOAD DATA
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)
df = df.dropna(subset=["UTC", "Hum", "Prec"]).copy()
df = df.sort_values("UTC").reset_index(drop=True)

df["YEAR"] = df["UTC"].dt.year

# Restrict to 2019–2024
years = list(range(2019, 2025))
df = df[df["YEAR"].isin(years)].copy()

print("Years present:", sorted(df["YEAR"].unique()))
print("Total rows:", len(df))

# ----------------------------------------------------
# 2. DEFINE "FOG-LIKE" HUMID CONDITIONS (same criterion as before)
#    Hum ≥ 95 % and -2 °C ≤ Temp ≤ 15 °C
# ----------------------------------------------------
if "Temp" not in df.columns:
    raise KeyError("Column 'Temp' not found in dataframe; check CSV columns.")

fog_mask = (
    (df["Hum"] >= 95.0) &
    (df["Temp"] >= -2.0) &
    (df["Temp"] <= 15.0)
)
df["FOG_FLAG"] = fog_mask

# Also define "rainy" as Prec > 0 for reference
df["RAIN_FLAG"] = df["Prec"] > 0

# ----------------------------------------------------
# 3. YEARLY SUMMARY STATISTICS
# ----------------------------------------------------
summary = (
    df.groupby("YEAR")
    .agg(
        hum_mean=("Hum", "mean"),
        hum_median=("Hum", "median"),
        hum_p95=("Hum", lambda x: np.percentile(x, 95)),
        fog_hours=("FOG_FLAG", "sum"),
        total_hours=("FOG_FLAG", "size"),
        rainy_hours=("RAIN_FLAG", "sum"),
        prec_mean=("Prec", "mean")
    )
    .reset_index()
)

summary["fog_pct"] = 100.0 * summary["fog_hours"] / summary["total_hours"]
summary["rainy_pct"] = 100.0 * summary["rainy_hours"] / summary["total_hours"]

print(summary)

# ----------------------------------------------------
# 4. PLOTS: HUMIDITY + FOG PERCENTAGE 2019–2024
# ----------------------------------------------------
years_arr = summary["YEAR"].values

fig, ax1 = plt.subplots(figsize=(9, 5))

# --- Left axis: mean humidity per year ---
ax1.plot(years_arr, summary["hum_mean"], marker="o", linewidth=2, label="Mean Humidity")
ax1.fill_between(
    years_arr,
    summary["hum_median"],
    summary["hum_p95"],
    alpha=0.15,
    label="Median–95th percentile range"
)
ax1.set_xlabel("Year")
ax1.set_ylabel("Humidity (%)")
ax1.grid(True, axis="both", alpha=0.3)
ax1.set_xticks(years_arr)

# Mark the power plant shutdown year (2023)
ax1.axvline(x=2023, color="gray", linestyle="--", alpha=0.7)
ax1.text(2023 + 0.05, ax1.get_ylim()[1] * 0.95,
         "Power plant shutdown\n(cooling towers off)",
         fontsize=9, va="top", ha="left", color="gray")

# --- Right axis: percentage of fog-like hours ---
ax2 = ax1.twinx()
ax2.plot(
    years_arr,
    summary["fog_pct"],
    marker="s",
    linewidth=2,
    color="tab:red",
    label="Fog-like hours (%)"
)
ax2.set_ylabel("Fog-like hours (%)", color="tab:red")
ax2.tick_params(axis="y", labelcolor="tab:red")

# Combine legends from both axes
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper left")

plt.title("Humidity and fog-like conditions per year (2019–2024)")
plt.tight_layout()
plt.show()

# ----------------------------------------------------
# 5. OPTIONAL: PRECIPITATION / RAINY HOURS PER YEAR
# ----------------------------------------------------
fig, ax3 = plt.subplots(figsize=(9, 4))

ax3.bar(years_arr - 0.15, summary["rainy_pct"], width=0.3, label="Rainy hours (%)")
ax3.bar(years_arr + 0.15, summary["prec_mean"], width=0.3, label="Mean Prec (mm)")
ax3.set_xticks(years_arr)
ax3.set_xlabel("Year")
ax3.set_ylabel("Value")
ax3.grid(True, axis="y", alpha=0.3)
ax3.axvline(x=2023, color="gray", linestyle="--", alpha=0.7)
ax3.set_title("Rain statistics per year (2019–2024)")
ax3.legend()
plt.tight_layout()
plt.show()

Years present: [np.int32(2019), np.int32(2020), np.int32(2021), np.int32(2022), np.int32(2023), np.int32(2024)]
Total rows: 43343
   YEAR   hum_mean  hum_median  hum_p95  fog_hours  total_hours  rainy_hours  \
0  2019  83.189219        89.0    100.0       2015         7198         1145   
1  2020  85.372612        90.0    100.0       2916         8690         1501   
2  2021  84.572478        90.0    100.0       2811         8713         1573   
3  2022  82.986023        87.0     99.0       1910         8228         1116   
4  2023  83.006056        88.0     99.0       1926         8751         1436   
5  2024  85.655133        88.0     99.0        410         1763          477   

   prec_mean    fog_pct  rainy_pct  
0   0.191803  27.993887  15.907196  
1   0.195075  33.555811  17.272727  
2   0.209388  32.262137  18.053483  
3   0.151920  23.213418  13.563442  
4   0.252063  22.008913  16.409553  
5   0.416563  23.255814  27.056154

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ----------------------------------------------------
# 1. LOAD DATA
# ----------------------------------------------------
df = pd.read_csv("datasets/1363X-20190215-20200416.csv", sep=";")

df["UTC"] = pd.to_datetime(df["UTC"], errors="coerce", utc=True)
df = df.dropna(subset=["UTC", "Hum", "Temp", "Prec"]).copy()
df = df.sort_values("UTC").reset_index(drop=True)

df["YEAR"] = df["UTC"].dt.year
df["HOUR"] = df["UTC"].dt.hour

# Restrict to 2019–2024
years = list(range(2019, 2025))
df = df[df["YEAR"].isin(years)].copy()

print("Years present:", sorted(df["YEAR"].unique()))
print("Total rows:", len(df))

# ----------------------------------------------------
# 2. DEFINE FOG-LIKE CONDITIONS
#    same criterion as before: Hum ≥ 95 %, -2 °C ≤ Temp ≤ 15 °C
# ----------------------------------------------------
fog_mask = (
    (df["Hum"] >= 95.0) &
    (df["Temp"] >= -2.0) &
    (df["Temp"] <= 15.0)
)
df["FOG_FLAG"] = fog_mask
df["RAIN_FLAG"] = df["Prec"] > 0

# ----------------------------------------------------
# 3. YEARLY SUMMARY (ALL HOURS)
# ----------------------------------------------------
summary_all = (
    df.groupby("YEAR")
    .agg(
        hum_mean=("Hum", "mean"),
        hum_median=("Hum", "median"),
        hum_p95=("Hum", lambda x: np.percentile(x, 95)),
        fog_hours=("FOG_FLAG", "sum"),
        total_hours=("FOG_FLAG", "size"),
        rainy_hours=("RAIN_FLAG", "sum"),
        prec_mean=("Prec", "mean"),
    )
    .reset_index()
)

summary_all["fog_pct"] = 100.0 * summary_all["fog_hours"] / summary_all["total_hours"]
summary_all["rainy_pct"] = 100.0 * summary_all["rainy_hours"] / summary_all["total_hours"]

print("\n=== Yearly summary (all hours) ===")
print(summary_all)

# ----------------------------------------------------
# 4. YEARLY SUMMARY (DAWN HOURS ONLY: 00–07)
# ----------------------------------------------------
dawn_mask = df["HOUR"].between(0, 7)  # 0,1,2,3,4,5,6,7
df_dawn = df[dawn_mask].copy()

summary_dawn = (
    df_dawn.groupby("YEAR")
    .agg(
        hum_mean=("Hum", "mean"),
        hum_median=("Hum", "median"),
        hum_p95=("Hum", lambda x: np.percentile(x, 95)),
        fog_hours=("FOG_FLAG", "sum"),
        total_hours=("FOG_FLAG", "size"),
        rainy_hours=("RAIN_FLAG", "sum"),
        prec_mean=("Prec", "mean"),
    )
    .reset_index()
)

summary_dawn["fog_pct"] = 100.0 * summary_dawn["fog_hours"] / summary_dawn["total_hours"]
summary_dawn["rainy_pct"] = 100.0 * summary_dawn["rainy_hours"] / summary_dawn["total_hours"]

print("\n=== Yearly summary (dawn hours 00–07) ===")
print(summary_dawn)

# ----------------------------------------------------
# 5. SIMPLE PRE vs POST AGGREGATION
#    pre: 2019–2022, post: 2023–2024
# ----------------------------------------------------
pre_years = [2019, 2020, 2021, 2022]
post_years = [2023, 2024]

def aggregate_period(summary_df, years, label):
    sub = summary_df[summary_df["YEAR"].isin(years)]
    out = {
        "period": label,
        "years": f"{years[0]}–{years[-1]}",
        "hum_mean": sub["hum_mean"].mean(),
        "hum_p95": sub["hum_p95"].mean(),
        "fog_pct": sub["fog_pct"].mean(),
        "rainy_pct": sub["rainy_pct"].mean(),
        "prec_mean": sub["prec_mean"].mean(),
    }
    return out

# Global (all hours)
pre_all = aggregate_period(summary_all, pre_years, "pre_all")
post_all = aggregate_period(summary_all, post_years, "post_all")

comparison_all = pd.DataFrame([pre_all, post_all])
print("\n=== Pre vs Post comparison (ALL HOURS) ===")
print(comparison_all)

# Dawn only
pre_dawn = aggregate_period(summary_dawn, pre_years, "pre_dawn")
post_dawn = aggregate_period(summary_dawn, post_years, "post_dawn")

comparison_dawn = pd.DataFrame([pre_dawn, post_dawn])
print("\n=== Pre vs Post comparison (DAWN HOURS 00–07) ===")
print(comparison_dawn)

# ----------------------------------------------------
# 6. PLOTS: GLOBAL AND DAWN COMPARISON (HUMIDITY + FOG%)
# ----------------------------------------------------
years_arr = summary_all["YEAR"].values

# Global
fig, ax1 = plt.subplots(figsize=(9, 5))
ax1.plot(years_arr, summary_all["hum_mean"], marker="o", linewidth=2, label="Mean Humidity (all hours)")
ax1.set_xlabel("Year")
ax1.set_ylabel("Humidity (%)")
ax1.grid(True, alpha=0.3)
ax1.set_xticks(years_arr)

ax2 = ax1.twinx()
ax2.plot(years_arr, summary_all["fog_pct"], marker="s", linewidth=2, color="tab:red",
         label="Fog-like hours (%) (all hours)")
ax2.set_ylabel("Fog-like hours (%)", color="tab:red")
ax2.tick_params(axis="y", labelcolor="tab:red")

ax1.axvline(x=2023, color="gray", linestyle="--", alpha=0.7)
ax1.text(2023 + 0.05, ax1.get_ylim()[1] * 0.95,
         "Power plant shutdown\ncooling towers off (2023)",
         fontsize=9, va="top", ha="left", color="gray")

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper left")

plt.title("2019–2024: humidity and fog-like hours (all hours)")
plt.tight_layout()
plt.show()

# Dawn-only plot
years_arr_dawn = summary_dawn["YEAR"].values

fig, ax3 = plt.subplots(figsize=(9, 5))
ax3.plot(years_arr_dawn, summary_dawn["hum_mean"], marker="o", linewidth=2,
         label="Mean Humidity (dawn 00–07)")
ax3.set_xlabel("Year")
ax3.set_ylabel("Humidity (%)")
ax3.grid(True, alpha=0.3)
ax3.set_xticks(years_arr_dawn)

ax4 = ax3.twinx()
ax4.plot(years_arr_dawn, summary_dawn["fog_pct"], marker="s", linewidth=2, color="tab:red",
         label="Fog-like hours (%) (dawn 00–07)")
ax4.set_ylabel("Fog-like hours (%)", color="tab:red")
ax4.tick_params(axis="y", labelcolor="tab:red")

ax3.axvline(x=2023, color="gray", linestyle="--", alpha=0.7)
ax3.text(2023 + 0.05, ax3.get_ylim()[1] * 0.95,
         "Power plant shutdown (2023)",
         fontsize=9, va="top", ha="left", color="gray")

lines3, labels3 = ax3.get_legend_handles_labels()
lines4, labels4 = ax4.get_legend_handles_labels()
ax3.legend(lines3 + lines4, labels3 + labels4, loc="upper left")

plt.title("2019–2024: humidity and fog-like hours (dawn 00–07)")
plt.tight_layout()
plt.show()

Years present: [np.int32(2019), np.int32(2020), np.int32(2021), np.int32(2022), np.int32(2023), np.int32(2024)]
Total rows: 43343

=== Yearly summary (all hours) ===
   YEAR   hum_mean  hum_median  hum_p95  fog_hours  total_hours  rainy_hours  \
0  2019  83.189219        89.0    100.0       2015         7198         1145   
1  2020  85.372612        90.0    100.0       2916         8690         1501   
2  2021  84.572478        90.0    100.0       2811         8713         1573   
3  2022  82.986023        87.0     99.0       1910         8228         1116   
4  2023  83.006056        88.0     99.0       1926         8751         1436   
5  2024  85.655133        88.0     99.0        410         1763          477   

   prec_mean    fog_pct  rainy_pct  
0   0.191803  27.993887  15.907196  
1   0.195075  33.555811  17.272727  
2   0.209388  32.262137  18.053483  
3   0.151920  23.213418  13.563442  
4   0.252063  22.008913  16.409553  
5   0.416563  23.255814  27.056154  

=== Yearly summary (dawn hours 00–07) ===
   YEAR   hum_mean  hum_median  hum_p95  fog_hours  total_hours  rainy_hours  \
0  2019  93.409532        95.0    100.0       1184         2371          380   
1  2020  94.368949        96.0    100.0       1696         2892          539   
2  2021  93.401721        96.0    100.0       1622         2905          551   
3  2022  91.895560        94.0    100.0       1196         2748          390   
4  2023  92.030822        94.0    100.0       1146         2920          510   
5  2024  90.705387        92.0     99.0        210          594          156   

   prec_mean    fog_pct  rainy_pct  
0   0.173260  49.936736  16.026993  
1   0.205325  58.644537  18.637621  
2   0.210396  55.834768  18.967298  
3   0.154585  43.522562  14.192140  
4   0.239315  39.246575  17.465753  
5   0.353199  35.353535  26.262626  

=== Pre vs Post comparison (ALL HOURS) ===
     period      years   hum_mean  hum_p95    fog_pct  rainy_pct  prec_mean
0   pre_all  2019–2022  84.030083    99.75  29.256313  16.199212   0.187047
1  post_all  2023–2024  84.330595    99.00  22.632364  21.732854   0.334313

=== Pre vs Post comparison (DAWN HOURS 00–07) ===
      period      years   hum_mean  hum_p95    fog_pct  rainy_pct  prec_mean
0   pre_dawn  2019–2022  93.268941    100.0  51.984650  16.956013   0.185892
1  post_dawn  2023–2024  91.368105     99.5  37.300055  21.864190   0.296257

Day 7: Transform¶

Assignment 9/12/2025¶

PCA of fog conditions (Hum, Temp, Prec)¶

Principal Components Analysis (PCA)¶

Independent Components Analysis (ICA)¶

Compressed sensing–style sparse reconstruction of temperature¶

Compressed sensing–style sparse reconstruction of humidity and precipitation¶

Compressed sensing–style comparison between 2019 and 2024 (humidity and precipitation)¶

Yearly comparison 2019–2024: humidity, fog and the power plant shutdown¶

Simple pre/post comparison around the power plant shutdown¶