import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ============================================
# 1. Load and prepare data
# ============================================
df = pd.read_csv(
    "datasets/barcelona_viajeros_por_franja_csv.csv",
    encoding="latin-1",
    sep=";"
)

# We're keeping only the core of Barcelona
df = df[df["NUCLEO_CERCANIAS"] == "BARCELONA"].copy()

def time_to_min(tramo):
    inicio = tramo.split("-")[0].strip()
    h, m = map(int, inicio.split(":"))
    return h * 60 + m

df["MINUTOS"] = df["TRAMO_HORARIO"].apply(time_to_min)

# We're adding travelers by time slot
df_agg = df.groupby("MINUTOS")[["VIAJEROS_SUBIDOS", "VIAJEROS_BAJADOS"]].sum()
df_agg["VIAJEROS_TOTALES"] = df_agg["VIAJEROS_SUBIDOS"] + df_agg["VIAJEROS_BAJADOS"]

# Variable we are going to model (one dimension): total travelers per slot
X = df_agg["VIAJEROS_TOTALES"].values.astype(float)

# We normalize (very important for numerical stability)
X = (X - X.mean()) / X.std()

N = X.shape[0]

# ============================================
#2. Initialize GMM (2 Gaussians)
# ============================================
K = 2
np.random.seed(0)

mu = np.random.randn(K)      # medias iniciales
sigma = np.ones(K)           # desviaciones típicas iniciales
pi = np.ones(K) / K          # pesos iniciales

# ============================================
# 3. Auxiliary functions
# ============================================
def gaussian(x, mu, sigma):
    """Densidad de una normal N(mu, sigma^2) evaluada en x."""
    return (1.0 / (np.sqrt(2*np.pi) * sigma)) * np.exp(-0.5 * ((x - mu) / sigma)**2)

def log_likelihood(X, mu, sigma, pi):
    ll = 0.0
    for x in X:
        comp = 0.0
        for k in range(K):
            comp += pi[k] * gaussian(x, mu[k], sigma[k])
        ll += np.log(comp + 1e-12)
    return ll

# ============================================
# 4. EM Algorithm
# ============================================
num_iters = 100
loglik = []

for it in range(num_iters):

    # ---------- E-step ----------
    gamma = np.zeros((N, K))  # responsabilidades

    for i, x in enumerate(X):
        for k in range(K):
            gamma[i, k] = pi[k] * gaussian(x, mu[k], sigma[k])
        gamma[i, :] /= gamma[i, :].sum()

    Nk = gamma.sum(axis=0)

    # ---------- M-step ----------
    for k in range(K):
        mu[k] = np.sum(gamma[:, k] * X) / Nk[k]
        sigma[k] = np.sqrt(
            np.sum(gamma[:, k] * (X - mu[k])**2) / Nk[k] + 1e-12
        )
        pi[k] = Nk[k] / N

    ll = log_likelihood(X, mu, sigma, pi)
    loglik.append(ll)

# ============================================
# 5. Density Estimation: histogram + estimated density
# ============================================
x_plot = np.linspace(X.min(), X.max(), 300)

pdf = np.zeros_like(x_plot)
for k in range(K):
    pdf += pi[k] * gaussian(x_plot, mu[k], sigma[k])

plt.figure(figsize=(10, 5))
plt.hist(X, bins=15, density=True, alpha=0.6, label="Actual (normalized) data")
plt.plot(x_plot, pdf, label="Estimated density (GMM + EM)")
plt.xlabel("Total travelers (normalized)")
plt.ylabel("Density")
plt.title("Density Estimation con EM (Gaussian Mixture, K=2)")
plt.legend()
plt.tight_layout()
plt.show()

# ============================================
# 6. Convergence: log-likelihood
# ============================================
plt.figure(figsize=(6, 4))
plt.plot(loglik)
plt.xlabel("Iteration")
plt.ylabel("Log-likelihood")
plt.title("EM Convergence")
plt.tight_layout()
plt.show()

# =========================================================
# 0. IMPORTS
# =========================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D # for 3D histograms

# If you don't have scikit-learn, run the following first:
# !pip install scikit-learn

from sklearn.mixture import GaussianMixture


# =========================================================
# 1. LOAD AND PREPARE DATA
# =========================================================
df = pd.read_csv(
    "datasets/barcelona_viajeros_por_franja_csv.csv",
    encoding="latin-1",
    sep=";"
)

# We're keeping only the core of Barcelona
df = df[df["NUCLEO_CERCANIAS"] == "BARCELONA"].copy()

# We converted TIME_SECTION to minutes starting at midnight
def time_to_min(tramo):
    inicio = tramo.split("-")[0].strip()
    h, m = map(int, inicio.split(":"))
    return h * 60 + m

df["MINUTOS"] = df["TRAMO_HORARIO"].apply(time_to_min)

# We add by time slot (the entire core)
df_agg = df.groupby("MINUTOS")[["VIAJEROS_SUBIDOS", "VIAJEROS_BAJADOS"]].sum()
df_agg["VIAJEROS_TOTALES"] = df_agg["VIAJEROS_SUBIDOS"] + df_agg["VIAJEROS_BAJADOS"]
df_agg = df_agg.reset_index().sort_values("MINUTOS")

# Variables for the model
# We use 2D: [MINUTES, TOTAL_TRAVELERS]
X = df_agg[["MINUTOS", "VIAJEROS_TOTALES"]].values.astype(float)

# We normalized (better for GMM)
X_mean = X.mean(axis=0)
X_std  = X.std(axis=0) + 1e-8
X_norm = (X - X_mean) / X_std


# =========================================================
#2. FITTING GAUSSIAN MIXTURE MODEL (GMM) WITH EM
# =========================================================
n_components = 3  # For example: morning / noon / afternoon

gmm = GaussianMixture(
    n_components=n_components,
    covariance_type="full",
    random_state=0
)

gmm.fit(X_norm)  # EM soft clustering

# Weights of each component (to prune unlikely clusters)
weights = gmm.weights_   # size K
print("Pesos de mezcla de los clusters:", weights)

# Pruning: we keep clusters whose weight > threshold
weight_threshold = 0.05
mask_clusters = weights > weight_threshold
idx_clusters_validos = np.where(mask_clusters)[0]

print("Clusters válidos (no podados):", idx_clusters_validos)


# Membership probabilities (soft EM clustering)
responsabilidades = gmm.predict_proba(X_norm)  # shape (N, K)
cluster_suave = responsabilidades[:, idx_clusters_validos].argmax(axis=1)


# =========================================================
# 3. 2D HISTOGRAM (MINUTES vs TOTAL TRAVELERS)
# =========================================================
plt.figure(figsize=(8, 6))

plt.hist2d(
    df_agg["MINUTOS"],
    df_agg["VIAJEROS_TOTALES"],
    bins=15
)

plt.colorbar(label="Frequency")
plt.xlabel("Minutes since midnight")
plt.ylabel("Total travelers")
plt.title("2D Histogram – MINUTES vs TOTAL TRAVELERS")
plt.tight_layout()
plt.show()


# =========================================================
# 4. 3D HISTOGRAM (MINUTES vs TOTAL TRAVELERS)
# =========================================================
# We used a 2D histogram and extruded it as 3D bars
x = df_agg["MINUTOS"].values
y = df_agg["VIAJEROS_TOTALES"].values

# We define bin grid
xbins = 10
ybins = 10

H, xedges, yedges = np.histogram2d(x, y, bins=[xbins, ybins])

# We construct the coordinates of the bars
xpos, ypos = np.meshgrid(
    (xedges[:-1] + xedges[1:]) / 2,
    (yedges[:-1] + yedges[1:]) / 2,
    indexing="ij"
)

xpos = xpos.ravel()
ypos = ypos.ravel()
zpos = np.zeros_like(xpos)

dx = (xedges[1] - xedges[0]) * np.ones_like(zpos)
dy = (yedges[1] - yedges[0]) * np.ones_like(zpos)
dz = H.ravel()

fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection="3d")

ax.bar3d(xpos, ypos, zpos, dx, dy, dz)

ax.set_xlabel("Minutes since midnight")
ax.set_ylabel("Total travelers")
ax.set_zlabel("Frequency")
ax.set_title("3D Histogram – MINUTES vs TOTAL TRAVELERS")
plt.tight_layout()
plt.show()


# =========================================================
# 5. VISUALIZE GMM CLUSTERS IN 2D
# =========================================================
plt.figure(figsize=(8, 6))

scatter = plt.scatter(
    X[:, 0],   # MINUTES
    X[:, 1],   # TOTAL_TRAVELERS
    c=cluster_suave,
    cmap="viridis"
)

plt.xlabel("Minutes since midnight")
plt.ylabel("Total travelers")
plt.title("Gaussian Mixture Model – soft clustering (valid clusters)")
plt.colorbar(scatter, label="Cluster")
plt.tight_layout()
plt.show()

Pesos de mezcla de los clusters: [0.23206772 0.4925547  0.27537758]
Clusters válidos (no podados): [0 1 2]

Week 3: Density Estimation¶

Expectancy-Maximization (E-M)¶

Kernel Density Estimation¶

Gaussian mixture models¶