import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ============================================
# 1. Load data
# ============================================
df = pd.read_csv(
    "datasets/barcelona_viajeros_por_franja_csv.csv",
    encoding="latin-1",
    sep=";"
)

# Optional: we make sure to work only with the Barcelona core
df = df[df["NUCLEO_CERCANIAS"] == "BARCELONA"].copy()

# ============================================
# 2. Convert TIME_SECTION to minutes from midnight
# ============================================
def time_to_min(tramo):
    """
    Convierte "HH:MM - HH:MM" en minutos desde medianoche usando el inicio del tramo.
    """
    inicio = tramo.split("-")[0].strip()
    h, m = map(int, inicio.split(":"))
    return h * 60 + m

df["MINUTOS"] = df["TRAMO_HORARIO"].apply(time_to_min)

# ============================================
# 3. Select numerical variables for the multidimensional model
# ============================================
# Feature vector: [MINUTES, PASSENGERS_BOARDING, PASSENGERS_OFF]
cols_feat = ["MINUTOS", "VIAJEROS_SUBIDOS", "VIAJEROS_BAJADOS"]

# We removed rows with NaN just in case
df_clean = df[cols_feat].dropna().copy()

X = df_clean.values.astype(float)   # shape (N, 3)

# ============================================
# 4. Calculate mean and covariance matrix
# ============================================
# Mean (3-dimensional vector)
mean_vec = X.mean(axis=0)

# Covariance matrix (3x3): rows = observations, columns = variables -> rowvar=False
cov_mat = np.cov(X, rowvar=False)

print("Media (MINUTOS, SUBIDOS, BAJADOS):")
print(mean_vec)
print("\nMatriz de covarianza:")
print(cov_mat)

# ============================================
# 5. Generate synthetic samples from the multivariate normal distribution
# ============================================
num_samples = 5000  # puedes cambiar este número

X_synth = np.random.multivariate_normal(
    mean=mean_vec,
    cov=cov_mat,
    size=num_samples
)  # shape (num_samples, 3)

# ============================================
# 6. Histograms: real vs simulated data
# ============================================
labels = ["MINUTES", "PASSENGERS_BOARDING", "PASSENGERS_OFF"]

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, ax in enumerate(axes):
    # Histograma datos reales
    ax.hist(X[:, i], bins=30, alpha=0.6, label="Reales", density=True)
 # Histogram of synthetic data
    ax.hist(X_synth[:, i], bins=30, alpha=0.4, label="Sintéticos (MVN)", density=True)
    ax.set_title(labels[i])
    ax.set_xlabel(labels[i])
    ax.set_ylabel("Densidad")
    ax.legend()

plt.suptitle("Histograms by dimension – Real data vs multidimensional distribution")
plt.tight_layout()
plt.show()

# ============================================
# 7. Visualize the covariance matrix as a heat map
# ============================================
fig, ax = plt.subplots(figsize=(4, 4))
im = ax.imshow(cov_mat)

# Axis labels
ax.set_xticks(range(len(labels)))
ax.set_yticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha="right")
ax.set_yticklabels(labels)

# We added a color bar
cbar = plt.colorbar(im, ax=ax)
cbar.set_label("Covariance")

plt.title("Covariance Matrix")
plt.tight_layout()
plt.show()

Media (MINUTOS, SUBIDOS, BAJADOS):
[836.40085187  87.42948415  87.42948415]

Matriz de covarianza:
[[115821.19691288  -2823.9142121    -737.45504051]
 [ -2823.9142121   34545.56011509  27361.6121861 ]
 [  -737.45504051  27361.6121861   43711.01999675]]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ============================================
# 1. Load data
# ============================================
# Adjust the path if the CSV is in a different folder
df = pd.read_csv(
    "datasets/barcelona_viajeros_por_franja_csv.csv",
    encoding="latin-1",
    sep=";"
)

# Optional: ensure that we only work with the Barcelona core
df = df[df["NUCLEO_CERCANIAS"] == "BARCELONA"].copy()

# ============================================
# 2. Convert TIME_SECTION to minutes from midnight
# ============================================
def time_to_min(tramo):
    """
    Convierte una cadena del tipo 'HH:MM - HH:MM'
    en minutos desde medianoche usando el inicio del tramo.
    """
    inicio = tramo.split("-")[0].strip()
    h, m = map(int, inicio.split(":"))
    return h * 60 + m

df["MINUTOS"] = df["TRAMO_HORARIO"].apply(time_to_min)

# ============================================
# 3. Select numeric variables
# ============================================
# Multidimensional vector: [MINUTES, PASSENGERS_BOARDING, PASSENGERS_OFF]
cols_feat = ["MINUTOS", "VIAJEROS_SUBIDOS", "VIAJEROS_BAJADOS"]

# We clean up potential NaNs
df_clean = df[cols_feat].dropna().copy()

# Data matrix (N, 3)
X = df_clean.values.astype(float)

# =============================================
# 4. Calculate means, variances, and variance matrix
# ============================================
# Mean of each dimension
mean_vec = X.mean(axis=0)

# Variance of each dimension (ddof=0 -> population variance)
var_vec = X.var(axis=0, ddof=0)

# Variance matrix (diagonal)
var_mat = np.diag(var_vec)

print("Media (MINUTOS, VIAJEROS_SUBIDOS, VIAJEROS_BAJADOS):")
print(mean_vec)

print("\nVector de varianzas (MINUTOS, VIAJEROS_SUBIDOS, VIAJEROS_BAJADOS):")
print(var_vec)

print("\nVariance matrix (matriz de varianza, diagonal):")
print(var_mat)

# ============================================
# 5. Histograms for each dimension (series of histograms)
# ============================================
labels = ["MINUTES", "PASSENGERS_BOARDING", "PASSENGERS_OFF"]

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, ax in enumerate(axes):
    ax.hist(X[:, i], bins=30, alpha=0.8)
    ax.set_title(f"Histogram of {labels[i]}")
    ax.set_xlabel(labels[i])
    ax.set_ylabel("Frequency")

plt.suptitle("Histograms of the multidimensional distribution (by dimension)")
plt.tight_layout()
plt.show()

# =============================================
# 6. (Optional) Compare with synthetic samples
# of a multidimensional Normal distribution with mean and variance matrix
# ===========================================
num_samples = 5000

# We generate each dimension as an independent Normal(mean, variance)
# (variance matrix -> no off-diagonal covariances)
X_synth = np.zeros((num_samples, 3))
for i in range(3):
    X_synth[:, i] = np.random.normal(
        loc=mean_vec[i],
        scale=np.sqrt(var_vec[i]),
        size=num_samples
    )

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, ax in enumerate(axes):
    ax.hist(X[:, i], bins=30, alpha=0.6, density=True, label="Realist")
    ax.hist(X_synth[:, i], bins=30, alpha=0.4, density=True, label="Synthetics (var matrix)")
    ax.set_title(labels[i])
    ax.set_xlabel(labels[i])
    ax.set_ylabel("Density")
    ax.legend()

plt.suptitle("Real vs synthetic distributions (variance matrix)")
plt.tight_layout()
plt.show()

Media (MINUTOS, VIAJEROS_SUBIDOS, VIAJEROS_BAJADOS):
[836.40085187  87.42948415  87.42948415]

Vector de varianzas (MINUTOS, VIAJEROS_SUBIDOS, VIAJEROS_BAJADOS):
[115793.79009866  34537.38558596  43700.67664133]

Variance matrix (matriz de varianza, diagonal):
[[115793.79009866      0.              0.        ]
 [     0.          34537.38558596      0.        ]
 [     0.              0.          43700.67664133]]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ============================================
# 1. Load data
# ============================================
df = pd.read_csv(
    "datasets/barcelona_viajeros_por_franja_csv.csv",
    encoding="latin-1",
    sep=";"
)

# We only work with the Barcelona core area (in case there are more)
df = df[df["NUCLEO_CERCANIAS"] == "BARCELONA"].copy()

# ============================================
# 2. Convert TIME_SECTION to minutes from midnight
# ============================================
def time_to_min(tramo):
    """
    Convierte 'HH:MM - HH:MM' en minutos desde medianoche usando el inicio del tramo.
    """
    inicio = tramo.split("-")[0].strip()
    h, m = map(int, inicio.split(":"))
    return h * 60 + m

df["MINUTOS"] = df["TRAMO_HORARIO"].apply(time_to_min)

# ============================================
# 3. Add travelers by time slot (across the entire core area)
# ============================================
df_agg = df.groupby(["TRAMO_HORARIO", "MINUTOS"])[
    ["VIAJEROS_SUBIDOS", "VIAJEROS_BAJADOS"]
].sum().reset_index()

# Total number of travelers per time slot
df_agg["VIAJEROS_TOTALES"] = df_agg["VIAJEROS_SUBIDOS"] + df_agg["VIAJEROS_BAJADOS"]

# Sort by time
df_agg = df_agg.sort_values("MINUTOS").reset_index(drop=True)

# Vector of times (minutes) and travelers
t = df_agg["MINUTOS"].values.astype(float)
counts = df_agg["VIAJEROS_TOTALES"].values.astype(float)

# ============================================
# 4. Constructing Probability Distributions
# ============================================
# Shannon Entropy Function in Bits
def entropy(p):
    p = np.asarray(p, dtype=float)
    p = p[p > 0]  # evitamos log(0)
    return -np.sum(p * np.log2(p))

# (a) Empirical distribution
total_counts = counts.sum()
p_emp = counts / total_counts  # distribution over the strips

# (b) Uniform distribution over the same strips
N = len(counts)
p_uni = np.ones(N) / N

# (c) Discrete Gaussian distribution over time
# We fit the mean and standard deviation from the data
mu = np.average(t, weights=counts)  # traveler-weighted average
sigma = np.sqrt(np.average((t - mu)**2, weights=counts) + 1e-8)

p_gauss = np.exp(-0.5 * ((t - mu) / sigma)**2)
p_gauss /= p_gauss.sum()  # normalize to 1

# (d) One-hot distribution: all the weight in the strip with more travelers
idx_max = np.argmax(counts)
p_onehot = np.zeros(N)
p_onehot[idx_max] = 1.0

# ============================================
# 5. Calculate entropies
# ============================================
H_emp    = entropy(p_emp)
H_uni    = entropy(p_uni)
H_gauss  = entropy(p_gauss)
H_onehot = entropy(p_onehot)

print("Entropías (en bits):")
print(f"  Empírica  : {H_emp:.3f}")
print(f"  Uniforme  : {H_uni:.3f}")
print(f"  Gaussiana : {H_gauss:.3f}")
print(f"  One-hot   : {H_onehot:.3f}")

# ============================================
# 6. Visualize the distributions
# ============================================
labels_tramo = df_agg["TRAMO_HORARIO"].values

x = np.arange(N)  # strip index for the X-axis

plt.figure(figsize=(12, 6))
plt.bar(x - 0.3, p_emp, width=0.2, label="Empirical")
plt.bar(x - 0.1, p_uni, width=0.2, label="Uniform")
plt.bar(x + 0.1, p_gauss, width=0.2, label="Discrete Gaussian")
plt.bar(x + 0.3, p_onehot, width=0.2, label="One-hot")

plt.xticks(x, labels_tramo, rotation=90)
plt.xlabel("Time slot")
plt.ylabel("Probability")
plt.title("Distributions over time slots (entropic perspective)")
plt.legend()
plt.tight_layout()
plt.show()

# ============================================
# 7. Visualize entropies in a separate graph
# ============================================
names = ["Empirical", "Uniform", "Gaussian", "One-hot"]
H_vals = [H_emp, H_uni, H_gauss, H_onehot]

plt.figure(figsize=(6,4))
plt.bar(names, H_vals)
plt.ylabel("Entropy (bits)")
plt.title("Comparison of entropy between distributions")
plt.tight_layout()
plt.show()

Entropías (en bits):
  Empírica  : 5.052
  Uniforme  : 5.358
  Gaussiana : 5.111
  One-hot   : -0.000

Week 3: Probability¶

Covariance matrix¶

Variance matrix¶

Entropy¶