import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 1. Upload the CSV
df = pd.read_csv(
    "datasets/barcelona_viajeros_por_franja_csv.csv",
    encoding="latin-1",
    sep=";"
)

# 2. We ensure that the time range is text and we order
df["TRAMO_HORARIO"] = df["TRAMO_HORARIO"].astype(str)
df_sorted = df.sort_values("TRAMO_HORARIO")

# 3. Add travelers by time slot (total across the entire hub)
agg = df_sorted.groupby("TRAMO_HORARIO")[["VIAJEROS_SUBIDOS", "VIAJEROS_BAJADOS"]].sum().reset_index()

# 4. Convert "HH:MM - HH:MM" → minutes since midnight (we use the start of the segment)
def time_to_min(tramo):
    inicio = tramo.split("-")[0].strip()  # "HH:MM "
    h, m = map(int, inicio.split(":"))
    return h * 60 + m

agg["minutos"] = agg["TRAMO_HORARIO"].apply(time_to_min)

# 5. Data for the adjustment (we use PASSENGERS_ON as an example)
x = agg["minutos"].values
y = agg["VIAJEROS_SUBIDOS"].values

# 6. High-degree polynomial fitting (overfitting)
grado = 12  # You can play with this value
coef = np.polyfit(x, y, grado)
poly = np.poly1d(coef)

x_fit = np.linspace(x.min(), x.max(), 300)
y_fit = poly(x_fit)

# 7. Representation
plt.figure(figsize=(14, 6))

# Real data
plt.plot(x, y, "o", label="Actual data (passengers boarded)")

# Overfitted curve
plt.plot(x_fit, y_fit, label=f"Overfitted model (grade {grado})")

# X-axis with time slot labels
plt.xticks(
    ticks=agg["minutos"],
    labels=agg["TRAMO_HORARIO"],
    rotation=90
)

plt.xlabel("Time slot")
plt.ylabel("Number of passengers boarded")
plt.title("Volume of travelers per time slot (Barcelona)")
plt.legend()
plt.tight_layout()
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 1. Upload the CSV
df = pd.read_csv(
    "datasets/barcelona_viajeros_por_franja_csv.csv",
    encoding="latin-1",
    sep=";"
)

# 2. We ensure that the time range is text and we order
df["TRAMO_HORARIO"] = df["TRAMO_HORARIO"].astype(str)
df_sorted = df.sort_values("TRAMO_HORARIO")

# 3. Add travelers by time slot (total across the entire hub)
agg = df_sorted.groupby("TRAMO_HORARIO")[["VIAJEROS_SUBIDOS", "VIAJEROS_BAJADOS"]].sum().reset_index()

# 4. Convert "HH:MM - HH:MM" → minutes since midnight (we use the start of the segment)
def time_to_min(tramo):
    inicio = tramo.split("-")[0].strip()  # "HH:MM "
    h, m = map(int, inicio.split(":"))
    return h * 60 + m

agg["minutos"] = agg["TRAMO_HORARIO"].apply(time_to_min)

# 5. Data for the adjustment (we use PASSENGERS_ON as an example)
x = agg["minutos"].values
y = agg["VIAJEROS_SUBIDOS"].values

# 6. High-degree polynomial fitting (overfitting)
grado = 2  # You can play with this value
coef = np.polyfit(x, y, grado)
poly = np.poly1d(coef)

x_fit = np.linspace(x.min(), x.max(), 300)
y_fit = poly(x_fit)

# 7. Representation
plt.figure(figsize=(14, 6))

# Real data
plt.plot(x, y, "o", label="Actual data (passengers boarded)")

# Overfitted curve
plt.plot(x_fit, y_fit, label=f"Overfitted model (grade {grado})")

# X-axis with time slot labels
plt.xticks(
    ticks=agg["minutos"],
    labels=agg["TRAMO_HORARIO"],
    rotation=90
)

plt.xlabel("Time slot")
plt.ylabel("Number of passengers boarded")
plt.title("Volume of travelers per time slot (Barcelona)")
plt.legend()
plt.tight_layout()
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 1. Upload the CSV
df = pd.read_csv(
    "datasets/barcelona_viajeros_por_franja_csv.csv",
    encoding="latin-1",
    sep=";"
)

# 2. We ensure that the time range is text and we order
df["TRAMO_HORARIO"] = df["TRAMO_HORARIO"].astype(str)
df_sorted = df.sort_values("TRAMO_HORARIO")

# 3. Add travelers by time slot (total across the entire hub)
agg = df_sorted.groupby("TRAMO_HORARIO")[["VIAJEROS_SUBIDOS", "VIAJEROS_BAJADOS"]].sum().reset_index()

# 4. Convert "HH:MM - HH:MM" → minutes since midnight (we use the start of the segment)
def time_to_min(tramo):
    inicio = tramo.split("-")[0].strip()  # "HH:MM "
    h, m = map(int, inicio.split(":"))
    return h * 60 + m

agg["minutos"] = agg["TRAMO_HORARIO"].apply(time_to_min)

# 5. Data for the adjustment (we use PASSENGERS_ON as an example)
x = agg["minutos"].values
y = agg["VIAJEROS_SUBIDOS"].values

# 6. High-degree polynomial fitting (overfitting)
grado = 5  # You can play with this value
coef = np.polyfit(x, y, grado)
poly = np.poly1d(coef)

x_fit = np.linspace(x.min(), x.max(), 300)
y_fit = poly(x_fit)

# 7. Representation
plt.figure(figsize=(14, 6))

# Real data
plt.plot(x, y, "o", label="Actual data (passengers boarded)")

# Overfitted curve
plt.plot(x_fit, y_fit, label=f"Overfitted model (grade {grado})")

# X-axis with time slot labels
plt.xticks(
    ticks=agg["minutos"],
    labels=agg["TRAMO_HORARIO"],
    rotation=90
)

plt.xlabel("Time slot")
plt.ylabel("Number of passengers boarded")
plt.title("Volume of travelers per time slot (Barcelona)")
plt.legend()
plt.tight_layout()
plt.show()

Week 2: Fitting¶

Overfitting¶