import fastf1
import pandas as pd
import numpy as np

from typing import Tuple

def load_session(year: int = 2025, gp: str = "Monza", session_name: str = "R"):
    '''
    Load an F1 session using FastF1.

    Inputs
    ------
    year : int
        Championship year, e.g. 2025
    gp : str
        Grand Prix name, e.g. "Monza"
    session_name : str
        Session code: "FP1", "FP2", "FP3", "Q", "R"

    Output
    ------
    session : fastf1.core.Session
        A FastF1 session object with timing + lap data.
    '''
    # Cache makes reruns *much* faster after the first download.
    fastf1.Cache.enable_cache("fastf1_cache_dir")

    session = fastf1.get_session(year, gp, session_name)
    session.load()  # downloads timing data (first time only)
    return session


def build_model_table(session) -> pd.DataFrame:
    '''
    Turn FastF1 lap data into a clean ML table.

    We purposely do NOT use pick_quicklaps(). We keep *all* laps that have
    a valid LapTime + sector times + the required features.

    Inputs
    ------
    session : fastf1.core.Session

    Output
    ------
    df : pd.DataFrame
        Clean modeling table with:
        - TyreLife
        - Compound
        - Sector1Time_s, Sector2Time_s, Sector3Time_s
        - TrackStatus
        - LapTime_s (target)
    '''
    laps = session.laps.copy()  # all laps available

    # Convert time columns to seconds (float)
    def to_seconds(series):
        return series.dt.total_seconds()

    df = pd.DataFrame({
        "Driver": laps["Driver"],
        "TyreLife": laps["TyreLife"],
        "Compound": laps["Compound"],
        "TrackStatus": laps["TrackStatus"],
        "Sector1TimeSeconds": to_seconds(laps["Sector1Time"]),
        "Sector2TimeSeconds": to_seconds(laps["Sector2Time"]),
        "Sector3TimeSeconds": to_seconds(laps["Sector3Time"]),
        "LapTimeSeconds": to_seconds(laps["LapTime"]),
    })

    # Basic cleaning: keep rows where the model can actually learn
    df = df.dropna(subset=[
        "TyreLife", "Compound", "TrackStatus",
        "Sector1TimeSeconds", "Sector2TimeSeconds", "Sector3TimeSeconds", "LapTimeSeconds"
    ]).reset_index(drop=True)

    # Keep only the 3 compounds asked for (some sessions have INTER/WET)
    df = df[df["Compound"].isin(["SOFT", "MEDIUM", "HARD"])].reset_index(drop=True)

    # TrackStatus is usually a string that *looks* like a number.
    # We keep it numeric so models can use it.
    df["TrackStatus"] = pd.to_numeric(df["TrackStatus"], errors="coerce")
    df = df.dropna(subset=["TrackStatus"]).reset_index(drop=True)
    df["TrackStatus"] = df["TrackStatus"].astype(int)

    return df

session = load_session(year=2025, gp="Monza", session_name="R")
df = build_model_table(session)
df.shape

core           INFO 	Loading data for Italian Grand Prix - Race [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '16', '63', '44', '23', '5', '12', '6', '55', '87', '22', '30', '31', '10', '43', '18', '14', '27']

(955, 8)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

feature_cols = ["TyreLife","Compound","Sector1TimeSeconds","Sector2TimeSeconds","Sector3TimeSeconds","TrackStatus"]
X = df[feature_cols].copy()

num_cols = ["TyreLife","Sector1TimeSeconds","Sector2TimeSeconds","Sector3TimeSeconds","TrackStatus"]
cat_cols = ["Compound"]

prep = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

X_num = prep.fit_transform(X)
X_num.shape

# # Encode Tyre Compound as numbers so it can be used in PCA/ICA
# compound_map = {
#     "SOFT":0,
#     "MEDIUM":1,
#     "HARD":2}
# features["CompoundEncoded"] = features["Compound"].map(compound_map)

# # Now features used for decomposition:
# numeric_features = ["TyreLife", "LapTime", "Sector1Time", "Sector2Time", "Sector3Time", "CompoundEncoded"]

(955, 8)

# scaler = StandardScaler()
# scaled_features = scaler.fit_transform(features[numeric_features])

# pca = PCA(n_components=2)  # Reduce to 2 dimensions for easy plotting
# pca_components = pca.fit_transform(scaled_features)

from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

dense = X_num.toarray() if hasattr(X_num, "toarray") else X_num

pca = PCA(random_state=7).fit(dense)
explained = pca.explained_variance_ratio_
cum = np.cumsum(explained)

fig, ax = plt.subplots(figsize=(8,4))
ax.plot(np.arange(1, len(explained)+1), cum, marker="o")
ax.set_title("PCA cumulative explained variance")
ax.set_xlabel("number of components")
ax.set_ylabel("cumulative variance explained")
ax.set_ylim(0, 1.01)
plt.show()

# ica = FastICA(n_components=2, random_state=42)
# ica_components = ica.fit_transform(scaled_features)

from sklearn.decomposition import FastICA

# Take the data and compresses it into 2 independent components
# Each point now has coordinates (IC1, IC2)
ica = FastICA(n_components=2, random_state=7, max_iter=2000)
Xica = ica.fit_transform(dense)

fig, ax = plt.subplots(figsize=(7,5))
for c in ["SOFT","MEDIUM","HARD"]:
    m = comp == c
    ax.scatter(Xica[m,0], Xica[m,1], s=10, alpha=0.6, label=c)
ax.set_title("ICA 2D projection (colored by Compound)")
ax.set_xlabel("IC1")
ax.set_ylabel("IC2")
ax.legend()
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Tyre life vs lap time

# Prepare data for regression
X = df["TyreLife"].to_numpy().reshape(-1, 1)        # feature (must be 2D)
y = lap_time                  # target

# Fit linear regression model
model = LinearRegression().fit(X, y)

# Generate smooth line for plotting
x_grid = np.linspace(X.min(), X.max(), 200).reshape(-1, 1)
y_pred = model.predict(x_grid)

# Correlation check
# TyreLife - number of laps
# lap_time - lap time in seconds
# np.corrcoef() - ?? (need to do a bit more digging with ChatGPT
corr_tyre = np.corrcoef(df["TyreLife"], lap_time)[0, 1]

# Plot data + regression line
plt.figure(figsize=(7, 5))
plt.scatter(df["TyreLife"], lap_time, s=10, alpha=0.4)
plt.plot(x_grid, y_pred, linewidth=2)
plt.title(f"Tyre Life vs Lap Time (corr={corr_tyre:.3f}) — correlation ≠ causation")
plt.xlabel("Tyre Life (laps)")
plt.ylabel("Lap Time (sec)")
plt.show()

Week 4: PCA & FASTICA

Libraries¶

Data Preparation¶

Loading data¶

Preprocessing¶

PCA - Principle Component Analysis¶

Explanation¶

ICA - Independent Component Analysis¶

Explanation:¶

Correlation and Causation¶

Explanation:¶

Insights¶