import fastf1
import pandas as pd
import numpy as np

from typing import Tuple

def load_session(year: int = 2025, gp: str = "Monza", session_name: str = "R"):
    '''
    Load an F1 session using FastF1.

    Inputs
    ------
    year : int
        Championship year, e.g. 2024
    gp : str
        Grand Prix name, e.g. "Monza"
    session_name : str
        Session code: "FP1", "FP2", "FP3", "Q", "R"

    Output
    ------
    session : fastf1.core.Session
        A FastF1 session object with timing + lap data.
    '''
    # Follwing was from FastF1
    # Cache makes reruns *much* faster after the first download.
    fastf1.Cache.enable_cache("fastf1_cache_dir")

    session = fastf1.get_session(year, gp, session_name)
    session.load()  # downloads timing data (first time only)
    return session


def build_model_table(session) -> pd.DataFrame:
    '''
    Turn FastF1 lap data into a clean ML table.

    We purposely do NOT use pick_quicklaps(). We keep *all* laps that have
    a valid LapTime + sector times + the required features.

    Inputs
    ------
    session : fastf1.core.Session

    Output
    ------
    df : pd.DataFrame
        Clean modeling table with:
        - TyreLife
        - Compound
        - Sector1TimeSeconds, Sector2TimeSeconds, Sector3TimeSeconds
        - TrackStatus
        - LapTime_s (target)
    '''
    laps = session.laps.copy()  # all laps available

    # Convert time columns to seconds (float)
    def to_seconds(series):
        return series.dt.total_seconds()

    df = pd.DataFrame({
        "Driver": laps["Driver"],
        "TyreLife": laps["TyreLife"],
        "Compound": laps["Compound"],
        "TrackStatus": laps["TrackStatus"],
        "Sector1TimeSeconds": to_seconds(laps["Sector1Time"]),
        "Sector2TimeSeconds": to_seconds(laps["Sector2Time"]),
        "Sector3TimeSeconds": to_seconds(laps["Sector3Time"]),
        "LapTimeSeconds": to_seconds(laps["LapTime"]),
    })

    # Basic cleaning: keep rows where the model can actually learn
    df = df.dropna(subset=[
        "TyreLife", "Compound", "TrackStatus",
        "Sector1TimeSeconds", "Sector2TimeSeconds", "Sector3TimeSeconds", "LapTimeSeconds"
    ]).reset_index(drop=True)

    # Keep only the 3 compounds asked for (some sessions have INTER/WET)
    df = df[df["Compound"].isin(["SOFT", "MEDIUM", "HARD"])].reset_index(drop=True)

    # TrackStatus is usually a string that *looks* like a number.
    # We keep it numeric so models can use it.
    df["TrackStatus"] = pd.to_numeric(df["TrackStatus"], errors="coerce")
    df = df.dropna(subset=["TrackStatus"]).reset_index(drop=True)
    df["TrackStatus"] = df["TrackStatus"].astype(int)

    return df

session = load_session(year=2025, gp="Monza", session_name="R")
df = build_model_table(session)

df.head(), df.shape

core           INFO 	Loading data for Italian Grand Prix - Race [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '16', '63', '44', '23', '5', '12', '6', '55', '87', '22', '30', '31', '10', '43', '18', '14', '27']

(  Driver  TyreLife Compound  TrackStatus  Sector1TimeSeconds  \
 0    VER       2.0   MEDIUM            1              28.457   
 1    VER       3.0   MEDIUM            1              27.212   
 2    VER       4.0   MEDIUM            1              27.375   
 3    VER       5.0   MEDIUM            1              27.520   
 4    VER       6.0   MEDIUM            1              27.434   
 
    Sector2TimeSeconds  Sector3TimeSeconds  LapTimeSeconds  
 0              28.843              27.559          84.859  
 1              28.713              27.587          83.512  
 2              28.455              27.432          83.262  
 3              28.427              27.641          83.588  
 4              28.496              27.646          83.576  ,
 (955, 8))

import matplotlib.pyplot as plt
import seaborn as sns    # better design template

sns.set_theme()   # using seaborn theme

fig, ax = plt.subplots(figsize=(8,4))
ax.hist(df["LapTimeSeconds"], bins=40)
ax.set_title("LapTimeSeconds distribution")
ax.set_xlabel("seconds")
ax.set_ylabel("count")
plt.show()

fig, ax = plt.subplots(figsize=(8,4))
ax.hist(df["TyreLife"], bins=30)
ax.set_title("TyreLife distribution")
ax.set_xlabel("laps")
ax.set_ylabel("count")
plt.show()

fig, ax = plt.subplots(figsize=(7,5))
ax.scatter(df["TyreLife"], df["LapTimeSeconds"], s=10)
ax.set_title("TyreLife vs LapTimeSeconds")
ax.set_xlabel("TyreLife (laps)")
ax.set_ylabel("LapTimeSeconds (sec)")
plt.show()

fig, ax = plt.subplots(figsize=(7,5))
ax.scatter(df["Sector1TimeSeconds"], df["LapTimeSeconds"], s=10)
ax.set_title("Sector1TimeSeconds vs LapTimeSeconds")
ax.set_xlabel("Sector1TimeSeconds (sec)")
ax.set_ylabel("LapTimeSeconds (sec)")
plt.show()

small = df[["TyreLife","Sector1TimeSeconds","Sector2TimeSeconds","Sector3TimeSeconds","LapTimeSeconds"]].sample(min(2000, len(df)), random_state=7)
sns.pairplot(small, corner=True)
plt.show()

corr = df[["TyreLife","TrackStatus","Sector1TimeSeconds","Sector2TimeSeconds","Sector3TimeSeconds","LapTimeSeconds"]].corr(numeric_only=True)

fig, ax = plt.subplots(figsize=(7,5))
sns.heatmap(corr, annot=True, fmt=".2f", ax=ax)
ax.set_title("Correlation heatmap")
plt.show()

fig, ax = plt.subplots(figsize=(7,5))
sns.boxplot(data=df, x="Compound", y="LapTimeSeconds", ax=ax)
ax.set_title("LapTimeSeconds by Compound")
plt.show()

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

x = df[["TyreLife"]].values
y = df["LapTimeSeconds"].values

lin = LinearRegression().fit(x, y)

poly = PolynomialFeatures(degree=2, include_bias=False)
x2 = poly.fit_transform(x)
quad = LinearRegression().fit(x2, y)

x_grid = np.linspace(df["TyreLife"].min(), df["TyreLife"].max(), 200).reshape(-1,1)
y_lin = lin.predict(x_grid)
y_quad = quad.predict(poly.transform(x_grid))

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(7,5))
ax.scatter(df["TyreLife"], df["LapTimeSeconds"], s=8, alpha=0.35, label="data")
ax.plot(x_grid.ravel(), y_lin, linewidth=2, label="linear fit")
ax.plot(x_grid.ravel(), y_quad, linewidth=2, label="quadratic fit")
ax.set_title("TyreLife → LapTimeSeconds (simple fits)")
ax.set_xlabel("TyreLife (laps)")
ax.set_ylabel("LapTimeSeconds (sec)")
ax.legend()
plt.show()

lin.coef_, lin.intercept_, quad.coef_, quad.intercept_

(array([-0.06613968]),
 np.float64(85.25861865157495),
 array([-0.3076687 ,  0.00564496]),
 np.float64(86.99328853297702))

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

feature_cols = ["TyreLife","Compound","Sector1TimeSeconds","Sector2TimeSeconds","Sector3TimeSeconds","TrackStatus"]
target_col = "LapTimeSeconds"

X = df[feature_cols].copy()
y = df[target_col].copy()

num_cols = ["TyreLife","Sector1TimeSeconds","Sector2TimeSeconds","Sector3TimeSeconds","TrackStatus"]
cat_cols = ["Compound"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

model = Pipeline(steps=[
    ("prep", preprocess),
    ("reg", LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

model.fit(X_train, y_train)
pred = model.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)

mae, rmse, r2

(8.77948092352794e-15, 1.2687857072455226e-28, 1.0)

# splits data into multiple chunks
# trains and tests the model multiple times automatically

from sklearn.model_selection import KFold, cross_val_score  # tools for cross-validation

# create a K-Fold splitter:
# shuffle=True - randomize the data before splitting
cv = KFold(n_splits=5, shuffle=True, random_state=7)

# run cross-validation:
# model - full pipeline (preprocessing + regression)
# X, y - input features and target
# cv=cv - use the 5-fold splitter defined above
scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_absolute_error")

# convert negative MAE scores back to positive values
mae_scores = -scores

# output:
# all MAE scores from each fold
# the average MAE (typical error)
# the standard deviation (how consistent the model is)
mae_scores, mae_scores.mean(), mae_scores.std()

(array([4.01772856e-15, 4.98495951e-15, 2.23207142e-15, 1.75589618e-14,
        1.04163333e-14]),
 np.float64(7.842010926608855e-15),
 np.float64(5.5732498699917915e-15))

import matplotlib.pyplot as plt
import numpy as np

# Create a square figure and axis for the scatter plot
fig, ax = plt.subplots(figsize=(6,6))

# Scatter plot: actual lap times (x-axis) vs predicted lap times (y-axis)
# Each dot represents one lap
ax.scatter(y_test, pred, s=10, alpha=0.5)

# Find the minimum and maximum values across both actual and predicted
# This ensures the diagonal reference line spans the full data range
mn = min(y_test.min(), pred.min())
mx = max(y_test.max(), pred.max())

# Plot a 45-degree reference line (perfect prediction line: y = x)
ax.plot([mn, mx], [mn, mx], linewidth=2)

# Add title and axis labels for clarity
ax.set_title("Predicted vs Actual LapTimeSeconds")
ax.set_xlabel("Actual (sec)")
ax.set_ylabel("Predicted (sec)")

# Display the scatter plot
plt.show()

# Calculate prediction errors for each lap
# Positive value = model overpredicted, negative = underpredicted
errors = pred - y_test.to_numpy()

# Create a new figure for the error distribution
fig, ax = plt.subplots(figsize=(8,4))

# Plot a histogram of prediction errors
# Shows how often different error sizes occur
ax.hist(errors, bins=40)

# Add title and axis labels for the error plot
ax.set_title("Prediction errors (pred - actual)")
ax.set_xlabel("seconds")
ax.set_ylabel("count")

# Display the histogram
plt.show()

Week 02: FastF1 Visual Exploration & Modeling

What you’ll need¶

Data¶

Note:¶

Visuals¶

Explanation:¶

Load the Data¶

What’s inside `df`?¶

Quick sanity checks (Recommended in the ChatGPT: First steps to analyzing cleaned data sets)¶

Explanation:¶

Histogram: LapTimeSecondsDistribution¶

Histogram: TyreLifeDistribution¶

Scatter plots¶

Scatter Plot: TyreLife vs LapTimeSeconds¶

Scatter Plot: Sector1TimeSeconds vs LapTimeSeconds¶

Pairplot¶

Correlation heatmap¶

Explanation: Correlation Heatmap¶

Boxplot: Compound vs Lap Time¶

Explanation: Boxplot¶

Fit a Line and Curve: Linear and Quadratic Function¶

Linear Regression with preprocessing¶

Explanation (ChatGPT to the rescue):¶

Cross-validation¶

Explanation:¶

Interpretation:¶

Interpretation:¶

Insights¶

Week 02: FastF1 Visual Exploration & Modeling

What you’ll need¶

Data¶

Note:¶

Visuals¶

Explanation:¶

Load the Data¶

What’s inside df?¶

Quick sanity checks (Recommended in the ChatGPT: First steps to analyzing cleaned data sets)¶

Explanation:¶

Histogram: LapTimeSecondsDistribution¶

Histogram: TyreLifeDistribution¶

Scatter plots¶

Scatter Plot: TyreLife vs LapTimeSeconds¶

Scatter Plot: Sector1TimeSeconds vs LapTimeSeconds¶

Pairplot¶

Correlation heatmap¶

Explanation: Correlation Heatmap¶

Boxplot: Compound vs Lap Time¶

Explanation: Boxplot¶

Fit a Line and Curve: Linear and Quadratic Function¶

Linear Regression with preprocessing¶

Explanation (ChatGPT to the rescue):¶

Cross-validation¶

Explanation:¶

Interpretation:¶

Interpretation:¶

Insights¶

What’s inside `df`?¶