[Your-Name-Here] - Fab Futures - Data Science
Home About

Most underratted players¶

Goals¶

By combining multiple datasets, I'm exploring how different attributes can be used to create a model for finding underrated players.

Data loading¶

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# --- DATA PREPARATION (Reused from previous step) ---
DATA_PATH = 'datasets/'

Data filtering¶

  • Only the last 4 seasons of data are used. The reason for that is that a) valuation trends change fast b) if we use older data, a more complex formula needs to be added to calculate inflation changes in valuation.
  • Players with insufficient data are removed, the assumption is that they are probably not the "hidden gems" anyway.
In [2]:
def load_and_prep_data():
    print("Loading and preparing data...")
    try:
        games = pd.read_csv(os.path.join(DATA_PATH, 'games.csv'))
        app = pd.read_csv(os.path.join(DATA_PATH, 'appearances.csv'))
        players = pd.read_csv(os.path.join(DATA_PATH, 'players.csv'))
    except FileNotFoundError:
        print("Error: Datasets not found.")
        return None

    # Filter Seasons (2020-2023)
    games['date'] = pd.to_datetime(games['date'])
    games = games[(games['season'] >= 2020) & (games['season'] != 2024)]
    valid_games = games['game_id'].unique()
    app = app[app['game_id'].isin(valid_games)]

    # Filter Insufficient Data (< 450 mins)
    player_mins = app.groupby('player_id')['minutes_played'].sum()
    valid_players = player_mins[player_mins >= 450].index
    app = app[app['player_id'].isin(valid_players)]
    players = players[players['player_id'].isin(valid_players)]
    players = players[players['market_value_in_eur'] > 0].dropna(subset=['market_value_in_eur'])    # Feature Engineering
    
    # Opponent Strength Weighting
    games_cols = ['game_id', 'home_club_id', 'away_club_id', 'home_club_position', 'away_club_position']
    merged = pd.merge(app, games[games_cols], on='game_id', how='left')
    merged[['home_club_position', 'away_club_position']] = merged[['home_club_position', 'away_club_position']].fillna(10)
    
    conditions = [merged['player_club_id'] == merged['home_club_id'], merged['player_club_id'] == merged['away_club_id']]
    choices = [merged['away_club_position'], merged['home_club_position']]
    merged['opponent_rank'] = np.select(conditions, choices, default=10)
    
    merged['weighted_performance'] = ((merged['goals'] * 2) + merged['assists']) * (21 - merged['opponent_rank'])

    # Aggregation
    stats = merged.groupby('player_id').agg(
        total_minutes=('minutes_played', 'sum'),
        total_goals=('goals', 'sum'),
        total_assists=('assists', 'sum'),
        avg_weighted_score=('weighted_performance', 'mean')
    ).reset_index()

    stats['goals_per_90'] = (stats['total_goals'] / stats['total_minutes']) * 90
    stats['assists_per_90'] = (stats['total_assists'] / stats['total_minutes']) * 90

    # Final Merge
    df = pd.merge(stats, players[['player_id', 'name', 'position', 'date_of_birth', 'market_value_in_eur', 'current_club_name']], on='player_id')
    
    # Age Calculation // Note: maybe I should filter out players who are over a certian age. 
    ### idea: find a way to understand/predict if a player over 30 has his final contract or there is a potential renewal with another club
    reference_date = datetime.now()
    df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
   
    # Vectorized calculation which handles NaNs gracefully 
    df['age'] = reference_date.year - df['date_of_birth'].dt.year
    
    return df

df = load_and_prep_data()
Loading and preparing data...

ML Experiements¶

In [3]:
# --- ML EXPERIMENTS ---
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

def run_ml_experiments(df):
    print(f"\nRunning ML Experiments on {len(df)} players...")
    
    # Data Preprocessing for ML
    # Encode 'position' (Attack, Defender, etc.)
    le = LabelEncoder()
    df['position_encoded'] = le.fit_transform(df['position'])
    
    features = ['age', 'goals_per_90', 'assists_per_90', 'avg_weighted_score', 'position_encoded']
    target = 'market_value_in_eur'
    
    # Drop rows with missing features to ensure data quality.
    initial_count = len(df)

    df = df.dropna(subset=features).copy()
    dropped_count = initial_count - len(df)
    
    if dropped_count > 0:
        print(f"Cleaned Data: Dropped {dropped_count} players due to missing values (NaN).")
        print(f"Remaining players: {len(df)}")
    
    X = df[features]
    y = df[target]
    
    # Split Data / 80% Train, 20% Test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- EXPERIMENT 1: DECISION TREE ---
    print("\n--- Model 1: Decision Tree Regressor ---")
    dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
    dt_model.fit(X_train, y_train)
    dt_preds = dt_model.predict(X_test)
    
    print(f"R2 Score (Accuracy): {r2_score(y_test, dt_preds):.3f}")
    print(f"Mean Absolute Error: €{mean_absolute_error(y_test, dt_preds):,.0f}")

    # --- EXPERIMENT 2: RANDOM FOREST ---
    print("\n--- Model 2: Random Forest Regressor ---")
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    
    print(f"R2 Score (Accuracy): {r2_score(y_test, rf_preds):.3f}")
    print(f"Mean Absolute Error: €{mean_absolute_error(y_test, rf_preds):,.0f}")
    
    # Calculate Feature Importance
    importances = pd.DataFrame({
        'feature': features,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    print("\nKey Valuation Drivers:")
    print(importances.to_string(index=False))

    # --- EXPERIMENT 3: K-MEANS CLUSTERING (Unsupervised) ---
    print("\n--- Model 3: K-Means Clustering ---")
    # Scale data for clustering
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    kmeans = KMeans(n_clusters=4, random_state=42)
    clusters = kmeans.fit_predict(X_scaled)
    df['cluster'] = clusters
    
    print(f"Silhouette Score (Cluster Quality): {silhouette_score(X_scaled, clusters):.3f}")
    
    # Analyze Clusters
    cluster_summary = df.groupby('cluster')[['market_value_in_eur', 'goals_per_90', 'age']].mean()
    print("\nCluster Profiles:")
    print(cluster_summary)

    return df, dt_model

df, best_model = run_ml_experiments(df)
Running ML Experiments on 9369 players...
Cleaned Data: Dropped 1 players due to missing values (NaN).
Remaining players: 9368

--- Model 1: Decision Tree Regressor ---
R2 Score (Accuracy): 0.138
Mean Absolute Error: €4,657,739

--- Model 2: Random Forest Regressor ---
R2 Score (Accuracy): 0.127
Mean Absolute Error: €4,678,607

Key Valuation Drivers:
           feature  importance
avg_weighted_score    0.292128
    assists_per_90    0.278165
      goals_per_90    0.222845
               age    0.171816
  position_encoded    0.035047

--- Model 3: K-Means Clustering ---
Silhouette Score (Cluster Quality): 0.268

Cluster Profiles:
         market_value_in_eur  goals_per_90        age
cluster                                              
0               4.464888e+06      0.090252  29.046494
1               3.005240e+06      0.030689  30.154388
2               1.023422e+07      0.455310  29.708245
3               4.412621e+06      0.180205  28.031652

Prediction model / find underrated players using the selected ML model¶

In [4]:
def find_underrated_players(df, model, features):
    """
    Identifies players where Predicted Value >> Actual Value.
    Adds predictions to the main dataframe for visualization.
    """
    print("\n--- IDENTIFYING UNDERRATED PLAYERS ---")
    
    # Predict value for ALL players using the trained model
    X_all = df[features]
    df['predicted_value'] = model.predict(X_all)
    
    # Calculate 'Undervalued' metric
    df['value_diff'] = df['predicted_value'] - df['market_value_in_eur']
    df['underrated_ratio'] = df['predicted_value'] / df['market_value_in_eur']
    
    # Filter: Must be worth at least €5M predicted, and priced at least 50% below prediction
    candidates = df[
        (df['predicted_value'] > 5_000_000) & 
        (df['value_diff'] > 0)
    ].sort_values('value_diff', ascending=False)
    
    # Added KPI columns to output for visualization
    return candidates[[
        'name', 'current_club_name', 'age', 'position', 
        'goals_per_90', 'assists_per_90', 
        'market_value_in_eur', 'predicted_value', 'value_diff'
    ]]
    
features = ['age', 'goals_per_90', 'assists_per_90', 'avg_weighted_score', 'position_encoded']
underrated = find_underrated_players(df, best_model, features)

print("\nTop 10 Most Underrated Players (Model Prediction vs Market Price):")
print(underrated.head(10).to_string(index=False))
--- IDENTIFYING UNDERRATED PLAYERS ---

Top 10 Most Underrated Players (Model Prediction vs Market Price):
                name             current_club_name  age position  goals_per_90  assists_per_90  market_value_in_eur  predicted_value   value_diff
      Rory Whittaker       Hibernian Football Club 18.0 Defender      0.000000        0.166052             150000.0     3.507500e+07 3.492500e+07
      Marokhy Ndione     Viborg Fodsports Forening 26.0   Attack      0.795053        0.318021             175000.0     3.010192e+07 2.992692e+07
           Jack Vale      Motherwell Football Club 24.0   Attack      0.369863        0.246575             500000.0     3.010192e+07 2.960192e+07
      Sergiy Buletsa               FC Oleksandriya 26.0 Midfield      0.242308        0.276923             600000.0     3.010192e+07 2.950192e+07
     Danny Armstrong      Kilmarnock Football Club 28.0   Attack      0.244344        0.309502             800000.0     3.010192e+07 2.930192e+07
Mads Frökjaer-Jensen               Odense Boldklub 26.0 Midfield      0.246085        0.246085             800000.0     3.010192e+07 2.930192e+07
   Aleksey Kashtanov             FK Fakel Voronezh 29.0   Attack      0.328707        0.284879             800000.0     3.010192e+07 2.930192e+07
       Alioune Ndour              SV Zulte Waregem 28.0   Attack      0.450751        0.300501             900000.0     3.010192e+07 2.920192e+07
                 Duk Club Deportivo Leganés S.A.D. 25.0   Attack      0.433735        0.238554            1200000.0     3.010192e+07 2.890192e+07
          Umut Bozok              Eyüp Spor Kulübü 29.0   Attack      0.615174        0.246070            1400000.0     3.010192e+07 2.870192e+07

Visualization of the top 20 players / using selected model¶

*** The R score is pretty low and I decided to change the approach, more in a separate notebook

In [6]:
def visualize_top_20_details(candidates):
    
    print("\n--- GENERATING TOP 20 VISUALIZATION ---")

    top_gems = candidates.head(20).copy()
    top_gems = top_gems.sort_values('value_diff', ascending=True)
    
    # Setup Plot
    sns.set_style("whitegrid")
    fig, ax = plt.subplots(figsize=(14, 12))
    
    # Y-axis positions
    y_range = np.arange(len(top_gems))
    
    # Gap Lines
    ax.hlines(
        y=y_range, 
        xmin=top_gems['market_value_in_eur'], 
        xmax=top_gems['predicted_value'], 
        color='grey', 
        alpha=0.4, 
        linewidth=3
    )
    
    # Plot Current Value (Grey Dot)
    ax.scatter(
        top_gems['market_value_in_eur'], 
        y_range, 
        color='#95a5a6', 
        s=120, 
        label='Current Market Value', 
        zorder=3
    )
    
    # Plot Potential Value (Green Dot - Moneyball Target)
    ax.scatter(
        top_gems['predicted_value'], 
        y_range, 
        color='#27ae60', 
        s=120, 
        label='Model Potential Value', 
        zorder=3
    )
    
    # Add KPI Annotations
    for i, (_, row) in enumerate(top_gems.iterrows()):
        stats_text = (f" Position: {row['position']} | "
                      f" Age: {row['age']:.0f} | "
                      f"G/90: {row['goals_per_90']:.2f} | "
                      f"A/90: {row['assists_per_90']:.2f}")
        ax.text(
            row['predicted_value'] + (top_gems['predicted_value'].max() * 0.02), 
            i, 
            stats_text, 
            va='center', 
            fontsize=10, 
            color='#34495e',
            fontweight='medium'
        )

    # Labels and titles
    ax.set_yticks(y_range)
    ax.set_yticklabels(top_gems['name'], fontsize=11, fontweight='bold')
    ax.set_xlabel('Value (€)', fontsize=12)
    ax.set_title('Top 20 "Moneyball" Candidates: Current vs. Potential Value', fontsize=16, fontweight='bold', pad=20)
    
    # Format X-axis to millions
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'€{x/1e6:.0f}M'))
    
    # Add legend
    plt.legend(loc='lower right', frameon=True, framealpha=0.9)
    
    plt.tight_layout()
    
    try:
        plt.show()
    except:
        pass

visualize_top_20_details(underrated)
--- GENERATING TOP 20 VISUALIZATION ---
No description has been provided for this image