Most underratted players¶
Goals¶
By combining multiple datasets, I'm exploring how different attributes can be used to create a model for finding underrated players.
Data loading¶
In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
# --- DATA PREPARATION (Reused from previous step) ---
DATA_PATH = 'datasets/'
Data filtering¶
- Only the last 4 seasons of data are used. The reason for that is that a) valuation trends change fast b) if we use older data, a more complex formula needs to be added to calculate inflation changes in valuation.
- Players with insufficient data are removed, the assumption is that they are probably not the "hidden gems" anyway.
In [2]:
def load_and_prep_data():
print("Loading and preparing data...")
try:
games = pd.read_csv(os.path.join(DATA_PATH, 'games.csv'))
app = pd.read_csv(os.path.join(DATA_PATH, 'appearances.csv'))
players = pd.read_csv(os.path.join(DATA_PATH, 'players.csv'))
except FileNotFoundError:
print("Error: Datasets not found.")
return None
# Filter Seasons (2020-2023)
games['date'] = pd.to_datetime(games['date'])
games = games[(games['season'] >= 2020) & (games['season'] != 2024)]
valid_games = games['game_id'].unique()
app = app[app['game_id'].isin(valid_games)]
# Filter Insufficient Data (< 450 mins)
player_mins = app.groupby('player_id')['minutes_played'].sum()
valid_players = player_mins[player_mins >= 450].index
app = app[app['player_id'].isin(valid_players)]
players = players[players['player_id'].isin(valid_players)]
players = players[players['market_value_in_eur'] > 0].dropna(subset=['market_value_in_eur']) # Feature Engineering
# Opponent Strength Weighting
games_cols = ['game_id', 'home_club_id', 'away_club_id', 'home_club_position', 'away_club_position']
merged = pd.merge(app, games[games_cols], on='game_id', how='left')
merged[['home_club_position', 'away_club_position']] = merged[['home_club_position', 'away_club_position']].fillna(10)
conditions = [merged['player_club_id'] == merged['home_club_id'], merged['player_club_id'] == merged['away_club_id']]
choices = [merged['away_club_position'], merged['home_club_position']]
merged['opponent_rank'] = np.select(conditions, choices, default=10)
merged['weighted_performance'] = ((merged['goals'] * 2) + merged['assists']) * (21 - merged['opponent_rank'])
# Aggregation
stats = merged.groupby('player_id').agg(
total_minutes=('minutes_played', 'sum'),
total_goals=('goals', 'sum'),
total_assists=('assists', 'sum'),
avg_weighted_score=('weighted_performance', 'mean')
).reset_index()
stats['goals_per_90'] = (stats['total_goals'] / stats['total_minutes']) * 90
stats['assists_per_90'] = (stats['total_assists'] / stats['total_minutes']) * 90
# Final Merge
df = pd.merge(stats, players[['player_id', 'name', 'position', 'date_of_birth', 'market_value_in_eur', 'current_club_name']], on='player_id')
# Age Calculation // Note: maybe I should filter out players who are over a certian age.
### idea: find a way to understand/predict if a player over 30 has his final contract or there is a potential renewal with another club
reference_date = datetime.now()
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
# Vectorized calculation which handles NaNs gracefully
df['age'] = reference_date.year - df['date_of_birth'].dt.year
return df
df = load_and_prep_data()
Loading and preparing data...
ML Experiements¶
In [3]:
# --- ML EXPERIMENTS ---
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
def run_ml_experiments(df):
print(f"\nRunning ML Experiments on {len(df)} players...")
# Data Preprocessing for ML
# Encode 'position' (Attack, Defender, etc.)
le = LabelEncoder()
df['position_encoded'] = le.fit_transform(df['position'])
features = ['age', 'goals_per_90', 'assists_per_90', 'avg_weighted_score', 'position_encoded']
target = 'market_value_in_eur'
# Drop rows with missing features to ensure data quality.
initial_count = len(df)
df = df.dropna(subset=features).copy()
dropped_count = initial_count - len(df)
if dropped_count > 0:
print(f"Cleaned Data: Dropped {dropped_count} players due to missing values (NaN).")
print(f"Remaining players: {len(df)}")
X = df[features]
y = df[target]
# Split Data / 80% Train, 20% Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# --- EXPERIMENT 1: DECISION TREE ---
print("\n--- Model 1: Decision Tree Regressor ---")
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_test)
print(f"R2 Score (Accuracy): {r2_score(y_test, dt_preds):.3f}")
print(f"Mean Absolute Error: €{mean_absolute_error(y_test, dt_preds):,.0f}")
# --- EXPERIMENT 2: RANDOM FOREST ---
print("\n--- Model 2: Random Forest Regressor ---")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
print(f"R2 Score (Accuracy): {r2_score(y_test, rf_preds):.3f}")
print(f"Mean Absolute Error: €{mean_absolute_error(y_test, rf_preds):,.0f}")
# Calculate Feature Importance
importances = pd.DataFrame({
'feature': features,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nKey Valuation Drivers:")
print(importances.to_string(index=False))
# --- EXPERIMENT 3: K-MEANS CLUSTERING (Unsupervised) ---
print("\n--- Model 3: K-Means Clustering ---")
# Scale data for clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df['cluster'] = clusters
print(f"Silhouette Score (Cluster Quality): {silhouette_score(X_scaled, clusters):.3f}")
# Analyze Clusters
cluster_summary = df.groupby('cluster')[['market_value_in_eur', 'goals_per_90', 'age']].mean()
print("\nCluster Profiles:")
print(cluster_summary)
return df, dt_model
df, best_model = run_ml_experiments(df)
Running ML Experiments on 9369 players...
Cleaned Data: Dropped 1 players due to missing values (NaN).
Remaining players: 9368
--- Model 1: Decision Tree Regressor ---
R2 Score (Accuracy): 0.138
Mean Absolute Error: €4,657,739
--- Model 2: Random Forest Regressor ---
R2 Score (Accuracy): 0.127
Mean Absolute Error: €4,678,607
Key Valuation Drivers:
feature importance
avg_weighted_score 0.292128
assists_per_90 0.278165
goals_per_90 0.222845
age 0.171816
position_encoded 0.035047
--- Model 3: K-Means Clustering ---
Silhouette Score (Cluster Quality): 0.268
Cluster Profiles:
market_value_in_eur goals_per_90 age
cluster
0 4.464888e+06 0.090252 29.046494
1 3.005240e+06 0.030689 30.154388
2 1.023422e+07 0.455310 29.708245
3 4.412621e+06 0.180205 28.031652
Prediction model / find underrated players using the selected ML model¶
In [4]:
def find_underrated_players(df, model, features):
"""
Identifies players where Predicted Value >> Actual Value.
Adds predictions to the main dataframe for visualization.
"""
print("\n--- IDENTIFYING UNDERRATED PLAYERS ---")
# Predict value for ALL players using the trained model
X_all = df[features]
df['predicted_value'] = model.predict(X_all)
# Calculate 'Undervalued' metric
df['value_diff'] = df['predicted_value'] - df['market_value_in_eur']
df['underrated_ratio'] = df['predicted_value'] / df['market_value_in_eur']
# Filter: Must be worth at least €5M predicted, and priced at least 50% below prediction
candidates = df[
(df['predicted_value'] > 5_000_000) &
(df['value_diff'] > 0)
].sort_values('value_diff', ascending=False)
# Added KPI columns to output for visualization
return candidates[[
'name', 'current_club_name', 'age', 'position',
'goals_per_90', 'assists_per_90',
'market_value_in_eur', 'predicted_value', 'value_diff'
]]
features = ['age', 'goals_per_90', 'assists_per_90', 'avg_weighted_score', 'position_encoded']
underrated = find_underrated_players(df, best_model, features)
print("\nTop 10 Most Underrated Players (Model Prediction vs Market Price):")
print(underrated.head(10).to_string(index=False))
--- IDENTIFYING UNDERRATED PLAYERS ---
Top 10 Most Underrated Players (Model Prediction vs Market Price):
name current_club_name age position goals_per_90 assists_per_90 market_value_in_eur predicted_value value_diff
Rory Whittaker Hibernian Football Club 18.0 Defender 0.000000 0.166052 150000.0 3.507500e+07 3.492500e+07
Marokhy Ndione Viborg Fodsports Forening 26.0 Attack 0.795053 0.318021 175000.0 3.010192e+07 2.992692e+07
Jack Vale Motherwell Football Club 24.0 Attack 0.369863 0.246575 500000.0 3.010192e+07 2.960192e+07
Sergiy Buletsa FC Oleksandriya 26.0 Midfield 0.242308 0.276923 600000.0 3.010192e+07 2.950192e+07
Danny Armstrong Kilmarnock Football Club 28.0 Attack 0.244344 0.309502 800000.0 3.010192e+07 2.930192e+07
Mads Frökjaer-Jensen Odense Boldklub 26.0 Midfield 0.246085 0.246085 800000.0 3.010192e+07 2.930192e+07
Aleksey Kashtanov FK Fakel Voronezh 29.0 Attack 0.328707 0.284879 800000.0 3.010192e+07 2.930192e+07
Alioune Ndour SV Zulte Waregem 28.0 Attack 0.450751 0.300501 900000.0 3.010192e+07 2.920192e+07
Duk Club Deportivo Leganés S.A.D. 25.0 Attack 0.433735 0.238554 1200000.0 3.010192e+07 2.890192e+07
Umut Bozok Eyüp Spor Kulübü 29.0 Attack 0.615174 0.246070 1400000.0 3.010192e+07 2.870192e+07
Visualization of the top 20 players / using selected model¶
*** The R score is pretty low and I decided to change the approach, more in a separate notebook
In [6]:
def visualize_top_20_details(candidates):
print("\n--- GENERATING TOP 20 VISUALIZATION ---")
top_gems = candidates.head(20).copy()
top_gems = top_gems.sort_values('value_diff', ascending=True)
# Setup Plot
sns.set_style("whitegrid")
fig, ax = plt.subplots(figsize=(14, 12))
# Y-axis positions
y_range = np.arange(len(top_gems))
# Gap Lines
ax.hlines(
y=y_range,
xmin=top_gems['market_value_in_eur'],
xmax=top_gems['predicted_value'],
color='grey',
alpha=0.4,
linewidth=3
)
# Plot Current Value (Grey Dot)
ax.scatter(
top_gems['market_value_in_eur'],
y_range,
color='#95a5a6',
s=120,
label='Current Market Value',
zorder=3
)
# Plot Potential Value (Green Dot - Moneyball Target)
ax.scatter(
top_gems['predicted_value'],
y_range,
color='#27ae60',
s=120,
label='Model Potential Value',
zorder=3
)
# Add KPI Annotations
for i, (_, row) in enumerate(top_gems.iterrows()):
stats_text = (f" Position: {row['position']} | "
f" Age: {row['age']:.0f} | "
f"G/90: {row['goals_per_90']:.2f} | "
f"A/90: {row['assists_per_90']:.2f}")
ax.text(
row['predicted_value'] + (top_gems['predicted_value'].max() * 0.02),
i,
stats_text,
va='center',
fontsize=10,
color='#34495e',
fontweight='medium'
)
# Labels and titles
ax.set_yticks(y_range)
ax.set_yticklabels(top_gems['name'], fontsize=11, fontweight='bold')
ax.set_xlabel('Value (€)', fontsize=12)
ax.set_title('Top 20 "Moneyball" Candidates: Current vs. Potential Value', fontsize=16, fontweight='bold', pad=20)
# Format X-axis to millions
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'€{x/1e6:.0f}M'))
# Add legend
plt.legend(loc='lower right', frameon=True, framealpha=0.9)
plt.tight_layout()
try:
plt.show()
except:
pass
visualize_top_20_details(underrated)
--- GENERATING TOP 20 VISUALIZATION ---