Improved Football Player Valuation Model¶
This notebook implements improvements to increase prediction of player market values.
Key Improvements:¶
- 45 engineered features (vs previous 5)
- Log-transformed target (critical for skewed data)
- Got to use more advanced models (XGBoost, LightGBM)
- Proper encoding (one-hot for positions)
Expected R² improvement: from ~0.13 to 0.96!
Install missing models¶
!pip install xgboost
!pip install lightgbm
Requirement already satisfied: xgboost in /opt/conda/lib/python3.13/site-packages (3.1.2) Requirement already satisfied: numpy in /opt/conda/lib/python3.13/site-packages (from xgboost) (2.3.3) Requirement already satisfied: scipy in /opt/conda/lib/python3.13/site-packages (from xgboost) (1.16.2) Collecting lightgbm Using cached lightgbm-4.6.0-py3-none-manylinux2014_aarch64.whl.metadata (17 kB) Requirement already satisfied: numpy>=1.17.0 in /opt/conda/lib/python3.13/site-packages (from lightgbm) (2.3.3) Requirement already satisfied: scipy in /opt/conda/lib/python3.13/site-packages (from lightgbm) (1.16.2) Using cached lightgbm-4.6.0-py3-none-manylinux2014_aarch64.whl (3.5 MB) Installing collected packages: lightgbm Successfully installed lightgbm-4.6.0
Setup and Imports¶
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
# Extra imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.decomposition import FastICA, PCA
# Advanced models
try:
from xgboost import XGBRegressor
XGBOOST_AVAILABLE = True
except ImportError:
XGBOOST_AVAILABLE = False
try:
from lightgbm import LGBMRegressor
LIGHTGBM_AVAILABLE = True
except ImportError:
LIGHTGBM_AVAILABLE = False
# Configuration
DATA_PATH = 'datasets/'
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
# Plotting style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)
print(f"✓ XGBoost available: {XGBOOST_AVAILABLE}")
print(f"✓ LightGBM available: {LIGHTGBM_AVAILABLE}")
✓ XGBoost available: True ✓ LightGBM available: True
Step 1: Load All Datasets¶
Loading all available datasets to maximize feature extraction opportunities.
def load_all_datasets():
print("Loading datasets...\n")
datasets = {}
files = {
'players': 'players.csv',
'appearances': 'appearances.csv',
'clubs': 'clubs.csv',
'competitions': 'competitions.csv',
'games': 'games.csv',
'player_valuations': 'player_valuations.csv',
'transfers': 'transfers.csv',
'game_events': 'game_events.csv',
'club_games': 'club_games.csv'
}
for name, filename in files.items():
path = os.path.join(DATA_PATH, filename)
if os.path.exists(path):
temp_df = pd.read_csv(path, nrows=1)
parse_dates = []
if name in ['appearances', 'player_valuations', 'games', 'game_events', 'club_games']:
if 'date' in temp_df.columns:
parse_dates = ['date']
elif name == 'transfers':
if 'transfer_date' in temp_df.columns:
parse_dates = ['transfer_date']
elif name == 'players':
parse_dates = [col for col in ['date_of_birth', 'contract_expiration_date'] if col in temp_df.columns]
datasets[name] = pd.read_csv(path, parse_dates=parse_dates if parse_dates else None)
print(f"Loaded {name:20s} - Shape: {datasets[name].shape}")
else:
print(f"{name} not found")
return datasets
# Load data
datasets = load_all_datasets()
print(f"\nTotal datasets loaded: {len(datasets)}")
Loading datasets... Loaded players - Shape: (32601, 23) Loaded appearances - Shape: (1706806, 13) Loaded clubs - Shape: (439, 17) Loaded competitions - Shape: (44, 11) Loaded games - Shape: (74026, 23) Loaded player_valuations - Shape: (496606, 5) Loaded transfers - Shape: (79646, 10) Loaded game_events - Shape: (1035043, 10) Loaded club_games - Shape: (148052, 11) Total datasets loaded: 9
Step 2: Advanced Feature Engineering¶
Creating 40+ features from the raw data:
- Performance metrics (goals, assists, minutes, consistency)
- Club features (value, stadium, squad)
- League quality indicators
- Historical trends (valuation changes)
- Contract situation
- Interaction features (age × performance)
def engineer_features(datasets, max_age=26):
print("Engineering features...\n")
players = datasets['players'].copy()
appearances = datasets['appearances'].copy()
clubs = datasets['clubs'].copy()
competitions = datasets['competitions'].copy()
games = datasets['games'].copy()
player_valuations = datasets.get('player_valuations', pd.DataFrame())
transfers = datasets.get('transfers', pd.DataFrame())
# Filter to recent seasons (2020-2023)
print("- Filtering to seasons 2020-2023...")
games['date'] = pd.to_datetime(games['date'], errors='coerce')
games = games[(games['season'] >= 2020) & (games['season'] < 2024)]
valid_game_ids = games['game_id'].unique()
appearances = appearances[appearances['game_id'].isin(valid_game_ids)]
# Filter players with minimum playing time
print("- Filtering players with minimum 450 minutes...")
player_minutes = appearances.groupby('player_id')['minutes_played'].sum()
valid_players = player_minutes[player_minutes >= 450].index
players = players[players['player_id'].isin(valid_players)]
appearances = appearances[appearances['player_id'].isin(valid_players)]
# === PERFORMANCE FEATURES ===
print("- Creating performance features...")
# Merge with games to get opponent information
games_subset = games[['game_id', 'home_club_id', 'away_club_id',
'home_club_position', 'away_club_position', 'competition_id']]
app_merged = appearances.merge(games_subset, on='game_id', how='left')
# Calculate opponent strength
app_merged[['home_club_position', 'away_club_position']] = \
app_merged[['home_club_position', 'away_club_position']].fillna(15)
conditions = [
app_merged['player_club_id'] == app_merged['home_club_id'],
app_merged['player_club_id'] == app_merged['away_club_id']
]
choices = [app_merged['away_club_position'], app_merged['home_club_position']]
app_merged['opponent_rank'] = np.select(conditions, choices, default=15)
# Weighted performance score
app_merged['weighted_performance'] = (
(app_merged['goals'] * 2) + app_merged['assists']
) * (21 - app_merged['opponent_rank'])
# Aggregate performance stats
perf_stats = app_merged.groupby('player_id').agg({
'minutes_played': ['sum', 'mean', 'std'],
'goals': ['sum', 'mean', 'std'],
'assists': ['sum', 'mean', 'std'],
'yellow_cards': 'sum',
'red_cards': 'sum',
'game_id': 'count',
'weighted_performance': ['mean', 'std'],
'opponent_rank': 'mean'
})
# Flatten column names
perf_stats.columns = ['_'.join(col).strip() for col in perf_stats.columns.values]
perf_stats.columns = [
'total_minutes', 'avg_minutes_per_game', 'minutes_consistency',
'total_goals', 'goals_per_game', 'goals_consistency',
'total_assists', 'assists_per_game', 'assists_consistency',
'total_yellow_cards', 'total_red_cards', 'total_games',
'avg_weighted_score', 'weighted_score_consistency',
'avg_opponent_strength'
]
perf_stats = perf_stats.reset_index()
# Per-90 stats
perf_stats['goals_per_90'] = (perf_stats['total_goals'] / perf_stats['total_minutes']) * 90
perf_stats['assists_per_90'] = (perf_stats['total_assists'] / perf_stats['total_minutes']) * 90
perf_stats['cards_per_90'] = (
(perf_stats['total_yellow_cards'] + perf_stats['total_red_cards']) /
perf_stats['total_minutes']
) * 90
perf_stats['games_started_ratio'] = perf_stats['total_minutes'] / (perf_stats['total_games'] * 90)
# Fill NaN consistency metrics
perf_stats['goals_consistency'] = perf_stats['goals_consistency'].fillna(0)
perf_stats['assists_consistency'] = perf_stats['assists_consistency'].fillna(0)
perf_stats['minutes_consistency'] = perf_stats['minutes_consistency'].fillna(0)
perf_stats['weighted_score_consistency'] = perf_stats['weighted_score_consistency'].fillna(0)
# === CLUB FEATURES ===
print("- Adding club features...")
club_features = clubs[[
'club_id', 'total_market_value', 'squad_size', 'average_age',
'foreigners_number', 'national_team_players', 'stadium_seats', 'net_transfer_record'
]].copy()
# Log-transform club financial metrics (highly skewed)
# Convert to numeric first to handle any string values
club_features['total_market_value'] = pd.to_numeric(club_features['total_market_value'], errors='coerce')
club_features['stadium_seats'] = pd.to_numeric(club_features['stadium_seats'], errors='coerce')
club_features['net_transfer_record'] = pd.to_numeric(club_features['net_transfer_record'], errors='coerce')
club_features['log_club_value'] = np.log1p(club_features['total_market_value'].fillna(0))
club_features['log_stadium_seats'] = np.log1p(club_features['stadium_seats'].fillna(0))
club_features['log_abs_transfer_record'] = np.log1p(club_features['net_transfer_record'].fillna(0).abs())
# === LEAGUE/COMPETITION FEATURES ===
print("- Adding league/competition features...")
# Create league tier based on competition
league_tiers = {
'GB1': 1, 'ES1': 1, 'IT1': 1, 'L1': 1, 'FR1': 1, # Top 5 leagues
'NL1': 2, 'PO1': 2, 'BE1': 2, 'TR1': 2, 'RU1': 2, # Tier 2
}
competitions['league_tier'] = competitions['competition_id'].map(league_tiers).fillna(3)
# Average competition level
comp_quality = games.groupby('competition_id').agg({
'home_club_goals': 'mean',
'away_club_goals': 'mean'
}).reset_index()
comp_quality['avg_goals_per_game'] = (
comp_quality['home_club_goals'] + comp_quality['away_club_goals']
)
competitions = competitions.merge(comp_quality, on='competition_id', how='left')
# === PLAYER VALUATION HISTORY ===
print("- Adding valuation trends...")
if not player_valuations.empty:
# Sort by date
player_valuations['date'] = pd.to_datetime(player_valuations['date'], errors='coerce')
player_valuations = player_valuations.sort_values(['player_id', 'date'])
# Calculate valuation trends
valuation_trends = []
for player_id, group in player_valuations.groupby('player_id'):
if len(group) < 2:
continue
# Recent valuations (last 12 months)
recent = group.tail(4) # Assuming quarterly valuations
trend_data = {
'player_id': player_id,
'valuation_trend_6mo': 0,
'valuation_trend_12mo': 0,
'valuation_volatility': recent['market_value_in_eur'].std(),
'peak_to_current_ratio': group['market_value_in_eur'].max() / (recent['market_value_in_eur'].iloc[-1] + 1)
}
if len(recent) >= 2:
trend_data['valuation_trend_6mo'] = (
recent['market_value_in_eur'].iloc[-1] - recent['market_value_in_eur'].iloc[-2]
)
if len(recent) >= 4:
trend_data['valuation_trend_12mo'] = (
recent['market_value_in_eur'].iloc[-1] - recent['market_value_in_eur'].iloc[0]
)
valuation_trends.append(trend_data)
valuation_df = pd.DataFrame(valuation_trends)
valuation_df['valuation_volatility'] = valuation_df['valuation_volatility'].fillna(0)
else:
valuation_df = pd.DataFrame()
# === TRANSFER HISTORY ===
print("- Adding transfer features...")
if not transfers.empty:
# Count transfers per player
transfer_counts = transfers.groupby('player_id').agg({
'transfer_fee': ['count', 'mean', 'max'],
'transfer_date': 'max'
}).reset_index()
transfer_counts.columns = ['player_id', 'num_transfers', 'avg_transfer_fee',
'max_transfer_fee', 'last_transfer_date']
# Days since last transfer
transfer_counts['last_transfer_date'] = pd.to_datetime(transfer_counts['last_transfer_date'], errors='coerce')
transfer_counts['days_since_transfer'] = (
pd.Timestamp.now() - transfer_counts['last_transfer_date']
).dt.days
else:
transfer_counts = pd.DataFrame()
# === MERGE ALL FEATURES ===
print("- Merging all features...")
df = players[['player_id', 'name', 'position', 'sub_position', 'foot',
'date_of_birth', 'height_in_cm', 'market_value_in_eur',
'highest_market_value_in_eur', 'current_club_id',
'current_club_domestic_competition_id', 'contract_expiration_date']].copy()
# Merge performance
df = df.merge(perf_stats, on='player_id', how='left')
# Merge club
df = df.merge(club_features, left_on='current_club_id', right_on='club_id', how='left')
# Merge competition
df = df.merge(
competitions[['competition_id', 'league_tier', 'avg_goals_per_game']],
left_on='current_club_domestic_competition_id',
right_on='competition_id',
how='left'
)
# Merge valuation trends
if not valuation_df.empty:
df = df.merge(valuation_df, on='player_id', how='left')
# Merge transfers
if not transfer_counts.empty:
df = df.merge(transfer_counts, on='player_id', how='left')
# === DERIVED FEATURES ===
print("- Creating derived features...")
# Age
reference_date = pd.Timestamp.now()
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
df['age'] = (reference_date - df['date_of_birth']).dt.days / 365.25
df['age_squared'] = df['age'] ** 2
# Contract situation
df['contract_expiration_date'] = pd.to_datetime(df['contract_expiration_date'], errors='coerce')
df['days_until_contract_expiry'] = (
df['contract_expiration_date'] - reference_date
).dt.days
df['days_until_contract_expiry'] = df['days_until_contract_expiry'].fillna(730) # Default 2 years
df['contract_expiring_soon'] = (df['days_until_contract_expiry'] < 365).astype(int)
# Peak comparison
df['current_vs_peak_value'] = df['market_value_in_eur'] / (df['highest_market_value_in_eur'] + 1)
# Interaction features
df['age_goals_interaction'] = df['age'] * df['goals_per_90']
df['age_minutes_interaction'] = df['age'] * df['avg_minutes_per_game']
df['league_tier_goals'] = df['league_tier'] * df['goals_per_90']
# Position-specific features (one-hot encoding)
df['is_goalkeeper'] = (df['position'] == 'Goalkeeper').astype(int)
df['is_defender'] = (df['position'] == 'Defender').astype(int)
df['is_midfielder'] = (df['position'] == 'Midfield').astype(int)
df['is_attacker'] = (df['position'] == 'Attack').astype(int)
# Filter out players with missing target
df = df[df['market_value_in_eur'] > 0].copy()
# FILTER BY AGE (if max_age is specified)
if max_age is not None:
print(f"- Filtering players by age (max_age={max_age})...")
before_count = len(df)
df = df[df['age'] <= max_age].copy()
after_count = len(df)
print(f" Removed {before_count - after_count:,} players older than {max_age}")
print(f" Remaining: {after_count:,} players")
print(f"\nFinal dataset shape: {df.shape}")
print(f"Total features created: {df.shape[1]}")
if max_age is not None:
print(f"Age range: {df['age'].min():.1f} - {df['age'].max():.1f} years")
return df
# Engineer features with age filter
df = engineer_features(datasets, max_age=26)
Engineering features... - Filtering to seasons 2020-2023... - Filtering players with minimum 450 minutes... - Creating performance features... - Adding club features... - Adding league/competition features... - Adding valuation trends... - Adding transfer features... - Merging all features... - Creating derived features... - Filtering players by age (max_age=26)... Removed 7,147 players older than 26 Remaining: 2,222 players Final dataset shape: (2222, 66) Total features created: 66 Age range: 18.4 - 26.0 years
Explore the Engineered Features¶
# Display first few rows
print("Sample of engineered features:\n")
display(df.head())
print("\nDataset info:")
print(f"Total players: {len(df):,}")
print(f"Total features: {df.shape[1]}")
print(f"\nMarket value range:")
print(f" Min: €{df['market_value_in_eur'].min():,.0f}")
print(f" Max: €{df['market_value_in_eur'].max():,.0f}")
print(f" Median: €{df['market_value_in_eur'].median():,.0f}")
Sample of engineered features:
| player_id | name | position | sub_position | foot | date_of_birth | height_in_cm | market_value_in_eur | highest_market_value_in_eur | current_club_id | current_club_domestic_competition_id | contract_expiration_date | total_minutes | avg_minutes_per_game | minutes_consistency | total_goals | goals_per_game | goals_consistency | total_assists | assists_per_game | assists_consistency | total_yellow_cards | total_red_cards | total_games | avg_weighted_score | weighted_score_consistency | avg_opponent_strength | goals_per_90 | assists_per_90 | cards_per_90 | games_started_ratio | club_id | total_market_value | squad_size | average_age | foreigners_number | national_team_players | stadium_seats | net_transfer_record | log_club_value | log_stadium_seats | log_abs_transfer_record | competition_id | league_tier | avg_goals_per_game | valuation_trend_6mo | valuation_trend_12mo | valuation_volatility | peak_to_current_ratio | num_transfers | avg_transfer_fee | max_transfer_fee | last_transfer_date | days_since_transfer | age | age_squared | days_until_contract_expiry | contract_expiring_soon | current_vs_peak_value | age_goals_interaction | age_minutes_interaction | league_tier_goals | is_goalkeeper | is_defender | is_midfielder | is_attacker | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3181 | 211847 | Osher Davida | Attack | Right Winger | right | 2001-02-18 | NaN | 1000000.0 | 1300000.0 | 3057 | BE1 | 2026-06-30 | 490 | 23.333333 | 20.404248 | 0 | 0.000000 | 0.000000 | 0 | 0.000000 | 0.000000 | 2 | 0 | 21 | 0.000000 | 0.000000 | 10.380952 | 0.000000 | 0.000000 | 0.367347 | 0.259259 | 3057 | NaN | 28 | 24.1 | 17 | 9 | 27221 | NaN | 0.0 | 10.211781 | 0.0 | BE1 | 2.0 | 2.949050 | -300000.0 | 0.0 | 173205.080757 | 1.299999 | NaN | NaN | NaN | NaT | NaN | 24.829569 | 616.507486 | 193.0 | 1 | 0.769230 | 0.000000 | 579.356605 | 0.000000 | 0 | 0 | 0 | 1 |
| 3395 | 227347 | Valentin Antov | Defender | Centre-Back | right | 2000-11-09 | 187.0 | 800000.0 | 2000000.0 | 2919 | IT1 | 2026-06-30 | 1618 | 59.925926 | 34.383500 | 0 | 0.000000 | 0.000000 | 0 | 0.000000 | 0.000000 | 5 | 0 | 27 | 0.000000 | 0.000000 | 11.518519 | 0.000000 | 0.000000 | 0.278121 | 0.665844 | 2919 | NaN | 31 | 26.5 | 11 | 3 | 17102 | NaN | 0.0 | 9.747009 | 0.0 | IT1 | 1.0 | 2.775000 | 0.0 | -200000.0 | 100000.000000 | 2.499997 | NaN | NaN | NaN | NaT | NaN | 25.106092 | 630.315841 | 193.0 | 1 | 0.400000 | 0.000000 | 1504.505793 | 0.000000 | 0 | 1 | 0 | 0 |
| 3855 | 256942 | Hajdin Salihu | Defender | Centre-Back | right | 2002-01-18 | 188.0 | 700000.0 | 700000.0 | 63007 | UKR1 | 2026-12-31 | 1305 | 87.000000 | 11.618950 | 3 | 0.200000 | 0.414039 | 1 | 0.066667 | 0.258199 | 1 | 0 | 15 | 7.200000 | 14.924572 | 9.600000 | 0.206897 | 0.068966 | 0.068966 | 0.966667 | 63007 | NaN | 26 | 25.8 | 11 | 3 | 10321 | NaN | 0.0 | 9.242033 | 0.0 | UKR1 | 3.0 | 2.545342 | 100000.0 | 350000.0 | 149303.940560 | 0.999999 | 8.0 | 38750.0 | 250000.0 | 2024-01-31 | 687.0 | 23.915127 | 571.933282 | 377.0 | 0 | 0.999999 | 4.947957 | 2080.616016 | 0.620690 | 0 | 1 | 0 | 0 |
| 3945 | 261780 | Artem Shulyanskyi | Attack | Left Winger | right | 2001-04-11 | 173.0 | 500000.0 | 600000.0 | 18303 | UKR1 | 2027-05-31 | 2299 | 51.088889 | 30.663735 | 6 | 0.133333 | 0.343776 | 3 | 0.066667 | 0.252262 | 1 | 0 | 45 | 4.155556 | 9.793216 | 8.755556 | 0.234885 | 0.117442 | 0.039147 | 0.567654 | 18303 | NaN | 29 | 26.3 | 5 | 1 | 7000 | NaN | 0.0 | 8.853808 | 0.0 | UKR1 | 3.0 | 2.545342 | -100000.0 | 300000.0 | 170782.512766 | 1.199998 | 1.0 | 0.0 | 0.0 | 2022-07-04 | 1263.0 | 24.687201 | 609.457871 | 528.0 | 0 | 0.833332 | 5.798646 | 1261.241646 | 0.704654 | 0 | 0 | 0 | 1 |
| 4753 | 316760 | Vincent Thill | Midfield | Attacking Midfield | left | 2000-02-04 | 170.0 | 750000.0 | 3000000.0 | 2740 | UKR1 | 2025-06-30 | 1089 | 57.315789 | 25.931647 | 0 | 0.000000 | 0.000000 | 3 | 0.157895 | 0.374634 | 6 | 0 | 19 | 1.210526 | 3.441236 | 11.631579 | 0.000000 | 0.247934 | 0.495868 | 0.636842 | 2740 | NaN | 26 | 25.5 | 5 | 1 | 23842 | NaN | 0.0 | 10.079246 | 0.0 | UKR1 | 3.0 | 2.545342 | -150000.0 | -250000.0 | 110867.789130 | 3.999995 | NaN | NaN | NaN | NaT | NaN | 25.869952 | 669.254421 | -172.0 | 1 | 0.250000 | 0.000000 | 1482.756728 | 0.000000 | 0 | 0 | 1 | 0 |
Dataset info: Total players: 2,222 Total features: 66 Market value range: Min: €50,000 Max: €200,000,000 Median: €2,000,000
Step 3: Prepare Features for Modeling¶
CRITICAL IMPROVEMENT: Log-transform the target variable
def prepare_features(df):
# Prepare feature matrix with proper encoding and scaling.
print("Preparing features...\n")
# Define feature groups
numeric_features = [
# Performance
'total_minutes', 'avg_minutes_per_game', 'minutes_consistency',
'total_goals', 'goals_per_90', 'goals_consistency',
'total_assists', 'assists_per_90', 'assists_consistency',
'total_games', 'games_started_ratio',
'avg_weighted_score', 'weighted_score_consistency',
'avg_opponent_strength', 'cards_per_90',
# Age
'age', 'age_squared',
# Club
'log_club_value', 'squad_size', 'average_age', 'foreigners_number',
'national_team_players', 'log_stadium_seats', 'log_abs_transfer_record',
# League
'league_tier', 'avg_goals_per_game',
# Physical
'height_in_cm',
# Contract
'days_until_contract_expiry', 'contract_expiring_soon',
# Valuation
'current_vs_peak_value',
# Interactions
'age_goals_interaction', 'age_minutes_interaction', 'league_tier_goals',
# Position indicators
'is_goalkeeper', 'is_defender', 'is_midfielder', 'is_attacker'
]
# Add optional features if they exist
optional_features = [
'valuation_trend_6mo', 'valuation_trend_12mo', 'valuation_volatility',
'peak_to_current_ratio', 'num_transfers', 'avg_transfer_fee',
'max_transfer_fee', 'days_since_transfer'
]
for feat in optional_features:
if feat in df.columns:
numeric_features.append(feat)
# Filter to existing features
numeric_features = [f for f in numeric_features if f in df.columns]
print(f"Using {len(numeric_features)} numeric features")
# Create feature matrix
X = df[numeric_features].copy()
# Fill missing values with median
X = X.fillna(X.median())
X = X.replace([np.inf, -np.inf], 0)
# Target variable - LOG-TRANSFORMED (CRITICAL!)
y = np.log1p(df['market_value_in_eur'])
print(f"\nFeature matrix shape: {X.shape}")
print(f"Target transformed: log1p(market_value)")
print(f"\n Original target range: €{df['market_value_in_eur'].min():.0f} - €{df['market_value_in_eur'].max():.0f}")
print(f" Log-transformed range: {y.min():.2f} - {y.max():.2f}")
return X, y, numeric_features
# Prepare features
X, y, feature_names = prepare_features(df)
print(f"\nFeature list ({len(feature_names)} total):")
for i, feat in enumerate(feature_names, 1):
print(f" {i:2d}. {feat}")
Preparing features... Using 45 numeric features Feature matrix shape: (2222, 45) Target transformed: log1p(market_value) Original target range: €50000 - €200000000 Log-transformed range: 10.82 - 19.11 Feature list (45 total): 1. total_minutes 2. avg_minutes_per_game 3. minutes_consistency 4. total_goals 5. goals_per_90 6. goals_consistency 7. total_assists 8. assists_per_90 9. assists_consistency 10. total_games 11. games_started_ratio 12. avg_weighted_score 13. weighted_score_consistency 14. avg_opponent_strength 15. cards_per_90 16. age 17. age_squared 18. log_club_value 19. squad_size 20. average_age 21. foreigners_number 22. national_team_players 23. log_stadium_seats 24. log_abs_transfer_record 25. league_tier 26. avg_goals_per_game 27. height_in_cm 28. days_until_contract_expiry 29. contract_expiring_soon 30. current_vs_peak_value 31. age_goals_interaction 32. age_minutes_interaction 33. league_tier_goals 34. is_goalkeeper 35. is_defender 36. is_midfielder 37. is_attacker 38. valuation_trend_6mo 39. valuation_trend_12mo 40. valuation_volatility 41. peak_to_current_ratio 42. num_transfers 43. avg_transfer_fee 44. max_transfer_fee 45. days_since_transfer
Step 4: Train/Test Split¶
Using 80/20 split for evaluation.
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=RANDOM_STATE
)
print(f"Training set size: {len(X_train):,} players")
print(f"Test set size: {len(X_test):,} players")
print(f"\nTest set represents {len(X_test)/len(X)*100:.1f}% of data")
Training set size: 1,777 players Test set size: 445 players Test set represents 20.0% of data
Step 5: Train Multiple Models¶
Training and comparing:
- Random Forest (baseline)
- Gradient Boosting
- XGBoost
- LightGBM
models = {}
results = {}
print("Training models...\n")
print("=" * 80)
Training models... ================================================================================
Model 1: Random Forest¶
print("\n[1/4] Training Random Forest...")
rf = RandomForestRegressor(
n_estimators=300,
max_depth=15,
min_samples_split=5,
min_samples_leaf=2,
random_state=RANDOM_STATE,
n_jobs=-1,
verbose=0
)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
models['Random Forest'] = rf
results['Random Forest'] = {
'r2': r2_score(y_test, pred_rf),
'mae': mean_absolute_error(y_test, pred_rf),
'rmse': np.sqrt(mean_squared_error(y_test, pred_rf)),
'predictions': pred_rf
}
print(f" R² Score: {results['Random Forest']['r2']:.4f}")
print(f" MAE (log scale): {results['Random Forest']['mae']:.4f}")
print(f" MAE (€): €{np.expm1(results['Random Forest']['mae']):,.0f}")
[1/4] Training Random Forest... R² Score: 0.9556 MAE (log scale): 0.2206 MAE (€): €0
Model 2: Gradient Boosting¶
print("\n[2/4] Training Gradient Boosting...")
gb = GradientBoostingRegressor(
n_estimators=300,
learning_rate=0.05,
max_depth=6,
random_state=RANDOM_STATE,
verbose=0
)
gb.fit(X_train, y_train)
pred_gb = gb.predict(X_test)
models['Gradient Boosting'] = gb
results['Gradient Boosting'] = {
'r2': r2_score(y_test, pred_gb),
'mae': mean_absolute_error(y_test, pred_gb),
'rmse': np.sqrt(mean_squared_error(y_test, pred_gb)),
'predictions': pred_gb
}
print(f" R² Score: {results['Gradient Boosting']['r2']:.4f}")
print(f" MAE (log scale): {results['Gradient Boosting']['mae']:.4f}")
print(f" MAE (€): €{np.expm1(results['Gradient Boosting']['mae']):,.0f}")
[2/4] Training Gradient Boosting... R² Score: 0.9650 MAE (log scale): 0.2055 MAE (€): €0
Model 3: XGBoost¶
if XGBOOST_AVAILABLE:
print("\n[3/4] Training XGBoost...")
xgb = XGBRegressor(
n_estimators=300,
learning_rate=0.05,
max_depth=6,
subsample=0.8,
colsample_bytree=0.8,
random_state=RANDOM_STATE,
n_jobs=-1,
verbosity=0
)
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
models['XGBoost'] = xgb
results['XGBoost'] = {
'r2': r2_score(y_test, pred_xgb),
'mae': mean_absolute_error(y_test, pred_xgb),
'rmse': np.sqrt(mean_squared_error(y_test, pred_xgb)),
'predictions': pred_xgb
}
print(f" R² Score: {results['XGBoost']['r2']:.4f}")
print(f" MAE (log scale): {results['XGBoost']['mae']:.4f}")
print(f" MAE (€): €{np.expm1(results['XGBoost']['mae']):,.0f}")
else:
print("\n[3/4] XGBoost not available - skipping")
[3/4] Training XGBoost... R² Score: 0.9667 MAE (log scale): 0.2071 MAE (€): €0
Model 4: LightGBM¶
if LIGHTGBM_AVAILABLE:
print("\n[4/4] Training LightGBM...")
lgb = LGBMRegressor(
n_estimators=300,
learning_rate=0.05,
max_depth=6,
random_state=RANDOM_STATE,
n_jobs=-1,
verbose=-1
)
lgb.fit(X_train, y_train)
pred_lgb = lgb.predict(X_test)
models['LightGBM'] = lgb
results['LightGBM'] = {
'r2': r2_score(y_test, pred_lgb),
'mae': mean_absolute_error(y_test, pred_lgb),
'rmse': np.sqrt(mean_squared_error(y_test, pred_lgb)),
'predictions': pred_lgb
}
print(f" R² Score: {results['LightGBM']['r2']:.4f}")
print(f" MAE (log scale): {results['LightGBM']['mae']:.4f}")
print(f" MAE (€): €{np.expm1(results['LightGBM']['mae']):,.0f}")
else:
print("\n[4/4] LightGBM not available - skipping")
[4/4] Training LightGBM... R² Score: 0.9659 MAE (log scale): 0.1991 MAE (€): €0
Step 6: Model Comparison¶
Compare all models to find the best performer.
# Find best model
best_model_name = max(results.keys(), key=lambda k: results[k]['r2'])
best_model = models[best_model_name]
print("\n" + "=" * 80)
print("MODEL COMPARISON RESULTS")
print("=" * 80)
# Create comparison dataframe
comparison_df = pd.DataFrame({
'Model': list(results.keys()),
'R² Score': [results[m]['r2'] for m in results.keys()],
'MAE (log)': [results[m]['mae'] for m in results.keys()],
'MAE (€)': [np.expm1(results[m]['mae']) for m in results.keys()],
'RMSE (log)': [results[m]['rmse'] for m in results.keys()],
'RMSE (€)': [np.expm1(results[m]['rmse']) for m in results.keys()]
}).sort_values('R² Score', ascending=False)
display(comparison_df.style.format({
'R² Score': '{:.4f}',
'MAE (log)': '{:.4f}',
'MAE (€)': '€{:,.0f}',
'RMSE (log)': '{:.4f}',
'RMSE (€)': '€{:,.0f}'
}).background_gradient(subset=['R² Score'], cmap='Greens'))
print(f"\n BEST MODEL: {best_model_name}")
print(f" R² Score: {results[best_model_name]['r2']:.4f}")
print(f" MAE: €{np.expm1(results[best_model_name]['mae']):,.0f}")
print(f" RMSE: €{np.expm1(results[best_model_name]['rmse']):,.0f}")
================================================================================ MODEL COMPARISON RESULTS ================================================================================
| Model | R² Score | MAE (log) | MAE (€) | RMSE (log) | RMSE (€) | |
|---|---|---|---|---|---|---|
| 2 | XGBoost | 0.9667 | 0.2071 | €0 | 0.2949 | €0 |
| 3 | LightGBM | 0.9659 | 0.1991 | €0 | 0.2983 | €0 |
| 1 | Gradient Boosting | 0.9650 | 0.2055 | €0 | 0.3023 | €0 |
| 0 | Random Forest | 0.9556 | 0.2206 | €0 | 0.3403 | €0 |
BEST MODEL: XGBoost R² Score: 0.9667 MAE: €0 RMSE: €0
Visualization 1: Model Comparison Bar Charts¶
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# R² comparison
model_names = list(results.keys())
r2_scores = [results[m]['r2'] for m in model_names]
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12'][:len(model_names)]
bars1 = axes[0].barh(model_names, r2_scores, color=colors)
axes[0].set_xlabel('R² Score', fontsize=12, fontweight='bold')
axes[0].set_title('Model Performance Comparison (R² Score)', fontsize=14, fontweight='bold')
axes[0].axvline(x=0.5, color='gray', linestyle='--', alpha=0.5, label='Target: 0.50')
axes[0].grid(axis='x', alpha=0.3)
axes[0].legend()
for i, (name, score) in enumerate(zip(model_names, r2_scores)):
axes[0].text(score + 0.01, i, f'{score:.4f}', va='center', fontweight='bold', fontsize=11)
# MAE comparison (in original euros)
mae_scores = [np.expm1(results[m]['mae']) for m in model_names]
bars2 = axes[1].barh(model_names, mae_scores, color=colors)
axes[1].set_xlabel('Mean Absolute Error (€)', fontsize=12, fontweight='bold')
axes[1].set_title('Model Error Comparison (MAE)', fontsize=14, fontweight='bold')
axes[1].xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'€{x/1e6:.1f}M'))
axes[1].grid(axis='x', alpha=0.3)
for i, (name, mae) in enumerate(zip(model_names, mae_scores)):
axes[1].text(mae + mae*0.02, i, f'€{mae/1e6:.2f}M', va='center', fontweight='bold', fontsize=11)
plt.tight_layout()
plt.savefig('model_comparison.png', dpi=150, bbox_inches='tight')
print("Saved: model_comparison.png")
plt.show()
Saved: model_comparison.png
Visualization 2: Actual vs Predicted (Best Model)¶
# Transform back to original scale
best_predictions = results[best_model_name]['predictions']
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(best_predictions)
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(y_test_actual, y_pred_actual, alpha=0.4, s=30, color='steelblue', edgecolors='navy', linewidth=0.5)
# Perfect prediction line
max_val = max(y_test_actual.max(), y_pred_actual.max())
ax.plot([0, max_val], [0, max_val], 'r--', lw=3, label='Perfect Prediction', alpha=0.8)
ax.set_xlabel('Actual Market Value (€)', fontsize=14, fontweight='bold')
ax.set_ylabel('Predicted Market Value (€)', fontsize=14, fontweight='bold')
ax.set_title(f'Actual vs Predicted Market Value\n{best_model_name} (R² = {results[best_model_name]["r2"]:.4f})',
fontsize=16, fontweight='bold', pad=20)
# Log scale for better visualization
ax.set_xscale('log')
ax.set_yscale('log')
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'€{x/1e6:.1f}M'))
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'€{x/1e6:.1f}M'))
ax.legend(fontsize=12, loc='upper left')
ax.grid(alpha=0.3, linestyle='--')
# Add stats box
stats_text = f"R² = {results[best_model_name]['r2']:.4f}\nMAE = €{np.expm1(results[best_model_name]['mae'])/1e6:.2f}M\nRMSE = €{np.expm1(results[best_model_name]['rmse'])/1e6:.2f}M"
ax.text(0.98, 0.02, stats_text, transform=ax.transAxes,
fontsize=12, verticalalignment='bottom', horizontalalignment='right',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
plt.tight_layout()
#plt.savefig('actual_vs_predicted.png', dpi=150, bbox_inches='tight')
#print("Saved: actual_vs_predicted.png")
plt.show()
Visualization 3: Feature Importance¶
if hasattr(best_model, 'feature_importances_'):
importances = pd.DataFrame({
'feature': feature_names,
'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False).head(20)
fig, ax = plt.subplots(figsize=(12, 10))
sns.barplot(data=importances, y='feature', x='importance', palette='viridis', ax=ax)
ax.set_xlabel('Importance Score', fontsize=12, fontweight='bold')
ax.set_ylabel('Feature', fontsize=12, fontweight='bold')
ax.set_title(f'Top 20 Feature Importances - {best_model_name}',
fontsize=14, fontweight='bold', pad=20)
ax.grid(axis='x', alpha=0.3)
# Add values on bars
for i, (idx, row) in enumerate(importances.iterrows()):
ax.text(row['importance'] + row['importance']*0.01, i,
f"{row['importance']:.4f}", va='center', fontsize=9)
plt.tight_layout()
#plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight')
#print("Saved: feature_importance.png")
plt.show()
#print("\nTop 10 Most Important Features:")
#display(importances.head(10))
else:
print("Feature importance not available for this model type.")
Step 7: Find Undervalued Players¶
Identify players where predicted value significantly exceeds market value.
print("Finding undervalued players...\n")
# Predict for all players
predictions_log = best_model.predict(X)
predictions = np.expm1(predictions_log)
# Add to dataframe
df_subset = df.iloc[:len(predictions)].copy()
df_subset['predicted_value'] = predictions
df_subset['value_diff'] = df_subset['predicted_value'] - df_subset['market_value_in_eur']
df_subset['underrated_ratio'] = df_subset['predicted_value'] / df_subset['market_value_in_eur']
# Get club names from the clubs dataset
if 'clubs' in datasets:
clubs_names = datasets['clubs'][['club_id', 'name']].rename(columns={'name': 'club_name'})
df_subset = df_subset.merge(clubs_names, left_on='current_club_id', right_on='club_id', how='left', suffixes=('', '_club'))
# Get competition/league names
if 'competitions' in datasets:
comp_names = datasets['competitions'][['competition_id', 'name', 'country_name']].rename(
columns={'name': 'league_name', 'country_name': 'league_country'}
)
df_subset = df_subset.merge(comp_names, left_on='current_club_domestic_competition_id',
right_on='competition_id', how='left', suffixes=('', '_comp'))
# Filter undervalued candidates
candidates = df_subset[
(df_subset['predicted_value'] > 3_000_000) & # Minimum €3M predicted
(df_subset['value_diff'] > 0) & # Predicted > actual
(df_subset['underrated_ratio'] > 1.2) # At least 20% undervalued
].sort_values('value_diff', ascending=False).copy()
# Reset index to start from 1
candidates.reset_index(drop=True, inplace=True)
candidates.index = candidates.index + 1
print(f"Found {len(candidates):,} undervalued players\n")
print("=" * 120)
print("TOP 20 MOST UNDERVALUED PLAYERS")
print("=" * 120)
### Found only 39 players with the new features and filters ###
### CHANGED from TOP50 to TOP20, but too lazy to change the variable names in the code ###
### Chart looks better with fewer players ###
# Select columns for display
display_cols = ['name', 'age', 'position', 'club_name', 'league_name', 'league_country',
'market_value_in_eur', 'predicted_value', 'value_diff', 'underrated_ratio']
# Filter to only existing columns
display_cols = [col for col in display_cols if col in candidates.columns]
top_50 = candidates[display_cols].head(20)
# Format for display
top_50_display = top_50.copy()
top_50_display['age'] = top_50_display['age'].round(1)
top_50_display['market_value_in_eur'] = top_50_display['market_value_in_eur'].apply(lambda x: f"€{x/1e6:.2f}M")
top_50_display['predicted_value'] = top_50_display['predicted_value'].apply(lambda x: f"€{x/1e6:.2f}M")
top_50_display['value_diff'] = top_50_display['value_diff'].apply(lambda x: f"€{x/1e6:.2f}M")
top_50_display['underrated_ratio'] = top_50_display['underrated_ratio'].apply(lambda x: f"{x:.2f}x")
# Rename columns for display
column_mapping = {
'name': 'Player',
'age': 'Age',
'position': 'Position',
'club_name': 'Club',
'league_name': 'League',
'league_country': 'Country',
'market_value_in_eur': 'Current Value',
'predicted_value': 'Predicted Value',
'value_diff': 'Upside',
'underrated_ratio': 'Ratio'
}
# Only rename columns that exist
rename_dict = {k: v for k, v in column_mapping.items() if k in top_50_display.columns}
top_50_display = top_50_display.rename(columns=rename_dict)
display(top_50_display)
print("\n" + "=" * 120)
print(f"Showing top 20 players (ID 1-20)")
print("=" * 120)
Finding undervalued players... Found 42 undervalued players ======================================================================================================================== TOP 20 MOST UNDERVALUED PLAYERS ========================================================================================================================
| Player | Age | Position | Club | League | Country | Current Value | Predicted Value | Upside | Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Dušan Vlahović | 25.9 | Attack | Juventus Football Club | serie-a | Italy | €45.00M | €71.99M | €26.99M | 1.60x |
| 2 | Charles De Ketelaere | 24.8 | Midfield | Atalanta Bergamasca Calcio S.p.a. | serie-a | Italy | €38.00M | €61.33M | €23.33M | 1.61x |
| 3 | Jurrien Timber | 24.5 | Defender | Arsenal Football Club | premier-league | England | €50.00M | €67.23M | €17.23M | 1.34x |
| 4 | Moise Kean | 25.8 | Attack | Associazione Calcio Fiorentina | serie-a | Italy | €40.00M | €53.88M | €13.88M | 1.35x |
| 5 | Yan Couto | 23.5 | Defender | Borussia Dortmund | bundesliga | Germany | €20.00M | €33.47M | €13.47M | 1.67x |
| 6 | Ansu Fati | 23.1 | Attack | Futbol Club Barcelona | laliga | Spain | €5.00M | €18.05M | €13.05M | 3.61x |
| 7 | Abdukodir Khusanov | 21.8 | Defender | Manchester City Football Club | premier-league | England | €35.00M | €46.94M | €11.94M | 1.34x |
| 8 | Arda Güler | 20.8 | Attack | Real Madrid Club de Fútbol | laliga | Spain | €45.00M | €55.59M | €10.59M | 1.24x |
| 9 | Amine Gouiri | 25.8 | Attack | Olympique de Marseille | ligue-1 | France | €20.00M | €29.84M | €9.84M | 1.49x |
| 10 | Mohamed Simakan | 25.6 | Defender | RasenBallsport Leipzig | bundesliga | Germany | €35.00M | €44.41M | €9.41M | 1.27x |
| 11 | Yunus Akgün | 25.4 | Attack | Galatasaray Spor Kulübü | super-lig | Turkey | €12.00M | €21.03M | €9.03M | 1.75x |
| 12 | Giacomo Raspadori | 25.8 | Attack | Società Sportiva Calcio Napoli | serie-a | Italy | €20.00M | €27.70M | €7.70M | 1.39x |
| 13 | Tino Livramento | 23.1 | Defender | Newcastle United Football Club | premier-league | England | €35.00M | €42.53M | €7.53M | 1.22x |
| 14 | Rocco Reitz | 23.6 | Midfield | Borussia Verein für Leibesübungen 1900 Mönchen... | bundesliga | Germany | €15.00M | €19.49M | €4.49M | 1.30x |
| 15 | Valentín Carboni | 20.8 | Midfield | Football Club Internazionale Milano S.p.A. | serie-a | Italy | €12.00M | €15.08M | €3.08M | 1.26x |
| 16 | Jake O'Brien | 24.6 | Defender | Everton Football Club | premier-league | England | €15.00M | €18.00M | €3.00M | 1.20x |
| 17 | Lucas Gourna-Douath | 22.4 | Midfield | Associazione Sportiva Roma | serie-a | Italy | €8.00M | €10.80M | €2.80M | 1.35x |
| 18 | Maxime Estève | 23.6 | Defender | Burnley FC | premier-league | England | €7.00M | €9.56M | €2.56M | 1.37x |
| 19 | Arnau Martínez | 22.7 | Defender | Girona Fútbol Club S. A. D. | laliga | Spain | €10.00M | €12.24M | €2.24M | 1.22x |
| 20 | Noah Weißhaupt | 24.2 | Attack | Fußball-Club St. Pauli von 1910 | bundesliga | Germany | €5.00M | €7.12M | €2.12M | 1.42x |
======================================================================================================================== Showing top 20 players (ID 1-20) ========================================================================================================================
Sunburst chart of top undervalued players¶
Initially I thought to use Hierarchical Packed Bubble Chart, but I found that Sunburst chart is better for this purpose as it provides a clearer structure.
I also decided to normalize the scale of the players and clubs, and only focus to show the leagues proportionally to the number of undervalued players in them.
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import colorsys
import hashlib
FIG_W = 1500
FIG_H = 1500
sb = top_50.copy()
# DEFENSIVE CHECK: Verify we have data
if len(sb) == 0:
raise ValueError("top_50 DataFrame is empty!")
print(f"Working with {len(sb)} players")
needed = ['name', 'position', 'club_name', 'league_name', 'value_diff', 'underrated_ratio',
'market_value_in_eur', 'predicted_value', 'age']
missing = [c for c in needed if c not in sb.columns]
if missing:
raise ValueError(f"Missing columns in top_50: {missing}")
# RESET INDEX to ensure clean integer indexing
sb = sb.reset_index(drop=True)
# 3 tiers by quantiles (balanced counts)
try:
sb["tier"] = pd.qcut(sb["underrated_ratio"], q=3, labels=[1, 2, 3], duplicates='drop').astype(int)
except ValueError:
sb["tier"] = pd.cut(sb["underrated_ratio"], bins=3, labels=[1, 2, 3]).astype(int)
def eur_m(x):
return f"€{x/1e6:.2f}M"
LEAGUE_ANCHORS = {
"premier-league": "#80B1D3",
"laliga": "#FFED6F",
"bundesliga": "#FB8072",
"serie-a": "#BEBADA",
"ligue-1": "#B3DE69",
"eredivisie": "#FDB462",
"liga-portugal": "#8DD3C7",
"super-lig": "#BC80BD",
"jupiler-pro-league": "#CCEBC5",
"superligaen": "#FCCDE5",
"premier-liga": "#D9D9D9",
"mls / other": "#FFFFB3",
}
def league_key(name: str) -> str:
s = str(name).strip().lower()
# direct slug matches
if s in LEAGUE_ANCHORS:
return s
# pretty-name / variants
if "english" in s and "premier league" in s: return "premier-league"
if "spanish" in s and ("laliga" in s or "la liga" in s): return "laliga"
if "german" in s and "bundesliga" in s: return "bundesliga"
if "italian" in s and "serie" in s: return "serie-a"
if "french" in s and "ligue" in s: return "ligue-1"
if "dutch" in s and "eredivisie" in s: return "eredivisie"
if "portuguese" in s and ("primeira" in s or "portugal" in s): return "liga-portugal"
if "turkish" in s and ("super lig" in s or "süper lig" in s or "super-lig" in s): return "super-lig"
if "belgian" in s and ("pro league" in s or "jupiler" in s): return "jupiler-pro-league"
if "russian" in s and ("premier" in s and "league" in s): return "premier-liga"
if "superligaen" in s: return "superligaen"
# fallback
return "mls / other"
# ------------------------------------------------------------
# Player text styling based on tiers
# ------------------------------------------------------------
sb["Player"] = np.where(
sb["tier"] == 1,
"<b>" + sb['name'].astype(str) + "</b> (" + np.floor(sb["age"]).astype(int).astype(str) + ")<br>" + sb['position'].astype(str) + "<br><br>Undervalued by " + sb["value_diff"].apply(eur_m) + "<br>" + "Ratio: " + sb["underrated_ratio"].map(lambda v: f"{v:.2f}x"),
np.where(
sb["tier"] == 2,
"<b>" + sb['name'].astype(str) + "</b> (" + np.floor(sb["age"]).astype(int).astype(str) + ")<br>" + sb['position'].astype(str) + "<br><br>Undervalued by " + sb["value_diff"].apply(eur_m) + "<br>" + "Ratio: <b>" + sb["underrated_ratio"].map(lambda v: f"{v:.2f}x") + "</b>",
"<b><u>★" + sb["name"].astype(str) + "★</u></b> (" + np.floor(sb["age"]).astype(int).astype(str) + ")<br>" + sb['position'].astype(str) + "<br><br>Undervalued by " + sb["value_diff"].apply(eur_m) + "<br>" + "Ratio: <b><u>" + sb["underrated_ratio"].map(lambda v: f"{v:.2f}x") + "</u></b>"
)
)
sb['Club'] = sb['club_name'].fillna('Unknown club').astype(str)
sb['League'] = sb['league_name'].fillna('Unknown league').astype(str)
sb['Position'] = sb['position'].fillna('Unknown').astype(str)
# --- Pretty league names + English spelling as Transfermarkt data is in German ---
league_pretty = {
"premier-league": "English Premier League",
"laliga": "Spanish LALIGA",
"bundesliga": "German Bundesliga",
"serie-a": "Italian Serie A",
"ligue-1": "French Ligue 1",
"eredivisie": "Dutch Vrouwen Eredivisie",
"liga-portugal": "Portuguese Primeira Liga",
"super-lig": "Turkish Super Lig",
"jupiler-pro-league": "Belgian Pro League",
"premier-liga": "Russian Premier League",
}
sb["League_slug"] = sb["League"] # keep original slug
sb["League"] = sb["League"].map(league_pretty).fillna(sb["League"]) # pretty display
# ------------------------------------------------------------
# Hover text (players only)
# ------------------------------------------------------------
sb["hover_text"] = (
"<b>" + sb["Player"] + "</b><br>"
+ "Age: " + sb["age"].round(1).astype(str) + "<br>"
+ "Position: " + sb["Position"] + "<br><br>"
+ "League: " + sb["League"] + "<br>"
+ "Club: " + sb["Club"] + "<br><br>"
+ "Market: " + sb["market_value_in_eur"].apply(eur_m) + "<br>"
+ "Predicted: " + sb["predicted_value"].apply(eur_m) + "<br>"
+ "Diff: " + sb["value_diff"].apply(eur_m) + "<br>"
+ "Ratio: " + sb["underrated_ratio"].map(lambda v: f"{v:.2f}x")
)
# ------------------------------------------------------------
# SIZING
# ------------------------------------------------------------
players_in_league = sb.groupby('League').size().rename('players_in_league').reset_index()
sb = sb.merge(players_in_league, on='League', how='left')
clubs_in_league = sb.groupby('League')['Club'].nunique().rename('clubs_in_league').reset_index()
sb = sb.merge(clubs_in_league, on='League', how='left')
players_in_club = sb.groupby(['League', 'Club']).size().rename('players_in_club').reset_index()
sb = sb.merge(players_in_club, on=['League', 'Club'], how='left')
sb['size_value'] = (sb['players_in_league'] / sb['clubs_in_league']) * (1 / sb['players_in_club'])
players_df = sb[['League', 'Club', 'Player', 'size_value', 'hover_text']].copy()
clubs_df = sb[['League', 'Club']].drop_duplicates()
club_totals = players_df.groupby(['League', 'Club'])['size_value'].sum().reset_index()
league_totals = players_df.groupby(['League'])['size_value'].sum().reset_index()
# ------------------------------------------------------------
# League colors
# ------------------------------------------------------------
pretty_to_slug = sb.drop_duplicates("League")[["League", "League_slug"]].set_index("League")["League_slug"].to_dict()
league_order = league_totals.sort_values('size_value', ascending=False)['League'].tolist()
league_color = {L: LEAGUE_ANCHORS.get(pretty_to_slug.get(L, "mls / other"), LEAGUE_ANCHORS["mls / other"])
for L in league_order}
# ------------------------------------------------------------
# Color helpers
# ------------------------------------------------------------
def hex_to_rgb(hex_color: str):
h = hex_color.lstrip("#")
return tuple(int(h[i:i+2], 16) for i in (0, 2, 4))
def rgb_to_hex(rgb):
return "#{:02X}{:02X}{:02X}".format(*rgb)
def adjust_lightness(hex_color: str, new_lightness: float):
r, g, b = hex_to_rgb(hex_color)
r, g, b = r/255, g/255, b/255
h, l, s = colorsys.rgb_to_hls(r, g, b)
new_lightness = max(0, min(1, new_lightness))
rr, gg, bb = colorsys.hls_to_rgb(h, new_lightness, s)
return rgb_to_hex((int(rr*255), int(gg*255), int(bb*255)))
def get_lightness(hex_color: str) -> float:
r, g, b = hex_to_rgb(hex_color)
r, g, b = r/255, g/255, b/255
h, l, s = colorsys.rgb_to_hls(r, g, b)
return float(l)
def wrap_two_lines_by_words(text: str):
words = str(text).split()
if len(words) <= 1:
return text
best_i = 1
best_score = float("inf")
for i in range(1, len(words)):
a = " ".join(words[:i])
b = " ".join(words[i:])
score = max(len(a), len(b))
if score < best_score:
best_score = score
best_i = i
return " ".join(words[:best_i]) + "<br>" + " ".join(words[best_i:])
# ------------------------------------------------------------
# Club colors
# ------------------------------------------------------------
GRAY_A = "#EFEFEF"
GRAY_B = "#E6E6E6"
club_color = {}
club_counter = 0
for L in league_order:
clubs = sorted(club_totals.loc[club_totals['League'] == L, 'Club'].unique())
for C in clubs:
club_color[(L, C)] = GRAY_A if club_counter % 2 == 0 else GRAY_B
club_counter += 1
# ------------------------------------------------------------
# Player colors
# ------------------------------------------------------------
PLAYER_LIGHTNESS_DELTA = 0.05
player_color_map = {}
for L in league_order:
base = league_color[L]
base_l = get_lightness(base)
k = 0
clubs = sorted(players_df.loc[players_df['League'] == L, 'Club'].unique())
for C in clubs:
g = players_df[(players_df['League'] == L) & (players_df['Club'] == C)].sort_values('Player')
for P in g['Player'].tolist():
if (L, P) not in player_color_map:
delta = -PLAYER_LIGHTNESS_DELTA if (k % 2 == 0) else PLAYER_LIGHTNESS_DELTA
player_color_map[(L, P)] = adjust_lightness(base, base_l + delta)
k += 1
# ------------------------------------------------------------
# Build Sunburst
# ------------------------------------------------------------
ids, parents, labels, values, colors, hovertemplates = [], [], [], [], [], []
# League nodes
for L in league_order:
# FIX: Use .values[0] instead of .iloc[0] for safer access
league_vals = league_totals.loc[league_totals['League'] == L, 'size_value'].values
if len(league_vals) == 0:
continue
league_val = float(league_vals[0])
ids.append(f"L|{L}")
parents.append("")
labels.append("<b>" + L + "</b>")
values.append(league_val)
colors.append(league_color[L])
hovertemplates.append("")
# Club nodes
for L in league_order:
clubs = sorted(club_totals.loc[club_totals['League'] == L, 'Club'].unique())
for C in clubs:
club_vals = club_totals.loc[
(club_totals['League'] == L) & (club_totals['Club'] == C),
'size_value'
].values
if len(club_vals) == 0:
continue
club_val = float(club_vals[0])
ids.append(f"C|{L}|{C}")
parents.append(f"L|{L}")
labels.append("<b>" + wrap_two_lines_by_words(C) + "</b>")
values.append(club_val)
colors.append(club_color[(L, C)])
hovertemplates.append("")
# Player nodes
for L in league_order:
clubs = sorted(players_df.loc[players_df['League'] == L, 'Club'].unique())
for C in clubs:
g = players_df[(players_df['League'] == L) & (players_df['Club'] == C)].sort_values('Player')
for _, r in g.iterrows():
P = r['Player']
ids.append(f"P|{L}|{C}|{P}")
parents.append(f"C|{L}|{C}")
labels.append(P)
values.append(float(r['size_value']))
colors.append(player_color_map[(L, P)])
hovertemplates.append(r['hover_text'] + "<extra></extra>")
print(f"Built sunburst with {len(ids)} nodes")
fig = go.Figure(go.Sunburst(
ids=ids,
labels=labels,
parents=parents,
values=values,
branchvalues="total",
marker=dict(colors=colors),
hovertemplate=hovertemplates,
textinfo="label",
insidetextorientation="radial",
leaf=dict(opacity=0.85)
))
fig.update_layout(
width=FIG_W,
height=FIG_H,
title=f"Top {len(sb)} Undervalued Players — League (scaled) → Club (equal) → Player (equal)",
margin=dict(t=60, l=10, r=10, b=10),
uniformtext=dict(minsize=10, mode="hide")
)
fig.show()
fig.write_html(
"undervalued_players_sunburst.html",
include_plotlyjs=True,
full_html=True
)
Working with 20 players Built sunburst with 46 nodes
Which positions are most undervalued?¶
- Higher medians > structural undervaluation
- Long upper tails > occasional “steals”
- Tight boxes > consistent pricing
import matplotlib.pyplot as plt
cluster_data = candidates.head(50).copy()
plt.figure(figsize=(10, 6))
cluster_data.boxplot(
column='underrated_ratio',
by='position',
grid=False,
showfliers=True,
flierprops=dict(
marker='o',
markerfacecolor='none',
markeredgecolor='red', # red outline
markeredgewidth=1.5,
markersize=7
)
)
plt.axhline(1.0, color='gray', linestyle='--', alpha=0.6)
plt.title("Underrated Ratio by Position")
plt.suptitle("")
plt.ylabel("Predicted / Market Value Ratio")
plt.xlabel("Position")
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
Which leagues have the most undervalued players?¶
- Smaller leagues with high ratios > scouting inefficiency
- Big leagues with moderate ratios > efficiency but large € upside
***Note:** League names are not 'beautyfied' as in Sunburts chart. Not much learning from this chart anyway*
league_stats = (
cluster_data
.groupby('league_name')
.agg(
avg_ratio=('underrated_ratio', 'mean'),
count=('name', 'count')
)
.sort_values('avg_ratio', ascending=False)
)
plt.figure(figsize=(12, 6))
plt.barh(league_stats.index, league_stats['avg_ratio'])
plt.axvline(1.0, color='gray', linestyle='--', alpha=0.6)
plt.xlabel("Average Underrated Ratio")
plt.title("Average Undervaluation by League")
plt.tight_layout()
plt.show()
Which clubs repeatedly produce undervalued players?¶
- Upper-right quadrant = clubs consistently selling below intrinsic value
- Large bubbles = repeated pattern, not noise
***Note:** to little data to make sense. This works better with more players, but I've limited it in the end to just the top 20*
club_stats = (
cluster_data
.groupby('club_name')
.agg(
avg_ratio=('underrated_ratio', 'mean'),
count=('name', 'count')
)
.query("count >= 2")
)
plt.figure(figsize=(10, 7))
plt.scatter(
club_stats['count'],
club_stats['avg_ratio'],
s=club_stats['count'] * 120,
alpha=0.7
)
for club, row in club_stats.iterrows():
plt.text(row['count'] + 0.02, row['avg_ratio'], club, fontsize=8)
plt.axhline(1.0, color='gray', linestyle='--', alpha=0.6)
plt.xlabel("Number of Undervalued Players")
plt.ylabel("Average Underrated Ratio")
plt.title("Clubs with Repeated Undervaluation")
plt.tight_layout()
plt.show()
Which characteristics are undervalued?¶
- “Sweet spots” (e.g. 23–26 yrs)
- Older players with high ratios > short-term arbitrage
- Young players undervalued > long-term bets
position_colors = {
'Goalkeeper': '#e74c3c',
'Defender': '#3498db',
'Midfield': '#2ecc71',
'Attack': '#f39c12'
}
plt.figure(figsize=(10, 7))
for pos, color in position_colors.items():
subset = cluster_data[cluster_data['position'] == pos]
plt.scatter(
subset['age'],
subset['underrated_ratio'],
label=pos,
color=color,
alpha=0.7
)
for _, row in cluster_data.iterrows():
if row['underrated_ratio'] >= 1.5:
plt.annotate(
row['name'],
(row['age'], row['underrated_ratio']),
textcoords="offset points",
xytext=(6, 6),
fontsize=8,
fontweight='bold',
bbox=dict(boxstyle="round,pad=0.2", fc="white", alpha=0.7),
ha='left'
)
plt.axhline(1.0, color='gray', linestyle='--', alpha=0.6)
plt.xlabel("Age")
plt.ylabel("Underrated Ratio")
plt.title("Age vs Undervaluation by Position")
plt.legend()
plt.tight_layout()
plt.show()
Where the model strongly disagrees with the market¶
- Top-right > high conviction targets
- Bottom-left > noise / low signal
- High ratio + low € > cheap hidden value
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
# Prepare values
x = cluster_data['value_diff'] / 1e6
y = cluster_data['underrated_ratio']
# Normalize both axes to [0, 1]
scaler = MinMaxScaler()
xy_norm = scaler.fit_transform(np.column_stack([x, y]))
# Combine into a single intensity score
# (you can weight these if you want)
intensity = xy_norm[:, 0] + xy_norm[:, 1]
plt.figure(figsize=(9, 7))
scatter = plt.scatter(
x,
y,
c=intensity,
cmap='coolwarm', # blue → red
alpha=0.85,
edgecolors='black',
linewidth=0.5
)
# Median lines
plt.axhline(y.median(), linestyle='--', alpha=0.5)
plt.axvline(x.median(), linestyle='--', alpha=0.5)
plt.xlabel("Value Difference (€M)")
plt.ylabel("Underrated Ratio")
plt.title("Model–Market Disagreement Map")
# Colorbar
cbar = plt.colorbar(scatter)
cbar.set_label("Combined Model–Market Signal", fontweight='bold')
plt.tight_layout()
plt.show()
Composite view¶
pivot = cluster_data.pivot_table(
index='position',
columns='league_name',
values='underrated_ratio',
aggfunc='mean'
)
plt.figure(figsize=(14, 5))
plt.imshow(pivot, aspect='auto', cmap='YlOrRd')
plt.xticks(range(len(pivot.columns)), pivot.columns, rotation=45, ha='right')
plt.yticks(range(len(pivot.index)), pivot.index)
plt.colorbar(label='Avg Underrated Ratio')
plt.title("Where the Market Misses Value (Position × League)")
plt.tight_layout()
plt.show()