In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
# Configuration
DATA_DIR = 'datasets/'
sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)
1. Data Loading and Preparation¶
Loading necessary datasets and merging them into a master dataframe for analysis.
In [2]:
def load_data():
files = {
'players': 'players.csv',
'appearances': 'appearances.csv',
'clubs': 'clubs.csv',
'player_valuations': 'player_valuations.csv',
'transfers': 'transfers.csv',
'competitions': 'competitions.csv',
'club_games': 'club_games.csv'
}
data = {}
for name, file in files.items():
path = os.path.join(DATA_DIR, file)
if os.path.exists(path):
print(f"Loading {name}...")
# Parse dates where applicable
parse_dates = []
if name in ['appearances', 'player_valuations']:
parse_dates = ['date']
elif name == 'transfers':
parse_dates = ['transfer_date']
elif name == 'players':
parse_dates = ['date_of_birth', 'contract_expiration_date']
data[name] = pd.read_csv(path, parse_dates=parse_dates)
return data
print(f"Loaded {name}: {data[players].shape}")
data = load_data()
Loading players... Loading appearances... Loading clubs... Loading player_valuations... Loading transfers... Loading competitions... Loading club_games...
Feature Engineering & Merging¶
In [3]:
# Aggregate appearances (Total goals, assists, minutes, cards)
if 'appearances' in data:
perf_agg = data['appearances'].groupby('player_id').agg({
'goals': 'sum',
'assists': 'sum',
'minutes_played': 'sum',
'yellow_cards': 'sum',
'red_cards': 'sum',
'game_id': 'count' # Total games played
}).rename(columns={'game_id': 'total_games'}).reset_index()
else:
perf_agg = pd.DataFrame()
# Merge with Players
df = data['players'].copy()
if not perf_agg.empty:
df = df.merge(perf_agg, on='player_id', how='left')
# Merge with Club info
if 'clubs' in data:
clubs_subset = data['clubs'][['club_id', 'total_market_value', 'stadium_seats', 'coach_name', 'national_team_players']]
df = df.merge(clubs_subset, left_on='current_club_id', right_on='club_id', how='left', suffixes=('', '_club'))
# Calculate Age
now = pd.Timestamp.now()
df['age'] = (now - df['date_of_birth']).dt.days / 365.25
# Filter for active players with market value
df_active = df[df['market_value_in_eur'] > 0].copy()
print(f"Active Players with Market Value: {len(df_active)}")
df_active.head()
Active Players with Market Value: 31078
Out[3]:
| player_id | first_name | last_name | name | last_season | current_club_id | player_code | country_of_birth | city_of_birth | country_of_citizenship | date_of_birth | sub_position | position | foot | height_in_cm | contract_expiration_date | agent_name | image_url | url | current_club_domestic_competition_id | current_club_name | market_value_in_eur | highest_market_value_in_eur | goals | assists | minutes_played | yellow_cards | red_cards | total_games | club_id | total_market_value | stadium_seats | coach_name | national_team_players | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10 | Miroslav | Klose | Miroslav Klose | 2015 | 398 | miroslav-klose | Poland | Opole | Germany | 1978-06-09 | Centre-Forward | Attack | right | 184.0 | NaT | ASBW Sport Marketing | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/miroslav-klose... | IT1 | Società Sportiva Lazio S.p.A. | 1000000.0 | 30000000.0 | 48.0 | 25.0 | 8808.0 | 19.0 | 0.0 | 136.0 | 398 | NaN | 70634 | NaN | 13 | 47.520876 |
| 1 | 26 | Roman | Weidenfeller | Roman Weidenfeller | 2017 | 16 | roman-weidenfeller | Germany | Diez | Germany | 1980-08-06 | Goalkeeper | Goalkeeper | left | 190.0 | NaT | Neubauer 13 GmbH | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/roman-weidenfe... | L1 | Borussia Dortmund | 750000.0 | 8000000.0 | 0.0 | 0.0 | 13508.0 | 4.0 | 2.0 | 152.0 | 16 | NaN | 81365 | NaN | 18 | 45.360712 |
| 2 | 65 | Dimitar | Berbatov | Dimitar Berbatov | 2015 | 1091 | dimitar-berbatov | Bulgaria | Blagoevgrad | Bulgaria | 1981-01-30 | Centre-Forward | Attack | NaN | NaN | NaT | CSKA-AS-23 Ltd. | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/dimitar-berbat... | GR1 | Panthessalonikios Athlitikos Omilos Konstantin... | 1000000.0 | 34500000.0 | 38.0 | 13.0 | 8788.0 | 11.0 | 1.0 | 122.0 | 1091 | NaN | 28703 | NaN | 11 | 44.876112 |
| 3 | 77 | NaN | Lúcio | Lúcio | 2012 | 506 | lucio | Brazil | BrasÃlia | Brazil | 1978-05-08 | Centre-Back | Defender | NaN | NaN | NaT | NaN | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/lucio/profil/s... | IT1 | Juventus Football Club | 200000.0 | 24500000.0 | 0.0 | 0.0 | 307.0 | 0.0 | 0.0 | 4.0 | 506 | NaN | 41507 | NaN | 19 | 47.608487 |
| 4 | 80 | Tom | Starke | Tom Starke | 2017 | 27 | tom-starke | East Germany (GDR) | Freital | Germany | 1981-03-18 | Goalkeeper | Goalkeeper | right | 194.0 | NaT | IFM | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/tom-starke/pro... | L1 | FC Bayern München | 100000.0 | 3000000.0 | 0.0 | 0.0 | 1080.0 | 0.0 | 0.0 | 12.0 | 27 | NaN | 75000 | NaN | 17 | 44.747433 |
In [5]:
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.scatterplot(data=df_active, x='goals', y='market_value_in_eur', alpha=0.5)
plt.title('Goals vs Market Value')
plt.xscale('log')
plt.yscale('log')
plt.subplot(1, 3, 2)
sns.scatterplot(data=df_active, x='assists', y='market_value_in_eur', alpha=0.5)
plt.title('Assists vs Market Value')
plt.xscale('log')
plt.yscale('log')
plt.subplot(1, 3, 3)
sns.scatterplot(data=df_active, x='minutes_played', y='market_value_in_eur', alpha=0.5)
plt.title('Minutes Played vs Market Value')
plt.yscale('log')
plt.tight_layout()
plt.show()
# Correlation Matrix
cols = ['market_value_in_eur', 'goals', 'assists', 'minutes_played', 'total_games', 'age']
corr = df_active[cols].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation: Performance vs Value')
plt.show()
Market Dynamics: Age Curves¶
How does valuation change with age?
In [6]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_active, x='age', y='market_value_in_eur', estimator='median')
plt.title('Median Market Value by Age')
plt.show()
Transfer Fee vs Market Value¶
Are players generally over or undervalued compared to their transfer fees?
*** Not happy with the results... to be continued
In [4]:
if 'transfers' in data:
# Merge transfers with current market value (approximate comparison)
# Note: Ideally we should compare transfer fee with market value AT THE TIME of transfer.
# Here we look at the 'market_value_in_eur' column in the transfers table itself which records value at transfer time.
tf = data['transfers'].dropna(subset=['transfer_fee', 'market_value_in_eur'])
tf = tf[tf['transfer_fee'] > 0] # Filter out free transfers for this plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=tf, x='market_value_in_eur', y='transfer_fee', alpha=0.3)
# Add y=x line
max_val = max(tf['market_value_in_eur'].max(), tf['transfer_fee'].max())
plt.plot([0, max_val], [0, max_val], 'r--', label='Fair Value (Fee = MV)')
plt.title('Transfer Fee vs Market Value at Time of Transfer')
plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.show()