Data Preperation¶
Initialize python and dependencies¶
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
# Configuration
DATA_DIR = 'datasets/'
# Set plot style
sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)
1. Data Loading¶
Load all CSV files from the data folder.
In [2]:
files = [
'appearances.csv',
'club_games.csv',
'clubs.csv',
'competitions.csv',
'game_events.csv',
'game_lineups.csv',
'games.csv',
'player_valuations.csv',
'players.csv',
'transfers.csv'
]
data = {}
for file in files:
name = file.split('.')[0]
file_path = os.path.join(DATA_DIR, file)
if os.path.exists(file_path):
print(f"Loading {file}...")
# Try parsing dates for common date columns to save time later
parse_dates = []
if 'date' in pd.read_csv(file_path, nrows=1).columns:
parse_dates = ['date']
elif 'transfer_date' in pd.read_csv(file_path, nrows=1).columns:
parse_dates = ['transfer_date']
data[name] = pd.read_csv(file_path, parse_dates=parse_dates)
print(f"Loaded {name}: {data[name].shape}")
else:
print(f"Warning: {file} not found at {file_path}")
Loading appearances.csv... Loaded appearances: (1706806, 13) Loading club_games.csv... Loaded club_games: (148052, 11) Loading clubs.csv... Loaded clubs: (439, 17) Loading competitions.csv... Loaded competitions: (44, 11) Loading game_events.csv... Loaded game_events: (1035043, 10) Loading game_lineups.csv... Loaded game_lineups: (2191911, 10) Loading games.csv... Loaded games: (74026, 23) Loading player_valuations.csv... Loaded player_valuations: (496606, 5) Loading players.csv... Loaded players: (32601, 23) Loading transfers.csv... Loaded transfers: (79646, 10)
2. Data Structure & Missing Values¶
Checking first rows and check for missing values in each dataset
In [3]:
for name, df in data.items():
print(f"\n--- {name} ---")
display(df.head(3))
print("\nMissing Values:")
missing = df.isnull().sum()
if missing.sum() > 0:
print(missing[missing > 0])
print("\nSample rows with missing values:")
# Display the first 5 rows that have at least one missing value
display(df[df.isnull().any(axis=1)].head(10))
else:
print("No missing values.")
print("-" * 30)
--- appearances ---
| appearance_id | game_id | player_id | player_club_id | player_current_club_id | date | player_name | competition_id | yellow_cards | red_cards | goals | assists | minutes_played | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2231978_38004 | 2231978 | 38004 | 853 | 235 | 2012-07-03 | Aurélien Joachim | CLQ | 0 | 0 | 2 | 0 | 90 |
| 1 | 2233748_79232 | 2233748 | 79232 | 8841 | 2698 | 2012-07-05 | Ruslan Abyshov | ELQ | 0 | 0 | 0 | 0 | 90 |
| 2 | 2234413_42792 | 2234413 | 42792 | 6251 | 465 | 2012-07-05 | Sander Puri | ELQ | 0 | 0 | 0 | 0 | 45 |
Missing Values: player_name 6 dtype: int64 Sample rows with missing values:
| appearance_id | game_id | player_id | player_club_id | player_current_club_id | date | player_name | competition_id | yellow_cards | red_cards | goals | assists | minutes_played | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 799042 | 3084062_380365 | 3084062 | 380365 | 16486 | -1 | 2018-09-05 | NaN | CDR | 1 | 0 | 0 | 0 | 90 |
| 799117 | 3084059_411294 | 3084059 | 411294 | 3302 | -1 | 2018-09-11 | NaN | CDR | 0 | 0 | 0 | 0 | 90 |
| 799150 | 3084057_255495 | 3084057 | 255495 | 11596 | -1 | 2018-09-12 | NaN | CDR | 0 | 0 | 0 | 0 | 90 |
| 799165 | 3102749_380365 | 3102749 | 380365 | 16486 | -1 | 2018-09-12 | NaN | CDR | 0 | 0 | 0 | 0 | 120 |
| 816192 | 3106648_255495 | 3106648 | 255495 | 11596 | -1 | 2018-10-17 | NaN | CDR | 0 | 0 | 0 | 0 | 90 |
| 840806 | 3118604_411294 | 3118604 | 411294 | 3302 | -1 | 2018-12-05 | NaN | CDR | 0 | 0 | 0 | 0 | 49 |
------------------------------ --- club_games ---
| game_id | club_id | own_goals | own_position | own_manager_name | opponent_id | opponent_goals | opponent_position | opponent_manager_name | hosting | is_win | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2320450 | 1468.0 | 0.0 | NaN | Holger Bachthaler | 24.0 | 2.0 | NaN | Armin Veh | Home | 0 |
| 1 | 2320454 | 222.0 | 0.0 | NaN | Volkan Uluc | 79.0 | 2.0 | NaN | Bruno Labbadia | Home | 0 |
| 2 | 2320460 | 1.0 | 3.0 | NaN | Jürgen Luginger | 86.0 | 1.0 | NaN | Robin Dutt | Home | 1 |
Missing Values: club_id 18 own_goals 24 own_position 44934 own_manager_name 1656 opponent_id 18 opponent_goals 24 opponent_position 44934 opponent_manager_name 1656 dtype: int64 Sample rows with missing values:
| game_id | club_id | own_goals | own_position | own_manager_name | opponent_id | opponent_goals | opponent_position | opponent_manager_name | hosting | is_win | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2320450 | 1468.0 | 0.0 | NaN | Holger Bachthaler | 24.0 | 2.0 | NaN | Armin Veh | Home | 0 |
| 1 | 2320454 | 222.0 | 0.0 | NaN | Volkan Uluc | 79.0 | 2.0 | NaN | Bruno Labbadia | Home | 0 |
| 2 | 2320460 | 1.0 | 3.0 | NaN | Jürgen Luginger | 86.0 | 1.0 | NaN | Robin Dutt | Home | 1 |
| 3 | 2320472 | 2036.0 | 4.0 | NaN | Frank Schmidt | 72.0 | 5.0 | NaN | Alexander Schmidt | Home | 0 |
| 20 | 2321976 | 10604.0 | 1.0 | NaN | Branko Nisevic | 448.0 | 4.0 | NaN | Marko Nikolic | Home | 0 |
| 21 | 2321990 | 1257.0 | 2.0 | NaN | Dan Theis | 2275.0 | 0.0 | NaN | Marko Rajamäki | Home | 1 |
| 22 | 2322019 | 11107.0 | 2.0 | NaN | José Gomes | 10883.0 | 1.0 | NaN | Vojo Pejovic | Home | 1 |
| 23 | 2322023 | 282.0 | 1.0 | NaN | Husref Musemic | 10616.0 | 0.0 | NaN | Ezio Fortunato | Home | 1 |
| 24 | 2322035 | 3811.0 | 4.0 | NaN | Janis Intenbergs | 29160.0 | 6.0 | NaN | Lee Jones | Home | 0 |
| 25 | 2323263 | 10734.0 | 1.0 | NaN | Carl Darlington | 255.0 | 3.0 | NaN | Jan Urban | Home | 0 |
------------------------------ --- clubs ---
| club_id | club_code | name | domestic_competition_id | total_market_value | squad_size | average_age | foreigners_number | foreigners_percentage | national_team_players | stadium_name | stadium_seats | net_transfer_record | coach_name | last_season | filename | url | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 105 | sv-darmstadt-98 | SV Darmstadt 98 | L1 | NaN | 27 | 25.6 | 13 | 48.1 | 1 | Merck-Stadion am Böllenfalltor | 17810 | +€3.05m | NaN | 2023 | ../data/raw/transfermarkt-scraper/2023/clubs.j... | https://www.transfermarkt.co.uk/sv-darmstadt-9... |
| 1 | 11127 | ural-ekaterinburg | Ural Yekaterinburg | RU1 | NaN | 30 | 26.5 | 11 | 36.7 | 3 | Yekaterinburg Arena | 23000 | +€880k | NaN | 2023 | ../data/raw/transfermarkt-scraper/2023/clubs.j... | https://www.transfermarkt.co.uk/ural-ekaterinb... |
| 2 | 114 | besiktas-istanbul | Beşiktaş Jimnastik Kulübü | TR1 | NaN | 30 | 26.6 | 15 | 50.0 | 8 | Beşiktaş Park | 42445 | €-25.26m | NaN | 2024 | ../data/raw/transfermarkt-scraper/2024/clubs.j... | https://www.transfermarkt.co.uk/besiktas-istan... |
Missing Values: total_market_value 439 average_age 38 foreigners_percentage 49 coach_name 439 dtype: int64 Sample rows with missing values:
| club_id | club_code | name | domestic_competition_id | total_market_value | squad_size | average_age | foreigners_number | foreigners_percentage | national_team_players | stadium_name | stadium_seats | net_transfer_record | coach_name | last_season | filename | url | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 105 | sv-darmstadt-98 | SV Darmstadt 98 | L1 | NaN | 27 | 25.6 | 13 | 48.1 | 1 | Merck-Stadion am Böllenfalltor | 17810 | +€3.05m | NaN | 2023 | ../data/raw/transfermarkt-scraper/2023/clubs.j... | https://www.transfermarkt.co.uk/sv-darmstadt-9... |
| 1 | 11127 | ural-ekaterinburg | Ural Yekaterinburg | RU1 | NaN | 30 | 26.5 | 11 | 36.7 | 3 | Yekaterinburg Arena | 23000 | +€880k | NaN | 2023 | ../data/raw/transfermarkt-scraper/2023/clubs.j... | https://www.transfermarkt.co.uk/ural-ekaterinb... |
| 2 | 114 | besiktas-istanbul | Beşiktaş Jimnastik Kulübü | TR1 | NaN | 30 | 26.6 | 15 | 50.0 | 8 | Beşiktaş Park | 42445 | €-25.26m | NaN | 2024 | ../data/raw/transfermarkt-scraper/2024/clubs.j... | https://www.transfermarkt.co.uk/besiktas-istan... |
| 3 | 12 | as-rom | Associazione Sportiva Roma | IT1 | NaN | 26 | 26.3 | 18 | 69.2 | 17 | Olimpico di Roma | 70634 | €-76.90m | NaN | 2024 | ../data/raw/transfermarkt-scraper/2024/clubs.j... | https://www.transfermarkt.co.uk/as-rom/startse... |
| 4 | 148 | tottenham-hotspur | Tottenham Hotspur Football Club | GB1 | NaN | 30 | 25.5 | 21 | 70.0 | 18 | Tottenham Hotspur Stadium | 62850 | €-120.05m | NaN | 2024 | ../data/raw/transfermarkt-scraper/2024/clubs.j... | https://www.transfermarkt.co.uk/tottenham-hots... |
| 5 | 157 | kaa-gent | Koninklijke Atletiek Associatie Gent | BE1 | NaN | 30 | 25.2 | 19 | 63.3 | 5 | Planet Group Arena | 20185 | +€5.63m | NaN | 2024 | ../data/raw/transfermarkt-scraper/2024/clubs.j... | https://www.transfermarkt.co.uk/kaa-gent/start... |
| 6 | 1894 | hvidovre-if | Hvidovre IF | DK1 | NaN | 25 | 26.4 | 3 | 12.0 | 2 | PRO VENTILATION Arena | 12000 | +€225k | NaN | 2023 | ../data/raw/transfermarkt-scraper/2023/clubs.j... | https://www.transfermarkt.co.uk/hvidovre-if/st... |
| 7 | 190 | fc-kopenhagen | Football Club København | DK1 | NaN | 28 | 26.1 | 19 | 67.9 | 9 | Parken | 38065 | +€35.06m | NaN | 2024 | ../data/raw/transfermarkt-scraper/2024/clubs.j... | https://www.transfermarkt.co.uk/fc-kopenhagen/... |
| 8 | 192 | roda-jc-kerkrade | Roda JC Kerkrade | NL1 | NaN | 25 | 24.0 | 9 | 36.0 | 0 | Parkstad Limburg Stadion | 19979 | +€1.30m | NaN | 2017 | ../data/raw/transfermarkt-scraper/2017/clubs.j... | https://www.transfermarkt.co.uk/roda-jc-kerkra... |
| 9 | 19789 | yeni-malatyaspor | Yeni Malatyaspor | TR1 | NaN | 10 | 22.9 | 1 | 10.0 | 0 | Yeni Malatya Stadyumu | 25745 | +€778k | NaN | 2021 | ../data/raw/transfermarkt-scraper/2021/clubs.j... | https://www.transfermarkt.co.uk/yeni-malatyasp... |
------------------------------ --- competitions ---
| competition_id | competition_code | name | sub_type | type | country_id | country_name | domestic_league_code | confederation | url | is_major_national_league | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CIT | italy-cup | italy-cup | domestic_cup | domestic_cup | 75 | Italy | IT1 | europa | https://www.transfermarkt.co.uk/italy-cup/star... | False |
| 1 | NLSC | johan-cruijff-schaal | johan-cruijff-schaal | domestic_super_cup | other | 122 | Netherlands | NL1 | europa | https://www.transfermarkt.co.uk/johan-cruijff-... | False |
| 2 | GRP | kypello-elladas | kypello-elladas | domestic_cup | domestic_cup | 56 | Greece | GR1 | europa | https://www.transfermarkt.co.uk/kypello-ellada... | False |
Missing Values: country_name 8 domestic_league_code 8 dtype: int64 Sample rows with missing values:
| competition_id | competition_code | name | sub_type | type | country_id | country_name | domestic_league_code | confederation | url | is_major_national_league | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | USC | uefa-super-cup | uefa-super-cup | uefa_super_cup | international_cup | -1 | NaN | NaN | europa | https://www.transfermarkt.co.uk/uefa-super-cup... | False |
| 8 | EL | europa-league | europa-league | europa_league | international_cup | -1 | NaN | NaN | europa | https://www.transfermarkt.co.uk/europa-league/... | False |
| 17 | UCOL | uefa-conference-league | uefa-conference-league | uefa_europa_conference_league | international_cup | -1 | NaN | NaN | europa | https://www.transfermarkt.co.uk/uefa-conferenc... | False |
| 18 | ELQ | europa-league-qualifikation | europa-league-qualifikation | europa_league_qualifying | international_cup | -1 | NaN | NaN | europa | https://www.transfermarkt.co.uk/europa-league-... | False |
| 21 | ECLQ | uefa-europa-conference-league-qualifikation | uefa-europa-conference-league-qualifikation | uefa_europa_conference_league_qualifiers | international_cup | -1 | NaN | NaN | europa | https://www.transfermarkt.co.uk/uefa-europa-co... | False |
| 27 | CL | uefa-champions-league | uefa-champions-league | uefa_champions_league | international_cup | -1 | NaN | NaN | europa | https://www.transfermarkt.co.uk/uefa-champions... | False |
| 32 | KLUB | fifa-klub-wm | fifa-klub-wm | fifa_club_world_cup | other | -1 | NaN | NaN | europa | https://www.transfermarkt.co.uk/fifa-klub-wm/s... | False |
| 38 | CLQ | uefa-champions-league-qualifikation | uefa-champions-league-qualifikation | uefa_champions_league_qualifying | international_cup | -1 | NaN | NaN | europa | https://www.transfermarkt.co.uk/uefa-champions... | False |
------------------------------ --- game_events ---
| game_event_id | date | game_id | minute | type | club_id | player_id | description | player_in_id | player_assist_id | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2f41da30c471492e7d4a984951671677 | 2012-08-05 | 2211607 | 77 | Cards | 610 | 4425 | 1. Yellow card , Mass confrontation | NaN | NaN |
| 1 | a72f7186d132775f234d3e2f7bc0ed5b | 2012-08-05 | 2211607 | 77 | Cards | 383 | 33210 | 1. Yellow card , Mass confrontation | NaN | NaN |
| 2 | b2d721eaed4692a5c59a92323689ef18 | 2012-08-05 | 2211607 | 3 | Goals | 383 | 36500 | , Header, 1. Tournament Goal Assist: , Corner,... | NaN | 56416.0 |
Missing Values: description 87327 player_in_id 537365 player_assist_id 878284 dtype: int64 Sample rows with missing values:
| game_event_id | date | game_id | minute | type | club_id | player_id | description | player_in_id | player_assist_id | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2f41da30c471492e7d4a984951671677 | 2012-08-05 | 2211607 | 77 | Cards | 610 | 4425 | 1. Yellow card , Mass confrontation | NaN | NaN |
| 1 | a72f7186d132775f234d3e2f7bc0ed5b | 2012-08-05 | 2211607 | 77 | Cards | 383 | 33210 | 1. Yellow card , Mass confrontation | NaN | NaN |
| 2 | b2d721eaed4692a5c59a92323689ef18 | 2012-08-05 | 2211607 | 3 | Goals | 383 | 36500 | , Header, 1. Tournament Goal Assist: , Corner,... | NaN | 56416.0 |
| 3 | aef768899cedac0c9a650980219075a2 | 2012-08-05 | 2211607 | 53 | Goals | 383 | 36500 | , Right-footed shot, 2. Tournament Goal Assist... | NaN | 146258.0 |
| 4 | 5d6d9533023057b6619ecd145a038bbe | 2012-08-05 | 2211607 | 74 | Substitutions | 383 | 36500 | , Not reported | 49499.0 | NaN |
| 5 | eef9c46dd75c3aa4c6a503225427446e | 2012-08-05 | 2211607 | 11 | Goals | 383 | 38497 | , Right-footed shot, 1. Tournament Goal Assist... | NaN | 33210.0 |
| 6 | 5d5aef7dedcd5dc9d35dea9438ee35e0 | 2012-08-05 | 2211607 | 90 | Cards | 610 | 42710 | 1. Yellow card | NaN | NaN |
| 7 | 7717860e3b0376b86f445f4749fa2ce5 | 2012-08-05 | 2211607 | 44 | Goals | 610 | 42710 | , Header, 1. Tournament Goal Assist: , Corner,... | NaN | 4425.0 |
| 8 | 02c708273f4fa2003873ef5908e4e3a6 | 2012-08-05 | 2211607 | 79 | Cards | 610 | 45509 | 1. Yellow card | NaN | NaN |
| 9 | d1be2ce4bd5f0ca091c1b15a8569301c | 2012-08-05 | 2211607 | 90 | Goals | 383 | 49499 | , Right-footed shot, 1. Tournament Goal Assist... | NaN | 167850.0 |
------------------------------ --- game_lineups ---
| game_lineups_id | date | game_id | player_id | club_id | player_name | type | position | number | team_captain | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | b2dbe01c3656b06c8e23e9de714e26bb | 2013-07-27 | 2317258 | 1443 | 610 | Christian Poulsen | substitutes | Defensive Midfield | 5 | 0 |
| 1 | b50a3ec6d52fd1490aab42042ac4f738 | 2013-07-27 | 2317258 | 5017 | 610 | Niklas Moisander | starting_lineup | Centre-Back | 4 | 0 |
| 2 | 7d890e6d0ff8af84b065839966a0ec81 | 2013-07-27 | 2317258 | 9602 | 1090 | Maarten Martens | substitutes | Left Winger | 11 | 0 |
Missing Values: position 3 dtype: int64 Sample rows with missing values:
| game_lineups_id | date | game_id | player_id | club_id | player_name | type | position | number | team_captain | |
|---|---|---|---|---|---|---|---|---|---|---|
| 242898 | efbdeff16a683e06801be9d64cb6e440 | 2014-08-16 | 2469561 | 336850 | 25418 | Shaun Dunion | starting_lineup | NaN | - | 0 |
| 501768 | 8e0d3d1774f12f52c269a21edc837287 | 2015-08-26 | 2597514 | 415060 | 6601 | Jimmy Theelen | starting_lineup | NaN | - | 0 |
| 501771 | 22014173b77618f434041e511467c704 | 2015-08-26 | 2597514 | 415063 | 6601 | Bart Zeevenhoven | substitutes | NaN | - | 0 |
------------------------------ --- games ---
| game_id | competition_id | season | round | date | home_club_id | away_club_id | home_club_goals | away_club_goals | home_club_position | away_club_position | home_club_manager_name | away_club_manager_name | stadium | attendance | referee | url | home_club_formation | away_club_formation | home_club_name | away_club_name | aggregate | competition_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2321027 | L1 | 2013 | 1. Matchday | 2013-08-11 | 33.0 | 41.0 | 3.0 | 3.0 | 8.0 | 9.0 | Jens Keller | Thorsten Fink | Veltins-Arena | 61973.0 | Manuel Gräfe | https://www.transfermarkt.co.uk/fc-schalke-04_... | 4-2-3-1 | 4-2-3-1 | FC Schalke 04 | Hamburger SV | 3:3 | domestic_league |
| 1 | 2321033 | L1 | 2013 | 1. Matchday | 2013-08-10 | 23.0 | 86.0 | 0.0 | 1.0 | 13.0 | 7.0 | Torsten Lieberknecht | Robin Dutt | EINTRACHT-Stadion | 23000.0 | Deniz Aytekin | https://www.transfermarkt.co.uk/eintracht-brau... | 4-3-2-1 | 4-3-1-2 | Eintracht Braunschweig | Sportverein Werder Bremen von 1899 | 0:1 | domestic_league |
| 2 | 2321044 | L1 | 2013 | 2. Matchday | 2013-08-18 | 16.0 | 23.0 | 2.0 | 1.0 | 1.0 | 15.0 | Jürgen Klopp | Torsten Lieberknecht | SIGNAL IDUNA PARK | 80200.0 | Peter Sippel | https://www.transfermarkt.co.uk/borussia-dortm... | 4-2-3-1 | 4-3-2-1 | Borussia Dortmund | Eintracht Braunschweig | 2:1 | domestic_league |
Missing Values: home_club_id 9 away_club_id 9 home_club_goals 12 away_club_goals 12 home_club_position 22467 away_club_position 22467 home_club_manager_name 828 away_club_manager_name 828 stadium 250 attendance 9948 referee 652 home_club_formation 6975 away_club_formation 6806 home_club_name 12850 away_club_name 11455 aggregate 12 dtype: int64 Sample rows with missing values:
| game_id | competition_id | season | round | date | home_club_id | away_club_id | home_club_goals | away_club_goals | home_club_position | away_club_position | home_club_manager_name | away_club_manager_name | stadium | attendance | referee | url | home_club_formation | away_club_formation | home_club_name | away_club_name | aggregate | competition_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 120 | 2360139 | POCP | 2013 | Group B | 2013-08-01 | 2423.0 | 7179.0 | 2.0 | 1.0 | NaN | NaN | Carlos Condeço | Vítor Paneira | Estádio de São Miguel | 384.0 | Jorge Ferreira | https://www.transfermarkt.co.uk/spielbericht/i... | 4-3-3 Attacking | 4-2-3-1 | Clube Desportivo Santa Clara | CD Tondela | 2:1 | domestic_cup |
| 121 | 2367096 | CL | 2013 | Group F | 2013-09-18 | 6195.0 | 16.0 | 2.0 | 1.0 | NaN | NaN | Rafael Benítez | Jürgen Klopp | Stadio Diego Armando Maradona | 55766.0 | Pedro Proença | https://www.transfermarkt.co.uk/spielbericht/i... | 4-2-3-1 | 4-2-3-1 | Società Sportiva Calcio Napoli | Borussia Dortmund | 2:1 | international_cup |
| 122 | 2367237 | EL | 2013 | Group E | 2013-10-03 | 339.0 | 430.0 | 1.0 | 2.0 | NaN | NaN | Juande Ramos | Vincenzo Montella | Dnipro-Arena | 25837.0 | Szymon Marciniak | https://www.transfermarkt.co.uk/spielbericht/i... | 4-4-2 double 6 | 3-4-2-1 | Dnipro Dnipropetrovsk (-2020) | Associazione Calcio Fiorentina | 1:2 | international_cup |
| 123 | 2367648 | UKRP | 2013 | Third Round | 2013-09-25 | 16247.0 | 6994.0 | 1.0 | 2.0 | NaN | NaN | Volodymyr Mazyar | Sergiy Puchkov | Metalurg | 1500.0 | Igor Paskhal | https://www.transfermarkt.co.uk/spielbericht/i... | 4-2-3-1 | 4-4-2 double 6 | PFK Stal Kamyanske (-2018) | Metalurg Zaporizhya (-2016) | 1:2 | domestic_cup |
| 124 | 2368466 | CL | 2013 | Group H | 2013-11-26 | 371.0 | 5.0 | 0.0 | 3.0 | NaN | NaN | Neil Lennon | Massimiliano Allegri | Celtic Park | 58619.0 | Cüneyt Cakir | https://www.transfermarkt.co.uk/spielbericht/i... | 4-2-3-1 | 4-3-2-1 | The Celtic Football Club | Associazione Calcio Milan | 0:3 | international_cup |
| 125 | 2368481 | CL | 2013 | Group C | 2013-12-10 | 683.0 | 58.0 | 3.0 | 1.0 | NaN | NaN | Michel | John van den Brom | Stadio Georgios Karaiskakis | 31444.0 | Wolfgang Stark | https://www.transfermarkt.co.uk/spielbericht/i... | 4-4-1-1 | 4-1-4-1 | Olympiakos Syndesmos Filathlon Peiraios | Royal Sporting Club Anderlecht | 3:1 | international_cup |
| 127 | 2382254 | CDR | 2013 | 4th round 2nd leg | 2013-12-17 | 331.0 | 1084.0 | 1.0 | 1.0 | NaN | NaN | Javi Gracia | Bernd Schuster | El Sadar | 10538.0 | Miguel Ángel Ayza Gámez | https://www.transfermarkt.co.uk/spielbericht/i... | 4-2-3-1 | 4-5-1 | Club Atlético Osasuna | Málaga CF | 1:1 | domestic_cup |
| 128 | 2382266 | CDR | 2013 | 4th round 2nd leg | 2013-12-19 | 621.0 | 940.0 | 4.0 | 0.0 | NaN | NaN | Ernesto Valverde | Luis Enrique | San Mamés | NaN | NaN | https://www.transfermarkt.co.uk/spielbericht/i... | 3-4-3 | 2-4-4 | Athletic Club Bilbao | Real Club Celta de Vigo S. A. D. | 4:0 | domestic_cup |
| 129 | 2384520 | CL | 2013 | Group B | 2013-12-11 | 141.0 | 506.0 | 1.0 | 0.0 | NaN | NaN | Roberto Mancini | Antonio Conte | Ali Sami Yen Spor Kompleksi RAMS Park | 37375.0 | Pedro Proença | https://www.transfermarkt.co.uk/spielbericht/i... | 3-5-2 | 3-5-2 flat | Galatasaray Spor Kulübü | Juventus Football Club | 1:0 | international_cup |
| 130 | 2385041 | CL | 2013 | last 16 1st leg | 2014-02-26 | 141.0 | 631.0 | 1.0 | 1.0 | NaN | NaN | Roberto Mancini | José Mourinho | Ali Sami Yen Spor Kompleksi RAMS Park | 52650.0 | Carlos Velasco Carballo | https://www.transfermarkt.co.uk/spielbericht/i... | 4-4-2 double 6 | 4-2-3-1 | Galatasaray Spor Kulübü | Chelsea Football Club | 1:1 | international_cup |
------------------------------ --- player_valuations ---
| player_id | date | market_value_in_eur | current_club_id | player_club_domestic_competition_id | |
|---|---|---|---|---|---|
| 0 | 405973 | 2000-01-20 | 150000 | 3057 | BE1 |
| 1 | 342216 | 2001-07-20 | 100000 | 1241 | SC1 |
| 2 | 3132 | 2003-12-09 | 400000 | 126 | TR1 |
Missing Values: No missing values. ------------------------------ --- players ---
| player_id | first_name | last_name | name | last_season | current_club_id | player_code | country_of_birth | city_of_birth | country_of_citizenship | date_of_birth | sub_position | position | foot | height_in_cm | contract_expiration_date | agent_name | image_url | url | current_club_domestic_competition_id | current_club_name | market_value_in_eur | highest_market_value_in_eur | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10 | Miroslav | Klose | Miroslav Klose | 2015 | 398 | miroslav-klose | Poland | Opole | Germany | 1978-06-09 00:00:00 | Centre-Forward | Attack | right | 184.0 | NaN | ASBW Sport Marketing | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/miroslav-klose... | IT1 | Società Sportiva Lazio S.p.A. | 1000000.0 | 30000000.0 |
| 1 | 26 | Roman | Weidenfeller | Roman Weidenfeller | 2017 | 16 | roman-weidenfeller | Germany | Diez | Germany | 1980-08-06 00:00:00 | Goalkeeper | Goalkeeper | left | 190.0 | NaN | Neubauer 13 GmbH | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/roman-weidenfe... | L1 | Borussia Dortmund | 750000.0 | 8000000.0 |
| 2 | 65 | Dimitar | Berbatov | Dimitar Berbatov | 2015 | 1091 | dimitar-berbatov | Bulgaria | Blagoevgrad | Bulgaria | 1981-01-30 00:00:00 | Centre-Forward | Attack | NaN | NaN | NaN | CSKA-AS-23 Ltd. | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/dimitar-berbat... | GR1 | Panthessalonikios Athlitikos Omilos Konstantin... | 1000000.0 | 34500000.0 |
Missing Values: first_name 2062 country_of_birth 2799 city_of_birth 2455 country_of_citizenship 383 date_of_birth 47 sub_position 180 foot 2536 height_in_cm 2256 contract_expiration_date 12091 agent_name 16019 market_value_in_eur 1523 highest_market_value_in_eur 1523 dtype: int64 Sample rows with missing values:
| player_id | first_name | last_name | name | last_season | current_club_id | player_code | country_of_birth | city_of_birth | country_of_citizenship | date_of_birth | sub_position | position | foot | height_in_cm | contract_expiration_date | agent_name | image_url | url | current_club_domestic_competition_id | current_club_name | market_value_in_eur | highest_market_value_in_eur | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10 | Miroslav | Klose | Miroslav Klose | 2015 | 398 | miroslav-klose | Poland | Opole | Germany | 1978-06-09 00:00:00 | Centre-Forward | Attack | right | 184.0 | NaN | ASBW Sport Marketing | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/miroslav-klose... | IT1 | Società Sportiva Lazio S.p.A. | 1000000.0 | 30000000.0 |
| 1 | 26 | Roman | Weidenfeller | Roman Weidenfeller | 2017 | 16 | roman-weidenfeller | Germany | Diez | Germany | 1980-08-06 00:00:00 | Goalkeeper | Goalkeeper | left | 190.0 | NaN | Neubauer 13 GmbH | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/roman-weidenfe... | L1 | Borussia Dortmund | 750000.0 | 8000000.0 |
| 2 | 65 | Dimitar | Berbatov | Dimitar Berbatov | 2015 | 1091 | dimitar-berbatov | Bulgaria | Blagoevgrad | Bulgaria | 1981-01-30 00:00:00 | Centre-Forward | Attack | NaN | NaN | NaN | CSKA-AS-23 Ltd. | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/dimitar-berbat... | GR1 | Panthessalonikios Athlitikos Omilos Konstantin... | 1000000.0 | 34500000.0 |
| 3 | 77 | NaN | Lúcio | Lúcio | 2012 | 506 | lucio | Brazil | Brasília | Brazil | 1978-05-08 00:00:00 | Centre-Back | Defender | NaN | NaN | NaN | NaN | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/lucio/profil/s... | IT1 | Juventus Football Club | 200000.0 | 24500000.0 |
| 4 | 80 | Tom | Starke | Tom Starke | 2017 | 27 | tom-starke | East Germany (GDR) | Freital | Germany | 1981-03-18 00:00:00 | Goalkeeper | Goalkeeper | right | 194.0 | NaN | IFM | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/tom-starke/pro... | L1 | FC Bayern München | 100000.0 | 3000000.0 |
| 5 | 109 | NaN | Dedê | Dedê | 2013 | 825 | dede | Brazil | Belo Horizonte | Brazil | 1978-04-18 00:00:00 | Left-Back | Defender | NaN | NaN | NaN | Football Concept | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/dede/profil/sp... | TR1 | Eskisehirspor | 400000.0 | 9500000.0 |
| 6 | 123 | Christoph | Metzelder | Christoph Metzelder | 2012 | 33 | christoph-metzelder | Germany | Haltern | Germany | 1980-11-05 00:00:00 | Centre-Back | Defender | NaN | NaN | NaN | NaN | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/christoph-metz... | L1 | FC Schalke 04 | 1500000.0 | 9500000.0 |
| 7 | 132 | Tomas | Rosicky | Tomas Rosicky | 2015 | 11 | tomas-rosicky | CSSR | Praha | Czech Republic | 1980-10-04 00:00:00 | Attacking Midfield | Midfield | both | 179.0 | NaN | NaN | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/tomas-rosicky/... | GB1 | Arsenal Football Club | 350000.0 | 17500000.0 |
| 8 | 162 | Marc | Ziegler | Marc Ziegler | 2012 | 79 | marc-ziegler | Germany | Blieskastel | Germany | 1976-06-13 00:00:00 | Goalkeeper | Goalkeeper | right | 193.0 | NaN | NaN | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/marc-ziegler/p... | L1 | Verein für Bewegungsspiele Stuttgart 1893 | 200000.0 | 1250000.0 |
| 9 | 215 | Roque | Santa Cruz | Roque Santa Cruz | 2015 | 1084 | roque-santa-cruz | Paraguay | Asunción | Paraguay | 1981-08-16 00:00:00 | Centre-Forward | Attack | right | 193.0 | 2023-12-31 00:00:00 | NaN | https://img.a.transfermarkt.technology/portrai... | https://www.transfermarkt.co.uk/roque-santa-cr... | ES1 | Málaga CF | 250000.0 | 12000000.0 |
------------------------------ --- transfers ---
| player_id | transfer_date | transfer_season | from_club_id | to_club_id | from_club_name | to_club_name | transfer_fee | market_value_in_eur | player_name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16136 | 2026-07-01 | 26/27 | 417 | 123 | OGC Nice | Retired | NaN | 500000.0 | Dante |
| 1 | 1138758 | 2026-07-01 | 26/27 | 336 | 631 | Sporting CP | Chelsea | 52140000.0 | 45000000.0 | Geovany Quenda |
| 2 | 195778 | 2026-06-30 | 25/26 | 79 | 27 | VfB Stuttgart | Bayern Munich | 0.0 | 12000000.0 | Alexander Nübel |
Missing Values: transfer_fee 27715 market_value_in_eur 30316 dtype: int64 Sample rows with missing values:
| player_id | transfer_date | transfer_season | from_club_id | to_club_id | from_club_name | to_club_name | transfer_fee | market_value_in_eur | player_name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16136 | 2026-07-01 | 26/27 | 417 | 123 | OGC Nice | Retired | NaN | 500000.0 | Dante |
| 21 | 58358 | 2025-08-01 | 25/26 | 27 | 515 | Bayern Munich | Without Club | NaN | 6000000.0 | Thomas Müller |
| 24 | 31067 | 2025-07-01 | 25/26 | 468 | 123 | Sparta R. | Retired | NaN | 100000.0 | Jonathan de Guzmán |
| 25 | 34784 | 2025-07-01 | 25/26 | 467 | 123 | NEC Nijmegen | Retired | NaN | 100000.0 | Lasse Schöne |
| 26 | 39728 | 2025-07-01 | 25/26 | 12 | 123 | AS Roma | Retired | NaN | 2500000.0 | Mats Hummels |
| 27 | 42710 | 2025-07-01 | 25/26 | 1096 | 123 | Royal Antwerp | Retired | NaN | 2200000.0 | Toby Alderweireld |
| 28 | 43250 | 2025-07-01 | 25/26 | 58 | 123 | RSC Anderlecht | Retired | NaN | 1000000.0 | Jan Vertonghen |
| 29 | 44996 | 2025-07-01 | 25/26 | 36 | 515 | Fenerbahce | Without Club | NaN | 100000.0 | Serdar Aziz |
| 30 | 55508 | 2025-07-01 | 25/26 | 269 | 515 | Holstein Kiel | Without Club | NaN | 500000.0 | Lewis Holtby |
| 31 | 57796 | 2025-07-01 | 24/25 | 2036 | 515 | Heidenheim | Without Club | NaN | 250000.0 | Vitus Eicher |
------------------------------
3. Temporal Coverage¶
Verifying the date ranges across datasets.
In [12]:
date_cols = {
'appearances': 'date',
'games': 'date',
'transfers': 'transfer_date',
'player_valuations': 'date',
'game_events': 'date'
}
for name, date_col in date_cols.items():
if name in data:
df = data[name]
if date_col in df.columns:
# Ensure datetime
if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
min_date = df[date_col].min()
max_date = df[date_col].max()
print(f"{name}: {min_date} to {max_date}")
appearances: 2012-07-03 00:00:00 to 2025-04-10 00:00:00 games: 2012-07-03 00:00:00 to 2025-07-13 00:00:00 transfers: 1993-07-01 00:00:00 to 2026-07-01 00:00:00 player_valuations: 2000-01-20 00:00:00 to 2025-04-06 00:00:00 game_events: 2012-07-03 00:00:00 to 2025-04-10 00:00:00
4. Relationships¶
Mapping relationships between tables. Check if foreign keys exist in the referenced tables.
In [13]:
def check_foreign_keys(df_left, col_left, df_right, col_right, left_name, right_name):
if col_left not in df_left.columns or col_right not in df_right.columns:
print(f"Skipping {left_name} -> {right_name}: Columns not found")
return
unique_left = set(df_left[col_left].dropna().unique())
unique_right = set(df_right[col_right].dropna().unique())
missing = unique_left - unique_right
match_pct = 100 * (1 - len(missing) / len(unique_left))
print(f"{left_name}.{col_left} -> {right_name}.{col_right}")
print(f"Match Percentage: {match_pct:.2f}%")
print(f"IDs in {left_name} not in {right_name}: {len(missing)}")
print("-"*30)
# Check relationships
if 'players' in data:
if 'appearances' in data: check_foreign_keys(data['appearances'], 'player_id', data['players'], 'player_id', 'appearances', 'players')
if 'transfers' in data: check_foreign_keys(data['transfers'], 'player_id', data['players'], 'player_id', 'transfers', 'players')
if 'player_valuations' in data: check_foreign_keys(data['player_valuations'], 'player_id', data['players'], 'player_id', 'player_valuations', 'players')
if 'clubs' in data:
if 'players' in data: check_foreign_keys(data['players'], 'current_club_id', data['clubs'], 'club_id', 'players', 'clubs')
if 'games' in data:
check_foreign_keys(data['games'], 'home_club_id', data['clubs'], 'club_id', 'games (home)', 'clubs')
check_foreign_keys(data['games'], 'away_club_id', data['clubs'], 'club_id', 'games (away)', 'clubs')
if 'competitions' in data:
if 'clubs' in data: check_foreign_keys(data['clubs'], 'domestic_competition_id', data['competitions'], 'competition_id', 'clubs', 'competitions')
if 'games' in data: check_foreign_keys(data['games'], 'competition_id', data['competitions'], 'competition_id', 'games', 'competitions')
appearances.player_id -> players.player_id Match Percentage: 99.99% IDs in appearances not in players: 3 ------------------------------ transfers.player_id -> players.player_id Match Percentage: 100.00% IDs in transfers not in players: 0 ------------------------------ player_valuations.player_id -> players.player_id Match Percentage: 100.00% IDs in player_valuations not in players: 0 ------------------------------ players.current_club_id -> clubs.club_id Match Percentage: 100.00% IDs in players not in clubs: 0 ------------------------------ games (home).home_club_id -> clubs.club_id Match Percentage: 17.27% IDs in games (home) not in clubs: 2103 ------------------------------ games (away).away_club_id -> clubs.club_id Match Percentage: 19.78% IDs in games (away) not in clubs: 1780 ------------------------------ clubs.domestic_competition_id -> competitions.competition_id Match Percentage: 100.00% IDs in clubs not in competitions: 0 ------------------------------ games.competition_id -> competitions.competition_id Match Percentage: 100.00% IDs in games not in competitions: 0 ------------------------------
5. Anomaly Detection¶
Scanning for potential anomalies like negative values in fields that should be positive.
In [5]:
numeric_checks = {
'players': ['market_value_in_eur', 'height_in_cm'],
'appearances': ['goals', 'assists', 'minutes_played'],
'transfers': ['transfer_fee', 'market_value_in_eur']
}
for name, cols in numeric_checks.items():
if name in data:
df = data[name]
for col in cols:
if col in df.columns:
neg_count = (df[col] < 0).sum()
if neg_count > 0:
print(f"Anomaly: {name}.{col} has {neg_count} negative values")
else:
print(f"OK: {name}.{col} has no negative values")
OK: players.market_value_in_eur has no negative values OK: players.height_in_cm has no negative values OK: appearances.goals has no negative values OK: appearances.assists has no negative values OK: appearances.minutes_played has no negative values OK: transfers.transfer_fee has no negative values OK: transfers.market_value_in_eur has no negative values
Key Field Analysis¶
Checking market_value_in_eur and performance metrics.
In [4]:
# Check market_value_in_eur in players
if 'players' in data:
print("Players with missing market value:", data['players']['market_value_in_eur'].isnull().sum())
# Distribution of market value
plt.figure(figsize=(25, 6))
sns.histplot(data['players']['market_value_in_eur'].dropna(), bins=50, kde=True)
plt.title('Distribution of Player Market Values')
plt.show()
# Check market_value_in_eur in player_valuations
if 'player_valuations' in data:
print("Valuations with missing market value:", data['player_valuations']['market_value_in_eur'].isnull().sum())
Players with missing market value: 1523
Valuations with missing market value: 0
In [ ]:
In [9]:
def load_data():
print("Loading datasets...")
try:
games = pd.read_csv(os.path.join(DATA_DIR, 'games.csv'))
appearances = pd.read_csv(os.path.join(DATA_DIR, 'appearances.csv'))
players = pd.read_csv(os.path.join(DATA_DIR, 'players.csv'))
print("Datasets loaded successfully.")
return games, appearances, players
except FileNotFoundError as e:
print(f"Error loading files: {e}")
return None, None, None
def analyze_seasons(games, appearances):
"""
Analyzes seasons to find start/end dates and player counts.
Returns a summary dataframe.
"""
print("Analyzing season data...")
# Convert date column to datetime objects
games['date'] = pd.to_datetime(games['date'])
# 1. Game Statistics per Season
season_stats = games.groupby('season').agg(
first_game=('date', 'min'),
last_game=('date', 'max'),
total_games=('game_id', 'count')
).reset_index()
# 2. Player Counts per Season
# merge appearances with games to link players to seasons, using a subset of columns
games_subset = games[['game_id', 'season']]
app_subset = appearances[['game_id', 'player_id']]
merged_data = pd.merge(app_subset, games_subset, on='game_id', how='inner')
# Count unique players per season
player_counts = merged_data.groupby('season')['player_id'].nunique().reset_index()
player_counts.rename(columns={'player_id': 'unique_players'}, inplace=True)
# Merge stats together
summary = pd.merge(season_stats, player_counts, on='season')
# Sort by season descending (newest first)
summary = summary.sort_values('season', ascending=False)
return summary
# --- Main Execution ---
if __name__ == "__main__":
games_df, appearances_df, players_df = load_data()
if games_df is not None:
season_summary = analyze_seasons(games_df, appearances_df)
# Logic to find the latest complete season...
# * A season is typically 'complete' if its last game was before July of the current year
print("\n--- Season Analysis Summary ---")
print(season_summary.to_string(index=False))
latest_season = season_summary.iloc[0]
previous_season = season_summary.iloc[1]
Loading datasets... Datasets loaded successfully. Analyzing season data... --- Season Analysis Summary --- season first_game last_game total_games unique_players 2024 2021-09-22 2025-07-13 5298 5850 2023 2023-06-27 2024-06-02 5839 7130 2022 2022-06-21 2024-02-25 6008 7222 2021 2021-06-22 2022-05-29 5875 6008 2020 2020-08-01 2021-05-29 5513 7106 2019 2019-06-25 2021-04-03 5469 6736 2018 2018-06-26 2019-06-01 5723 6842 2017 2017-06-27 2018-05-26 5596 6642 2016 2016-06-28 2017-06-03 5699 6751 2015 2015-06-30 2016-05-29 5708 6740 2014 2014-07-01 2015-06-07 5836 6455 2013 2013-07-02 2014-05-24 5762 6797 2012 2012-07-03 2013-06-01 5700 6635
In [ ]: