< Home
Week 1: Introduction to the data science¶
Dataset¶
Jupyterlab¶
In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Rest of your code...
# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
# Read the data
df = pd.read_csv('datasets/players_15.csv')
In [2]:
print(df.head())
sofifa_id player_url \
0 158023 https://sofifa.com/player/158023/lionel-messi/...
1 20801 https://sofifa.com/player/20801/c-ronaldo-dos-...
2 9014 https://sofifa.com/player/9014/arjen-robben/15...
3 41236 https://sofifa.com/player/41236/zlatan-ibrahim...
4 167495 https://sofifa.com/player/167495/manuel-neuer/...
short_name long_name age dob \
0 L. Messi Lionel Andrés Messi Cuccittini 27 1987-06-24
1 Cristiano Ronaldo Cristiano Ronaldo dos Santos Aveiro 29 1985-02-05
2 A. Robben Arjen Robben 30 1984-01-23
3 Z. Ibrahimović Zlatan Ibrahimović 32 1981-10-03
4 M. Neuer Manuel Neuer 28 1986-03-27
height_cm weight_kg nationality club_name ... lwb ldm \
0 169 67 Argentina FC Barcelona ... 62+3 62+3
1 185 80 Portugal Real Madrid ... 63+3 63+3
2 180 80 Netherlands FC Bayern München ... 64+3 64+3
3 195 95 Sweden Paris Saint-Germain ... 61+3 65+3
4 193 92 Germany FC Bayern München ... 36+3 40+3
cdm rdm rwb lb lcb cb rcb rb
0 62+3 62+3 62+3 54+3 45+3 45+3 45+3 54+3
1 63+3 63+3 63+3 57+3 52+3 52+3 52+3 57+3
2 64+3 64+3 64+3 55+3 46+3 46+3 46+3 55+3
3 65+3 65+3 61+3 56+3 55+3 55+3 55+3 56+3
4 40+3 40+3 36+3 36+3 38+3 38+3 38+3 36+3
[5 rows x 106 columns]
In [3]:
# Top nationalities in the dataset
top_nations = df['nationality'].value_counts().head(15)
plt.figure(figsize=(12, 8))
top_nations.plot(kind='bar')
plt.title('Top 15 Nationalities in FIFA 21')
plt.xlabel('Nationality')
plt.ylabel('Number of Players')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
# Read the data
df = pd.read_csv('datasets/players_15.csv')
# 1. Distribution of Player Overall Ratings
plt.figure(figsize=(12, 6))
plt.hist(df['overall'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Distribution of Player Overall Ratings in FIFA 21')
plt.xlabel('Overall Rating')
plt.ylabel('Number of Players')
plt.grid(True, alpha=0.3)
plt.show()
In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
# Read the data
df = pd.read_csv('datasets/players_15.csv')
# First, let's see what columns are available
print("First few rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())
# Let's check the actual column names for club/team information
# Common column names for clubs in FIFA datasets:
# 'club', 'club_name', 'team', 'club_team', 'team_name'
# Look for club-related columns
club_columns = [col for col in df.columns if 'club' in col.lower() or 'team' in col.lower()]
print("\nPotential club/team columns:", club_columns)
# Let's see some sample values from potential club columns
if club_columns:
for col in club_columns[:3]: # Check first 3 club-related columns
print(f"\nUnique values in '{col}':")
print(df[col].value_counts().head(10))
First few rows:
sofifa_id player_url \
0 158023 https://sofifa.com/player/158023/lionel-messi/...
1 20801 https://sofifa.com/player/20801/c-ronaldo-dos-...
2 9014 https://sofifa.com/player/9014/arjen-robben/15...
3 41236 https://sofifa.com/player/41236/zlatan-ibrahim...
4 167495 https://sofifa.com/player/167495/manuel-neuer/...
short_name long_name age dob \
0 L. Messi Lionel Andrés Messi Cuccittini 27 1987-06-24
1 Cristiano Ronaldo Cristiano Ronaldo dos Santos Aveiro 29 1985-02-05
2 A. Robben Arjen Robben 30 1984-01-23
3 Z. Ibrahimović Zlatan Ibrahimović 32 1981-10-03
4 M. Neuer Manuel Neuer 28 1986-03-27
height_cm weight_kg nationality club_name ... lwb ldm \
0 169 67 Argentina FC Barcelona ... 62+3 62+3
1 185 80 Portugal Real Madrid ... 63+3 63+3
2 180 80 Netherlands FC Bayern München ... 64+3 64+3
3 195 95 Sweden Paris Saint-Germain ... 61+3 65+3
4 193 92 Germany FC Bayern München ... 36+3 40+3
cdm rdm rwb lb lcb cb rcb rb
0 62+3 62+3 62+3 54+3 45+3 45+3 45+3 54+3
1 63+3 63+3 63+3 57+3 52+3 52+3 52+3 57+3
2 64+3 64+3 64+3 55+3 46+3 46+3 46+3 55+3
3 65+3 65+3 61+3 56+3 55+3 55+3 55+3 56+3
4 40+3 40+3 36+3 36+3 38+3 38+3 38+3 36+3
[5 rows x 106 columns]
Column names:
['sofifa_id', 'player_url', 'short_name', 'long_name', 'age', 'dob', 'height_cm', 'weight_kg', 'nationality', 'club_name', 'league_name', 'league_rank', 'overall', 'potential', 'value_eur', 'wage_eur', 'player_positions', 'preferred_foot', 'international_reputation', 'weak_foot', 'skill_moves', 'work_rate', 'body_type', 'real_face', 'release_clause_eur', 'player_tags', 'team_position', 'team_jersey_number', 'loaned_from', 'joined', 'contract_valid_until', 'nation_position', 'nation_jersey_number', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning', 'player_traits', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']
Potential club/team columns: ['club_name', 'team_position', 'team_jersey_number']
Unique values in 'club_name':
club_name
Sevilla FC 33
Newcastle United 33
Hull City 33
Torino 33
OGC Nice 33
Burnley 33
Stoke City 33
Queens Park Rangers 33
Sporting Club de Bastia 33
Everton 33
Name: count, dtype: int64
Unique values in 'team_position':
team_position
SUB 6906
RES 2663
LCB 577
RCB 577
GK 577
LB 534
RB 534
LM 403
RM 403
ST 366
Name: count, dtype: int64
Unique values in 'team_jersey_number':
team_jersey_number
7.0 542
8.0 536
10.0 533
5.0 526
11.0 521
6.0 520
1.0 511
9.0 501
4.0 497
17.0 489
Name: count, dtype: int64
In [ ]: