import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
# Read the data
df = pd.read_csv('datasets/players_21.csv')
print(df.head())
sofifa_id player_url \
0 158023 https://sofifa.com/player/158023/lionel-messi/...
1 20801 https://sofifa.com/player/20801/c-ronaldo-dos-...
2 200389 https://sofifa.com/player/200389/jan-oblak/210002
3 188545 https://sofifa.com/player/188545/robert-lewand...
4 190871 https://sofifa.com/player/190871/neymar-da-sil...
short_name long_name age dob \
0 L. Messi Lionel Andrés Messi Cuccittini 33 1987-06-24
1 Cristiano Ronaldo Cristiano Ronaldo dos Santos Aveiro 35 1985-02-05
2 J. Oblak Jan Oblak 27 1993-01-07
3 R. Lewandowski Robert Lewandowski 31 1988-08-21
4 Neymar Jr Neymar da Silva Santos Júnior 28 1992-02-05
height_cm weight_kg nationality club_name ... lwb ldm \
0 170 72 Argentina FC Barcelona ... 66+3 65+3
1 187 83 Portugal Juventus ... 65+3 61+3
2 188 87 Slovenia Atlético Madrid ... 32+3 36+3
3 184 80 Poland FC Bayern München ... 64+3 65+3
4 175 68 Brazil Paris Saint-Germain ... 67+3 62+3
cdm rdm rwb lb lcb cb rcb rb
0 65+3 65+3 66+3 62+3 52+3 52+3 52+3 62+3
1 61+3 61+3 65+3 61+3 54+3 54+3 54+3 61+3
2 36+3 36+3 32+3 32+3 33+3 33+3 33+3 32+3
3 65+3 65+3 64+3 61+3 60+3 60+3 60+3 61+3
4 62+3 62+3 67+3 62+3 49+3 49+3 49+3 62+3
[5 rows x 106 columns]
print("/--Data info")
df.info()
/--Data info <class 'pandas.core.frame.DataFrame'> RangeIndex: 18944 entries, 0 to 18943 Columns: 106 entries, sofifa_id to rb dtypes: float64(18), int64(44), object(44) memory usage: 15.3+ MB
!ls -R
.: about.ipynb datasets images theme data_vis.ipynb home.ipynb players_21.csv week01.ipynb ./datasets: ./images: nk.jpg ./theme: conf.json index.html.j2 sidebar.js style.css
!find / -name "readme*"
!find / -name "theme*"
/opt/conda/share/jupyterhub/static/components/@fortawesome/fontawesome-free/svgs/brands/readme.svg /opt/conda/share/jupyterhub/static/components/@fortawesome/fontawesome-free/svgs-full/brands/readme.svg /opt/conda/lib/python3.13/site-packages/matplotlib/mpl-data/fonts/pdfcorefonts/readme.txt find: ‘/proc/tty/driver’: Permission denied /usr/share/doc/texlive-base/readme-html.dir /usr/share/doc/texlive-base/readme-txt.dir /usr/share/texlive/readme-html.dir /usr/share/texlive/readme-html.dir/readme.es.html /usr/share/texlive/readme-html.dir/readme.zh-cn.html /usr/share/texlive/readme-html.dir/readme.vi.html /usr/share/texlive/readme-html.dir/readme.ru.html /usr/share/texlive/readme-html.dir/readme.fr.html /usr/share/texlive/readme-html.dir/readme.sk.html /usr/share/texlive/readme-html.dir/readme.it.html /usr/share/texlive/readme-html.dir/readme.en.html /usr/share/texlive/readme-html.dir/readme.pl.html /usr/share/texlive/readme-html.dir/readme.sr.html /usr/share/texlive/readme-html.dir/readme.de.html /usr/share/texlive/readme-html.dir/readme.pt-br.html /usr/share/texlive/readme-html.dir/readme.cs.html /usr/share/texlive/readme-html.dir/readme.ja.html /usr/share/texlive/readme-txt.dir find: ‘/var/cache/apt/archives/partial’: Permission denied find: ‘/var/cache/ldconfig’: Permission denied find: ‘/root’: Permission denied find: ‘/etc/ssl/private’: Permission denied /opt/conda/share/jupyter/nbconvert/templates/lab/static/theme-dark.css /opt/conda/share/jupyter/nbconvert/templates/lab/static/theme-light.css /opt/conda/share/jupyter/lab/themes /opt/conda/share/jupyter/lab/themes/@jupyterlab/theme-dark-high-contrast-extension /opt/conda/share/jupyter/lab/themes/@jupyterlab/theme-light-extension /opt/conda/share/jupyter/lab/themes/@jupyterlab/theme-dark-extension /opt/conda/share/jupyter/lab/schemas/@jupyterlab/apputils-extension/themes.json /opt/conda/share/jupyterhub/static/components/@fortawesome/fontawesome-free/svgs/brands/themeisle.svg /opt/conda/share/jupyterhub/static/components/@fortawesome/fontawesome-free/svgs/brands/themeco.svg /opt/conda/share/jupyterhub/static/components/@fortawesome/fontawesome-free/svgs-full/brands/themeisle.svg /opt/conda/share/jupyterhub/static/components/@fortawesome/fontawesome-free/svgs-full/brands/themeco.svg /opt/conda/lib/python3.13/site-packages/pip/_vendor/rich/theme.py /opt/conda/lib/python3.13/site-packages/pip/_vendor/rich/themes.py /opt/conda/lib/python3.13/site-packages/pip/_vendor/rich/__pycache__/theme.cpython-313.pyc /opt/conda/lib/python3.13/site-packages/pip/_vendor/rich/__pycache__/themes.cpython-313.pyc /opt/conda/lib/python3.13/site-packages/altair/theme.py /opt/conda/lib/python3.13/site-packages/altair/vegalite/v5/theme.py /opt/conda/lib/python3.13/site-packages/altair/vegalite/v5/__pycache__/theme.cpython-313.pyc /opt/conda/lib/python3.13/site-packages/altair/__pycache__/theme.cpython-313.pyc /opt/conda/lib/python3.13/site-packages/distributed/dashboard/theme.yaml /opt/conda/lib/python3.13/site-packages/openpyxl/writer/theme.py /opt/conda/lib/python3.13/site-packages/openpyxl/writer/__pycache__/theme.cpython-313.pyc /opt/conda/lib/python3.13/site-packages/bokeh/themes /opt/conda/lib/python3.13/site-packages/bokeh/themes/theme.py /opt/conda/lib/python3.13/site-packages/bokeh/themes/__pycache__/theme.cpython-313.pyc /opt/conda/lib/python3.13/site-packages/bokeh/server/static/js/lib/api/themes.d.ts /opt/conda/lib/python3.13/site-packages/jupyterlab_server/__pycache__/themes_handler.cpython-313.pyc /opt/conda/lib/python3.13/site-packages/jupyterlab_server/test_data/schemas/@jupyterlab/apputils-extension/themes.json /opt/conda/lib/python3.13/site-packages/jupyterlab_server/themes_handler.py /opt/conda/lib/python3.13/site-packages/jupyterlab/themes /opt/conda/lib/python3.13/site-packages/jupyterlab/themes/@jupyterlab/theme-dark-high-contrast-extension /opt/conda/lib/python3.13/site-packages/jupyterlab/themes/@jupyterlab/theme-light-extension /opt/conda/lib/python3.13/site-packages/jupyterlab/themes/@jupyterlab/theme-dark-extension /opt/conda/lib/python3.13/site-packages/jupyterlab/schemas/@jupyterlab/apputils-extension/themes.json /opt/conda/lib/python3.13/site-packages/nbclassic/static/components/jquery-ui/themes /opt/conda/lib/python3.13/site-packages/nbclassic/static/components/jquery-ui/dist/themes /opt/conda/lib/python3.13/site-packages/nbclassic/static/components/codemirror/theme /opt/conda/lib/python3.13/site-packages/nbclassic/static/components/bootstrap/less/theme.less find: ‘/proc/tty/driver’: Permission denied /usr/share/texlive/texmf-dist/tex/latex/fixme/themes /usr/share/pandoc/data/docx/word/theme /usr/share/pandoc/data/docx/word/theme/theme1.xml /usr/share/pandoc/data/pptx/ppt/theme /usr/share/pandoc/data/pptx/ppt/theme/theme1.xml /usr/share/pandoc/data/pptx/ppt/theme/theme2.xml find: ‘/var/cache/apt/archives/partial’: Permission denied find: ‘/var/cache/ldconfig’: Permission denied find: ‘/root’: Permission denied find: ‘/etc/ssl/private’: Permission denied /home/jovyan/work/nikhil-pradhan/theme /home/jovyan/work/class/theme
# Distribution of overall ratings
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
df['overall'].hist(bins=30, alpha=0.7)
plt.title('Distribution of Overall Ratings')
plt.xlabel('Overall Rating')
plt.ylabel('Frequency')
plt.subplot(1, 2, 2)
df['age'].hist(bins=30, alpha=0.7)
plt.title('Distribution of Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Top 20 players by overall rating
top_players = df.nlargest(20, 'overall')[['short_name', 'overall', 'club_name', 'age']]
plt.figure(figsize=(12, 8))
plt.barh(top_players['short_name'], top_players['overall'])
plt.xlabel('Overall Rating')
plt.title('Top 20 Players by Overall Rating')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
# Top clubs by average player rating
club_ratings = df.groupby('club_name')['overall'].mean().sort_values(ascending=False).head(15)
plt.figure(figsize=(12, 8))
club_ratings.plot(kind='barh')
plt.title('Top 15 Clubs by Average Player Rating')
plt.xlabel('Average Overall Rating')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
# Top nationalities in the dataset
top_nations = df['nationality'].value_counts().head(15)
plt.figure(figsize=(12, 8))
top_nations.plot(kind='bar')
plt.title('Top 15 Nationalities in FIFA 21')
plt.xlabel('Nationality')
plt.ylabel('Number of Players')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Select numeric columns for correlation
numeric_cols = ['overall', 'potential', 'age', 'height_cm', 'weight_kg',
'value_eur', 'wage_eur', 'pace', 'shooting', 'passing',
'dribbling', 'defending', 'physic']
corr_matrix = df[numeric_cols].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Player Attributes')
plt.tight_layout()
plt.show()
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.scatter(df['age'], df['overall'], alpha=0.5)
plt.xlabel('Age')
plt.ylabel('Overall Rating')
plt.title('Age vs Overall Rating')
plt.subplot(1, 3, 2)
plt.scatter(df['value_eur'], df['overall'], alpha=0.5)
plt.xlabel('Value (EUR)')
plt.ylabel('Overall Rating')
plt.title('Value vs Overall Rating')
plt.subplot(1, 3, 3)
plt.scatter(df['potential'], df['overall'], alpha=0.5)
plt.xlabel('Potential')
plt.ylabel('Overall Rating')
plt.title('Potential vs Overall Rating')
plt.tight_layout()
plt.show()
# Extract primary positions
df['primary_position'] = df['player_positions'].str.split(',').str[0]
# Top positions by average rating
position_ratings = df.groupby('primary_position')['overall'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
position_ratings.plot(kind='bar')
plt.title('Average Rating by Primary Position')
plt.xlabel('Position')
plt.ylabel('Average Overall Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df['age'], df['potential'], c=df['overall'],
cmap='viridis', alpha=0.6, s=50)
plt.colorbar(scatter, label='Overall Rating')
plt.xlabel('Age')
plt.ylabel('Potential')
plt.title('Age vs Potential (colored by Overall Rating)')
plt.tight_layout()
plt.show()
# Top leagues by average rating
league_ratings = df.groupby('league_name')['overall'].mean().sort_values(ascending=False).head(10)
plt.figure(figsize=(12, 6))
league_ratings.plot(kind='bar')
plt.title('Top 10 Leagues by Average Player Rating')
plt.xlabel('League')
plt.ylabel('Average Overall Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Filter out extreme values for better visualization
value_filtered = df[df['value_eur'] < 50000000]
plt.figure(figsize=(14, 8))
sns.boxplot(data=value_filtered, x='primary_position', y='value_eur')
plt.title('Player Value Distribution by Position')
plt.xlabel('Position')
plt.ylabel('Value (EUR)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Top 10 players
top_10 = df.nlargest(10, 'overall')[['short_name', 'overall', 'age', 'club_name', 'value_eur']]
print("Top 10 Players:")
top_10
Top 10 Players:
| short_name | overall | age | club_name | value_eur | |
|---|---|---|---|---|---|
| 0 | L. Messi | 93 | 33 | FC Barcelona | 67500000 |
| 1 | Cristiano Ronaldo | 92 | 35 | Juventus | 46000000 |
| 2 | J. Oblak | 91 | 27 | Atlético Madrid | 75000000 |
| 3 | R. Lewandowski | 91 | 31 | FC Bayern München | 80000000 |
| 4 | Neymar Jr | 91 | 28 | Paris Saint-Germain | 90000000 |
| 5 | K. De Bruyne | 91 | 29 | Manchester City | 87000000 |
| 6 | K. Mbappé | 90 | 21 | Paris Saint-Germain | 105500000 |
| 7 | M. ter Stegen | 90 | 28 | FC Barcelona | 69500000 |
| 8 | V. van Dijk | 90 | 28 | Liverpool | 75500000 |
| 9 | Alisson | 90 | 27 | Liverpool | 62500000 |
BEST PLAYERS
** Simple explanation
- Simple Breakdown: df.nlargest(10, 'overall')
Translation: "From all players, find the 10 with the highest 'overall' rating"
Like saying: "Show me the 10 best-rated players in the game"
[['short_name', 'overall', 'age', 'club_name', 'value_eur']]
Translation: "But only show me these 5 columns for each player"
Like saying: "Just tell me their name, rating, age, club, and value"
plt.figure(figsize=(10, 6))
df['age'].hist(bins=30, alpha=0.7, color='skyblue')
plt.title('Age Distribution of Players')
plt.xlabel('Age')
plt.ylabel('Number of Players')
plt.show()
Age Distribution: *Simple Breakdown: plt.figure(figsize=(10, 6))
Translation: "Create a drawing canvas that's 10 units wide and 6 units tall"
Like saying: "Get me a big piece of paper to draw on"
df['age'].hist(bins=30, alpha=0.7, color='skyblue')
Translation: "Make a histogram (bar chart) showing how many players are at each age"
bins=30: "Divide the ages into 30 groups"
alpha=0.7: "Make the bars slightly transparent" color='skyblue': "Color the bars light blue"
plt.title('Age Distribution of Players')
Translation: "Put this title at the top of the chart"
plt.xlabel('Age') and plt.ylabel('Number of Players')
Translation: "Label the bottom as 'Age' and the side as 'Number of Players'"
plt.show()
# Top clubs by average rating
top_clubs = df.groupby('club_name')['overall'].mean().nlargest(10)
plt.figure(figsize=(12, 6))
top_clubs.plot(kind='barh')
plt.title('Top 10 Clubs by Average Player Rating')
plt.xlabel('Average Rating')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
- CLUB DISTRIBUTION Simple Breakdown: df.groupby('club_name')['overall'].mean()
Translation: "For each club, calculate the average rating of all their players"
Like saying: "What's the typical player quality for each team?"
.nlargest(10)
Translation: "Take only the 10 clubs with the highest averages"
Like saying: "Show me only the top 10 best clubs"
top_clubs.plot(kind='barh') Translation: "Make a horizontal bar chart"
Like saying: "Draw bars going sideways instead of up and down"
plt.gca().invert_yaxis()
Translation: "Put the best club at the TOP of the chart"
Like saying: "Make #1 appear at the top, #10 at the bottom"
plt.tight_layout()
Translation: "Make everything fit nicely without overlapping"