Clustering¶
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
# Set style for better visuals
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
# Load your data from datasets folder
df = pd.read_csv('datasets/players_21.csv')
print(f"Total players in dataset: {len(df)}")
print(f"Total attributes: {len(df.columns)}")
print("\nFirst 3 players:")
df[['short_name', 'overall', 'age', 'club_name']].head(3)
Total players in dataset: 18944 Total attributes: 106 First 3 players:
Out[2]:
| short_name | overall | age | club_name | |
|---|---|---|---|---|
| 0 | L. Messi | 93 | 33 | FC Barcelona |
| 1 | Cristiano Ronaldo | 92 | 35 | Juventus |
| 2 | J. Oblak | 91 | 27 | Atlético Madrid |
Select features and clean data¶
In [3]:
# Select relevant features for clustering
features = ['overall', 'potential', 'pace', 'shooting', 'passing',
'dribbling', 'defending', 'physic', 'age', 'height_cm',
'weight_kg', 'value_eur', 'wage_eur']
# Check which features exist in your dataset
available_features = [f for f in features if f in df.columns]
print(f"Using {len(available_features)} features for clustering: {available_features}")
# Create a subset with these features
clustering_data = df[available_features].copy()
# Check for missing values
print(f"\nMissing values in each feature:")
print(clustering_data.isnull().sum())
# Fill missing values with median (for numeric features)
clustering_data = clustering_data.fillna(clustering_data.median())
print(f"\nData ready for clustering: {clustering_data.shape}")
print(f"First few rows:")
clustering_data.head()
Using 13 features for clustering: ['overall', 'potential', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'age', 'height_cm', 'weight_kg', 'value_eur', 'wage_eur'] Missing values in each feature: overall 0 potential 0 pace 2083 shooting 2083 passing 2083 dribbling 2083 defending 2083 physic 2083 age 0 height_cm 0 weight_kg 0 value_eur 0 wage_eur 0 dtype: int64 Data ready for clustering: (18944, 13) First few rows:
Out[3]:
| overall | potential | pace | shooting | passing | dribbling | defending | physic | age | height_cm | weight_kg | value_eur | wage_eur | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 93 | 93 | 85.0 | 92.0 | 91.0 | 95.0 | 38.0 | 65.0 | 33 | 170 | 72 | 67500000 | 560000 |
| 1 | 92 | 92 | 89.0 | 93.0 | 81.0 | 89.0 | 35.0 | 77.0 | 35 | 187 | 83 | 46000000 | 220000 |
| 2 | 91 | 93 | 68.0 | 54.0 | 58.0 | 64.0 | 56.0 | 66.0 | 27 | 188 | 87 | 75000000 | 125000 |
| 3 | 91 | 91 | 78.0 | 91.0 | 78.0 | 85.0 | 43.0 | 82.0 | 31 | 184 | 80 | 80000000 | 240000 |
| 4 | 91 | 91 | 91.0 | 85.0 | 86.0 | 94.0 | 36.0 | 59.0 | 28 | 175 | 68 | 90000000 | 270000 |
Scale the data¶
In [4]:
# Standardize the features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(clustering_data)
# Create DataFrame with scaled data
scaled_df = pd.DataFrame(scaled_data, columns=clustering_data.columns)
print("Data scaled successfully!")
print(f"Scaled data shape: {scaled_df.shape}")
Data scaled successfully! Scaled data shape: (18944, 13)
Find optimal number of clusters (Elbow Method)¶
In [5]:
# Try different numbers of clusters
inertia = []
k_range = range(1, 11)
print("Calculating inertia for different k values...")
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(scaled_data)
inertia.append(kmeans.inertia_)
print(f"k={k}: inertia = {kmeans.inertia_:.2f}")
# Plot the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (k)', fontsize=12)
plt.ylabel('Inertia (Sum of squared distances)', fontsize=12)
plt.title('Elbow Method: Finding Optimal Number of Clusters', fontsize=14, fontweight='bold')
plt.xticks(k_range)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Show where the "elbow" might be
print("\nLook for the 'elbow' point where the line starts to bend less steeply.")
print("This is usually the optimal number of clusters.")
Calculating inertia for different k values... k=1: inertia = 246272.00 k=2: inertia = 198044.71 k=3: inertia = 167813.90 k=4: inertia = 149764.42 k=5: inertia = 136162.92 k=6: inertia = 126206.17 k=7: inertia = 119336.93 k=8: inertia = 113589.15 k=9: inertia = 108563.37 k=10: inertia = 104105.72
Look for the 'elbow' point where the line starts to bend less steeply. This is usually the optimal number of clusters.
Apply K-Means (let's use k=4 as an example)¶
In [6]:
# Ask user for k value or use 4 as default
optimal_k = 4 # You can change this based on the elbow plot
print(f"\nUsing k = {optimal_k} clusters")
# Perform K-Means clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10, max_iter=300)
clusters = kmeans.fit_predict(scaled_data)
# Add cluster labels to original data
clustering_data['cluster'] = clusters
df_filtered = df.loc[clustering_data.index].copy()
df_filtered['cluster'] = clusters
print(f"\n✅ Clustering completed!")
print("\nNumber of players in each cluster:")
cluster_counts = df_filtered['cluster'].value_counts().sort_index()
for cluster_id, count in cluster_counts.items():
print(f"Cluster {cluster_id}: {count} players ({count/len(df_filtered)*100:.1f}%)")
Using k = 4 clusters ✅ Clustering completed! Number of players in each cluster: Cluster 0: 5860 players (30.9%) Cluster 1: 2039 players (10.8%) Cluster 2: 7637 players (40.3%) Cluster 3: 3408 players (18.0%)
Visualize clusters in 2D¶
In [8]:
# Reduce dimensions to 2D for visualization
print("\nReducing dimensions with PCA for visualization...")
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)
# Create visualization dataframe
viz_df = pd.DataFrame({
'PC1': pca_result[:, 0],
'PC2': pca_result[:, 1],
'cluster': clusters,
'overall': df_filtered['overall'],
'name': df_filtered['short_name']
})
# Plot clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(viz_df['PC1'], viz_df['PC2'],
c=viz_df['cluster'], cmap='tab10',
alpha=0.6, s=50, edgecolors='w', linewidth=0.5)
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)', fontsize=12)
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)', fontsize=12)
plt.title(f'FIFA 21 Players Clustering (k={optimal_k})', fontsize=14, fontweight='bold')
plt.colorbar(scatter, label='Cluster')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Reducing dimensions with PCA for visualization...
Analyze cluster characteristics¶
In [10]:
print("\n" + "="*60)
print("CLUSTER ANALYSIS")
print("="*60)
# Calculate average attributes for each cluster
cluster_stats = clustering_data.groupby('cluster').mean().round(2)
# Display cluster statistics
print("\nAverage attributes for each cluster:")
print(cluster_stats[['overall', 'age', 'value_eur', 'pace', 'shooting', 'defending']])
============================================================
CLUSTER ANALYSIS
============================================================
Average attributes for each cluster:
overall age value_eur pace shooting defending
cluster
0 61.52 22.32 668513.65 71.62 53.31 39.75
1 77.33 26.42 12344433.55 74.24 66.11 58.05
2 67.26 27.37 1395350.01 67.57 56.27 55.60
3 62.30 24.71 705042.55 57.36 34.33 60.45
Show top players in each cluster¶
In [11]:
print("\n" + "="*60)
print("TOP PLAYERS IN EACH CLUSTER")
print("="*60)
for cluster_num in range(optimal_k):
cluster_players = df_filtered[df_filtered['cluster'] == cluster_num]
print(f"\n📊 CLUSTER {cluster_num} ({len(cluster_players)} players)")
print("-" * 40)
# Top 5 players by overall rating
top_players = cluster_players.nlargest(5, 'overall')[
['short_name', 'overall', 'age', 'player_positions', 'club_name', 'value_eur']
]
# Format value in millions
top_players_display = top_players.copy()
top_players_display['value_eur'] = (top_players_display['value_eur'] / 1000000).round(1).astype(str) + 'M'
print("Top 5 players:")
print(top_players_display.to_string(index=False))
# Cluster characteristics
avg_overall = cluster_players['overall'].mean()
avg_age = cluster_players['age'].mean()
avg_value = cluster_players['value_eur'].mean() / 1000000
print(f"\nCluster characteristics:")
print(f" • Average rating: {avg_overall:.1f}")
print(f" • Average age: {avg_age:.1f}")
print(f" • Average value: €{avg_value:.1f}M")
# Most common positions
positions = []
for pos_list in cluster_players['player_positions'].str.split(', '):
if isinstance(pos_list, list):
positions.extend(pos_list)
if positions:
top_positions = pd.Series(positions).value_counts().head(3)
print(f" • Top positions: {', '.join(top_positions.index.tolist())}")
============================================================
TOP PLAYERS IN EACH CLUSTER
============================================================
📊 CLUSTER 0 (5860 players)
----------------------------------------
Top 5 players:
short_name overall age player_positions club_name value_eur
M. Moralez 75 33 CAM, CM New York City FC 4.0M
M. Barrios 75 29 RM, RW, ST FC Dallas 6.5M
E. Barco 74 21 LW, CF Atlanta United 9.5M
M. van Bergen 74 20 RW SC Heerenveen 8.0M
M. Okugawa 74 24 RM, CAM, ST FC Red Bull Salzburg 7.5M
Cluster characteristics:
• Average rating: 61.5
• Average age: 22.3
• Average value: €0.7M
• Top positions: ST, RM, CM
📊 CLUSTER 1 (2039 players)
----------------------------------------
Top 5 players:
short_name overall age player_positions club_name value_eur
L. Messi 93 33 RW, ST, CF FC Barcelona 67.5M
Cristiano Ronaldo 92 35 ST, LW Juventus 46.0M
J. Oblak 91 27 GK Atlético Madrid 75.0M
R. Lewandowski 91 31 ST FC Bayern München 80.0M
Neymar Jr 91 28 LW, CAM Paris Saint-Germain 90.0M
Cluster characteristics:
• Average rating: 77.3
• Average age: 26.4
• Average value: €12.3M
• Top positions: CM, CAM, ST
📊 CLUSTER 2 (7637 players)
----------------------------------------
Top 5 players:
short_name overall age player_positions club_name value_eur
G. Buffon 82 42 GK Juventus 2.2M
Pepe 81 37 CB FC Porto 3.6M
L. Fabiański 81 35 GK West Ham United 4.8M
Marcano 81 33 CB FC Porto 8.5M
Juiano Mestres 81 24 CB, CDM NaN 0.0M
Cluster characteristics:
• Average rating: 67.3
• Average age: 27.4
• Average value: €1.4M
• Top positions: GK, CM, CDM
📊 CLUSTER 3 (3408 players)
----------------------------------------
Top 5 players:
short_name overall age player_positions club_name value_eur
Unai Núñez 78 23 CB Athletic Club de Bilbao 13.0M
Domingos Duarte 78 25 CB Granada CF 12.0M
M. Škrtel 78 35 CB Medipol Başakşehir FK 3.1M
José Fonte 78 36 CB LOSC Lille 2.1M
I. Diop 77 23 CB West Ham United 11.5M
Cluster characteristics:
• Average rating: 62.3
• Average age: 24.7
• Average value: €0.7M
• Top positions: CB, CDM, RB
Create radar chart for cluster profiles¶
In [12]:
# Select attributes for radar chart
radar_features = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
# Check which features are available
available_radar = [f for f in radar_features if f in cluster_stats.columns]
if available_radar:
# Prepare data for radar chart
cluster_means = cluster_stats[available_radar]
# Number of variables
N = len(available_radar)
# Create angles for each axis
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]
# Create radar chart
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
# Plot each cluster
colors = plt.cm.tab10(np.arange(optimal_k) / optimal_k)
for cluster_num in range(optimal_k):
values = cluster_means.loc[cluster_num].values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=2, linestyle='solid',
color=colors[cluster_num], label=f'Cluster {cluster_num}')
ax.fill(angles, values, color=colors[cluster_num], alpha=0.1)
# Add labels
plt.xticks(angles[:-1], available_radar, fontsize=11)
ax.set_rlabel_position(0)
plt.yticks([20, 40, 60, 80, 100], ["20", "40", "60", "80", "100"], color="grey", size=9)
plt.ylim(0, 100)
plt.title('Player Attribute Profiles by Cluster', size=16, fontweight='bold', y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0), fontsize=10)
plt.tight_layout()
plt.show()
Final summary report¶
In [13]:
print("\n" + "="*70)
print("FINAL CLUSTERING REPORT")
print("="*70)
# Create a simple cluster description
for cluster_num in range(optimal_k):
cluster_data = df_filtered[df_filtered['cluster'] == cluster_num]
# Calculate key metrics
size = len(cluster_data)
avg_rating = cluster_data['overall'].mean()
avg_age = cluster_data['age'].mean()
avg_value = cluster_data['value_eur'].mean() / 1000000
# Determine cluster type
if avg_rating >= 85:
rating_desc = "Elite"
elif avg_rating >= 80:
rating_desc = "High Quality"
elif avg_rating >= 75:
rating_desc = "Good"
else:
rating_desc = "Average"
if avg_age <= 23:
age_desc = "Young"
elif avg_age <= 28:
age_desc = "Prime"
else:
age_desc = "Veteran"
print(f"\n🏆 CLUSTER {cluster_num}:")
print(f" • Players: {size} ({size/len(df_filtered)*100:.1f}% of total)")
print(f" • Type: {rating_desc} {age_desc} Players")
print(f" • Avg Rating: {avg_rating:.1f}")
print(f" • Avg Age: {avg_age:.1f}")
print(f" • Avg Value: €{avg_value:.1f}M")
# Example players
examples = cluster_data.nlargest(3, 'overall')['short_name'].tolist()
print(f" • Examples: {', '.join(examples)}")
print("\n" + "="*70)
print(f"Total players clustered: {len(df_filtered)}")
print("="*70)
====================================================================== FINAL CLUSTERING REPORT ====================================================================== 🏆 CLUSTER 0: • Players: 5860 (30.9% of total) • Type: Average Young Players • Avg Rating: 61.5 • Avg Age: 22.3 • Avg Value: €0.7M • Examples: M. Moralez, M. Barrios, E. Barco 🏆 CLUSTER 1: • Players: 2039 (10.8% of total) • Type: Good Prime Players • Avg Rating: 77.3 • Avg Age: 26.4 • Avg Value: €12.3M • Examples: L. Messi, Cristiano Ronaldo, J. Oblak 🏆 CLUSTER 2: • Players: 7637 (40.3% of total) • Type: Average Prime Players • Avg Rating: 67.3 • Avg Age: 27.4 • Avg Value: €1.4M • Examples: G. Buffon, Pepe, L. Fabiański 🏆 CLUSTER 3: • Players: 3408 (18.0% of total) • Type: Average Prime Players • Avg Rating: 62.3 • Avg Age: 24.7 • Avg Value: €0.7M • Examples: Unai Núñez, Domingos Duarte, M. Škrtel ====================================================================== Total players clustered: 18944 ======================================================================
In [ ]:
In [ ]: