[Your-Name-Here] - Fab Futures - Data Science
Home About

Clustering¶

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set style for better visuals
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Load your data from datasets folder
df = pd.read_csv('datasets/players_21.csv')

print(f"Total players in dataset: {len(df)}")
print(f"Total attributes: {len(df.columns)}")
print("\nFirst 3 players:")
df[['short_name', 'overall', 'age', 'club_name']].head(3)
Total players in dataset: 18944
Total attributes: 106

First 3 players:
Out[2]:
short_name overall age club_name
0 L. Messi 93 33 FC Barcelona
1 Cristiano Ronaldo 92 35 Juventus
2 J. Oblak 91 27 Atlético Madrid

Select features and clean data¶

In [3]:
# Select relevant features for clustering
features = ['overall', 'potential', 'pace', 'shooting', 'passing', 
            'dribbling', 'defending', 'physic', 'age', 'height_cm', 
            'weight_kg', 'value_eur', 'wage_eur']

# Check which features exist in your dataset
available_features = [f for f in features if f in df.columns]
print(f"Using {len(available_features)} features for clustering: {available_features}")

# Create a subset with these features
clustering_data = df[available_features].copy()

# Check for missing values
print(f"\nMissing values in each feature:")
print(clustering_data.isnull().sum())

# Fill missing values with median (for numeric features)
clustering_data = clustering_data.fillna(clustering_data.median())

print(f"\nData ready for clustering: {clustering_data.shape}")
print(f"First few rows:")
clustering_data.head()
Using 13 features for clustering: ['overall', 'potential', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'age', 'height_cm', 'weight_kg', 'value_eur', 'wage_eur']

Missing values in each feature:
overall         0
potential       0
pace         2083
shooting     2083
passing      2083
dribbling    2083
defending    2083
physic       2083
age             0
height_cm       0
weight_kg       0
value_eur       0
wage_eur        0
dtype: int64

Data ready for clustering: (18944, 13)
First few rows:
Out[3]:
overall potential pace shooting passing dribbling defending physic age height_cm weight_kg value_eur wage_eur
0 93 93 85.0 92.0 91.0 95.0 38.0 65.0 33 170 72 67500000 560000
1 92 92 89.0 93.0 81.0 89.0 35.0 77.0 35 187 83 46000000 220000
2 91 93 68.0 54.0 58.0 64.0 56.0 66.0 27 188 87 75000000 125000
3 91 91 78.0 91.0 78.0 85.0 43.0 82.0 31 184 80 80000000 240000
4 91 91 91.0 85.0 86.0 94.0 36.0 59.0 28 175 68 90000000 270000

Scale the data¶

In [4]:
# Standardize the features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(clustering_data)

# Create DataFrame with scaled data
scaled_df = pd.DataFrame(scaled_data, columns=clustering_data.columns)

print("Data scaled successfully!")
print(f"Scaled data shape: {scaled_df.shape}")
Data scaled successfully!
Scaled data shape: (18944, 13)

Find optimal number of clusters (Elbow Method)¶

In [5]:
# Try different numbers of clusters
inertia = []
k_range = range(1, 11)

print("Calculating inertia for different k values...")
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)
    print(f"k={k}: inertia = {kmeans.inertia_:.2f}")

# Plot the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (k)', fontsize=12)
plt.ylabel('Inertia (Sum of squared distances)', fontsize=12)
plt.title('Elbow Method: Finding Optimal Number of Clusters', fontsize=14, fontweight='bold')
plt.xticks(k_range)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Show where the "elbow" might be
print("\nLook for the 'elbow' point where the line starts to bend less steeply.")
print("This is usually the optimal number of clusters.")
Calculating inertia for different k values...
k=1: inertia = 246272.00
k=2: inertia = 198044.71
k=3: inertia = 167813.90
k=4: inertia = 149764.42
k=5: inertia = 136162.92
k=6: inertia = 126206.17
k=7: inertia = 119336.93
k=8: inertia = 113589.15
k=9: inertia = 108563.37
k=10: inertia = 104105.72
No description has been provided for this image
Look for the 'elbow' point where the line starts to bend less steeply.
This is usually the optimal number of clusters.

Apply K-Means (let's use k=4 as an example)¶

In [6]:
# Ask user for k value or use 4 as default
optimal_k = 4  # You can change this based on the elbow plot
print(f"\nUsing k = {optimal_k} clusters")

# Perform K-Means clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10, max_iter=300)
clusters = kmeans.fit_predict(scaled_data)

# Add cluster labels to original data
clustering_data['cluster'] = clusters
df_filtered = df.loc[clustering_data.index].copy()
df_filtered['cluster'] = clusters

print(f"\n✅ Clustering completed!")
print("\nNumber of players in each cluster:")
cluster_counts = df_filtered['cluster'].value_counts().sort_index()
for cluster_id, count in cluster_counts.items():
    print(f"Cluster {cluster_id}: {count} players ({count/len(df_filtered)*100:.1f}%)")
Using k = 4 clusters

✅ Clustering completed!

Number of players in each cluster:
Cluster 0: 5860 players (30.9%)
Cluster 1: 2039 players (10.8%)
Cluster 2: 7637 players (40.3%)
Cluster 3: 3408 players (18.0%)

Visualize clusters in 2D¶

In [8]:
# Reduce dimensions to 2D for visualization
print("\nReducing dimensions with PCA for visualization...")
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

# Create visualization dataframe
viz_df = pd.DataFrame({
    'PC1': pca_result[:, 0],
    'PC2': pca_result[:, 1],
    'cluster': clusters,
    'overall': df_filtered['overall'],
    'name': df_filtered['short_name']
})

# Plot clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(viz_df['PC1'], viz_df['PC2'], 
                     c=viz_df['cluster'], cmap='tab10', 
                     alpha=0.6, s=50, edgecolors='w', linewidth=0.5)

plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)', fontsize=12)
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)', fontsize=12)
plt.title(f'FIFA 21 Players Clustering (k={optimal_k})', fontsize=14, fontweight='bold')
plt.colorbar(scatter, label='Cluster')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Reducing dimensions with PCA for visualization...
No description has been provided for this image

Analyze cluster characteristics¶

In [10]:
print("\n" + "="*60)
print("CLUSTER ANALYSIS")
print("="*60)

# Calculate average attributes for each cluster
cluster_stats = clustering_data.groupby('cluster').mean().round(2)

# Display cluster statistics
print("\nAverage attributes for each cluster:")
print(cluster_stats[['overall', 'age', 'value_eur', 'pace', 'shooting', 'defending']])
============================================================
CLUSTER ANALYSIS
============================================================

Average attributes for each cluster:
         overall    age    value_eur   pace  shooting  defending
cluster                                                         
0          61.52  22.32    668513.65  71.62     53.31      39.75
1          77.33  26.42  12344433.55  74.24     66.11      58.05
2          67.26  27.37   1395350.01  67.57     56.27      55.60
3          62.30  24.71    705042.55  57.36     34.33      60.45

Show top players in each cluster¶

In [11]:
print("\n" + "="*60)
print("TOP PLAYERS IN EACH CLUSTER")
print("="*60)

for cluster_num in range(optimal_k):
    cluster_players = df_filtered[df_filtered['cluster'] == cluster_num]
    
    print(f"\n📊 CLUSTER {cluster_num} ({len(cluster_players)} players)")
    print("-" * 40)
    
    # Top 5 players by overall rating
    top_players = cluster_players.nlargest(5, 'overall')[
        ['short_name', 'overall', 'age', 'player_positions', 'club_name', 'value_eur']
    ]
    
    # Format value in millions
    top_players_display = top_players.copy()
    top_players_display['value_eur'] = (top_players_display['value_eur'] / 1000000).round(1).astype(str) + 'M'
    
    print("Top 5 players:")
    print(top_players_display.to_string(index=False))
    
    # Cluster characteristics
    avg_overall = cluster_players['overall'].mean()
    avg_age = cluster_players['age'].mean()
    avg_value = cluster_players['value_eur'].mean() / 1000000
    
    print(f"\nCluster characteristics:")
    print(f"  • Average rating: {avg_overall:.1f}")
    print(f"  • Average age: {avg_age:.1f}")
    print(f"  • Average value: €{avg_value:.1f}M")
    
    # Most common positions
    positions = []
    for pos_list in cluster_players['player_positions'].str.split(', '):
        if isinstance(pos_list, list):
            positions.extend(pos_list)
    
    if positions:
        top_positions = pd.Series(positions).value_counts().head(3)
        print(f"  • Top positions: {', '.join(top_positions.index.tolist())}")
============================================================
TOP PLAYERS IN EACH CLUSTER
============================================================

📊 CLUSTER 0 (5860 players)
----------------------------------------
Top 5 players:
   short_name  overall  age player_positions            club_name value_eur
   M. Moralez       75   33          CAM, CM     New York City FC      4.0M
   M. Barrios       75   29       RM, RW, ST            FC Dallas      6.5M
     E. Barco       74   21           LW, CF       Atlanta United      9.5M
M. van Bergen       74   20               RW        SC Heerenveen      8.0M
   M. Okugawa       74   24      RM, CAM, ST FC Red Bull Salzburg      7.5M

Cluster characteristics:
  • Average rating: 61.5
  • Average age: 22.3
  • Average value: €0.7M
  • Top positions: ST, RM, CM

📊 CLUSTER 1 (2039 players)
----------------------------------------
Top 5 players:
       short_name  overall  age player_positions           club_name value_eur
         L. Messi       93   33       RW, ST, CF        FC Barcelona     67.5M
Cristiano Ronaldo       92   35           ST, LW            Juventus     46.0M
         J. Oblak       91   27               GK     Atlético Madrid     75.0M
   R. Lewandowski       91   31               ST   FC Bayern München     80.0M
        Neymar Jr       91   28          LW, CAM Paris Saint-Germain     90.0M

Cluster characteristics:
  • Average rating: 77.3
  • Average age: 26.4
  • Average value: €12.3M
  • Top positions: CM, CAM, ST

📊 CLUSTER 2 (7637 players)
----------------------------------------
Top 5 players:
    short_name  overall  age player_positions       club_name value_eur
     G. Buffon       82   42               GK        Juventus      2.2M
          Pepe       81   37               CB        FC Porto      3.6M
  L. Fabiański       81   35               GK West Ham United      4.8M
       Marcano       81   33               CB        FC Porto      8.5M
Juiano Mestres       81   24          CB, CDM             NaN      0.0M

Cluster characteristics:
  • Average rating: 67.3
  • Average age: 27.4
  • Average value: €1.4M
  • Top positions: GK, CM, CDM

📊 CLUSTER 3 (3408 players)
----------------------------------------
Top 5 players:
     short_name  overall  age player_positions               club_name value_eur
     Unai Núñez       78   23               CB Athletic Club de Bilbao     13.0M
Domingos Duarte       78   25               CB              Granada CF     12.0M
      M. Škrtel       78   35               CB   Medipol Başakşehir FK      3.1M
     José Fonte       78   36               CB              LOSC Lille      2.1M
        I. Diop       77   23               CB         West Ham United     11.5M

Cluster characteristics:
  • Average rating: 62.3
  • Average age: 24.7
  • Average value: €0.7M
  • Top positions: CB, CDM, RB

Create radar chart for cluster profiles¶

In [12]:
# Select attributes for radar chart
radar_features = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']

# Check which features are available
available_radar = [f for f in radar_features if f in cluster_stats.columns]
if available_radar:
    # Prepare data for radar chart
    cluster_means = cluster_stats[available_radar]
    
    # Number of variables
    N = len(available_radar)
    
    # Create angles for each axis
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]
    
    # Create radar chart
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
    
    # Plot each cluster
    colors = plt.cm.tab10(np.arange(optimal_k) / optimal_k)
    
    for cluster_num in range(optimal_k):
        values = cluster_means.loc[cluster_num].values.flatten().tolist()
        values += values[:1]
        
        ax.plot(angles, values, linewidth=2, linestyle='solid', 
                color=colors[cluster_num], label=f'Cluster {cluster_num}')
        ax.fill(angles, values, color=colors[cluster_num], alpha=0.1)
    
    # Add labels
    plt.xticks(angles[:-1], available_radar, fontsize=11)
    ax.set_rlabel_position(0)
    plt.yticks([20, 40, 60, 80, 100], ["20", "40", "60", "80", "100"], color="grey", size=9)
    plt.ylim(0, 100)
    
    plt.title('Player Attribute Profiles by Cluster', size=16, fontweight='bold', y=1.1)
    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0), fontsize=10)
    plt.tight_layout()
    plt.show()
No description has been provided for this image

Final summary report¶

In [13]:
print("\n" + "="*70)
print("FINAL CLUSTERING REPORT")
print("="*70)

# Create a simple cluster description
for cluster_num in range(optimal_k):
    cluster_data = df_filtered[df_filtered['cluster'] == cluster_num]
    
    # Calculate key metrics
    size = len(cluster_data)
    avg_rating = cluster_data['overall'].mean()
    avg_age = cluster_data['age'].mean()
    avg_value = cluster_data['value_eur'].mean() / 1000000
    
    # Determine cluster type
    if avg_rating >= 85:
        rating_desc = "Elite"
    elif avg_rating >= 80:
        rating_desc = "High Quality"
    elif avg_rating >= 75:
        rating_desc = "Good"
    else:
        rating_desc = "Average"
    
    if avg_age <= 23:
        age_desc = "Young"
    elif avg_age <= 28:
        age_desc = "Prime"
    else:
        age_desc = "Veteran"
    
    print(f"\n🏆 CLUSTER {cluster_num}:")
    print(f"   • Players: {size} ({size/len(df_filtered)*100:.1f}% of total)")
    print(f"   • Type: {rating_desc} {age_desc} Players")
    print(f"   • Avg Rating: {avg_rating:.1f}")
    print(f"   • Avg Age: {avg_age:.1f}")
    print(f"   • Avg Value: €{avg_value:.1f}M")
    
    # Example players
    examples = cluster_data.nlargest(3, 'overall')['short_name'].tolist()
    print(f"   • Examples: {', '.join(examples)}")

print("\n" + "="*70)
print(f"Total players clustered: {len(df_filtered)}")
print("="*70)
======================================================================
FINAL CLUSTERING REPORT
======================================================================

🏆 CLUSTER 0:
   • Players: 5860 (30.9% of total)
   • Type: Average Young Players
   • Avg Rating: 61.5
   • Avg Age: 22.3
   • Avg Value: €0.7M
   • Examples: M. Moralez, M. Barrios, E. Barco

🏆 CLUSTER 1:
   • Players: 2039 (10.8% of total)
   • Type: Good Prime Players
   • Avg Rating: 77.3
   • Avg Age: 26.4
   • Avg Value: €12.3M
   • Examples: L. Messi, Cristiano Ronaldo, J. Oblak

🏆 CLUSTER 2:
   • Players: 7637 (40.3% of total)
   • Type: Average Prime Players
   • Avg Rating: 67.3
   • Avg Age: 27.4
   • Avg Value: €1.4M
   • Examples: G. Buffon, Pepe, L. Fabiański

🏆 CLUSTER 3:
   • Players: 3408 (18.0% of total)
   • Type: Average Prime Players
   • Avg Rating: 62.3
   • Avg Age: 24.7
   • Avg Value: €0.7M
   • Examples: Unai Núñez, Domingos Duarte, M. Škrtel

======================================================================
Total players clustered: 18944
======================================================================
In [ ]:
 
In [ ]: