[Pema-Norbu] - Fab Futures - Data Science
Home About
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import gaussian_kde, entropy
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Load the data
df = pd.read_csv('players_15.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumns:", df.columns.tolist())
print("\nBasic Info:")
print(df.info())
Dataset Shape: (16155, 106)

First few rows:
   sofifa_id                                         player_url  \
0     158023  https://sofifa.com/player/158023/lionel-messi/...   
1      20801  https://sofifa.com/player/20801/c-ronaldo-dos-...   
2       9014  https://sofifa.com/player/9014/arjen-robben/15...   
3      41236  https://sofifa.com/player/41236/zlatan-ibrahim...   
4     167495  https://sofifa.com/player/167495/manuel-neuer/...   

          short_name                            long_name  age         dob  \
0           L. Messi       Lionel Andrés Messi Cuccittini   27  1987-06-24   
1  Cristiano Ronaldo  Cristiano Ronaldo dos Santos Aveiro   29  1985-02-05   
2          A. Robben                         Arjen Robben   30  1984-01-23   
3     Z. Ibrahimović                   Zlatan Ibrahimović   32  1981-10-03   
4           M. Neuer                         Manuel Neuer   28  1986-03-27   

   height_cm  weight_kg  nationality            club_name  ...   lwb   ldm  \
0        169         67    Argentina         FC Barcelona  ...  62+3  62+3   
1        185         80     Portugal          Real Madrid  ...  63+3  63+3   
2        180         80  Netherlands    FC Bayern München  ...  64+3  64+3   
3        195         95       Sweden  Paris Saint-Germain  ...  61+3  65+3   
4        193         92      Germany    FC Bayern München  ...  36+3  40+3   

    cdm   rdm   rwb    lb   lcb    cb   rcb    rb  
0  62+3  62+3  62+3  54+3  45+3  45+3  45+3  54+3  
1  63+3  63+3  63+3  57+3  52+3  52+3  52+3  57+3  
2  64+3  64+3  64+3  55+3  46+3  46+3  46+3  55+3  
3  65+3  65+3  61+3  56+3  55+3  55+3  55+3  56+3  
4  40+3  40+3  36+3  36+3  38+3  38+3  38+3  36+3  

[5 rows x 106 columns]

Columns: ['sofifa_id', 'player_url', 'short_name', 'long_name', 'age', 'dob', 'height_cm', 'weight_kg', 'nationality', 'club_name', 'league_name', 'league_rank', 'overall', 'potential', 'value_eur', 'wage_eur', 'player_positions', 'preferred_foot', 'international_reputation', 'weak_foot', 'skill_moves', 'work_rate', 'body_type', 'real_face', 'release_clause_eur', 'player_tags', 'team_position', 'team_jersey_number', 'loaned_from', 'joined', 'contract_valid_until', 'nation_position', 'nation_jersey_number', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning', 'player_traits', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']

Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16155 entries, 0 to 16154
Columns: 106 entries, sofifa_id to rb
dtypes: float64(18), int64(44), object(44)
memory usage: 13.1+ MB
None
In [2]:
# Select numerical columns for analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"Number of numerical columns: {len(numeric_cols)}")

# Select key attributes for analysis
key_attributes = [
    'age', 'height_cm', 'weight_kg', 'overall', 'potential', 
    'value_eur', 'wage_eur', 'pace', 'shooting', 'passing', 
    'dribbling', 'defending', 'physic'
]

# Clean data - remove rows with missing values in key attributes
df_clean = df[key_attributes].dropna()
print(f"Dataset size after cleaning: {df_clean.shape}")
Number of numerical columns: 62
Dataset size after cleaning: (14380, 13)
In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import gaussian_kde, entropy
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Load the data
df = pd.read_csv('players_15.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Select key attributes for analysis
key_attributes = [
    'age', 'height_cm', 'weight_kg', 'overall', 'potential', 
    'value_eur', 'wage_eur', 'pace', 'shooting', 'passing', 
    'dribbling', 'defending', 'physic'
]

# Clean data - remove rows with missing values in key attributes
df_clean = df[key_attributes].dropna()
print(f"\nDataset size after cleaning: {df_clean.shape}")

# Your functions here (they're already defined above)

def analyze_distribution(data, column_name, bins=30):
    """
    Analyze the probability distribution of a single column
    """
    print(f"\n{'='*60}")
    print(f"ANALYSIS FOR: {column_name}")
    print('='*60)
    
    # Basic statistics
    data_series = df_clean[column_name]
    mean_val = data_series.mean()
    median_val = data_series.median()
    std_val = data_series.std()
    skew_val = data_series.skew()
    kurt_val = data_series.kurtosis()
    
    print(f"Mean: {mean_val:.2f}")
    print(f"Median: {median_val:.2f}")
    print(f"Standard Deviation: {std_val:.2f}")
    print(f"Skewness: {skew_val:.2f}")
    print(f"Kurtosis: {kurt_val:.2f}")
    print(f"Range: {data_series.min():.2f} to {data_series.max():.2f}")
    
    return {
        'column': column_name,
        'mean': mean_val,
        'median': median_val,
        'std': std_val,
        'skew': skew_val,
        'kurtosis': kurt_val,
        'is_normal': abs(skew_val) < 0.5 and abs(kurt_val) < 1
    }

def plot_comprehensive_distribution(data, column_name):
    """
    Create comprehensive distribution plots
    """
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.suptitle(f'Distribution Analysis: {column_name}', fontsize=16, fontweight='bold')
    
    data_series = df_clean[column_name]
    
    # 1. Histogram with KDE
    axes[0, 0].hist(data_series, bins=30, density=True, alpha=0.6, color='skyblue', edgecolor='black')
    axes[0, 0].set_title(f'Histogram with KDE: {column_name}')
    axes[0, 0].set_xlabel(column_name)
    axes[0, 0].set_ylabel('Density')
    
    # Add KDE
    kde = gaussian_kde(data_series)
    x_range = np.linspace(data_series.min(), data_series.max(), 1000)
    axes[0, 0].plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
    axes[0, 0].legend()
    
    # 2. Box Plot
    axes[0, 1].boxplot(data_series, vert=False)
    axes[0, 1].set_title(f'Box Plot: {column_name}')
    axes[0, 1].set_xlabel(column_name)
    
    # 3. Q-Q Plot (for normality check)
    stats.probplot(data_series, dist="norm", plot=axes[0, 2])
    axes[0, 2].set_title(f'Q-Q Plot: {column_name}')
    axes[0, 2].get_lines()[0].set_markerfacecolor('blue')
    axes[0, 2].get_lines()[0].set_markersize(5)
    
    # 4. Empirical CDF
    sorted_data = np.sort(data_series)
    yvals = np.arange(len(sorted_data))/float(len(sorted_data)-1)
    axes[1, 0].plot(sorted_data, yvals, linewidth=2)
    axes[1, 0].set_title(f'Empirical CDF: {column_name}')
    axes[1, 0].set_xlabel(column_name)
    axes[1, 0].set_ylabel('Cumulative Probability')
    axes[1, 0].grid(True, alpha=0.3)
    
    # 5. Violin Plot
    axes[1, 1].violinplot(data_series, vert=False)
    axes[1, 1].set_title(f'Violin Plot: {column_name}')
    axes[1, 1].set_xlabel(column_name)
    
    # 6. Kernel Density Estimation with bandwidth variations
    for bw in [0.1, 0.5, 1.0]:
        kde = gaussian_kde(data_series, bw_method=bw)
        axes[1, 2].plot(x_range, kde(x_range), label=f'BW={bw}')
    axes[1, 2].set_title(f'KDE with different bandwidths: {column_name}')
    axes[1, 2].set_xlabel(column_name)
    axes[1, 2].set_ylabel('Density')
    axes[1, 2].legend()
    axes[1, 2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# ===== EXECUTION PART =====
# This is what was missing - actually calling the functions

print("\n" + "="*80)
print("STARTING DISTRIBUTION ANALYSIS")
print("="*80)

# Analyze first few attributes to see output
for attribute in key_attributes[:3]:  # Just analyze first 3 for demonstration
    # Call the analyze function
    stats_result = analyze_distribution(df_clean, attribute)
    
    # Call the plot function
    plot_comprehensive_distribution(df_clean, attribute)

print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("="*80)

# Additional analysis for more attributes if needed
while True:
    user_input = input("\nDo you want to analyze another attribute? (yes/no): ").lower()
    if user_input == 'yes':
        print(f"\nAvailable attributes: {key_attributes}")
        attr_name = input("Enter attribute name to analyze: ")
        if attr_name in key_attributes:
            stats_result = analyze_distribution(df_clean, attr_name)
            plot_comprehensive_distribution(df_clean, attr_name)
        else:
            print(f"Error: '{attr_name}' not found in available attributes.")
    elif user_input == 'no':
        print("Exiting analysis.")
        break
    else:
        print("Please enter 'yes' or 'no'.")
Dataset Shape: (16155, 106)

First few rows:
   sofifa_id                                         player_url  \
0     158023  https://sofifa.com/player/158023/lionel-messi/...   
1      20801  https://sofifa.com/player/20801/c-ronaldo-dos-...   
2       9014  https://sofifa.com/player/9014/arjen-robben/15...   
3      41236  https://sofifa.com/player/41236/zlatan-ibrahim...   
4     167495  https://sofifa.com/player/167495/manuel-neuer/...   

          short_name                            long_name  age         dob  \
0           L. Messi       Lionel Andrés Messi Cuccittini   27  1987-06-24   
1  Cristiano Ronaldo  Cristiano Ronaldo dos Santos Aveiro   29  1985-02-05   
2          A. Robben                         Arjen Robben   30  1984-01-23   
3     Z. Ibrahimović                   Zlatan Ibrahimović   32  1981-10-03   
4           M. Neuer                         Manuel Neuer   28  1986-03-27   

   height_cm  weight_kg  nationality            club_name  ...   lwb   ldm  \
0        169         67    Argentina         FC Barcelona  ...  62+3  62+3   
1        185         80     Portugal          Real Madrid  ...  63+3  63+3   
2        180         80  Netherlands    FC Bayern München  ...  64+3  64+3   
3        195         95       Sweden  Paris Saint-Germain  ...  61+3  65+3   
4        193         92      Germany    FC Bayern München  ...  36+3  40+3   

    cdm   rdm   rwb    lb   lcb    cb   rcb    rb  
0  62+3  62+3  62+3  54+3  45+3  45+3  45+3  54+3  
1  63+3  63+3  63+3  57+3  52+3  52+3  52+3  57+3  
2  64+3  64+3  64+3  55+3  46+3  46+3  46+3  55+3  
3  65+3  65+3  61+3  56+3  55+3  55+3  55+3  56+3  
4  40+3  40+3  36+3  36+3  38+3  38+3  38+3  36+3  

[5 rows x 106 columns]

Dataset size after cleaning: (14380, 13)

================================================================================
STARTING DISTRIBUTION ANALYSIS
================================================================================

============================================================
ANALYSIS FOR: age
============================================================
Mean: 24.66
Median: 24.00
Standard Deviation: 4.49
Skewness: 0.40
Kurtosis: -0.51
Range: 16.00 to 41.00
No description has been provided for this image
============================================================
ANALYSIS FOR: height_cm
============================================================
Mean: 180.26
Median: 180.00
Standard Deviation: 6.34
Skewness: -0.03
Kurtosis: -0.17
Range: 155.00 to 204.00
No description has been provided for this image
============================================================
ANALYSIS FOR: weight_kg
============================================================
Mean: 74.66
Median: 75.00
Standard Deviation: 6.51
Skewness: 0.18
Kurtosis: 0.17
Range: 50.00 to 110.00
No description has been provided for this image
================================================================================
ANALYSIS COMPLETE!
================================================================================
Available attributes: ['age', 'height_cm', 'weight_kg', 'overall', 'potential', 'value_eur', 'wage_eur', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
============================================================
ANALYSIS FOR: age
============================================================
Mean: 24.66
Median: 24.00
Standard Deviation: 4.49
Skewness: 0.40
Kurtosis: -0.51
Range: 16.00 to 41.00
No description has been provided for this image
Exiting analysis.
In [5]:
def detect_multimodal_distribution(data, column_name, n_components=5):
    """
    Detect multi-modal distributions using Gaussian Mixture Models
    """
    from sklearn.mixture import GaussianMixture
    
    data_series = df_clean[column_name].values.reshape(-1, 1)
    
    # Fit Gaussian Mixture Model
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm.fit(data_series)
    
    # Predict component for each data point
    labels = gmm.predict(data_series)
    
    # Plot the results
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    fig.suptitle(f'Multi-Modal Analysis: {column_name}', fontsize=14, fontweight='bold')
    
    # Histogram with component colors
    for i in range(n_components):
        component_data = data_series[labels == i]
        axes[0].hist(component_data, bins=30, alpha=0.5, label=f'Component {i+1}')
    
    axes[0].set_title(f'Histogram with {n_components} Gaussian Components')
    axes[0].set_xlabel(column_name)
    axes[0].set_ylabel('Frequency')
    axes[0].legend()
    
    # Plot the fitted GMM
    x_range = np.linspace(data_series.min(), data_series.max(), 1000).reshape(-1, 1)
    logprob = gmm.score_samples(x_range)
    responsibilities = gmm.predict_proba(x_range)
    
    axes[1].plot(x_range, np.exp(logprob), 'k-', label='Mixture', linewidth=2)
    
    for i in range(n_components):
        pdf = responsibilities[:, i] * np.exp(logprob)
        axes[1].plot(x_range, pdf, '--', label=f'Component {i+1}')
    
    axes[1].set_title(f'Gaussian Mixture Model: {column_name}')
    axes[1].set_xlabel(column_name)
    axes[1].set_ylabel('Probability Density')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Calculate AIC and BIC for different numbers of components
    n_components_range = range(1, 7)
    aics = []
    bics = []
    
    for n in n_components_range:
        gmm_temp = GaussianMixture(n_components=n, random_state=42)
        gmm_temp.fit(data_series)
        aics.append(gmm_temp.aic(data_series))
        bics.append(gmm_temp.bic(data_series))
    
    print(f"\nGMM Analysis for {column_name}:")
    print(f"Optimal components (AIC): {n_components_range[np.argmin(aics)]}")
    print(f"Optimal components (BIC): {n_components_range[np.argmin(bics)]}")
    
    return {
        'n_components': n_components,
        'means': gmm.means_.flatten(),
        'covariances': gmm.covariances_.flatten(),
        'weights': gmm.weights_
    }
In [ ]: