In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import gaussian_kde, entropy
import warnings
warnings.filterwarnings('ignore')
# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
# Load the data
df = pd.read_csv('players_15.csv')
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumns:", df.columns.tolist())
print("\nBasic Info:")
print(df.info())
Dataset Shape: (16155, 106)
First few rows:
sofifa_id player_url \
0 158023 https://sofifa.com/player/158023/lionel-messi/...
1 20801 https://sofifa.com/player/20801/c-ronaldo-dos-...
2 9014 https://sofifa.com/player/9014/arjen-robben/15...
3 41236 https://sofifa.com/player/41236/zlatan-ibrahim...
4 167495 https://sofifa.com/player/167495/manuel-neuer/...
short_name long_name age dob \
0 L. Messi Lionel Andrés Messi Cuccittini 27 1987-06-24
1 Cristiano Ronaldo Cristiano Ronaldo dos Santos Aveiro 29 1985-02-05
2 A. Robben Arjen Robben 30 1984-01-23
3 Z. Ibrahimović Zlatan Ibrahimović 32 1981-10-03
4 M. Neuer Manuel Neuer 28 1986-03-27
height_cm weight_kg nationality club_name ... lwb ldm \
0 169 67 Argentina FC Barcelona ... 62+3 62+3
1 185 80 Portugal Real Madrid ... 63+3 63+3
2 180 80 Netherlands FC Bayern München ... 64+3 64+3
3 195 95 Sweden Paris Saint-Germain ... 61+3 65+3
4 193 92 Germany FC Bayern München ... 36+3 40+3
cdm rdm rwb lb lcb cb rcb rb
0 62+3 62+3 62+3 54+3 45+3 45+3 45+3 54+3
1 63+3 63+3 63+3 57+3 52+3 52+3 52+3 57+3
2 64+3 64+3 64+3 55+3 46+3 46+3 46+3 55+3
3 65+3 65+3 61+3 56+3 55+3 55+3 55+3 56+3
4 40+3 40+3 36+3 36+3 38+3 38+3 38+3 36+3
[5 rows x 106 columns]
Columns: ['sofifa_id', 'player_url', 'short_name', 'long_name', 'age', 'dob', 'height_cm', 'weight_kg', 'nationality', 'club_name', 'league_name', 'league_rank', 'overall', 'potential', 'value_eur', 'wage_eur', 'player_positions', 'preferred_foot', 'international_reputation', 'weak_foot', 'skill_moves', 'work_rate', 'body_type', 'real_face', 'release_clause_eur', 'player_tags', 'team_position', 'team_jersey_number', 'loaned_from', 'joined', 'contract_valid_until', 'nation_position', 'nation_jersey_number', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning', 'player_traits', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']
Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16155 entries, 0 to 16154
Columns: 106 entries, sofifa_id to rb
dtypes: float64(18), int64(44), object(44)
memory usage: 13.1+ MB
None
In [2]:
# Select numerical columns for analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"Number of numerical columns: {len(numeric_cols)}")
# Select key attributes for analysis
key_attributes = [
'age', 'height_cm', 'weight_kg', 'overall', 'potential',
'value_eur', 'wage_eur', 'pace', 'shooting', 'passing',
'dribbling', 'defending', 'physic'
]
# Clean data - remove rows with missing values in key attributes
df_clean = df[key_attributes].dropna()
print(f"Dataset size after cleaning: {df_clean.shape}")
Number of numerical columns: 62 Dataset size after cleaning: (14380, 13)
In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import gaussian_kde, entropy
import warnings
warnings.filterwarnings('ignore')
# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
# Load the data
df = pd.read_csv('players_15.csv')
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
# Select key attributes for analysis
key_attributes = [
'age', 'height_cm', 'weight_kg', 'overall', 'potential',
'value_eur', 'wage_eur', 'pace', 'shooting', 'passing',
'dribbling', 'defending', 'physic'
]
# Clean data - remove rows with missing values in key attributes
df_clean = df[key_attributes].dropna()
print(f"\nDataset size after cleaning: {df_clean.shape}")
# Your functions here (they're already defined above)
def analyze_distribution(data, column_name, bins=30):
"""
Analyze the probability distribution of a single column
"""
print(f"\n{'='*60}")
print(f"ANALYSIS FOR: {column_name}")
print('='*60)
# Basic statistics
data_series = df_clean[column_name]
mean_val = data_series.mean()
median_val = data_series.median()
std_val = data_series.std()
skew_val = data_series.skew()
kurt_val = data_series.kurtosis()
print(f"Mean: {mean_val:.2f}")
print(f"Median: {median_val:.2f}")
print(f"Standard Deviation: {std_val:.2f}")
print(f"Skewness: {skew_val:.2f}")
print(f"Kurtosis: {kurt_val:.2f}")
print(f"Range: {data_series.min():.2f} to {data_series.max():.2f}")
return {
'column': column_name,
'mean': mean_val,
'median': median_val,
'std': std_val,
'skew': skew_val,
'kurtosis': kurt_val,
'is_normal': abs(skew_val) < 0.5 and abs(kurt_val) < 1
}
def plot_comprehensive_distribution(data, column_name):
"""
Create comprehensive distribution plots
"""
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle(f'Distribution Analysis: {column_name}', fontsize=16, fontweight='bold')
data_series = df_clean[column_name]
# 1. Histogram with KDE
axes[0, 0].hist(data_series, bins=30, density=True, alpha=0.6, color='skyblue', edgecolor='black')
axes[0, 0].set_title(f'Histogram with KDE: {column_name}')
axes[0, 0].set_xlabel(column_name)
axes[0, 0].set_ylabel('Density')
# Add KDE
kde = gaussian_kde(data_series)
x_range = np.linspace(data_series.min(), data_series.max(), 1000)
axes[0, 0].plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
axes[0, 0].legend()
# 2. Box Plot
axes[0, 1].boxplot(data_series, vert=False)
axes[0, 1].set_title(f'Box Plot: {column_name}')
axes[0, 1].set_xlabel(column_name)
# 3. Q-Q Plot (for normality check)
stats.probplot(data_series, dist="norm", plot=axes[0, 2])
axes[0, 2].set_title(f'Q-Q Plot: {column_name}')
axes[0, 2].get_lines()[0].set_markerfacecolor('blue')
axes[0, 2].get_lines()[0].set_markersize(5)
# 4. Empirical CDF
sorted_data = np.sort(data_series)
yvals = np.arange(len(sorted_data))/float(len(sorted_data)-1)
axes[1, 0].plot(sorted_data, yvals, linewidth=2)
axes[1, 0].set_title(f'Empirical CDF: {column_name}')
axes[1, 0].set_xlabel(column_name)
axes[1, 0].set_ylabel('Cumulative Probability')
axes[1, 0].grid(True, alpha=0.3)
# 5. Violin Plot
axes[1, 1].violinplot(data_series, vert=False)
axes[1, 1].set_title(f'Violin Plot: {column_name}')
axes[1, 1].set_xlabel(column_name)
# 6. Kernel Density Estimation with bandwidth variations
for bw in [0.1, 0.5, 1.0]:
kde = gaussian_kde(data_series, bw_method=bw)
axes[1, 2].plot(x_range, kde(x_range), label=f'BW={bw}')
axes[1, 2].set_title(f'KDE with different bandwidths: {column_name}')
axes[1, 2].set_xlabel(column_name)
axes[1, 2].set_ylabel('Density')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# ===== EXECUTION PART =====
# This is what was missing - actually calling the functions
print("\n" + "="*80)
print("STARTING DISTRIBUTION ANALYSIS")
print("="*80)
# Analyze first few attributes to see output
for attribute in key_attributes[:3]: # Just analyze first 3 for demonstration
# Call the analyze function
stats_result = analyze_distribution(df_clean, attribute)
# Call the plot function
plot_comprehensive_distribution(df_clean, attribute)
print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("="*80)
# Additional analysis for more attributes if needed
while True:
user_input = input("\nDo you want to analyze another attribute? (yes/no): ").lower()
if user_input == 'yes':
print(f"\nAvailable attributes: {key_attributes}")
attr_name = input("Enter attribute name to analyze: ")
if attr_name in key_attributes:
stats_result = analyze_distribution(df_clean, attr_name)
plot_comprehensive_distribution(df_clean, attr_name)
else:
print(f"Error: '{attr_name}' not found in available attributes.")
elif user_input == 'no':
print("Exiting analysis.")
break
else:
print("Please enter 'yes' or 'no'.")
Dataset Shape: (16155, 106)
First few rows:
sofifa_id player_url \
0 158023 https://sofifa.com/player/158023/lionel-messi/...
1 20801 https://sofifa.com/player/20801/c-ronaldo-dos-...
2 9014 https://sofifa.com/player/9014/arjen-robben/15...
3 41236 https://sofifa.com/player/41236/zlatan-ibrahim...
4 167495 https://sofifa.com/player/167495/manuel-neuer/...
short_name long_name age dob \
0 L. Messi Lionel Andrés Messi Cuccittini 27 1987-06-24
1 Cristiano Ronaldo Cristiano Ronaldo dos Santos Aveiro 29 1985-02-05
2 A. Robben Arjen Robben 30 1984-01-23
3 Z. Ibrahimović Zlatan Ibrahimović 32 1981-10-03
4 M. Neuer Manuel Neuer 28 1986-03-27
height_cm weight_kg nationality club_name ... lwb ldm \
0 169 67 Argentina FC Barcelona ... 62+3 62+3
1 185 80 Portugal Real Madrid ... 63+3 63+3
2 180 80 Netherlands FC Bayern München ... 64+3 64+3
3 195 95 Sweden Paris Saint-Germain ... 61+3 65+3
4 193 92 Germany FC Bayern München ... 36+3 40+3
cdm rdm rwb lb lcb cb rcb rb
0 62+3 62+3 62+3 54+3 45+3 45+3 45+3 54+3
1 63+3 63+3 63+3 57+3 52+3 52+3 52+3 57+3
2 64+3 64+3 64+3 55+3 46+3 46+3 46+3 55+3
3 65+3 65+3 61+3 56+3 55+3 55+3 55+3 56+3
4 40+3 40+3 36+3 36+3 38+3 38+3 38+3 36+3
[5 rows x 106 columns]
Dataset size after cleaning: (14380, 13)
================================================================================
STARTING DISTRIBUTION ANALYSIS
================================================================================
============================================================
ANALYSIS FOR: age
============================================================
Mean: 24.66
Median: 24.00
Standard Deviation: 4.49
Skewness: 0.40
Kurtosis: -0.51
Range: 16.00 to 41.00
============================================================ ANALYSIS FOR: height_cm ============================================================ Mean: 180.26 Median: 180.00 Standard Deviation: 6.34 Skewness: -0.03 Kurtosis: -0.17 Range: 155.00 to 204.00
============================================================ ANALYSIS FOR: weight_kg ============================================================ Mean: 74.66 Median: 75.00 Standard Deviation: 6.51 Skewness: 0.18 Kurtosis: 0.17 Range: 50.00 to 110.00
================================================================================ ANALYSIS COMPLETE! ================================================================================
Available attributes: ['age', 'height_cm', 'weight_kg', 'overall', 'potential', 'value_eur', 'wage_eur', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
============================================================ ANALYSIS FOR: age ============================================================ Mean: 24.66 Median: 24.00 Standard Deviation: 4.49 Skewness: 0.40 Kurtosis: -0.51 Range: 16.00 to 41.00
Exiting analysis.
In [5]:
def detect_multimodal_distribution(data, column_name, n_components=5):
"""
Detect multi-modal distributions using Gaussian Mixture Models
"""
from sklearn.mixture import GaussianMixture
data_series = df_clean[column_name].values.reshape(-1, 1)
# Fit Gaussian Mixture Model
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(data_series)
# Predict component for each data point
labels = gmm.predict(data_series)
# Plot the results
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle(f'Multi-Modal Analysis: {column_name}', fontsize=14, fontweight='bold')
# Histogram with component colors
for i in range(n_components):
component_data = data_series[labels == i]
axes[0].hist(component_data, bins=30, alpha=0.5, label=f'Component {i+1}')
axes[0].set_title(f'Histogram with {n_components} Gaussian Components')
axes[0].set_xlabel(column_name)
axes[0].set_ylabel('Frequency')
axes[0].legend()
# Plot the fitted GMM
x_range = np.linspace(data_series.min(), data_series.max(), 1000).reshape(-1, 1)
logprob = gmm.score_samples(x_range)
responsibilities = gmm.predict_proba(x_range)
axes[1].plot(x_range, np.exp(logprob), 'k-', label='Mixture', linewidth=2)
for i in range(n_components):
pdf = responsibilities[:, i] * np.exp(logprob)
axes[1].plot(x_range, pdf, '--', label=f'Component {i+1}')
axes[1].set_title(f'Gaussian Mixture Model: {column_name}')
axes[1].set_xlabel(column_name)
axes[1].set_ylabel('Probability Density')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Calculate AIC and BIC for different numbers of components
n_components_range = range(1, 7)
aics = []
bics = []
for n in n_components_range:
gmm_temp = GaussianMixture(n_components=n, random_state=42)
gmm_temp.fit(data_series)
aics.append(gmm_temp.aic(data_series))
bics.append(gmm_temp.bic(data_series))
print(f"\nGMM Analysis for {column_name}:")
print(f"Optimal components (AIC): {n_components_range[np.argmin(aics)]}")
print(f"Optimal components (BIC): {n_components_range[np.argmin(bics)]}")
return {
'n_components': n_components,
'means': gmm.means_.flatten(),
'covariances': gmm.covariances_.flatten(),
'weights': gmm.weights_
}
In [ ]: