# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')
# Load the data
df = pd.read_csv('datasets/players_15.csv')
# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum().sort_values(ascending=False).head(20))
Dataset Shape: (16155, 106)
First few rows:
sofifa_id player_url \
0 158023 https://sofifa.com/player/158023/lionel-messi/...
1 20801 https://sofifa.com/player/20801/c-ronaldo-dos-...
2 9014 https://sofifa.com/player/9014/arjen-robben/15...
3 41236 https://sofifa.com/player/41236/zlatan-ibrahim...
4 167495 https://sofifa.com/player/167495/manuel-neuer/...
short_name long_name age dob \
0 L. Messi Lionel Andrés Messi Cuccittini 27 1987-06-24
1 Cristiano Ronaldo Cristiano Ronaldo dos Santos Aveiro 29 1985-02-05
2 A. Robben Arjen Robben 30 1984-01-23
3 Z. Ibrahimović Zlatan Ibrahimović 32 1981-10-03
4 M. Neuer Manuel Neuer 28 1986-03-27
height_cm weight_kg nationality club_name ... lwb ldm \
0 169 67 Argentina FC Barcelona ... 62+3 62+3
1 185 80 Portugal Real Madrid ... 63+3 63+3
2 180 80 Netherlands FC Bayern München ... 64+3 64+3
3 195 95 Sweden Paris Saint-Germain ... 61+3 65+3
4 193 92 Germany FC Bayern München ... 36+3 40+3
cdm rdm rwb lb lcb cb rcb rb
0 62+3 62+3 62+3 54+3 45+3 45+3 45+3 54+3
1 63+3 63+3 63+3 57+3 52+3 52+3 52+3 57+3
2 64+3 64+3 64+3 55+3 46+3 46+3 46+3 55+3
3 65+3 65+3 61+3 56+3 55+3 55+3 55+3 56+3
4 40+3 40+3 36+3 36+3 38+3 38+3 38+3 36+3
[5 rows x 106 columns]
Column names:
['sofifa_id', 'player_url', 'short_name', 'long_name', 'age', 'dob', 'height_cm', 'weight_kg', 'nationality', 'club_name', 'league_name', 'league_rank', 'overall', 'potential', 'value_eur', 'wage_eur', 'player_positions', 'preferred_foot', 'international_reputation', 'weak_foot', 'skill_moves', 'work_rate', 'body_type', 'real_face', 'release_clause_eur', 'player_tags', 'team_position', 'team_jersey_number', 'loaned_from', 'joined', 'contract_valid_until', 'nation_position', 'nation_jersey_number', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning', 'player_traits', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']
Data types:
sofifa_id int64
player_url object
short_name object
long_name object
age int64
...
lb object
lcb object
cb object
rcb object
rb object
Length: 106, dtype: object
Missing values:
mentality_composure 16155
release_clause_eur 16155
loaned_from 15243
nation_jersey_number 15074
nation_position 15074
player_tags 14919
gk_speed 14380
gk_kicking 14380
gk_handling 14380
gk_diving 14380
gk_positioning 14380
gk_reflexes 14380
player_traits 9556
pace 1775
physic 1775
defending 1775
dribbling 1775
passing 1775
shooting 1775
joined 1151
dtype: int64
understanding the above code
Real-World Analogy Imagine you're organizing a soccer team:
Import tools = Getting your clipboard, pen, calculator, and charts ready
Load data = Opening the folder with player information sheets
Check shape = Counting how many player sheets you have
First few rows = Looking at the first few player profiles
Column names = Seeing what information is recorded (age, goals, position, etc.)
Data types = Checking if height is written as a number or text
Missing values = Finding which players have incomplete information
Why This Is Important This is like exploring a new video game before playing:
You check how many levels there are
You see what buttons do what
You understand the game rules
You find any glitches or missing parts
Without this exploration, you might:
Try to do math with text data
Get errors because of missing information
Not understand what you're working with
Next steps (what you'll do after this):
Clean up the data (fill missing values, fix errors)
Analyze relationships (do taller players earn more?)
Build prediction models (predict player ratings or values)
Create visualizations (charts showing top players by position)
Analysis 1: Predicting Player's Overall Rating¶
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')
# Load the data
df = pd.read_csv('datasets/players_15.csv')
# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
# ========== DATA CLEANING ==========
print("\n" + "="*50)
print("CLEANING THE DATA")
print("="*50)
# Clean column names (remove extra spaces)
df.columns = df.columns.str.strip()
# Create a cleaner copy for analysis
df_clean = df.copy()
# Let's see what columns have missing values
print("\nMissing values summary:")
missing_summary = df_clean.isnull().sum()
print(missing_summary[missing_summary > 0])
# Step 1: Drop columns with too many missing values (> 50%)
threshold = len(df_clean) * 0.5
columns_to_drop = missing_summary[missing_summary > threshold].index.tolist()
print(f"\nDropping columns with more than 50% missing values: {columns_to_drop}")
df_clean = df_clean.drop(columns=columns_to_drop)
# Step 2: Drop columns that are not useful for prediction
columns_to_drop = [
'player_url', 'short_name', 'long_name', 'dob',
'player_tags', 'team_position', 'player_traits',
'loaned_from', 'joined', 'real_face', 'contract_valid_until',
'nation_position', 'team_jersey_number', 'nation_jersey_number'
]
# Only drop columns that exist
for col in columns_to_drop:
if col in df_clean.columns:
df_clean = df_clean.drop(columns=[col])
# Step 3: Select only numeric columns for initial analysis
# Get numeric columns only
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
# Also include some important categorical columns that we'll encode
categorical_cols_to_keep = ['preferred_foot', 'body_type', 'work_rate']
# Keep only numeric columns and selected categorical columns
all_cols_to_keep = numeric_cols + [col for col in categorical_cols_to_keep if col in df_clean.columns]
df_clean = df_clean[all_cols_to_keep]
print(f"\nAfter initial cleaning - shape: {df_clean.shape}")
# Step 4: Handle missing values in numeric columns
print("\nHandling missing values...")
imputer = SimpleImputer(strategy='median') # Use median to handle outliers
numeric_data = df_clean.select_dtypes(include=[np.number])
df_clean[numeric_data.columns] = imputer.fit_transform(numeric_data)
# Step 5: Encode categorical variables
print("\nEncoding categorical variables...")
le = LabelEncoder()
for col in ['preferred_foot', 'body_type']:
if col in df_clean.columns:
# Fill any remaining NaN with the most common value
df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
df_clean[col] = le.fit_transform(df_clean[col])
# Handle work_rate separately as it has format like "High/Low"
if 'work_rate' in df_clean.columns:
# Create two separate columns for attacking and defensive work rates
df_clean[['att_work_rate', 'def_work_rate']] = df_clean['work_rate'].str.split('/', expand=True)
# Encode the work rates
work_rate_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df_clean['att_work_rate'] = df_clean['att_work_rate'].map(work_rate_mapping).fillna(1)
df_clean['def_work_rate'] = df_clean['def_work_rate'].map(work_rate_mapping).fillna(1)
# Drop the original work_rate column
df_clean = df_clean.drop(columns=['work_rate'])
# Step 6: Final check for any remaining NaN values
print("\nFinal check for NaN values:")
print(f"Total NaN values remaining: {df_clean.isnull().sum().sum()}")
print(f"Columns with NaN: {df_clean.columns[df_clean.isnull().any()].tolist()}")
# Drop any remaining rows with NaN (should be very few if any)
initial_rows = len(df_clean)
df_clean = df_clean.dropna()
final_rows = len(df_clean)
print(f"\nRows before dropping NaN: {initial_rows}")
print(f"Rows after dropping NaN: {final_rows}")
print(f"Rows dropped: {initial_rows - final_rows}")
# Step 7: Make sure we have the target column
if 'overall' not in df_clean.columns:
print("\nERROR: 'overall' column not found in cleaned data!")
print(f"Available columns: {df_clean.columns.tolist()}")
else:
print(f"\n✅ Data cleaning complete!")
print(f"Final shape: {df_clean.shape}")
print(f"Number of features: {len(df_clean.columns) - 1}") # minus target
# Display some basic statistics
print("\nBasic statistics of cleaned data:")
print(df_clean[['overall', 'age', 'value_eur', 'wage_eur']].describe())
# ========== ANALYSIS 1: PREDICTING PLAYER'S OVERALL RATING ==========
print("\n" + "="*50)
print("ANALYSIS 1: PREDICTING OVERALL RATING")
print("="*50)
# Let's predict overall rating based on other attributes
# Define features and target
# First, drop non-numeric columns that might still be present
for col in df_clean.columns:
if df_clean[col].dtype == 'object':
print(f"Warning: Column '{col}' is still object type. Dropping it.")
df_clean = df_clean.drop(columns=[col])
# Make sure we have numeric data
df_clean = df_clean.select_dtypes(include=[np.number])
# Define X and y
X = df_clean.drop(columns=['overall', 'sofifa_id'])
y = df_clean['overall']
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
# Check for any infinite values
print(f"\nChecking for infinite values:")
print(f"Infinite values in X: {np.isinf(X.values).sum()}")
print(f"Infinite values in y: {np.isinf(y.values).sum()}")
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"\nData scaled successfully!")
# Train different models
models = {
'Linear Regression': LinearRegression(),
'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=5),
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
}
results = {}
print("\nModel Performance:")
for name, model in models.items():
print(f"\nTraining {name}...")
try:
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
results[name] = {'RMSE': rmse, 'R²': r2}
print(f" RMSE: {rmse:.4f}")
print(f" R² Score: {r2:.4f}")
except Exception as e:
print(f" Error training {name}: {e}")
print("-" * 50)
# Feature importance from Random Forest
if 'Random Forest' in results:
print("\nFeature Importance Analysis:")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
# Get feature importances
feature_importances = pd.DataFrame({
'feature': X.columns,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features for Overall Rating:")
print(feature_importances.head(10))
# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importances.head(15)
plt.barh(top_features['feature'], top_features['importance'])
plt.xlabel('Importance')
plt.title('Top 15 Features for Predicting Overall Rating')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
# ========== ADDITIONAL SIMPLE VISUALIZATIONS ==========
print("\n" + "="*50)
print("ADDITIONAL VISUALIZATIONS")
print("="*50)
# 1. Distribution of Overall Ratings
plt.figure(figsize=(10, 6))
plt.hist(y, bins=30, edgecolor='black', alpha=0.7, color='skyblue')
plt.xlabel('Overall Rating')
plt.ylabel('Number of Players')
plt.title('Distribution of Player Overall Ratings')
plt.grid(True, alpha=0.3)
plt.show()
# 2. Top 20 players by overall rating
if 'short_name' in df.columns:
top_players = df[['short_name', 'overall', 'age', 'club_name']].sort_values('overall', ascending=False).head(20)
print("\nTop 20 Players by Overall Rating:")
print(top_players)
else:
print("\nNote: 'short_name' column not available for displaying top players")
# 3. Age vs Overall Rating scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df['age'], df['overall'], alpha=0.5, s=20, color='green')
plt.xlabel('Age')
plt.ylabel('Overall Rating')
plt.title('Age vs Overall Rating')
plt.grid(True, alpha=0.3)
plt.show()
# 4. Value vs Overall Rating (only if value_eur exists)
if 'value_eur' in df.columns:
plt.figure(figsize=(10, 6))
# Use log scale for value to better visualize
plt.scatter(np.log1p(df['value_eur']), df['overall'], alpha=0.5, s=20, color='purple')
plt.xlabel('Log(Value + 1) in EUR')
plt.ylabel('Overall Rating')
plt.title('Player Value vs Overall Rating (Log Scale)')
plt.grid(True, alpha=0.3)
plt.show()
# ========== SIMPLE CORRELATION ANALYSIS ==========
print("\n" + "="*50)
print("CORRELATION ANALYSIS")
print("="*50)
# Select a few key columns for correlation analysis
key_columns = ['overall', 'potential', 'value_eur', 'wage_eur', 'age', 'height_cm', 'weight_kg']
key_columns = [col for col in key_columns if col in df.columns]
if len(key_columns) > 1:
corr_matrix = df[key_columns].corr()
# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', square=True)
plt.title('Correlation Matrix of Key Player Attributes')
plt.tight_layout()
plt.show()
# Show top correlations with overall rating
if 'overall' in corr_matrix.columns:
overall_corr = corr_matrix['overall'].sort_values(ascending=False)
print("\nTop Correlations with Overall Rating:")
print(overall_corr.head(10))
print("\n" + "="*50)
print("ANALYSIS COMPLETE!")
print("="*50)
Dataset Shape: (16155, 106)
First few rows:
sofifa_id player_url \
0 158023 https://sofifa.com/player/158023/lionel-messi/...
1 20801 https://sofifa.com/player/20801/c-ronaldo-dos-...
2 9014 https://sofifa.com/player/9014/arjen-robben/15...
3 41236 https://sofifa.com/player/41236/zlatan-ibrahim...
4 167495 https://sofifa.com/player/167495/manuel-neuer/...
short_name long_name age dob \
0 L. Messi Lionel Andrés Messi Cuccittini 27 1987-06-24
1 Cristiano Ronaldo Cristiano Ronaldo dos Santos Aveiro 29 1985-02-05
2 A. Robben Arjen Robben 30 1984-01-23
3 Z. Ibrahimović Zlatan Ibrahimović 32 1981-10-03
4 M. Neuer Manuel Neuer 28 1986-03-27
height_cm weight_kg nationality club_name ... lwb ldm \
0 169 67 Argentina FC Barcelona ... 62+3 62+3
1 185 80 Portugal Real Madrid ... 63+3 63+3
2 180 80 Netherlands FC Bayern München ... 64+3 64+3
3 195 95 Sweden Paris Saint-Germain ... 61+3 65+3
4 193 92 Germany FC Bayern München ... 36+3 40+3
cdm rdm rwb lb lcb cb rcb rb
0 62+3 62+3 62+3 54+3 45+3 45+3 45+3 54+3
1 63+3 63+3 63+3 57+3 52+3 52+3 52+3 57+3
2 64+3 64+3 64+3 55+3 46+3 46+3 46+3 55+3
3 65+3 65+3 61+3 56+3 55+3 55+3 55+3 56+3
4 40+3 40+3 36+3 36+3 38+3 38+3 38+3 36+3
[5 rows x 106 columns]
Column names:
['sofifa_id', 'player_url', 'short_name', 'long_name', 'age', 'dob', 'height_cm', 'weight_kg', 'nationality', 'club_name', 'league_name', 'league_rank', 'overall', 'potential', 'value_eur', 'wage_eur', 'player_positions', 'preferred_foot', 'international_reputation', 'weak_foot', 'skill_moves', 'work_rate', 'body_type', 'real_face', 'release_clause_eur', 'player_tags', 'team_position', 'team_jersey_number', 'loaned_from', 'joined', 'contract_valid_until', 'nation_position', 'nation_jersey_number', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning', 'player_traits', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']
Data types:
sofifa_id int64
player_url object
short_name object
long_name object
age int64
...
lb object
lcb object
cb object
rcb object
rb object
Length: 106, dtype: object
==================================================
CLEANING THE DATA
==================================================
Missing values summary:
club_name 239
league_name 239
league_rank 239
release_clause_eur 16155
player_tags 14919
team_position 239
team_jersey_number 239
loaned_from 15243
joined 1151
contract_valid_until 239
nation_position 15074
nation_jersey_number 15074
pace 1775
shooting 1775
passing 1775
dribbling 1775
defending 1775
physic 1775
gk_diving 14380
gk_handling 14380
gk_kicking 14380
gk_reflexes 14380
gk_speed 14380
gk_positioning 14380
player_traits 9556
mentality_composure 16155
dtype: int64
Dropping columns with more than 50% missing values: ['release_clause_eur', 'player_tags', 'loaned_from', 'nation_position', 'nation_jersey_number', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning', 'player_traits', 'mentality_composure']
After initial cleaning - shape: (16155, 54)
Handling missing values...
Encoding categorical variables...
Final check for NaN values:
Total NaN values remaining: 0
Columns with NaN: []
Rows before dropping NaN: 16155
Rows after dropping NaN: 16155
Rows dropped: 0
✅ Data cleaning complete!
Final shape: (16155, 55)
Number of features: 54
Basic statistics of cleaned data:
overall age value_eur wage_eur
count 16155.000000 16155.000000 1.615500e+04 16155.000000
mean 63.830393 24.776230 1.060882e+06 13056.453110
std 7.169896 4.625321 2.819128e+06 23488.182571
min 40.000000 16.000000 0.000000e+00 0.000000
25% 59.000000 21.000000 1.200000e+05 2000.000000
50% 64.000000 24.000000 3.500000e+05 5000.000000
75% 68.000000 28.000000 8.250000e+05 10000.000000
max 93.000000 44.000000 1.005000e+08 550000.000000
==================================================
ANALYSIS 1: PREDICTING OVERALL RATING
==================================================
Features shape: (16155, 53)
Target shape: (16155,)
Checking for infinite values:
Infinite values in X: 0
Infinite values in y: 0
Training set: (12924, 53)
Testing set: (3231, 53)
Data scaled successfully!
Model Performance:
Training Linear Regression...
RMSE: 1.8073
R² Score: 0.9365
Training Decision Tree...
RMSE: 1.5459
R² Score: 0.9536
Training Random Forest...
RMSE: 0.7002
R² Score: 0.9905
--------------------------------------------------
Feature Importance Analysis:
Top 10 Most Important Features for Overall Rating:
feature importance
5 value_eur 0.803973
6 wage_eur 0.137903
0 age 0.023435
4 potential 0.022868
14 defending 0.002350
29 movement_reactions 0.000599
42 defending_standing_tackle 0.000450
34 power_strength 0.000350
37 mentality_interceptions 0.000334
41 defending_marking 0.000320
================================================== ADDITIONAL VISUALIZATIONS ==================================================
Top 20 Players by Overall Rating:
short_name overall age club_name
0 L. Messi 93 27 FC Barcelona
1 Cristiano Ronaldo 92 29 Real Madrid
2 A. Robben 90 30 FC Bayern München
3 Z. Ibrahimović 90 32 Paris Saint-Germain
4 M. Neuer 90 28 FC Bayern München
5 L. Suárez 89 27 FC Barcelona
6 Iniesta 89 30 FC Barcelona
7 E. Hazard 88 23 Chelsea
8 R. van Persie 88 30 Manchester United
9 B. Schweinsteiger 88 29 FC Bayern München
10 F. Ribéry 88 31 FC Bayern München
11 Falcao 88 28 Manchester United
16 Thiago Silva 87 29 Paris Saint-Germain
18 L. Modrić 87 28 Real Madrid
17 David Silva 87 28 Manchester City
12 G. Bale 87 24 Real Madrid
15 Sergio Ramos 87 28 Real Madrid
14 P. Lahm 87 30 FC Bayern München
13 R. Lewandowski 87 25 FC Bayern München
26 S. Agüero 86 26 Manchester City
================================================== CORRELATION ANALYSIS ==================================================
Top Correlations with Overall Rating: overall 1.000000 potential 0.803952 wage_eur 0.705422 value_eur 0.568540 age 0.436976 weight_kg 0.123992 height_cm 0.050819 Name: overall, dtype: float64 ================================================== ANALYSIS COMPLETE! ==================================================