In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
# 1. Load data
df = pd.read_csv('players_15.csv')
# Select variables
df_clean = df[['overall', 'value_eur']].dropna()
X = df_clean[['overall']]
y = df_clean['value_eur']
print("Data loaded successfully!")
print(f"Number of observations: {len(df_clean)}")
print(f"Overall rating range: {X['overall'].min()} to {X['overall'].max()}")
print(f"Market value range: €{y.min():,.2f} to €{y.max():,.2f}")
# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
# 3. Polynomial Regression Analysis
print("\n" + "="*70)
print("POLYNOMIAL REGRESSION ANALYSIS")
print("="*70)
# Test different polynomial degrees
degrees = [1, 2, 3, 4, 5, 6]
results = []
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()
for i, degree in enumerate(degrees):
# Create polynomial features
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Train model
model = LinearRegression()
model.fit(X_train_poly, y_train)
# Make predictions
y_train_pred = model.predict(X_train_poly)
y_test_pred = model.predict(X_test_poly)
# Calculate metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
# Store results
results.append({
'degree': degree,
'train_r2': train_r2,
'test_r2': test_r2,
'train_mse': train_mse,
'test_mse': test_mse,
'model': model,
'poly': poly
})
# Plot results
ax = axes[i]
# Sort for smooth curve
X_sorted = np.sort(X_train.values.flatten())
X_sorted_poly = poly.transform(X_sorted.reshape(-1, 1))
y_sorted_pred = model.predict(X_sorted_poly)
# Scatter plot of training data
ax.scatter(X_train, y_train, alpha=0.3, s=20, label='Training Data', color='blue')
# Polynomial curve
ax.plot(X_sorted, y_sorted_pred, color='red', linewidth=3,
label=f'Degree {degree}')
ax.set_xlabel('Overall Rating', fontsize=10)
ax.set_ylabel('Market Value (EUR)', fontsize=10)
ax.set_title(f'Polynomial Degree {degree}\nTrain R²: {train_r2:.3f}, Test R²: {test_r2:.3f}',
fontsize=11, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)
# Format y-axis with commas
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.tight_layout()
plt.show()
# 4. Compare results
print("\n" + "="*70)
print("COMPARISON OF DIFFERENT POLYNOMIAL DEGREES")
print("="*70)
comparison_df = pd.DataFrame(results)
print(comparison_df[['degree', 'train_r2', 'test_r2', 'train_mse', 'test_mse']].round(4))
# 5. Find optimal degree (balance between complexity and performance)
print("\n" + "="*70)
print("FINDING OPTIMAL POLYNOMIAL DEGREE")
print("="*70)
# Plot R² comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# R² comparison
ax1.plot(comparison_df['degree'], comparison_df['train_r2'],
marker='o', linewidth=2, markersize=8, label='Training R²')
ax1.plot(comparison_df['degree'], comparison_df['test_r2'],
marker='s', linewidth=2, markersize=8, label='Testing R²')
ax1.set_xlabel('Polynomial Degree', fontsize=12)
ax1.set_ylabel('R² Score', fontsize=12)
ax1.set_title('R² Score vs Polynomial Degree', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)
# MSE comparison
ax2.plot(comparison_df['degree'], comparison_df['train_mse'],
marker='o', linewidth=2, markersize=8, label='Training MSE')
ax2.plot(comparison_df['degree'], comparison_df['test_mse'],
marker='s', linewidth=2, markersize=8, label='Testing MSE')
ax2.set_xlabel('Polynomial Degree', fontsize=12)
ax2.set_ylabel('Mean Squared Error', fontsize=12)
ax2.set_title('MSE vs Polynomial Degree', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_yscale('log') # Log scale for better visualization
plt.tight_layout()
plt.show()
# 6. Detailed analysis with optimal degree (let's choose degree 3)
optimal_degree = 3
print(f"\nSelected optimal degree: {optimal_degree}")
# Get the model for optimal degree
optimal_result = results[optimal_degree - 1] # -1 because list starts at degree 1
optimal_model = optimal_result['model']
optimal_poly = optimal_result['poly']
# Transform all data
X_poly = optimal_poly.transform(X)
X_poly_train = optimal_poly.transform(X_train)
X_poly_test = optimal_poly.transform(X_test)
# Make predictions
y_pred_all = optimal_model.predict(X_poly)
y_pred_train = optimal_model.predict(X_poly_train)
y_pred_test = optimal_model.predict(X_poly_test)
# 7. Detailed visualization of optimal model
print("\n" + "="*70)
print(f"DETAILED ANALYSIS: POLYNOMIAL DEGREE {optimal_degree}")
print("="*70)
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# Subplot 1: Polynomial fit
ax1 = axes[0, 0]
# Generate smooth curve for visualization
X_range = np.linspace(X['overall'].min(), X['overall'].max(), 100).reshape(-1, 1)
X_range_poly = optimal_poly.transform(X_range)
y_range_pred = optimal_model.predict(X_range_poly)
ax1.scatter(X, y, alpha=0.4, s=30, label='Actual Data', color='blue')
ax1.plot(X_range, y_range_pred, color='red', linewidth=3,
label=f'Polynomial (Degree {optimal_degree})')
ax1.set_xlabel('Overall Rating', fontsize=12)
ax1.set_ylabel('Market Value (EUR)', fontsize=12)
ax1.set_title(f'Polynomial Regression Fit (Degree {optimal_degree})\n'
f'Overall R²: {r2_score(y, y_pred_all):.4f}',
fontsize=14, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)
ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
# Subplot 2: Residuals plot
ax2 = axes[0, 1]
residuals = y - y_pred_all
ax2.scatter(y_pred_all, residuals, alpha=0.5, s=30)
ax2.axhline(y=0, color='red', linestyle='--', linewidth=2)
ax2.set_xlabel('Predicted Market Value (EUR)', fontsize=12)
ax2.set_ylabel('Residuals', fontsize=12)
ax2.set_title('Residuals vs Predicted Values', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
# Subplot 3: Distribution of residuals
ax3 = axes[1, 0]
ax3.hist(residuals, bins=40, edgecolor='black', alpha=0.7)
ax3.axvline(x=0, color='red', linestyle='--', linewidth=2)
ax3.set_xlabel('Residuals', fontsize=12)
ax3.set_ylabel('Frequency', fontsize=12)
ax3.set_title('Distribution of Residuals', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3)
# Add statistics text
mean_residual = residuals.mean()
std_residual = residuals.std()
textstr = f'Mean: {mean_residual:,.0f}\nStd: {std_residual:,.0f}'
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax3.text(0.95, 0.95, textstr, transform=ax3.transAxes, fontsize=10,
verticalalignment='top', horizontalalignment='right', bbox=props)
# Subplot 4: Actual vs Predicted
ax4 = axes[1, 1]
ax4.scatter(y, y_pred_all, alpha=0.5, s=30)
# Perfect prediction line
max_val = max(y.max(), y_pred_all.max())
min_val = min(y.min(), y_pred_all.min())
ax4.plot([min_val, max_val], [min_val, max_val],
color='red', linestyle='--', linewidth=2, label='Perfect Prediction')
ax4.set_xlabel('Actual Market Value (EUR)', fontsize=12)
ax4.set_ylabel('Predicted Market Value (EUR)', fontsize=12)
ax4.set_title('Actual vs Predicted Values', fontsize=14, fontweight='bold')
ax4.legend()
ax4.grid(True, alpha=0.3)
ax4.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
ax4.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.tight_layout()
plt.show()
# 8. Model coefficients and equation
print("\n" + "="*70)
print("MODEL COEFFICIENTS AND EQUATION")
print("="*70)
coefficients = optimal_model.coef_
intercept = optimal_model.intercept_
print(f"Intercept: {intercept:,.2f}")
print("\nCoefficients:")
for i, coef in enumerate(coefficients):
if i == 0:
continue # Skip the first coefficient (it's for x^0 = 1)
print(f" x^{i}: {coef:,.2f}")
# Print the polynomial equation
equation = f"y = {intercept:,.2f}"
for i, coef in enumerate(coefficients[1:], 1):
if coef != 0:
equation += f" + {coef:,.2f} * x^{i}"
print(f"\nPolynomial Equation (Degree {optimal_degree}):")
print(equation)
# 9. Statistical analysis with statsmodels
print("\n" + "="*70)
print("STATISTICAL ANALYSIS WITH STATSMODELS")
print("="*70)
# Add polynomial features to statsmodels
X_sm_poly = sm.add_constant(X_poly) # Already includes constant from PolynomialFeatures
# Fit model with statsmodels
model_sm = sm.OLS(y, X_sm_poly).fit()
print(model_sm.summary())
# 10. Predictions for specific values
print("\n" + "="*70)
print("PREDICTION EXAMPLES")
print("="*70)
# Sample overall ratings
sample_overall = np.array([[65], [70], [75], [80], [85], [90], [92], [94]])
sample_poly = optimal_poly.transform(sample_overall)
sample_predictions = optimal_model.predict(sample_poly)
print(f"{'Overall Rating':<15} {'Predicted Value (€)':<20}")
print("-" * 35)
for overall, pred in zip(sample_overall.flatten(), sample_predictions):
print(f"{overall:<15} {pred:>20,.2f}")
# 11. Compare with linear regression
print("\n" + "="*70)
print("COMPARISON WITH LINEAR REGRESSION")
print("="*70)
# Linear regression for comparison
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_linear_pred = linear_model.predict(X_test)
linear_r2 = r2_score(y_test, y_linear_pred)
print(f"Linear Regression R² (test): {linear_r2:.4f}")
print(f"Polynomial Regression R² (test, degree {optimal_degree}): {optimal_result['test_r2']:.4f}")
print(f"Improvement: {(optimal_result['test_r2'] - linear_r2):.4f}")
print(f"Percentage Improvement: {((optimal_result['test_r2'] - linear_r2) / linear_r2 * 100):.2f}%")
# 12. Cross-validation for more reliable results
print("\n" + "="*70)
print("CROSS-VALIDATION RESULTS")
print("="*70)
from sklearn.model_selection import cross_val_score
# Cross-validation for polynomial regression
cv_scores_poly = cross_val_score(
optimal_model, X_poly, y,
cv=5, # 5-fold cross-validation
scoring='r2'
)
# Cross-validation for linear regression
cv_scores_linear = cross_val_score(
linear_model, X, y,
cv=5,
scoring='r2'
)
print(f"Polynomial Regression (Degree {optimal_degree}) Cross-validation R²:")
print(f" Scores: {cv_scores_poly.round(4)}")
print(f" Mean: {cv_scores_poly.mean():.4f}")
print(f" Std: {cv_scores_poly.std():.4f}")
print(f"\nLinear Regression Cross-validation R²:")
print(f" Scores: {cv_scores_linear.round(4)}")
print(f" Mean: {cv_scores_linear.mean():.4f}")
print(f" Std: {cv_scores_linear.std():.4f}")
# 13. Save results
print("\n" + "="*70)
print("SAVING RESULTS")
print("="*70)
# Create results dataframe
results_df = pd.DataFrame({
'overall': X['overall'],
'actual_value': y,
'predicted_value_linear': linear_model.predict(X),
'predicted_value_poly': y_pred_all,
'residuals_linear': y - linear_model.predict(X),
'residuals_poly': residuals
})
# Save to CSV
results_df.to_csv('polynomial_regression_results.csv', index=False)
print("Results saved to 'polynomial_regression_results.csv'")
# 14. Final summary
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)
print(f"Dataset: {len(df_clean)} players")
print(f"Optimal polynomial degree: {optimal_degree}")
print(f"\nModel Performance:")
print(f" Training R²: {optimal_result['train_r2']:.4f}")
print(f" Testing R²: {optimal_result['test_r2']:.4f}")
print(f" Cross-validation R² (mean): {cv_scores_poly.mean():.4f}")
print(f"\nComparison with Linear Regression:")
print(f" Linear R²: {linear_r2:.4f}")
print(f" Polynomial R² improvement: {(optimal_result['test_r2'] - linear_r2):.4f}")
# Check for overfitting
if optimal_result['train_r2'] - optimal_result['test_r2'] > 0.1:
print("\n⚠️ WARNING: Potential overfitting detected!")
print(f" Gap between train and test R²: {(optimal_result['train_r2'] - optimal_result['test_r2']):.3f}")
else:
print("\n✓ Good generalization: Train and test performance are similar")
Data loaded successfully! Number of observations: 16155 Overall rating range: 40 to 93 Market value range: €0.00 to €100,500,000.00 Training set size: 12924 Testing set size: 3231 ====================================================================== POLYNOMIAL REGRESSION ANALYSIS ======================================================================
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn( /opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn( /opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn( /opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn( /opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn( /opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn(
====================================================================== COMPARISON OF DIFFERENT POLYNOMIAL DEGREES ====================================================================== degree train_r2 test_r2 train_mse test_mse 0 1 0.3354 0.2892 4.813830e+12 7.646020e+12 1 2 0.6737 0.6260 2.363396e+12 4.023383e+12 2 3 0.8601 0.8440 1.013520e+12 1.678392e+12 3 4 0.9199 0.9285 5.802198e+11 7.687071e+11 4 5 0.9261 0.9394 5.355409e+11 6.522237e+11 5 6 0.9324 0.9515 4.894018e+11 5.217910e+11 ====================================================================== FINDING OPTIMAL POLYNOMIAL DEGREE ======================================================================
Selected optimal degree: 3 ====================================================================== DETAILED ANALYSIS: POLYNOMIAL DEGREE 3 ======================================================================
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn(
======================================================================
MODEL COEFFICIENTS AND EQUATION
======================================================================
Intercept: -292,393,284.39
Coefficients:
x^1: 14,920,580.30
x^2: -251,978.93
x^3: 1,410.06
Polynomial Equation (Degree 3):
y = -292,393,284.39 + 14,920,580.30 * x^1 + -251,978.93 * x^2 + 1,410.06 * x^3
======================================================================
STATISTICAL ANALYSIS WITH STATSMODELS
======================================================================
OLS Regression Results
==============================================================================
Dep. Variable: value_eur R-squared: 0.856
Model: OLS Adj. R-squared: 0.856
Method: Least Squares F-statistic: 3.212e+04
Date: Wed, 10 Dec 2025 Prob (F-statistic): 0.00
Time: 05:09:20 Log-Likelihood: -2.4718e+05
No. Observations: 16155 AIC: 4.944e+05
Df Residuals: 16151 BIC: 4.944e+05
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -3.117e+08 2.68e+06 -116.171 0.000 -3.17e+08 -3.06e+08
x1 1.586e+07 1.26e+05 125.742 0.000 1.56e+07 1.61e+07
x2 -2.671e+05 1959.543 -136.299 0.000 -2.71e+05 -2.63e+05
x3 1489.9646 10.058 148.132 0.000 1470.249 1509.680
==============================================================================
Omnibus: 26126.467 Durbin-Watson: 0.542
Prob(Omnibus): 0.000 Jarque-Bera (JB): 104528424.872
Skew: 9.905 Prob(JB): 0.00
Kurtosis: 396.568 Cond. No. 9.11e+07
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.11e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
======================================================================
PREDICTION EXAMPLES
======================================================================
Overall Rating Predicted Value (€)
-----------------------------------
65 72,202.14
70 1,002,432.42
75 4,139,385.27
80 10,540,608.48
85 21,263,649.83
90 37,366,057.12
92 45,550,133.21
94 54,831,798.55
======================================================================
COMPARISON WITH LINEAR REGRESSION
======================================================================
Linear Regression R² (test): 0.2892
Polynomial Regression R² (test, degree 3): 0.8440
Improvement: 0.5548
Percentage Improvement: 191.82%
======================================================================
CROSS-VALIDATION RESULTS
======================================================================
Polynomial Regression (Degree 3) Cross-validation R²:
Scores: [ 1.63400000e-01 -4.50770000e+00 -1.09956000e+01 -1.68437000e+01
-4.68287011e+04]
Mean: -9372.1769
Std: 18728.2629
Linear Regression Cross-validation R²:
Scores: [-2.8160000e-01 -3.0739300e+01 -4.0040700e+01 -1.0491400e+01
-8.9687326e+03]
Mean: -1810.0571
Std: 3579.3655
======================================================================
SAVING RESULTS
======================================================================
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn(
Results saved to 'polynomial_regression_results.csv' ====================================================================== FINAL SUMMARY ====================================================================== Dataset: 16155 players Optimal polynomial degree: 3 Model Performance: Training R²: 0.8601 Testing R²: 0.8440 Cross-validation R² (mean): -9372.1769 Comparison with Linear Regression: Linear R²: 0.2892 Polynomial R² improvement: 0.5548 ✓ Good generalization: Train and test performance are similar
In [ ]: