[Pema-Norbu] - Fab Futures - Data Science
Home About
In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# 1. Load data
df = pd.read_csv('players_15.csv')

# Select variables
df_clean = df[['overall', 'value_eur']].dropna()
X = df_clean[['overall']]
y = df_clean['value_eur']

print("Data loaded successfully!")
print(f"Number of observations: {len(df_clean)}")
print(f"Overall rating range: {X['overall'].min()} to {X['overall'].max()}")
print(f"Market value range: €{y.min():,.2f} to €{y.max():,.2f}")

# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# 3. Polynomial Regression Analysis
print("\n" + "="*70)
print("POLYNOMIAL REGRESSION ANALYSIS")
print("="*70)

# Test different polynomial degrees
degrees = [1, 2, 3, 4, 5, 6]
results = []

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, degree in enumerate(degrees):
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    # Train model
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train_poly)
    y_test_pred = model.predict(X_test_poly)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    # Store results
    results.append({
        'degree': degree,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'model': model,
        'poly': poly
    })
    
    # Plot results
    ax = axes[i]
    
    # Sort for smooth curve
    X_sorted = np.sort(X_train.values.flatten())
    X_sorted_poly = poly.transform(X_sorted.reshape(-1, 1))
    y_sorted_pred = model.predict(X_sorted_poly)
    
    # Scatter plot of training data
    ax.scatter(X_train, y_train, alpha=0.3, s=20, label='Training Data', color='blue')
    
    # Polynomial curve
    ax.plot(X_sorted, y_sorted_pred, color='red', linewidth=3, 
            label=f'Degree {degree}')
    
    ax.set_xlabel('Overall Rating', fontsize=10)
    ax.set_ylabel('Market Value (EUR)', fontsize=10)
    ax.set_title(f'Polynomial Degree {degree}\nTrain R²: {train_r2:.3f}, Test R²: {test_r2:.3f}', 
                 fontsize=11, fontweight='bold')
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)
    
    # Format y-axis with commas
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))

plt.tight_layout()
plt.show()

# 4. Compare results
print("\n" + "="*70)
print("COMPARISON OF DIFFERENT POLYNOMIAL DEGREES")
print("="*70)

comparison_df = pd.DataFrame(results)
print(comparison_df[['degree', 'train_r2', 'test_r2', 'train_mse', 'test_mse']].round(4))

# 5. Find optimal degree (balance between complexity and performance)
print("\n" + "="*70)
print("FINDING OPTIMAL POLYNOMIAL DEGREE")
print("="*70)

# Plot R² comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# R² comparison
ax1.plot(comparison_df['degree'], comparison_df['train_r2'], 
         marker='o', linewidth=2, markersize=8, label='Training R²')
ax1.plot(comparison_df['degree'], comparison_df['test_r2'], 
         marker='s', linewidth=2, markersize=8, label='Testing R²')
ax1.set_xlabel('Polynomial Degree', fontsize=12)
ax1.set_ylabel('R² Score', fontsize=12)
ax1.set_title('R² Score vs Polynomial Degree', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# MSE comparison
ax2.plot(comparison_df['degree'], comparison_df['train_mse'], 
         marker='o', linewidth=2, markersize=8, label='Training MSE')
ax2.plot(comparison_df['degree'], comparison_df['test_mse'], 
         marker='s', linewidth=2, markersize=8, label='Testing MSE')
ax2.set_xlabel('Polynomial Degree', fontsize=12)
ax2.set_ylabel('Mean Squared Error', fontsize=12)
ax2.set_title('MSE vs Polynomial Degree', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_yscale('log')  # Log scale for better visualization

plt.tight_layout()
plt.show()

# 6. Detailed analysis with optimal degree (let's choose degree 3)
optimal_degree = 3
print(f"\nSelected optimal degree: {optimal_degree}")

# Get the model for optimal degree
optimal_result = results[optimal_degree - 1]  # -1 because list starts at degree 1
optimal_model = optimal_result['model']
optimal_poly = optimal_result['poly']

# Transform all data
X_poly = optimal_poly.transform(X)
X_poly_train = optimal_poly.transform(X_train)
X_poly_test = optimal_poly.transform(X_test)

# Make predictions
y_pred_all = optimal_model.predict(X_poly)
y_pred_train = optimal_model.predict(X_poly_train)
y_pred_test = optimal_model.predict(X_poly_test)

# 7. Detailed visualization of optimal model
print("\n" + "="*70)
print(f"DETAILED ANALYSIS: POLYNOMIAL DEGREE {optimal_degree}")
print("="*70)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Subplot 1: Polynomial fit
ax1 = axes[0, 0]
# Generate smooth curve for visualization
X_range = np.linspace(X['overall'].min(), X['overall'].max(), 100).reshape(-1, 1)
X_range_poly = optimal_poly.transform(X_range)
y_range_pred = optimal_model.predict(X_range_poly)

ax1.scatter(X, y, alpha=0.4, s=30, label='Actual Data', color='blue')
ax1.plot(X_range, y_range_pred, color='red', linewidth=3, 
         label=f'Polynomial (Degree {optimal_degree})')
ax1.set_xlabel('Overall Rating', fontsize=12)
ax1.set_ylabel('Market Value (EUR)', fontsize=12)
ax1.set_title(f'Polynomial Regression Fit (Degree {optimal_degree})\n'
              f'Overall R²: {r2_score(y, y_pred_all):.4f}', 
              fontsize=14, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)
ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))

# Subplot 2: Residuals plot
ax2 = axes[0, 1]
residuals = y - y_pred_all
ax2.scatter(y_pred_all, residuals, alpha=0.5, s=30)
ax2.axhline(y=0, color='red', linestyle='--', linewidth=2)
ax2.set_xlabel('Predicted Market Value (EUR)', fontsize=12)
ax2.set_ylabel('Residuals', fontsize=12)
ax2.set_title('Residuals vs Predicted Values', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))

# Subplot 3: Distribution of residuals
ax3 = axes[1, 0]
ax3.hist(residuals, bins=40, edgecolor='black', alpha=0.7)
ax3.axvline(x=0, color='red', linestyle='--', linewidth=2)
ax3.set_xlabel('Residuals', fontsize=12)
ax3.set_ylabel('Frequency', fontsize=12)
ax3.set_title('Distribution of Residuals', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3)

# Add statistics text
mean_residual = residuals.mean()
std_residual = residuals.std()
textstr = f'Mean: {mean_residual:,.0f}\nStd: {std_residual:,.0f}'
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax3.text(0.95, 0.95, textstr, transform=ax3.transAxes, fontsize=10,
         verticalalignment='top', horizontalalignment='right', bbox=props)

# Subplot 4: Actual vs Predicted
ax4 = axes[1, 1]
ax4.scatter(y, y_pred_all, alpha=0.5, s=30)
# Perfect prediction line
max_val = max(y.max(), y_pred_all.max())
min_val = min(y.min(), y_pred_all.min())
ax4.plot([min_val, max_val], [min_val, max_val], 
         color='red', linestyle='--', linewidth=2, label='Perfect Prediction')
ax4.set_xlabel('Actual Market Value (EUR)', fontsize=12)
ax4.set_ylabel('Predicted Market Value (EUR)', fontsize=12)
ax4.set_title('Actual vs Predicted Values', fontsize=14, fontweight='bold')
ax4.legend()
ax4.grid(True, alpha=0.3)
ax4.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
ax4.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))

plt.tight_layout()
plt.show()

# 8. Model coefficients and equation
print("\n" + "="*70)
print("MODEL COEFFICIENTS AND EQUATION")
print("="*70)

coefficients = optimal_model.coef_
intercept = optimal_model.intercept_

print(f"Intercept: {intercept:,.2f}")
print("\nCoefficients:")
for i, coef in enumerate(coefficients):
    if i == 0:
        continue  # Skip the first coefficient (it's for x^0 = 1)
    print(f"  x^{i}: {coef:,.2f}")

# Print the polynomial equation
equation = f"y = {intercept:,.2f}"
for i, coef in enumerate(coefficients[1:], 1):
    if coef != 0:
        equation += f" + {coef:,.2f} * x^{i}"
print(f"\nPolynomial Equation (Degree {optimal_degree}):")
print(equation)

# 9. Statistical analysis with statsmodels
print("\n" + "="*70)
print("STATISTICAL ANALYSIS WITH STATSMODELS")
print("="*70)

# Add polynomial features to statsmodels
X_sm_poly = sm.add_constant(X_poly)  # Already includes constant from PolynomialFeatures

# Fit model with statsmodels
model_sm = sm.OLS(y, X_sm_poly).fit()
print(model_sm.summary())

# 10. Predictions for specific values
print("\n" + "="*70)
print("PREDICTION EXAMPLES")
print("="*70)

# Sample overall ratings
sample_overall = np.array([[65], [70], [75], [80], [85], [90], [92], [94]])
sample_poly = optimal_poly.transform(sample_overall)
sample_predictions = optimal_model.predict(sample_poly)

print(f"{'Overall Rating':<15} {'Predicted Value (€)':<20}")
print("-" * 35)
for overall, pred in zip(sample_overall.flatten(), sample_predictions):
    print(f"{overall:<15} {pred:>20,.2f}")

# 11. Compare with linear regression
print("\n" + "="*70)
print("COMPARISON WITH LINEAR REGRESSION")
print("="*70)

# Linear regression for comparison
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_linear_pred = linear_model.predict(X_test)
linear_r2 = r2_score(y_test, y_linear_pred)

print(f"Linear Regression R² (test): {linear_r2:.4f}")
print(f"Polynomial Regression R² (test, degree {optimal_degree}): {optimal_result['test_r2']:.4f}")
print(f"Improvement: {(optimal_result['test_r2'] - linear_r2):.4f}")
print(f"Percentage Improvement: {((optimal_result['test_r2'] - linear_r2) / linear_r2 * 100):.2f}%")

# 12. Cross-validation for more reliable results
print("\n" + "="*70)
print("CROSS-VALIDATION RESULTS")
print("="*70)

from sklearn.model_selection import cross_val_score

# Cross-validation for polynomial regression
cv_scores_poly = cross_val_score(
    optimal_model, X_poly, y, 
    cv=5,  # 5-fold cross-validation
    scoring='r2'
)

# Cross-validation for linear regression
cv_scores_linear = cross_val_score(
    linear_model, X, y,
    cv=5,
    scoring='r2'
)

print(f"Polynomial Regression (Degree {optimal_degree}) Cross-validation R²:")
print(f"  Scores: {cv_scores_poly.round(4)}")
print(f"  Mean: {cv_scores_poly.mean():.4f}")
print(f"  Std: {cv_scores_poly.std():.4f}")

print(f"\nLinear Regression Cross-validation R²:")
print(f"  Scores: {cv_scores_linear.round(4)}")
print(f"  Mean: {cv_scores_linear.mean():.4f}")
print(f"  Std: {cv_scores_linear.std():.4f}")

# 13. Save results
print("\n" + "="*70)
print("SAVING RESULTS")
print("="*70)

# Create results dataframe
results_df = pd.DataFrame({
    'overall': X['overall'],
    'actual_value': y,
    'predicted_value_linear': linear_model.predict(X),
    'predicted_value_poly': y_pred_all,
    'residuals_linear': y - linear_model.predict(X),
    'residuals_poly': residuals
})

# Save to CSV
results_df.to_csv('polynomial_regression_results.csv', index=False)
print("Results saved to 'polynomial_regression_results.csv'")

# 14. Final summary
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)

print(f"Dataset: {len(df_clean)} players")
print(f"Optimal polynomial degree: {optimal_degree}")
print(f"\nModel Performance:")
print(f"  Training R²: {optimal_result['train_r2']:.4f}")
print(f"  Testing R²:  {optimal_result['test_r2']:.4f}")
print(f"  Cross-validation R² (mean): {cv_scores_poly.mean():.4f}")
print(f"\nComparison with Linear Regression:")
print(f"  Linear R²: {linear_r2:.4f}")
print(f"  Polynomial R² improvement: {(optimal_result['test_r2'] - linear_r2):.4f}")

# Check for overfitting
if optimal_result['train_r2'] - optimal_result['test_r2'] > 0.1:
    print("\n⚠️ WARNING: Potential overfitting detected!")
    print(f"  Gap between train and test R²: {(optimal_result['train_r2'] - optimal_result['test_r2']):.3f}")
else:
    print("\n✓ Good generalization: Train and test performance are similar")
Data loaded successfully!
Number of observations: 16155
Overall rating range: 40 to 93
Market value range: €0.00 to €100,500,000.00

Training set size: 12924
Testing set size: 3231

======================================================================
POLYNOMIAL REGRESSION ANALYSIS
======================================================================
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(
No description has been provided for this image
======================================================================
COMPARISON OF DIFFERENT POLYNOMIAL DEGREES
======================================================================
   degree  train_r2  test_r2     train_mse      test_mse
0       1    0.3354   0.2892  4.813830e+12  7.646020e+12
1       2    0.6737   0.6260  2.363396e+12  4.023383e+12
2       3    0.8601   0.8440  1.013520e+12  1.678392e+12
3       4    0.9199   0.9285  5.802198e+11  7.687071e+11
4       5    0.9261   0.9394  5.355409e+11  6.522237e+11
5       6    0.9324   0.9515  4.894018e+11  5.217910e+11

======================================================================
FINDING OPTIMAL POLYNOMIAL DEGREE
======================================================================
No description has been provided for this image
Selected optimal degree: 3

======================================================================
DETAILED ANALYSIS: POLYNOMIAL DEGREE 3
======================================================================
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(
No description has been provided for this image
======================================================================
MODEL COEFFICIENTS AND EQUATION
======================================================================
Intercept: -292,393,284.39

Coefficients:
  x^1: 14,920,580.30
  x^2: -251,978.93
  x^3: 1,410.06

Polynomial Equation (Degree 3):
y = -292,393,284.39 + 14,920,580.30 * x^1 + -251,978.93 * x^2 + 1,410.06 * x^3

======================================================================
STATISTICAL ANALYSIS WITH STATSMODELS
======================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              value_eur   R-squared:                       0.856
Model:                            OLS   Adj. R-squared:                  0.856
Method:                 Least Squares   F-statistic:                 3.212e+04
Date:                Wed, 10 Dec 2025   Prob (F-statistic):               0.00
Time:                        05:09:20   Log-Likelihood:            -2.4718e+05
No. Observations:               16155   AIC:                         4.944e+05
Df Residuals:                   16151   BIC:                         4.944e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -3.117e+08   2.68e+06   -116.171      0.000   -3.17e+08   -3.06e+08
x1          1.586e+07   1.26e+05    125.742      0.000    1.56e+07    1.61e+07
x2         -2.671e+05   1959.543   -136.299      0.000   -2.71e+05   -2.63e+05
x3          1489.9646     10.058    148.132      0.000    1470.249    1509.680
==============================================================================
Omnibus:                    26126.467   Durbin-Watson:                   0.542
Prob(Omnibus):                  0.000   Jarque-Bera (JB):        104528424.872
Skew:                           9.905   Prob(JB):                         0.00
Kurtosis:                     396.568   Cond. No.                     9.11e+07
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.11e+07. This might indicate that there are
strong multicollinearity or other numerical problems.

======================================================================
PREDICTION EXAMPLES
======================================================================
Overall Rating  Predicted Value (€) 
-----------------------------------
65                         72,202.14
70                      1,002,432.42
75                      4,139,385.27
80                     10,540,608.48
85                     21,263,649.83
90                     37,366,057.12
92                     45,550,133.21
94                     54,831,798.55

======================================================================
COMPARISON WITH LINEAR REGRESSION
======================================================================
Linear Regression R² (test): 0.2892
Polynomial Regression R² (test, degree 3): 0.8440
Improvement: 0.5548
Percentage Improvement: 191.82%

======================================================================
CROSS-VALIDATION RESULTS
======================================================================
Polynomial Regression (Degree 3) Cross-validation R²:
  Scores: [ 1.63400000e-01 -4.50770000e+00 -1.09956000e+01 -1.68437000e+01
 -4.68287011e+04]
  Mean: -9372.1769
  Std: 18728.2629

Linear Regression Cross-validation R²:
  Scores: [-2.8160000e-01 -3.0739300e+01 -4.0040700e+01 -1.0491400e+01
 -8.9687326e+03]
  Mean: -1810.0571
  Std: 3579.3655

======================================================================
SAVING RESULTS
======================================================================
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(
Results saved to 'polynomial_regression_results.csv'

======================================================================
FINAL SUMMARY
======================================================================
Dataset: 16155 players
Optimal polynomial degree: 3

Model Performance:
  Training R²: 0.8601
  Testing R²:  0.8440
  Cross-validation R² (mean): -9372.1769

Comparison with Linear Regression:
  Linear R²: 0.2892
  Polynomial R² improvement: 0.5548

✓ Good generalization: Train and test performance are similar
In [ ]: