Fitting¶
In [3]:
# First, let's load the data from the datasets folder
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
# Load the data from datasets folder
df = pd.read_csv('datasets/players_21.csv')
# Display basic info about the dataset
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
# Let's see what columns are available
print("\nColumn names (first 20):")
print(df.columns.tolist()[:20])
# Check data types
print("\nData types (first 20):")
print(df.dtypes.head(20))
# For linear regression, let's select numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumber of numeric columns: {len(numeric_cols)}")
# Example 1: Simple linear regression - Predicting overall rating
# Let's select some key attributes
print("\n" + "="*50)
print("EXAMPLE 1: Predicting Player Overall Rating")
print("="*50)
features = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
target = 'overall'
# Check if our features exist
for feature in features:
if feature in df.columns:
print(f"✓ {feature} available")
else:
print(f"✗ {feature} NOT found")
# Create a clean dataset
clean_df = df[features + [target]].dropna()
print(f"\nOriginal dataset size: {len(df)}")
print(f"Clean dataset size: {len(clean_df)}")
print(f"Percentage of data kept: {len(clean_df)/len(df)*100:.1f}%")
# Split the data
X = clean_df[features]
y = clean_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
print("\nModel Performance:")
print("-" * 30)
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.4f}")
print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R-squared Score: {r2_score(y_test, y_pred):.4f}")
# Show the coefficients
print("\nFeature Coefficients (Impact on Overall Rating):")
print("-" * 40)
for feature, coef in zip(features, model.coef_):
print(f"{feature:12}: {coef:+.4f}")
print(f"\nIntercept: {model.intercept_:.4f}")
# Visualize predictions vs actual values
plt.figure(figsize=(12, 5))
# Plot 1: Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.5, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Overall Rating')
plt.ylabel('Predicted Overall Rating')
plt.title('Actual vs Predicted')
plt.grid(True, alpha=0.3)
# Plot 2: Residual plot
plt.subplot(1, 2, 2)
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, alpha=0.5, color='green')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Example 2: Predicting player's potential
print("\n" + "="*50)
print("EXAMPLE 2: Predicting Player Potential")
print("="*50)
potential_features = ['overall', 'age', 'international_reputation', 'skill_moves', 'weak_foot']
potential_target = 'potential'
# Check if features exist
for feature in potential_features:
if feature in df.columns:
print(f"✓ {feature} available")
else:
print(f"✗ {feature} NOT found")
# Create clean dataset for potential
potential_df = df[potential_features + [potential_target]].dropna()
print(f"\nData for potential prediction: {len(potential_df)} rows")
X_pot = potential_df[potential_features]
y_pot = potential_df[potential_target]
X_train_pot, X_test_pot, y_train_pot, y_test_pot = train_test_split(
X_pot, y_pot, test_size=0.2, random_state=42
)
# Train model
pot_model = LinearRegression()
pot_model.fit(X_train_pot, y_train_pot)
# Make predictions
y_pred_pot = pot_model.predict(X_test_pot)
# Evaluate
print("\nPotential Prediction Performance:")
print("-" * 30)
print(f"R-squared Score: {r2_score(y_test_pot, y_pred_pot):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test_pot, y_pred_pot):.4f}")
# Show coefficients
print("\nFeature Coefficients (Impact on Potential):")
print("-" * 40)
for feature, coef in zip(potential_features, pot_model.coef_):
print(f"{feature:25}: {coef:+.4f}")
print(f"\nIntercept: {pot_model.intercept_:.4f}")
# Example 3: Quick correlation heatmap
print("\n" + "="*50)
print("EXAMPLE 3: Correlation Heatmap (First 15 numeric columns)")
print("="*50)
# Select first 15 numeric columns for visualization
corr_cols = numeric_cols[:15] if len(numeric_cols) >= 15 else numeric_cols
corr_matrix = df[corr_cols].corr()
plt.figure(figsize=(12, 8))
plt.imshow(corr_matrix, cmap='coolwarm', aspect='auto')
plt.colorbar(label='Correlation')
plt.xticks(range(len(corr_cols)), corr_cols, rotation=45, ha='right')
plt.yticks(range(len(corr_cols)), corr_cols)
plt.title('Correlation Heatmap of Player Attributes')
plt.tight_layout()
plt.show()
# Show top correlations with overall rating
if 'overall' in df.columns:
overall_corr = df[numeric_cols].corrwith(df['overall']).sort_values(ascending=False)
print("\nTop 10 attributes correlated with Overall Rating:")
print("-" * 50)
for i, (attr, corr) in enumerate(overall_corr.head(10).items(), 1):
print(f"{i:2}. {attr:30}: {corr:.4f}")
Dataset shape: (18944, 106)
First few rows:
sofifa_id player_url \
0 158023 https://sofifa.com/player/158023/lionel-messi/...
1 20801 https://sofifa.com/player/20801/c-ronaldo-dos-...
2 200389 https://sofifa.com/player/200389/jan-oblak/210002
3 188545 https://sofifa.com/player/188545/robert-lewand...
4 190871 https://sofifa.com/player/190871/neymar-da-sil...
short_name long_name age dob \
0 L. Messi Lionel Andrés Messi Cuccittini 33 1987-06-24
1 Cristiano Ronaldo Cristiano Ronaldo dos Santos Aveiro 35 1985-02-05
2 J. Oblak Jan Oblak 27 1993-01-07
3 R. Lewandowski Robert Lewandowski 31 1988-08-21
4 Neymar Jr Neymar da Silva Santos Júnior 28 1992-02-05
height_cm weight_kg nationality club_name ... lwb ldm \
0 170 72 Argentina FC Barcelona ... 66+3 65+3
1 187 83 Portugal Juventus ... 65+3 61+3
2 188 87 Slovenia Atlético Madrid ... 32+3 36+3
3 184 80 Poland FC Bayern München ... 64+3 65+3
4 175 68 Brazil Paris Saint-Germain ... 67+3 62+3
cdm rdm rwb lb lcb cb rcb rb
0 65+3 65+3 66+3 62+3 52+3 52+3 52+3 62+3
1 61+3 61+3 65+3 61+3 54+3 54+3 54+3 61+3
2 36+3 36+3 32+3 32+3 33+3 33+3 33+3 32+3
3 65+3 65+3 64+3 61+3 60+3 60+3 60+3 61+3
4 62+3 62+3 67+3 62+3 49+3 49+3 49+3 62+3
[5 rows x 106 columns]
Column names (first 20):
['sofifa_id', 'player_url', 'short_name', 'long_name', 'age', 'dob', 'height_cm', 'weight_kg', 'nationality', 'club_name', 'league_name', 'league_rank', 'overall', 'potential', 'value_eur', 'wage_eur', 'player_positions', 'preferred_foot', 'international_reputation', 'weak_foot']
Data types (first 20):
sofifa_id int64
player_url object
short_name object
long_name object
age int64
dob object
height_cm int64
weight_kg int64
nationality object
club_name object
league_name object
league_rank float64
overall int64
potential int64
value_eur int64
wage_eur int64
player_positions object
preferred_foot object
international_reputation int64
weak_foot int64
dtype: object
Number of numeric columns: 62
==================================================
EXAMPLE 1: Predicting Player Overall Rating
==================================================
✓ pace available
✓ shooting available
✓ passing available
✓ dribbling available
✓ defending available
✓ physic available
Original dataset size: 18944
Clean dataset size: 16861
Percentage of data kept: 89.0%
Training set size: 13488
Testing set size: 3373
Model Performance:
------------------------------
Mean Absolute Error: 2.7576
Mean Squared Error: 11.8920
Root Mean Squared Error: 3.4485
R-squared Score: 0.7439
Feature Coefficients (Impact on Overall Rating):
----------------------------------------
pace : +0.0064
shooting : +0.0752
passing : +0.0844
dribbling : +0.3130
defending : +0.1122
physic : +0.2574
Intercept: 14.7282
================================================== EXAMPLE 2: Predicting Player Potential ================================================== ✓ overall available ✓ age available ✓ international_reputation available ✓ skill_moves available ✓ weak_foot available Data for potential prediction: 18944 rows Potential Prediction Performance: ------------------------------ R-squared Score: 0.8056 Mean Absolute Error: 2.0595 Feature Coefficients (Impact on Potential): ---------------------------------------- overall : +0.8287 age : -0.9567 international_reputation : +1.6677 skill_moves : -0.3096 weak_foot : +0.0443 Intercept: 39.5871 ================================================== EXAMPLE 3: Correlation Heatmap (First 15 numeric columns) ==================================================
Top 10 attributes correlated with Overall Rating: -------------------------------------------------- 1. overall : 1.0000 2. gk_diving : 0.9503 3. gk_reflexes : 0.9478 4. gk_positioning : 0.9469 5. gk_handling : 0.9293 6. movement_reactions : 0.8672 7. gk_kicking : 0.8104 8. passing : 0.7115 9. mentality_composure : 0.7053 10. dribbling : 0.6411
Plynomial Regression¶
In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
# Load the data from datasets folder
df = pd.read_csv('datasets/players_21.csv')
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
# Example 1: Simple Polynomial Regression with 2 features
print("\n" + "="*60)
print("POLYNOMIAL REGRESSION EXAMPLE 1: Overall Rating Prediction")
print("="*60)
# Select features for polynomial regression
features = ['pace', 'shooting'] # Start with just 2 features for visualization
target = 'overall'
# Create clean dataset
clean_df = df[features + [target]].dropna()
print(f"Data points available: {len(clean_df)}")
X = clean_df[features]
y = clean_df[target]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {len(X_train)} samples")
print(f"Testing set: {len(X_test)} samples")
# Try different polynomial degrees
degrees = [1, 2, 3, 4]
results = []
for degree in degrees:
print(f"\n--- Polynomial Degree {degree} ---")
# Create polynomial features
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Train linear regression on polynomial features
model = LinearRegression()
model.fit(X_train_poly, y_train)
# Make predictions
y_pred = model.predict(X_test_poly)
# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
results.append({
'degree': degree,
'r2': r2,
'mae': mae,
'rmse': rmse,
'num_features': X_train_poly.shape[1]
})
print(f"Number of polynomial features: {X_train_poly.shape[1]}")
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
# Compare results
print("\n" + "="*60)
print("COMPARISON OF DIFFERENT POLYNOMIAL DEGREES")
print("="*60)
results_df = pd.DataFrame(results)
print(results_df[['degree', 'r2', 'mae', 'rmse', 'num_features']])
# Visualize the results
plt.figure(figsize=(14, 5))
# Plot 1: R² Score vs Polynomial Degree
plt.subplot(1, 3, 1)
plt.plot(results_df['degree'], results_df['r2'], 'bo-', linewidth=2, markersize=8)
plt.xlabel('Polynomial Degree')
plt.ylabel('R² Score')
plt.title('R² Score vs Polynomial Degree')
plt.grid(True, alpha=0.3)
for i, row in results_df.iterrows():
plt.text(row['degree'], row['r2']+0.01, f"{row['r2']:.3f}",
ha='center', va='bottom')
# Plot 2: RMSE vs Polynomial Degree
plt.subplot(1, 3, 2)
plt.plot(results_df['degree'], results_df['rmse'], 'ro-', linewidth=2, markersize=8)
plt.xlabel('Polynomial Degree')
plt.ylabel('RMSE')
plt.title('RMSE vs Polynomial Degree')
plt.grid(True, alpha=0.3)
for i, row in results_df.iterrows():
plt.text(row['degree'], row['rmse']+0.05, f"{row['rmse']:.3f}",
ha='center', va='bottom')
# Plot 3: Number of Features vs Polynomial Degree
plt.subplot(1, 3, 3)
plt.plot(results_df['degree'], results_df['num_features'], 'go-', linewidth=2, markersize=8)
plt.xlabel('Polynomial Degree')
plt.ylabel('Number of Features')
plt.title('Feature Explosion in Polynomial Regression')
plt.grid(True, alpha=0.3)
for i, row in results_df.iterrows():
plt.text(row['degree'], row['num_features']+5, f"{row['num_features']}",
ha='center', va='bottom')
plt.tight_layout()
plt.show()
# Example 2: Polynomial Regression with more features
print("\n" + "="*60)
print("POLYNOMIAL REGRESSION EXAMPLE 2: With 4 Features")
print("="*60)
features = ['pace', 'shooting', 'passing', 'dribbling']
target = 'overall'
# Create clean dataset
clean_df = df[features + [target]].dropna()
print(f"Data points available: {len(clean_df)}")
X = clean_df[features]
y = clean_df[target]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Try degree 2 polynomial
degree = 2
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
print(f"\nWith {len(features)} features and degree {degree}:")
print(f"Original features: {len(features)}")
print(f"Polynomial features: {X_train_poly.shape[1]}")
# Train model
model = LinearRegression()
model.fit(X_train_poly, y_train)
# Make predictions
y_pred = model.predict(X_test_poly)
# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"\nPerformance Metrics (Degree {degree}):")
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
# Visualize predictions vs actual
plt.figure(figsize=(12, 5))
# Plot 1: Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', edgecolors='black', linewidth=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
'r--', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Overall Rating')
plt.ylabel('Predicted Overall Rating')
plt.title(f'Actual vs Predicted (Degree {degree} Polynomial)')
plt.legend()
plt.grid(True, alpha=0.3)
# Add some statistics to the plot
plt.text(0.05, 0.95, f'R² = {r2:.3f}\nRMSE = {rmse:.3f}',
transform=plt.gca().transAxes, fontsize=12,
verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# Plot 2: Residuals distribution
plt.subplot(1, 2, 2)
residuals = y_test - y_pred
plt.hist(residuals, bins=30, alpha=0.7, color='green', edgecolor='black')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals (Actual - Predicted)')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Example 3: Check for overfitting with different polynomial degrees
print("\n" + "="*60)
print("EXAMPLE 3: Checking for Overfitting")
print("="*60)
features = ['pace', 'shooting'] # Simple case for visualization
target = 'overall'
clean_df = df[features + [target]].dropna()
X = clean_df[features]
y = clean_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_scores = []
test_scores = []
degrees_range = range(1, 7)
for degree in degrees_range:
# Create polynomial features
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Train model
model = LinearRegression()
model.fit(X_train_poly, y_train)
# Calculate scores
train_score = model.score(X_train_poly, y_train)
test_score = model.score(X_test_poly, y_test)
train_scores.append(train_score)
test_scores.append(test_score)
print(f"Degree {degree}: Train R² = {train_score:.4f}, Test R² = {test_score:.4f}")
# Plot training vs test scores
plt.figure(figsize=(10, 6))
plt.plot(degrees_range, train_scores, 'bo-', label='Training Score', linewidth=2, markersize=8)
plt.plot(degrees_range, test_scores, 'ro-', label='Testing Score', linewidth=2, markersize=8)
plt.xlabel('Polynomial Degree')
plt.ylabel('R² Score')
plt.title('Training vs Testing Scores (Overfitting Check)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(degrees_range)
# Highlight potential overfitting
for i, degree in enumerate(degrees_range):
if i > 0 and test_scores[i] < test_scores[i-1]:
plt.axvspan(degree-0.5, degree+0.5, alpha=0.2, color='red', label='Potential Overfitting' if i==3 else "")
plt.tight_layout()
plt.show()
print("\nNote: When test score starts decreasing while train score keeps increasing,")
print("that's a sign of overfitting!")
Dataset loaded successfully! Shape: (18944, 106) ============================================================ POLYNOMIAL REGRESSION EXAMPLE 1: Overall Rating Prediction ============================================================ Data points available: 16861 Training set: 13488 samples Testing set: 3373 samples --- Polynomial Degree 1 --- Number of polynomial features: 3 R² Score: 0.2255 MAE: 4.7824 RMSE: 5.9972 --- Polynomial Degree 2 --- Number of polynomial features: 6 R² Score: 0.3234 MAE: 4.3865 RMSE: 5.6051 --- Polynomial Degree 3 --- Number of polynomial features: 10 R² Score: 0.3534 MAE: 4.2537 RMSE: 5.4797 --- Polynomial Degree 4 --- Number of polynomial features: 15 R² Score: 0.3642 MAE: 4.2193 RMSE: 5.4335 ============================================================ COMPARISON OF DIFFERENT POLYNOMIAL DEGREES ============================================================ degree r2 mae rmse num_features 0 1 0.225492 4.782364 5.997174 3 1 2 0.323443 4.386485 5.605131 6 2 3 0.353392 4.253653 5.479670 10 3 4 0.364246 4.219285 5.433482 15
============================================================ POLYNOMIAL REGRESSION EXAMPLE 2: With 4 Features ============================================================ Data points available: 16861 With 4 features and degree 2: Original features: 4 Polynomial features: 15 Performance Metrics (Degree 2): R² Score: 0.6578 MAE: 2.9401 RMSE: 3.9862
============================================================ EXAMPLE 3: Checking for Overfitting ============================================================ Degree 1: Train R² = 0.2430, Test R² = 0.2255 Degree 2: Train R² = 0.3333, Test R² = 0.3234 Degree 3: Train R² = 0.3610, Test R² = 0.3534 Degree 4: Train R² = 0.3711, Test R² = 0.3642 Degree 5: Train R² = 0.3754, Test R² = 0.3667 Degree 6: Train R² = 0.3763, Test R² = 0.3675
Note: When test score starts decreasing while train score keeps increasing, that's a sign of overfitting!
Professor Neil code fitting
In [6]:
import numpy as np
import matplotlib.pyplot as plt
xmin = 0
xmax = 2
noise = 0.05
npts = 100
np.set_printoptions(precision=3)
np.random.seed(10)
c = [-.3,1,0.5]
print(f"data generation coefficients: {c}")
x = xmin+(xmax-xmin)*np.random.rand(npts) # generate random x
y = c[2]+c[1]*x+c[0]*x*x+np.random.normal(0,noise,npts) # evaluate polynomial at x and add noise
coeff1 = np.polyfit(x,y,1) # fit first-order polynomial
coeff2 = np.polyfit(x,y,2) # fit second-order polynomial
xfit = np.linspace(xmin,xmax,npts)
pfit1 = np.poly1d(coeff1)
yfit1 = pfit1(xfit) # evaluate first-order fit
print(f"first-order fit coefficients: {coeff1}")
pfit2 = np.poly1d(coeff2)
yfit2 = pfit2(xfit) # evaluate second-order fit
print(f"second-order fit coefficients: {coeff2}")
plt.figure()
plt.plot(x,y,'o')
plt.plot(xfit,yfit1,'g-',label='linear')
plt.plot(xfit,yfit2,'r-',label='quadratic')
plt.legend()
plt.show()
data generation coefficients: [-0.3, 1, 0.5] first-order fit coefficients: [0.402 0.711] second-order fit coefficients: [-0.308 1.006 0.507]
In [7]:
import numpy as np
import matplotlib.pyplot as plt
xmin = 0
xmax = 2
noise = 0.05
npts = 100
np.set_printoptions(precision=3)
np.random.seed(10)
c = [-.3,1,0.5]
print(f"data generation coefficients: {c}")
data generation coefficients: [-0.3, 1, 0.5]
In [8]:
x = xmin+(xmax-xmin)*np.random.rand(npts) # generate random x
y = c[2]+c[1]*x+c[0]*x*x+np.random.normal(0,noise,npts) # evaluate polynomial at x and add noise
In [9]:
coeff1 = np.polyfit(x,y,1) # fit first-order polynomial
coeff2 = np.polyfit(x,y,2) # fit second-order polynomial
In [10]:
xfit = np.linspace(xmin,xmax,npts)
pfit1 = np.poly1d(coeff1)
yfit1 = pfit1(xfit) # evaluate first-order fit
print(f"first-order fit coefficients: {coeff1}")
first-order fit coefficients: [0.40152718 0.7111448 ]
In [11]:
pfit2 = np.poly1d(coeff2)
yfit2 = pfit2(xfit) # evaluate second-order fit
print(f"second-order fit coefficients: {coeff2}")
second-order fit coefficients: [-0.30779691 1.00618689 0.5073186 ]
In [12]:
plt.figure()
plt.plot(x,y,'o')
Out[12]:
[<matplotlib.lines.Line2D at 0xe218f7482990>]
In [13]:
plt.plot(xfit,yfit2,'r-',label='quadratic')
Out[13]:
[<matplotlib.lines.Line2D at 0xe218f77d0050>]
In [14]:
plt.plot(xfit,yfit1,'g-',label='linear')
Out[14]:
[<matplotlib.lines.Line2D at 0xe218f8b53c50>]
In [15]:
plt.plot(x,y,'o')
plt.plot(xfit,yfit1,'g-',label='linear')
plt.plot(xfit,yfit2,'r-',label='quadratic')
plt.legend()
plt.show()
linear vs quadratic fitting¶
In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
# Load the data
df = pd.read_csv('datasets/players_21.csv')
print("Dataset loaded!")
print(f"Shape: {df.shape}")
# Example 1: Simple 1D case - Pace vs Overall Rating (for visualization)
print("\n" + "="*60)
print("EXAMPLE 1: Linear vs Quadratic - Pace vs Overall Rating")
print("="*60)
# Select features
feature = 'pace' # Just one feature for clear visualization
target = 'overall'
# Clean the data
clean_df = df[[feature, target]].dropna()
X = clean_df[[feature]].values
y = clean_df[target].values
print(f"Data points: {len(X)}")
# Sort for better visualization
sorted_idx = np.argsort(X.flatten())
X_sorted = X[sorted_idx]
y_sorted = y[sorted_idx]
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# LINEAR REGRESSION
print("\n--- LINEAR REGRESSION ---")
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
# Make predictions
y_pred_linear = linear_model.predict(X_test)
# Calculate metrics
linear_r2 = r2_score(y_test, y_pred_linear)
linear_mae = mean_absolute_error(y_test, y_pred_linear)
linear_rmse = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print(f"R² Score: {linear_r2:.4f}")
print(f"MAE: {linear_mae:.4f}")
print(f"RMSE: {linear_rmse:.4f}")
print(f"Equation: y = {linear_model.intercept_:.2f} + {linear_model.coef_[0]:.2f}x")
# QUADRATIC REGRESSION (Degree 2 Polynomial)
print("\n--- QUADRATIC REGRESSION ---")
# Create polynomial features (degree 2)
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Train quadratic model
quadratic_model = LinearRegression()
quadratic_model.fit(X_train_poly, y_train)
# Make predictions
y_pred_quad = quadratic_model.predict(X_test_poly)
# Calculate metrics
quad_r2 = r2_score(y_test, y_pred_quad)
quad_mae = mean_absolute_error(y_test, y_pred_quad)
quad_rmse = np.sqrt(mean_squared_error(y_test, y_pred_quad))
print(f"R² Score: {quad_r2:.4f}")
print(f"MAE: {quad_mae:.4f}")
print(f"RMSE: {quad_rmse:.4f}")
# Show quadratic equation coefficients
coefs = quadratic_model.coef_
intercept = quadratic_model.intercept_
print(f"Equation: y = {intercept:.2f} + {coefs[1]:.2f}x + {coefs[2]:.2f}x²")
# VISUALIZATION - 1D Case
plt.figure(figsize=(14, 10))
# Plot 1: Data with fitted curves
plt.subplot(2, 2, 1)
# Plot data points
plt.scatter(X_sorted, y_sorted, alpha=0.3, s=20, color='gray', label='Data points')
# Create smooth line for predictions
X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
# Linear prediction line
y_linear_range = linear_model.predict(X_range)
plt.plot(X_range, y_linear_range, 'b-', linewidth=3, label=f'Linear (R²={linear_r2:.3f})')
# Quadratic prediction line
X_range_poly = poly.transform(X_range)
y_quad_range = quadratic_model.predict(X_range_poly)
plt.plot(X_range, y_quad_range, 'r-', linewidth=3, label=f'Quadratic (R²={quad_r2:.3f})')
plt.xlabel(f'{feature.capitalize()}')
plt.ylabel(f'{target.capitalize()}')
plt.title(f'Linear vs Quadratic Fit: {feature} vs {target}')
plt.legend()
plt.grid(True, alpha=0.3)
# Plot 2: Residuals comparison
plt.subplot(2, 2, 2)
# Calculate residuals
residuals_linear = y_test - y_pred_linear
residuals_quad = y_test - y_pred_quad
# Plot residual histograms
bins = np.linspace(-10, 10, 30)
plt.hist(residuals_linear, bins=bins, alpha=0.5, label=f'Linear (σ={np.std(residuals_linear):.2f})', color='blue')
plt.hist(residuals_quad, bins=bins, alpha=0.5, label=f'Quadratic (σ={np.std(residuals_quad):.2f})', color='red')
plt.axvline(x=0, color='black', linestyle='--')
plt.xlabel('Residuals (Actual - Predicted)')
plt.ylabel('Frequency')
plt.title('Residual Distributions Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
# Plot 3: Performance metrics comparison
plt.subplot(2, 2, 3)
metrics = ['R² Score', 'MAE', 'RMSE']
linear_scores = [linear_r2, linear_mae, linear_rmse]
quad_scores = [quad_r2, quad_mae, quad_rmse]
x_pos = np.arange(len(metrics))
width = 0.35
plt.bar(x_pos - width/2, linear_scores, width, label='Linear', color='blue', alpha=0.7)
plt.bar(x_pos + width/2, quad_scores, width, label='Quadratic', color='red', alpha=0.7)
plt.ylabel('Score')
plt.title('Performance Metrics Comparison')
plt.xticks(x_pos, metrics)
plt.legend()
# Add value labels on bars
for i, v in enumerate(linear_scores):
plt.text(i - width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
for i, v in enumerate(quad_scores):
plt.text(i + width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
plt.grid(True, alpha=0.3, axis='y')
# Plot 4: Prediction vs Actual scatter
plt.subplot(2, 2, 4)
# Linear predictions
plt.scatter(y_test, y_pred_linear, alpha=0.5, color='blue', s=30, label=f'Linear (R²={linear_r2:.3f})')
# Quadratic predictions
plt.scatter(y_test, y_pred_quad, alpha=0.5, color='red', s=30, label=f'Quadratic (R²={quad_r2:.3f})')
# Perfect prediction line
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
'k--', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs Actual Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Example 2: 2D case - Pace and Shooting vs Overall Rating
print("\n" + "="*60)
print("EXAMPLE 2: Linear vs Quadratic - 2 Features")
print("="*60)
features = ['pace', 'shooting']
target = 'overall'
# Clean data
clean_df = df[features + [target]].dropna()
X = clean_df[features].values
y = clean_df[target].values
print(f"Data points: {len(X)}")
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# LINEAR REGRESSION
print("\n--- LINEAR REGRESSION (2 features) ---")
linear_model_2d = LinearRegression()
linear_model_2d.fit(X_train, y_train)
y_pred_linear_2d = linear_model_2d.predict(X_test)
linear_r2_2d = r2_score(y_test, y_pred_linear_2d)
print(f"R² Score: {linear_r2_2d:.4f}")
print(f"Equation: y = {linear_model_2d.intercept_:.2f} + {linear_model_2d.coef_[0]:.2f}x₁ + {linear_model_2d.coef_[1]:.2f}x₂")
# QUADRATIC REGRESSION
print("\n--- QUADRATIC REGRESSION (2 features) ---")
poly_2d = PolynomialFeatures(degree=2)
X_train_poly_2d = poly_2d.fit_transform(X_train)
X_test_poly_2d = poly_2d.transform(X_test)
print(f"Original features: {len(features)}")
print(f"Quadratic features: {X_train_poly_2d.shape[1]}")
quadratic_model_2d = LinearRegression()
quadratic_model_2d.fit(X_train_poly_2d, y_train)
y_pred_quad_2d = quadratic_model_2d.predict(X_test_poly_2d)
quad_r2_2d = r2_score(y_test, y_pred_quad_2d)
print(f"R² Score: {quad_r2_2d:.4f}")
# Calculate improvement
improvement = ((quad_r2_2d - linear_r2_2d) / linear_r2_2d) * 100
print(f"\nImprovement with quadratic terms: {improvement:.1f}%")
# Example 3: Try different features
print("\n" + "="*60)
print("EXAMPLE 3: Linear vs Quadratic with Different Feature Sets")
print("="*60)
feature_sets = [
['pace', 'shooting'],
['passing', 'dribbling'],
['defending', 'physic'],
['pace', 'shooting', 'passing']
]
results = []
for i, features in enumerate(feature_sets, 1):
print(f"\nFeature Set {i}: {features}")
# Clean data
clean_df = df[features + [target]].dropna()
if len(clean_df) < 100: # Skip if not enough data
print(" Not enough data, skipping...")
continue
X = clean_df[features].values
y = clean_df[target].values
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Linear
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
linear_r2 = r2_score(y_test, y_pred_linear)
# Quadratic
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
quadratic_model = LinearRegression()
quadratic_model.fit(X_train_poly, y_train)
y_pred_quad = quadratic_model.predict(X_test_poly)
quad_r2 = r2_score(y_test, y_pred_quad)
# Improvement
improvement = ((quad_r2 - linear_r2) / linear_r2) * 100 if linear_r2 != 0 else 0
results.append({
'features': ', '.join(features),
'linear_r2': linear_r2,
'quad_r2': quad_r2,
'improvement': improvement,
'quad_features': X_train_poly.shape[1]
})
print(f" Linear R²: {linear_r2:.4f}")
print(f" Quadratic R²: {quad_r2:.4f}")
print(f" Improvement: {improvement:.1f}%")
print(f" Quadratic features: {X_train_poly.shape[1]}")
# Display results table
print("\n" + "="*60)
print("SUMMARY: Linear vs Quadratic Performance")
print("="*60)
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))
# Plot comparison
plt.figure(figsize=(12, 6))
# Bar plot of R² scores
x_pos = np.arange(len(results))
width = 0.35
linear_scores = [r['linear_r2'] for r in results]
quad_scores = [r['quad_r2'] for r in results]
feature_labels = [r['features'][:20] + '...' if len(r['features']) > 20 else r['features']
for r in results]
plt.subplot(1, 2, 1)
bars1 = plt.bar(x_pos - width/2, linear_scores, width, label='Linear', color='blue', alpha=0.7)
bars2 = plt.bar(x_pos + width/2, quad_scores, width, label='Quadratic', color='red', alpha=0.7)
plt.ylabel('R² Score')
plt.title('Linear vs Quadratic R² Scores')
plt.xticks(x_pos, feature_labels, rotation=45, ha='right')
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
# Add value labels
for bars in [bars1, bars2]:
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
f'{height:.3f}', ha='center', va='bottom', fontsize=8)
# Improvement plot
plt.subplot(1, 2, 2)
improvements = [r['improvement'] for r in results]
colors = ['green' if imp > 0 else 'red' for imp in improvements]
bars = plt.bar(x_pos, improvements, color=colors, alpha=0.7)
plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
plt.ylabel('Improvement (%)')
plt.title('Improvement with Quadratic Terms')
plt.xticks(x_pos, feature_labels, rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
# Add value labels
for bar, imp in zip(bars, improvements):
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2.,
height + (1 if height >= 0 else -3),
f'{imp:.1f}%', ha='center', va='bottom' if height >= 0 else 'top',
fontsize=9, fontweight='bold')
plt.tight_layout()
plt.show()
print("\nKey Insights:")
print("1. Quadratic regression can capture non-linear relationships")
print("2. Watch for overfitting - quadratic has more parameters")
print("3. Improvement varies based on feature relationships")
print("4. Always check if the complexity is justified by the performance gain")
Dataset loaded! Shape: (18944, 106) ============================================================ EXAMPLE 1: Linear vs Quadratic - Pace vs Overall Rating ============================================================ Data points: 16861 --- LINEAR REGRESSION --- R² Score: 0.0371 MAE: 5.3021 RMSE: 6.6870 Equation: y = 57.01 + 0.13x --- QUADRATIC REGRESSION --- R² Score: 0.0853 MAE: 5.1695 RMSE: 6.5175 Equation: y = 90.74 + -0.94x + 0.01x²
============================================================
EXAMPLE 2: Linear vs Quadratic - 2 Features
============================================================
Data points: 16861
--- LINEAR REGRESSION (2 features) ---
R² Score: 0.2255
Equation: y = 51.82 + 0.02x₁ + 0.24x₂
--- QUADRATIC REGRESSION (2 features) ---
Original features: 2
Quadratic features: 6
R² Score: 0.3234
Improvement with quadratic terms: 43.4%
============================================================
EXAMPLE 3: Linear vs Quadratic with Different Feature Sets
============================================================
Feature Set 1: ['pace', 'shooting']
Linear R²: 0.2255
Quadratic R²: 0.3234
Improvement: 43.4%
Quadratic features: 6
Feature Set 2: ['passing', 'dribbling']
Linear R²: 0.5055
Quadratic R²: 0.5700
Improvement: 12.8%
Quadratic features: 6
Feature Set 3: ['defending', 'physic']
Linear R²: 0.2964
Quadratic R²: 0.3544
Improvement: 19.6%
Quadratic features: 6
Feature Set 4: ['pace', 'shooting', 'passing']
Linear R²: 0.5001
Quadratic R²: 0.6480
Improvement: 29.6%
Quadratic features: 10
============================================================
SUMMARY: Linear vs Quadratic Performance
============================================================
features linear_r2 quad_r2 improvement quad_features
pace, shooting 0.225492 0.323443 43.438847 6
passing, dribbling 0.505487 0.569983 12.759051 6
defending, physic 0.296367 0.354391 19.578583 6
pace, shooting, passing 0.500144 0.648009 29.564385 10
Key Insights: 1. Quadratic regression can capture non-linear relationships 2. Watch for overfitting - quadratic has more parameters 3. Improvement varies based on feature relationships 4. Always check if the complexity is justified by the performance gain
In [ ]: