# First, let's load the data from the datasets folder
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Load the data from datasets folder
df = pd.read_csv('datasets/players_21.csv')

# Display basic info about the dataset
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Let's see what columns are available
print("\nColumn names (first 20):")
print(df.columns.tolist()[:20])

# Check data types
print("\nData types (first 20):")
print(df.dtypes.head(20))

# For linear regression, let's select numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumber of numeric columns: {len(numeric_cols)}")

# Example 1: Simple linear regression - Predicting overall rating
# Let's select some key attributes
print("\n" + "="*50)
print("EXAMPLE 1: Predicting Player Overall Rating")
print("="*50)

features = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
target = 'overall'

# Check if our features exist
for feature in features:
    if feature in df.columns:
        print(f"✓ {feature} available")
    else:
        print(f"✗ {feature} NOT found")

# Create a clean dataset
clean_df = df[features + [target]].dropna()

print(f"\nOriginal dataset size: {len(df)}")
print(f"Clean dataset size: {len(clean_df)}")
print(f"Percentage of data kept: {len(clean_df)/len(df)*100:.1f}%")

# Split the data
X = clean_df[features]
y = clean_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

print("\nModel Performance:")
print("-" * 30)
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.4f}")
print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R-squared Score: {r2_score(y_test, y_pred):.4f}")

# Show the coefficients
print("\nFeature Coefficients (Impact on Overall Rating):")
print("-" * 40)
for feature, coef in zip(features, model.coef_):
    print(f"{feature:12}: {coef:+.4f}")

print(f"\nIntercept: {model.intercept_:.4f}")

# Visualize predictions vs actual values
plt.figure(figsize=(12, 5))

# Plot 1: Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.5, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Overall Rating')
plt.ylabel('Predicted Overall Rating')
plt.title('Actual vs Predicted')
plt.grid(True, alpha=0.3)

# Plot 2: Residual plot
plt.subplot(1, 2, 2)
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, alpha=0.5, color='green')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Example 2: Predicting player's potential
print("\n" + "="*50)
print("EXAMPLE 2: Predicting Player Potential")
print("="*50)

potential_features = ['overall', 'age', 'international_reputation', 'skill_moves', 'weak_foot']
potential_target = 'potential'

# Check if features exist
for feature in potential_features:
    if feature in df.columns:
        print(f"✓ {feature} available")
    else:
        print(f"✗ {feature} NOT found")

# Create clean dataset for potential
potential_df = df[potential_features + [potential_target]].dropna()

print(f"\nData for potential prediction: {len(potential_df)} rows")

X_pot = potential_df[potential_features]
y_pot = potential_df[potential_target]

X_train_pot, X_test_pot, y_train_pot, y_test_pot = train_test_split(
    X_pot, y_pot, test_size=0.2, random_state=42
)

# Train model
pot_model = LinearRegression()
pot_model.fit(X_train_pot, y_train_pot)

# Make predictions
y_pred_pot = pot_model.predict(X_test_pot)

# Evaluate
print("\nPotential Prediction Performance:")
print("-" * 30)
print(f"R-squared Score: {r2_score(y_test_pot, y_pred_pot):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test_pot, y_pred_pot):.4f}")

# Show coefficients
print("\nFeature Coefficients (Impact on Potential):")
print("-" * 40)
for feature, coef in zip(potential_features, pot_model.coef_):
    print(f"{feature:25}: {coef:+.4f}")

print(f"\nIntercept: {pot_model.intercept_:.4f}")

# Example 3: Quick correlation heatmap
print("\n" + "="*50)
print("EXAMPLE 3: Correlation Heatmap (First 15 numeric columns)")
print("="*50)

# Select first 15 numeric columns for visualization
corr_cols = numeric_cols[:15] if len(numeric_cols) >= 15 else numeric_cols
corr_matrix = df[corr_cols].corr()

plt.figure(figsize=(12, 8))
plt.imshow(corr_matrix, cmap='coolwarm', aspect='auto')
plt.colorbar(label='Correlation')
plt.xticks(range(len(corr_cols)), corr_cols, rotation=45, ha='right')
plt.yticks(range(len(corr_cols)), corr_cols)
plt.title('Correlation Heatmap of Player Attributes')
plt.tight_layout()
plt.show()

# Show top correlations with overall rating
if 'overall' in df.columns:
    overall_corr = df[numeric_cols].corrwith(df['overall']).sort_values(ascending=False)
    print("\nTop 10 attributes correlated with Overall Rating:")
    print("-" * 50)
    for i, (attr, corr) in enumerate(overall_corr.head(10).items(), 1):
        print(f"{i:2}. {attr:30}: {corr:.4f}")

Dataset shape: (18944, 106)

First few rows:
   sofifa_id                                         player_url  \
0     158023  https://sofifa.com/player/158023/lionel-messi/...   
1      20801  https://sofifa.com/player/20801/c-ronaldo-dos-...   
2     200389  https://sofifa.com/player/200389/jan-oblak/210002   
3     188545  https://sofifa.com/player/188545/robert-lewand...   
4     190871  https://sofifa.com/player/190871/neymar-da-sil...   

          short_name                            long_name  age         dob  \
0           L. Messi       Lionel Andrés Messi Cuccittini   33  1987-06-24   
1  Cristiano Ronaldo  Cristiano Ronaldo dos Santos Aveiro   35  1985-02-05   
2           J. Oblak                            Jan Oblak   27  1993-01-07   
3     R. Lewandowski                   Robert Lewandowski   31  1988-08-21   
4          Neymar Jr        Neymar da Silva Santos Júnior   28  1992-02-05   

   height_cm  weight_kg nationality            club_name  ...   lwb   ldm  \
0        170         72   Argentina         FC Barcelona  ...  66+3  65+3   
1        187         83    Portugal             Juventus  ...  65+3  61+3   
2        188         87    Slovenia      Atlético Madrid  ...  32+3  36+3   
3        184         80      Poland    FC Bayern München  ...  64+3  65+3   
4        175         68      Brazil  Paris Saint-Germain  ...  67+3  62+3   

    cdm   rdm   rwb    lb   lcb    cb   rcb    rb  
0  65+3  65+3  66+3  62+3  52+3  52+3  52+3  62+3  
1  61+3  61+3  65+3  61+3  54+3  54+3  54+3  61+3  
2  36+3  36+3  32+3  32+3  33+3  33+3  33+3  32+3  
3  65+3  65+3  64+3  61+3  60+3  60+3  60+3  61+3  
4  62+3  62+3  67+3  62+3  49+3  49+3  49+3  62+3  

[5 rows x 106 columns]

Column names (first 20):
['sofifa_id', 'player_url', 'short_name', 'long_name', 'age', 'dob', 'height_cm', 'weight_kg', 'nationality', 'club_name', 'league_name', 'league_rank', 'overall', 'potential', 'value_eur', 'wage_eur', 'player_positions', 'preferred_foot', 'international_reputation', 'weak_foot']

Data types (first 20):
sofifa_id                     int64
player_url                   object
short_name                   object
long_name                    object
age                           int64
dob                          object
height_cm                     int64
weight_kg                     int64
nationality                  object
club_name                    object
league_name                  object
league_rank                 float64
overall                       int64
potential                     int64
value_eur                     int64
wage_eur                      int64
player_positions             object
preferred_foot               object
international_reputation      int64
weak_foot                     int64
dtype: object

Number of numeric columns: 62

==================================================
EXAMPLE 1: Predicting Player Overall Rating
==================================================
✓ pace available
✓ shooting available
✓ passing available
✓ dribbling available
✓ defending available
✓ physic available

Original dataset size: 18944
Clean dataset size: 16861
Percentage of data kept: 89.0%

Training set size: 13488
Testing set size: 3373

Model Performance:
------------------------------
Mean Absolute Error: 2.7576
Mean Squared Error: 11.8920
Root Mean Squared Error: 3.4485
R-squared Score: 0.7439

Feature Coefficients (Impact on Overall Rating):
----------------------------------------
pace        : +0.0064
shooting    : +0.0752
passing     : +0.0844
dribbling   : +0.3130
defending   : +0.1122
physic      : +0.2574

Intercept: 14.7282

==================================================
EXAMPLE 2: Predicting Player Potential
==================================================
✓ overall available
✓ age available
✓ international_reputation available
✓ skill_moves available
✓ weak_foot available

Data for potential prediction: 18944 rows

Potential Prediction Performance:
------------------------------
R-squared Score: 0.8056
Mean Absolute Error: 2.0595

Feature Coefficients (Impact on Potential):
----------------------------------------
overall                  : +0.8287
age                      : -0.9567
international_reputation : +1.6677
skill_moves              : -0.3096
weak_foot                : +0.0443

Intercept: 39.5871

==================================================
EXAMPLE 3: Correlation Heatmap (First 15 numeric columns)
==================================================

Top 10 attributes correlated with Overall Rating:
--------------------------------------------------
 1. overall                       : 1.0000
 2. gk_diving                     : 0.9503
 3. gk_reflexes                   : 0.9478
 4. gk_positioning                : 0.9469
 5. gk_handling                   : 0.9293
 6. movement_reactions            : 0.8672
 7. gk_kicking                    : 0.8104
 8. passing                       : 0.7115
 9. mentality_composure           : 0.7053
10. dribbling                     : 0.6411

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt

# Load the data from datasets folder
df = pd.read_csv('datasets/players_21.csv')

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")

# Example 1: Simple Polynomial Regression with 2 features
print("\n" + "="*60)
print("POLYNOMIAL REGRESSION EXAMPLE 1: Overall Rating Prediction")
print("="*60)

# Select features for polynomial regression
features = ['pace', 'shooting']  # Start with just 2 features for visualization
target = 'overall'

# Create clean dataset
clean_df = df[features + [target]].dropna()
print(f"Data points available: {len(clean_df)}")

X = clean_df[features]
y = clean_df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {len(X_train)} samples")
print(f"Testing set: {len(X_test)} samples")

# Try different polynomial degrees
degrees = [1, 2, 3, 4]
results = []

for degree in degrees:
    print(f"\n--- Polynomial Degree {degree} ---")
    
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    # Train linear regression on polynomial features
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_poly)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'degree': degree,
        'r2': r2,
        'mae': mae,
        'rmse': rmse,
        'num_features': X_train_poly.shape[1]
    })
    
    print(f"Number of polynomial features: {X_train_poly.shape[1]}")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")

# Compare results
print("\n" + "="*60)
print("COMPARISON OF DIFFERENT POLYNOMIAL DEGREES")
print("="*60)

results_df = pd.DataFrame(results)
print(results_df[['degree', 'r2', 'mae', 'rmse', 'num_features']])

# Visualize the results
plt.figure(figsize=(14, 5))

# Plot 1: R² Score vs Polynomial Degree
plt.subplot(1, 3, 1)
plt.plot(results_df['degree'], results_df['r2'], 'bo-', linewidth=2, markersize=8)
plt.xlabel('Polynomial Degree')
plt.ylabel('R² Score')
plt.title('R² Score vs Polynomial Degree')
plt.grid(True, alpha=0.3)
for i, row in results_df.iterrows():
    plt.text(row['degree'], row['r2']+0.01, f"{row['r2']:.3f}", 
             ha='center', va='bottom')

# Plot 2: RMSE vs Polynomial Degree
plt.subplot(1, 3, 2)
plt.plot(results_df['degree'], results_df['rmse'], 'ro-', linewidth=2, markersize=8)
plt.xlabel('Polynomial Degree')
plt.ylabel('RMSE')
plt.title('RMSE vs Polynomial Degree')
plt.grid(True, alpha=0.3)
for i, row in results_df.iterrows():
    plt.text(row['degree'], row['rmse']+0.05, f"{row['rmse']:.3f}", 
             ha='center', va='bottom')

# Plot 3: Number of Features vs Polynomial Degree
plt.subplot(1, 3, 3)
plt.plot(results_df['degree'], results_df['num_features'], 'go-', linewidth=2, markersize=8)
plt.xlabel('Polynomial Degree')
plt.ylabel('Number of Features')
plt.title('Feature Explosion in Polynomial Regression')
plt.grid(True, alpha=0.3)
for i, row in results_df.iterrows():
    plt.text(row['degree'], row['num_features']+5, f"{row['num_features']}", 
             ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Example 2: Polynomial Regression with more features
print("\n" + "="*60)
print("POLYNOMIAL REGRESSION EXAMPLE 2: With 4 Features")
print("="*60)

features = ['pace', 'shooting', 'passing', 'dribbling']
target = 'overall'

# Create clean dataset
clean_df = df[features + [target]].dropna()
print(f"Data points available: {len(clean_df)}")

X = clean_df[features]
y = clean_df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Try degree 2 polynomial
degree = 2
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

print(f"\nWith {len(features)} features and degree {degree}:")
print(f"Original features: {len(features)}")
print(f"Polynomial features: {X_train_poly.shape[1]}")

# Train model
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Make predictions
y_pred = model.predict(X_test_poly)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"\nPerformance Metrics (Degree {degree}):")
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

# Visualize predictions vs actual
plt.figure(figsize=(12, 5))

# Plot 1: Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', edgecolors='black', linewidth=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
         'r--', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Overall Rating')
plt.ylabel('Predicted Overall Rating')
plt.title(f'Actual vs Predicted (Degree {degree} Polynomial)')
plt.legend()
plt.grid(True, alpha=0.3)

# Add some statistics to the plot
plt.text(0.05, 0.95, f'R² = {r2:.3f}\nRMSE = {rmse:.3f}', 
         transform=plt.gca().transAxes, fontsize=12,
         verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# Plot 2: Residuals distribution
plt.subplot(1, 2, 2)
residuals = y_test - y_pred
plt.hist(residuals, bins=30, alpha=0.7, color='green', edgecolor='black')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals (Actual - Predicted)')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Example 3: Check for overfitting with different polynomial degrees
print("\n" + "="*60)
print("EXAMPLE 3: Checking for Overfitting")
print("="*60)

features = ['pace', 'shooting']  # Simple case for visualization
target = 'overall'

clean_df = df[features + [target]].dropna()
X = clean_df[features]
y = clean_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_scores = []
test_scores = []
degrees_range = range(1, 7)

for degree in degrees_range:
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    # Train model
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    
    # Calculate scores
    train_score = model.score(X_train_poly, y_train)
    test_score = model.score(X_test_poly, y_test)
    
    train_scores.append(train_score)
    test_scores.append(test_score)
    
    print(f"Degree {degree}: Train R² = {train_score:.4f}, Test R² = {test_score:.4f}")

# Plot training vs test scores
plt.figure(figsize=(10, 6))
plt.plot(degrees_range, train_scores, 'bo-', label='Training Score', linewidth=2, markersize=8)
plt.plot(degrees_range, test_scores, 'ro-', label='Testing Score', linewidth=2, markersize=8)
plt.xlabel('Polynomial Degree')
plt.ylabel('R² Score')
plt.title('Training vs Testing Scores (Overfitting Check)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(degrees_range)

# Highlight potential overfitting
for i, degree in enumerate(degrees_range):
    if i > 0 and test_scores[i] < test_scores[i-1]:
        plt.axvspan(degree-0.5, degree+0.5, alpha=0.2, color='red', label='Potential Overfitting' if i==3 else "")

plt.tight_layout()
plt.show()

print("\nNote: When test score starts decreasing while train score keeps increasing,")
print("that's a sign of overfitting!")

Dataset loaded successfully!
Shape: (18944, 106)

============================================================
POLYNOMIAL REGRESSION EXAMPLE 1: Overall Rating Prediction
============================================================
Data points available: 16861
Training set: 13488 samples
Testing set: 3373 samples

--- Polynomial Degree 1 ---
Number of polynomial features: 3
R² Score: 0.2255
MAE: 4.7824
RMSE: 5.9972

--- Polynomial Degree 2 ---
Number of polynomial features: 6
R² Score: 0.3234
MAE: 4.3865
RMSE: 5.6051

--- Polynomial Degree 3 ---
Number of polynomial features: 10
R² Score: 0.3534
MAE: 4.2537
RMSE: 5.4797

--- Polynomial Degree 4 ---
Number of polynomial features: 15
R² Score: 0.3642
MAE: 4.2193
RMSE: 5.4335

============================================================
COMPARISON OF DIFFERENT POLYNOMIAL DEGREES
============================================================
   degree        r2       mae      rmse  num_features
0       1  0.225492  4.782364  5.997174             3
1       2  0.323443  4.386485  5.605131             6
2       3  0.353392  4.253653  5.479670            10
3       4  0.364246  4.219285  5.433482            15

============================================================
POLYNOMIAL REGRESSION EXAMPLE 2: With 4 Features
============================================================
Data points available: 16861

With 4 features and degree 2:
Original features: 4
Polynomial features: 15

Performance Metrics (Degree 2):
R² Score: 0.6578
MAE: 2.9401
RMSE: 3.9862

============================================================
EXAMPLE 3: Checking for Overfitting
============================================================
Degree 1: Train R² = 0.2430, Test R² = 0.2255
Degree 2: Train R² = 0.3333, Test R² = 0.3234
Degree 3: Train R² = 0.3610, Test R² = 0.3534
Degree 4: Train R² = 0.3711, Test R² = 0.3642
Degree 5: Train R² = 0.3754, Test R² = 0.3667
Degree 6: Train R² = 0.3763, Test R² = 0.3675

Note: When test score starts decreasing while train score keeps increasing,
that's a sign of overfitting!

import numpy as np
import matplotlib.pyplot as plt
xmin = 0
xmax = 2
noise = 0.05
npts = 100
np.set_printoptions(precision=3)
np.random.seed(10)
c = [-.3,1,0.5]
print(f"data generation coefficients: {c}")
x = xmin+(xmax-xmin)*np.random.rand(npts) # generate random x
y = c[2]+c[1]*x+c[0]*x*x+np.random.normal(0,noise,npts) # evaluate polynomial at x and add noise
coeff1 = np.polyfit(x,y,1) # fit first-order polynomial
coeff2 = np.polyfit(x,y,2) # fit second-order polynomial
xfit = np.linspace(xmin,xmax,npts)
pfit1 = np.poly1d(coeff1)
yfit1 = pfit1(xfit) # evaluate first-order fit
print(f"first-order fit coefficients: {coeff1}")
pfit2 = np.poly1d(coeff2)
yfit2 = pfit2(xfit) # evaluate second-order fit
print(f"second-order fit coefficients: {coeff2}")
plt.figure()
plt.plot(x,y,'o')
plt.plot(xfit,yfit1,'g-',label='linear')
plt.plot(xfit,yfit2,'r-',label='quadratic')
plt.legend()
plt.show()

data generation coefficients: [-0.3, 1, 0.5]
first-order fit coefficients: [0.402 0.711]
second-order fit coefficients: [-0.308  1.006  0.507]

import numpy as np
import matplotlib.pyplot as plt
xmin = 0
xmax = 2
noise = 0.05
npts = 100
np.set_printoptions(precision=3)
np.random.seed(10)
c = [-.3,1,0.5]
print(f"data generation coefficients: {c}")

data generation coefficients: [-0.3, 1, 0.5]

x = xmin+(xmax-xmin)*np.random.rand(npts) # generate random x
y = c[2]+c[1]*x+c[0]*x*x+np.random.normal(0,noise,npts) # evaluate polynomial at x and add noise

coeff1 = np.polyfit(x,y,1) # fit first-order polynomial
coeff2 = np.polyfit(x,y,2) # fit second-order polynomial

xfit = np.linspace(xmin,xmax,npts)
pfit1 = np.poly1d(coeff1)
yfit1 = pfit1(xfit) # evaluate first-order fit
print(f"first-order fit coefficients: {coeff1}")

first-order fit coefficients: [0.40152718 0.7111448 ]

pfit2 = np.poly1d(coeff2)
yfit2 = pfit2(xfit) # evaluate second-order fit
print(f"second-order fit coefficients: {coeff2}")

second-order fit coefficients: [-0.30779691  1.00618689  0.5073186 ]

plt.figure()
plt.plot(x,y,'o')

[<matplotlib.lines.Line2D at 0xe218f7482990>]

plt.plot(xfit,yfit2,'r-',label='quadratic')

[<matplotlib.lines.Line2D at 0xe218f77d0050>]

plt.plot(xfit,yfit1,'g-',label='linear')

[<matplotlib.lines.Line2D at 0xe218f8b53c50>]

plt.plot(x,y,'o')
plt.plot(xfit,yfit1,'g-',label='linear')
plt.plot(xfit,yfit2,'r-',label='quadratic')
plt.legend()
plt.show()

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('datasets/players_21.csv')
print("Dataset loaded!")
print(f"Shape: {df.shape}")

# Example 1: Simple 1D case - Pace vs Overall Rating (for visualization)
print("\n" + "="*60)
print("EXAMPLE 1: Linear vs Quadratic - Pace vs Overall Rating")
print("="*60)

# Select features
feature = 'pace'  # Just one feature for clear visualization
target = 'overall'

# Clean the data
clean_df = df[[feature, target]].dropna()
X = clean_df[[feature]].values
y = clean_df[target].values

print(f"Data points: {len(X)}")

# Sort for better visualization
sorted_idx = np.argsort(X.flatten())
X_sorted = X[sorted_idx]
y_sorted = y[sorted_idx]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LINEAR REGRESSION
print("\n--- LINEAR REGRESSION ---")
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions
y_pred_linear = linear_model.predict(X_test)

# Calculate metrics
linear_r2 = r2_score(y_test, y_pred_linear)
linear_mae = mean_absolute_error(y_test, y_pred_linear)
linear_rmse = np.sqrt(mean_squared_error(y_test, y_pred_linear))

print(f"R² Score: {linear_r2:.4f}")
print(f"MAE: {linear_mae:.4f}")
print(f"RMSE: {linear_rmse:.4f}")
print(f"Equation: y = {linear_model.intercept_:.2f} + {linear_model.coef_[0]:.2f}x")

# QUADRATIC REGRESSION (Degree 2 Polynomial)
print("\n--- QUADRATIC REGRESSION ---")

# Create polynomial features (degree 2)
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train quadratic model
quadratic_model = LinearRegression()
quadratic_model.fit(X_train_poly, y_train)

# Make predictions
y_pred_quad = quadratic_model.predict(X_test_poly)

# Calculate metrics
quad_r2 = r2_score(y_test, y_pred_quad)
quad_mae = mean_absolute_error(y_test, y_pred_quad)
quad_rmse = np.sqrt(mean_squared_error(y_test, y_pred_quad))

print(f"R² Score: {quad_r2:.4f}")
print(f"MAE: {quad_mae:.4f}")
print(f"RMSE: {quad_rmse:.4f}")

# Show quadratic equation coefficients
coefs = quadratic_model.coef_
intercept = quadratic_model.intercept_
print(f"Equation: y = {intercept:.2f} + {coefs[1]:.2f}x + {coefs[2]:.2f}x²")

# VISUALIZATION - 1D Case
plt.figure(figsize=(14, 10))

# Plot 1: Data with fitted curves
plt.subplot(2, 2, 1)

# Plot data points
plt.scatter(X_sorted, y_sorted, alpha=0.3, s=20, color='gray', label='Data points')

# Create smooth line for predictions
X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)

# Linear prediction line
y_linear_range = linear_model.predict(X_range)
plt.plot(X_range, y_linear_range, 'b-', linewidth=3, label=f'Linear (R²={linear_r2:.3f})')

# Quadratic prediction line
X_range_poly = poly.transform(X_range)
y_quad_range = quadratic_model.predict(X_range_poly)
plt.plot(X_range, y_quad_range, 'r-', linewidth=3, label=f'Quadratic (R²={quad_r2:.3f})')

plt.xlabel(f'{feature.capitalize()}')
plt.ylabel(f'{target.capitalize()}')
plt.title(f'Linear vs Quadratic Fit: {feature} vs {target}')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Residuals comparison
plt.subplot(2, 2, 2)

# Calculate residuals
residuals_linear = y_test - y_pred_linear
residuals_quad = y_test - y_pred_quad

# Plot residual histograms
bins = np.linspace(-10, 10, 30)
plt.hist(residuals_linear, bins=bins, alpha=0.5, label=f'Linear (σ={np.std(residuals_linear):.2f})', color='blue')
plt.hist(residuals_quad, bins=bins, alpha=0.5, label=f'Quadratic (σ={np.std(residuals_quad):.2f})', color='red')
plt.axvline(x=0, color='black', linestyle='--')
plt.xlabel('Residuals (Actual - Predicted)')
plt.ylabel('Frequency')
plt.title('Residual Distributions Comparison')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 3: Performance metrics comparison
plt.subplot(2, 2, 3)

metrics = ['R² Score', 'MAE', 'RMSE']
linear_scores = [linear_r2, linear_mae, linear_rmse]
quad_scores = [quad_r2, quad_mae, quad_rmse]

x_pos = np.arange(len(metrics))
width = 0.35

plt.bar(x_pos - width/2, linear_scores, width, label='Linear', color='blue', alpha=0.7)
plt.bar(x_pos + width/2, quad_scores, width, label='Quadratic', color='red', alpha=0.7)

plt.ylabel('Score')
plt.title('Performance Metrics Comparison')
plt.xticks(x_pos, metrics)
plt.legend()

# Add value labels on bars
for i, v in enumerate(linear_scores):
    plt.text(i - width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
for i, v in enumerate(quad_scores):
    plt.text(i + width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

plt.grid(True, alpha=0.3, axis='y')

# Plot 4: Prediction vs Actual scatter
plt.subplot(2, 2, 4)

# Linear predictions
plt.scatter(y_test, y_pred_linear, alpha=0.5, color='blue', s=30, label=f'Linear (R²={linear_r2:.3f})')
# Quadratic predictions
plt.scatter(y_test, y_pred_quad, alpha=0.5, color='red', s=30, label=f'Quadratic (R²={quad_r2:.3f})')

# Perfect prediction line
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
         'k--', linewidth=2, label='Perfect Prediction')

plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs Actual Comparison')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Example 2: 2D case - Pace and Shooting vs Overall Rating
print("\n" + "="*60)
print("EXAMPLE 2: Linear vs Quadratic - 2 Features")
print("="*60)

features = ['pace', 'shooting']
target = 'overall'

# Clean data
clean_df = df[features + [target]].dropna()
X = clean_df[features].values
y = clean_df[target].values

print(f"Data points: {len(X)}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LINEAR REGRESSION
print("\n--- LINEAR REGRESSION (2 features) ---")
linear_model_2d = LinearRegression()
linear_model_2d.fit(X_train, y_train)
y_pred_linear_2d = linear_model_2d.predict(X_test)

linear_r2_2d = r2_score(y_test, y_pred_linear_2d)
print(f"R² Score: {linear_r2_2d:.4f}")
print(f"Equation: y = {linear_model_2d.intercept_:.2f} + {linear_model_2d.coef_[0]:.2f}x₁ + {linear_model_2d.coef_[1]:.2f}x₂")

# QUADRATIC REGRESSION
print("\n--- QUADRATIC REGRESSION (2 features) ---")
poly_2d = PolynomialFeatures(degree=2)
X_train_poly_2d = poly_2d.fit_transform(X_train)
X_test_poly_2d = poly_2d.transform(X_test)

print(f"Original features: {len(features)}")
print(f"Quadratic features: {X_train_poly_2d.shape[1]}")

quadratic_model_2d = LinearRegression()
quadratic_model_2d.fit(X_train_poly_2d, y_train)
y_pred_quad_2d = quadratic_model_2d.predict(X_test_poly_2d)

quad_r2_2d = r2_score(y_test, y_pred_quad_2d)
print(f"R² Score: {quad_r2_2d:.4f}")

# Calculate improvement
improvement = ((quad_r2_2d - linear_r2_2d) / linear_r2_2d) * 100
print(f"\nImprovement with quadratic terms: {improvement:.1f}%")

# Example 3: Try different features
print("\n" + "="*60)
print("EXAMPLE 3: Linear vs Quadratic with Different Feature Sets")
print("="*60)

feature_sets = [
    ['pace', 'shooting'],
    ['passing', 'dribbling'],
    ['defending', 'physic'],
    ['pace', 'shooting', 'passing']
]

results = []

for i, features in enumerate(feature_sets, 1):
    print(f"\nFeature Set {i}: {features}")
    
    # Clean data
    clean_df = df[features + [target]].dropna()
    if len(clean_df) < 100:  # Skip if not enough data
        print("  Not enough data, skipping...")
        continue
    
    X = clean_df[features].values
    y = clean_df[target].values
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Linear
    linear_model = LinearRegression()
    linear_model.fit(X_train, y_train)
    y_pred_linear = linear_model.predict(X_test)
    linear_r2 = r2_score(y_test, y_pred_linear)
    
    # Quadratic
    poly = PolynomialFeatures(degree=2)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    quadratic_model = LinearRegression()
    quadratic_model.fit(X_train_poly, y_train)
    y_pred_quad = quadratic_model.predict(X_test_poly)
    quad_r2 = r2_score(y_test, y_pred_quad)
    
    # Improvement
    improvement = ((quad_r2 - linear_r2) / linear_r2) * 100 if linear_r2 != 0 else 0
    
    results.append({
        'features': ', '.join(features),
        'linear_r2': linear_r2,
        'quad_r2': quad_r2,
        'improvement': improvement,
        'quad_features': X_train_poly.shape[1]
    })
    
    print(f"  Linear R²: {linear_r2:.4f}")
    print(f"  Quadratic R²: {quad_r2:.4f}")
    print(f"  Improvement: {improvement:.1f}%")
    print(f"  Quadratic features: {X_train_poly.shape[1]}")

# Display results table
print("\n" + "="*60)
print("SUMMARY: Linear vs Quadratic Performance")
print("="*60)

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

# Plot comparison
plt.figure(figsize=(12, 6))

# Bar plot of R² scores
x_pos = np.arange(len(results))
width = 0.35

linear_scores = [r['linear_r2'] for r in results]
quad_scores = [r['quad_r2'] for r in results]
feature_labels = [r['features'][:20] + '...' if len(r['features']) > 20 else r['features'] 
                  for r in results]

plt.subplot(1, 2, 1)
bars1 = plt.bar(x_pos - width/2, linear_scores, width, label='Linear', color='blue', alpha=0.7)
bars2 = plt.bar(x_pos + width/2, quad_scores, width, label='Quadratic', color='red', alpha=0.7)

plt.ylabel('R² Score')
plt.title('Linear vs Quadratic R² Scores')
plt.xticks(x_pos, feature_labels, rotation=45, ha='right')
plt.legend()
plt.grid(True, alpha=0.3, axis='y')

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom', fontsize=8)

# Improvement plot
plt.subplot(1, 2, 2)
improvements = [r['improvement'] for r in results]
colors = ['green' if imp > 0 else 'red' for imp in improvements]

bars = plt.bar(x_pos, improvements, color=colors, alpha=0.7)
plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
plt.ylabel('Improvement (%)')
plt.title('Improvement with Quadratic Terms')
plt.xticks(x_pos, feature_labels, rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, imp in zip(bars, improvements):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., 
             height + (1 if height >= 0 else -3),
             f'{imp:.1f}%', ha='center', va='bottom' if height >= 0 else 'top', 
             fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nKey Insights:")
print("1. Quadratic regression can capture non-linear relationships")
print("2. Watch for overfitting - quadratic has more parameters")
print("3. Improvement varies based on feature relationships")
print("4. Always check if the complexity is justified by the performance gain")

Dataset loaded!
Shape: (18944, 106)

============================================================
EXAMPLE 1: Linear vs Quadratic - Pace vs Overall Rating
============================================================
Data points: 16861

--- LINEAR REGRESSION ---
R² Score: 0.0371
MAE: 5.3021
RMSE: 6.6870
Equation: y = 57.01 + 0.13x

--- QUADRATIC REGRESSION ---
R² Score: 0.0853
MAE: 5.1695
RMSE: 6.5175
Equation: y = 90.74 + -0.94x + 0.01x²

============================================================
EXAMPLE 2: Linear vs Quadratic - 2 Features
============================================================
Data points: 16861

--- LINEAR REGRESSION (2 features) ---
R² Score: 0.2255
Equation: y = 51.82 + 0.02x₁ + 0.24x₂

--- QUADRATIC REGRESSION (2 features) ---
Original features: 2
Quadratic features: 6
R² Score: 0.3234

Improvement with quadratic terms: 43.4%

============================================================
EXAMPLE 3: Linear vs Quadratic with Different Feature Sets
============================================================

Feature Set 1: ['pace', 'shooting']
  Linear R²: 0.2255
  Quadratic R²: 0.3234
  Improvement: 43.4%
  Quadratic features: 6

Feature Set 2: ['passing', 'dribbling']
  Linear R²: 0.5055
  Quadratic R²: 0.5700
  Improvement: 12.8%
  Quadratic features: 6

Feature Set 3: ['defending', 'physic']
  Linear R²: 0.2964
  Quadratic R²: 0.3544
  Improvement: 19.6%
  Quadratic features: 6

Feature Set 4: ['pace', 'shooting', 'passing']
  Linear R²: 0.5001
  Quadratic R²: 0.6480
  Improvement: 29.6%
  Quadratic features: 10

============================================================
SUMMARY: Linear vs Quadratic Performance
============================================================
               features  linear_r2  quad_r2  improvement  quad_features
         pace, shooting   0.225492 0.323443    43.438847              6
     passing, dribbling   0.505487 0.569983    12.759051              6
      defending, physic   0.296367 0.354391    19.578583              6
pace, shooting, passing   0.500144 0.648009    29.564385             10

Key Insights:
1. Quadratic regression can capture non-linear relationships
2. Watch for overfitting - quadratic has more parameters
3. Improvement varies based on feature relationships
4. Always check if the complexity is justified by the performance gain

Fitting¶

Plynomial Regression¶

linear vs quadratic fitting¶