import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette("husl")
print("Libraries imported successfully!")

Libraries imported successfully!

df = pd.read_csv('datasets/mental_health.csv')
print(f"Dataset loaded with {len(df)} rows and {len(df.columns)} columns")

Dataset loaded with 500 rows and 10 columns

# Display basic info
print("\nDataset Overview:")
print("=" * 50)
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

Dataset Overview:
==================================================
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   User_ID                    500 non-null    object 
 1   Age                        500 non-null    int64  
 2   Gender                     500 non-null    object 
 3   Daily_Screen_Time(hrs)     500 non-null    float64
 4   Sleep_Quality(1-10)        500 non-null    float64
 5   Stress_Level(1-10)         500 non-null    float64
 6   Days_Without_Social_Media  500 non-null    float64
 7   Exercise_Frequency(week)   500 non-null    float64
 8   Social_Media_Platform      500 non-null    object 
 9   Happiness_Index(1-10)      500 non-null    float64
dtypes: float64(6), int64(1), object(3)
memory usage: 39.2+ KB
None

First 5 rows:
  User_ID  Age  Gender  Daily_Screen_Time(hrs)  Sleep_Quality(1-10)  \
0    U001   44    Male                     3.1                  7.0   
1    U002   30   Other                     5.1                  7.0   
2    U003   23   Other                     7.4                  6.0   
3    U004   36  Female                     5.7                  7.0   
4    U005   34  Female                     7.0                  4.0   

   Stress_Level(1-10)  Days_Without_Social_Media  Exercise_Frequency(week)  \
0                 6.0                        2.0                       5.0   
1                 8.0                        5.0                       3.0   
2                 7.0                        1.0                       3.0   
3                 8.0                        1.0                       1.0   
4                 7.0                        5.0                       1.0   

  Social_Media_Platform  Happiness_Index(1-10)  
0              Facebook                   10.0  
1              LinkedIn                   10.0  
2               YouTube                    6.0  
3                TikTok                    8.0  
4           X (Twitter)                    8.0

# Data preprocessing - handle any missing values if they exist
print(f"\nMissing values: {df.isnull().sum().sum()}")

Missing values: 0

# 1. SIMPLE LINEAR REGRESSION - Screen Time vs Happiness
print("\n" + "="*60)
print("1. SIMPLE LINEAR REGRESSION: Screen Time vs Happiness")
print("="*60)

============================================================
1. SIMPLE LINEAR REGRESSION: Screen Time vs Happiness
============================================================

# Prepare data
x_screen = df['Daily_Screen_Time(hrs)'].values.reshape(-1, 1)
y_happiness = df['Happiness_Index(1-10)'].values

# Fit linear regression
lin_reg = LinearRegression()
lin_reg.fit(x_screen, y_happiness)
y_pred_linear = lin_reg.predict(x_screen)

# Get coefficients
slope = lin_reg.coef_[0]
intercept = lin_reg.intercept_
r2_linear = r2_score(y_happiness, y_pred_linear)
mse_linear = mean_squared_error(y_happiness, y_pred_linear)

print(f"Linear Regression Equation: Happiness = {slope:.3f} × ScreenTime + {intercept:.3f}")
print(f"R² Score: {r2_linear:.3f}")
print(f"Mean Squared Error: {mse_linear:.3f}")

Linear Regression Equation: Happiness = -0.620 × ScreenTime + 11.802
R² Score: 0.497
Mean Squared Error: 1.166

# 2. POLYNOMIAL REGRESSION
print("\n" + "="*60)
print("2. POLYNOMIAL REGRESSION")
print("="*60)

# Try different polynomial degrees
degrees = [2, 3, 4]
poly_results = {}

for degree in degrees:
    poly = PolynomialFeatures(degree=degree)
    x_poly = poly.fit_transform(x_screen)
    poly_reg = LinearRegression()
    poly_reg.fit(x_poly, y_happiness)
    y_pred_poly = poly_reg.predict(x_poly)
    
    r2_poly = r2_score(y_happiness, y_pred_poly)
    mse_poly = mean_squared_error(y_happiness, y_pred_poly)
    poly_results[degree] = {
        'model': poly_reg,
        'transformer': poly,
        'r2': r2_poly,
        'mse': mse_poly
    }
    print(f"Degree {degree}: R² = {r2_poly:.3f}, MSE = {mse_poly:.3f}")

============================================================
2. POLYNOMIAL REGRESSION
============================================================
Degree 2: R² = 0.513, MSE = 1.128
Degree 3: R² = 0.516, MSE = 1.123
Degree 4: R² = 0.516, MSE = 1.122

# 3. MULTIPLE LINEAR REGRESSION
print("\n" + "="*60)
print("3. MULTIPLE LINEAR REGRESSION")
print("="*60)

# Select multiple features
features = ['Daily_Screen_Time(hrs)', 'Sleep_Quality(1-10)', 'Stress_Level(1-10)', 
           'Days_Without_Social_Media', 'Exercise_Frequency(week)', 'Age']
X_multi = df[features]
y_multi = df['Happiness_Index(1-10)']

# Fit multiple regression
multi_reg = LinearRegression()
multi_reg.fit(X_multi, y_multi)
y_pred_multi = multi_reg.predict(X_multi)

r2_multi = r2_score(y_multi, y_pred_multi)
mse_multi = mean_squared_error(y_multi, y_pred_multi)

print("Multiple Regression Coefficients:")
for feature, coef in zip(features, multi_reg.coef_):
    print(f"  {feature}: {coef:.3f}")
print(f"Intercept: {multi_reg.intercept_:.3f}")
print(f"R² Score: {r2_multi:.3f}")
print(f"Mean Squared Error: {mse_multi:.3f}")

============================================================
3. MULTIPLE LINEAR REGRESSION
============================================================
Multiple Regression Coefficients:
  Daily_Screen_Time(hrs): -0.105
  Sleep_Quality(1-10): 0.316
  Stress_Level(1-10): -0.457
  Days_Without_Social_Media: 0.035
  Exercise_Frequency(week): 0.010
  Age: 0.007
Intercept: 9.623
R² Score: 0.645
Mean Squared Error: 0.822

# 4. CUSTOM FUNCTION FITTING USING curve_fit
print("\n" + "="*60)
print("4. CUSTOM FUNCTION FITTING")
print("="*60)

# Define custom functions to fit
def linear_func(x, a, b):
    return a * x + b

def quadratic_func(x, a, b, c):
    return a * x**2 + b * x + c

def exponential_func(x, a, b, c):
    return a * np.exp(-b * x) + c  # Negative for decay

def power_func(x, a, b):
    return a * x**b

def logistic_func(x, a, b, c):
    return a / (1 + np.exp(-b * (x - c)))

# Fit different functions to screen time vs happiness
x_data = df['Daily_Screen_Time(hrs)'].values
y_data = df['Happiness_Index(1-10)'].values

# Remove any potential NaN values
mask = ~np.isnan(x_data) & ~np.isnan(y_data)
x_clean = x_data[mask]
y_clean = y_data[mask]

functions = {
    'Linear': linear_func,
    'Quadratic': quadratic_func,
    'Exponential Decay': exponential_func,
    'Power Law': power_func
}

fit_results = {}

for name, func in functions.items():
    try:
        if name == 'Linear':
            popt, pcov = curve_fit(func, x_clean, y_clean)
            y_pred = func(x_clean, *popt)
        elif name == 'Quadratic':
            popt, pcov = curve_fit(func, x_clean, y_clean, maxfev=5000)
            y_pred = func(x_clean, *popt)
        elif name == 'Exponential Decay':
            # Provide initial guess for better convergence
            p0 = [10, 0.1, 5]
            popt, pcov = curve_fit(func, x_clean, y_clean, p0=p0, maxfev=5000)
            y_pred = func(x_clean, *popt)
        elif name == 'Power Law':
            popt, pcov = curve_fit(func, x_clean, y_clean, maxfev=5000)
            y_pred = func(x_clean, *popt)
        
        r2 = r2_score(y_clean, y_pred)
        mse = mean_squared_error(y_clean, y_pred)
        fit_results[name] = {
            'parameters': popt,
            'r2': r2,
            'mse': mse,
            'predictions': y_pred
        }
        print(f"{name}: R² = {r2:.3f}, MSE = {mse:.3f}")
        print(f"  Parameters: {popt}")
    except Exception as e:
        print(f"Could not fit {name}: {e}")

============================================================
4. CUSTOM FUNCTION FITTING
============================================================
Linear: R² = 0.497, MSE = 1.166
  Parameters: [-0.61957933 11.80227369]
Quadratic: R² = 0.513, MSE = 1.128
  Parameters: [-0.04754136 -0.09092248 10.47546284]
Exponential Decay: R² = 0.497, MSE = 1.166
  Parameters: [ 1.79832433e+04  3.44586963e-05 -1.79714408e+04]
Power Law: R² = 0.385, MSE = 1.427
  Parameters: [13.25918678 -0.28086346]

# 5. VISUALIZATION OF ALL FITS
print("\n" + "="*60)
print("5. CREATING VISUALIZATIONS")
print("="*60)

# Create comprehensive visualization
fig = plt.figure(figsize=(20, 15))

# Plot 1: Linear regression
plt.subplot(3, 3, 1)
plt.scatter(x_clean, y_clean, alpha=0.5, label='Data', color='blue')
x_line = np.linspace(x_clean.min(), x_clean.max(), 100)
y_line_linear = linear_func(x_line, *fit_results['Linear']['parameters'])
plt.plot(x_line, y_line_linear, 'r-', linewidth=2, label='Linear fit')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Happiness Index')
plt.title(f'Linear Fit (R² = {fit_results["Linear"]["r2"]:.3f})')
plt.legend()
plt.grid(True, alpha=0.3)

============================================================
5. CREATING VISUALIZATIONS
============================================================

# Plot 2: Quadratic fit
plt.subplot(3, 3, 2)
plt.scatter(x_clean, y_clean, alpha=0.5, label='Data', color='blue')
y_line_quad = quadratic_func(x_line, *fit_results['Quadratic']['parameters'])
plt.plot(x_line, y_line_quad, 'g-', linewidth=2, label='Quadratic fit')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Happiness Index')
plt.title(f'Quadratic Fit (R² = {fit_results["Quadratic"]["r2"]:.3f})')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 3: Exponential decay fit
plt.subplot(3, 3, 3)
plt.scatter(x_clean, y_clean, alpha=0.5, label='Data', color='blue')
y_line_exp = exponential_func(x_line, *fit_results['Exponential Decay']['parameters'])
plt.plot(x_line, y_line_exp, 'purple', linewidth=2, label='Exponential decay fit')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Happiness Index')
plt.title(f'Exponential Decay Fit (R² = {fit_results["Exponential Decay"]["r2"]:.3f})')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 4: Power law fit
plt.subplot(3, 3, 4)
plt.scatter(x_clean, y_clean, alpha=0.5, label='Data', color='blue')
y_line_power = power_func(x_line, *fit_results['Power Law']['parameters'])
plt.plot(x_line, y_line_power, 'orange', linewidth=2, label='Power law fit')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Happiness Index')
plt.title(f'Power Law Fit (R² = {fit_results["Power Law"]["r2"]:.3f})')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 5: Comparison of all fits
plt.subplot(3, 3, 5)
plt.scatter(x_clean, y_clean, alpha=0.3, label='Data', color='gray')
plt.plot(x_line, y_line_linear, 'r-', linewidth=2, label='Linear')
plt.plot(x_line, y_line_quad, 'g-', linewidth=2, label='Quadratic')
plt.plot(x_line, y_line_exp, 'purple', linewidth=2, label='Exponential')
plt.plot(x_line, y_line_power, 'orange', linewidth=2, label='Power Law')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Happiness Index')
plt.title('Comparison of All Fits')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 6: Residuals for linear fit
plt.subplot(3, 3, 6)
residuals_linear = y_clean - linear_func(x_clean, *fit_results['Linear']['parameters'])
plt.scatter(x_clean, residuals_linear, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Residuals')
plt.title('Residuals - Linear Fit')
plt.grid(True, alpha=0.3)

# Plot 7: Residuals for quadratic fit
plt.subplot(3, 3, 7)
residuals_quad = y_clean - quadratic_func(x_clean, *fit_results['Quadratic']['parameters'])
plt.scatter(x_clean, residuals_quad, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Residuals')
plt.title('Residuals - Quadratic Fit')
plt.grid(True, alpha=0.3)

# Plot 8: Distribution of residuals comparison
plt.subplot(3, 3, 8)
plt.hist(residuals_linear, alpha=0.7, label='Linear', bins=20, density=True)
plt.hist(residuals_quad, alpha=0.7, label='Quadratic', bins=20, density=True)
plt.xlabel('Residual Value')
plt.ylabel('Density')
plt.title('Distribution of Residuals')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 9: Actual vs Predicted for best model
best_model_name = max(fit_results.items(), key=lambda x: x[1]['r2'])[0]
best_predictions = fit_results[best_model_name]['predictions']
plt.subplot(3, 3, 9)
plt.scatter(y_clean, best_predictions, alpha=0.5)
plt.plot([y_clean.min(), y_clean.max()], [y_clean.min(), y_clean.max()], 'r--', linewidth=2)
plt.xlabel('Actual Happiness')
plt.ylabel('Predicted Happiness')
plt.title(f'Actual vs Predicted ({best_model_name})')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 6. MULTI-VARIABLE FUNCTION FITTING
print("\n" + "="*60)
print("6. MULTI-VARIABLE FUNCTION FITTING")
print("="*60)

# Define a multi-variable function
def multi_var_func(X, a, b, c, d, e, f):
    x1, x2, x3, x4, x5, x6 = X
    return a*x1 + b*x2 + c*x3 + d*x4 + e*x5 + f*x6

# Prepare data for multi-variable fitting
x1 = df['Daily_Screen_Time(hrs)'].values
x2 = df['Sleep_Quality(1-10)'].values
x3 = df['Stress_Level(1-10)'].values
x4 = df['Days_Without_Social_Media'].values
x5 = df['Exercise_Frequency(week)'].values
x6 = df['Age'].values
y = df['Happiness_Index(1-10)'].values

# Remove NaN values
mask = ~(np.isnan(x1) | np.isnan(x2) | np.isnan(x3) | np.isnan(x4) | np.isnan(x5) | np.isnan(x6) | np.isnan(y))
x1_clean, x2_clean, x3_clean, x4_clean, x5_clean, x6_clean, y_clean = \
    x1[mask], x2[mask], x3[mask], x4[mask], x5[mask], x6[mask], y[mask]

try:
    # Fit multi-variable function
    popt_multi, pcov_multi = curve_fit(multi_var_func, 
                                     (x1_clean, x2_clean, x3_clean, x4_clean, x5_clean, x6_clean), 
                                     y_clean, maxfev=5000)
    
    a, b, c, d, e, f = popt_multi
    y_pred_multi_func = multi_var_func((x1_clean, x2_clean, x3_clean, x4_clean, x5_clean, x6_clean), *popt_multi)
    r2_multi_func = r2_score(y_clean, y_pred_multi_func)
    
    print("Multi-variable Function:")
    print(f"Happiness = {a:.3f}×ScreenTime + {b:.3f}×SleepQuality + {c:.3f}×StressLevel + {d:.3f}×DaysWithoutSM + {e:.3f}×Exercise + {f:.3f}×Age")
    print(f"R² Score: {r2_multi_func:.3f}")
    
except Exception as e:
    print(f"Multi-variable fitting failed: {e}")

============================================================
6. MULTI-VARIABLE FUNCTION FITTING
============================================================
Multi-variable Function:
Happiness = 0.358×ScreenTime + 1.014×SleepQuality + -0.256×StressLevel + 0.096×DaysWithoutSM + 0.117×Exercise + 0.032×Age
R² Score: 0.373

# 7. MODEL COMPARISON AND SUMMARY
print("\n" + "="*60)
print("7. MODEL COMPARISON SUMMARY")
print("="*60)

# Create comparison table
comparison_data = {
    'Model': ['Simple Linear', 'Multiple Linear'] + list(fit_results.keys()),
    'R² Score': [r2_linear, r2_multi] + [fit_results[name]['r2'] for name in fit_results],
    'MSE': [mse_linear, mse_multi] + [fit_results[name]['mse'] for name in fit_results]
}

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('R² Score', ascending=False)
print(comparison_df.to_string(index=False))

# Find best model
best_overall = comparison_df.iloc[0]
print(f"\n🏆 BEST MODEL: {best_overall['Model']} (R² = {best_overall['R² Score']:.3f})")

============================================================
7. MODEL COMPARISON SUMMARY
============================================================
            Model  R² Score      MSE
  Multiple Linear  0.645409 0.822164
        Quadratic  0.513426 1.128182
    Simple Linear  0.497315 1.165537
           Linear  0.497315 1.165537
Exponential Decay  0.497308 1.165554
        Power Law  0.384568 1.426954

🏆 BEST MODEL: Multiple Linear (R² = 0.645)

# 8. PREDICTION ON NEW DATA
print("\n" + "="*60)
print("8. PREDICTION EXAMPLES")
print("="*60)

# Use the best single-variable model for prediction
if best_model_name in fit_results:
    best_func = functions[best_model_name]
    best_params = fit_results[best_model_name]['parameters']
    
    # Predict for some example screen times
    example_times = [2, 5, 8, 10]
    print("Predicted Happiness for different screen times:")
    for time in example_times:
        if best_model_name == 'Linear':
            pred = linear_func(time, *best_params)
        elif best_model_name == 'Quadratic':
            pred = quadratic_func(time, *best_params)
        elif best_model_name == 'Exponential Decay':
            pred = exponential_func(time, *best_params)
        elif best_model_name == 'Power Law':
            pred = power_func(time, *best_params)
        print(f"  {time} hours screen time → {pred:.1f} happiness")

============================================================
8. PREDICTION EXAMPLES
============================================================
Predicted Happiness for different screen times:
  2 hours screen time → 10.1 happiness
  5 hours screen time → 8.8 happiness
  8 hours screen time → 6.7 happiness
  10 hours screen time → 4.8 happiness

# 9. ADDITIONAL ANALYSIS: STRESS vs HAPPINESS
print("\n" + "="*60)
print("9. ADDITIONAL ANALYSIS: Stress vs Happiness")
print("="*60)

# Fit stress vs happiness
x_stress = df['Stress_Level(1-10)'].values.reshape(-1, 1)
y_happiness = df['Happiness_Index(1-10)'].values

stress_reg = LinearRegression()
stress_reg.fit(x_stress, y_happiness)
y_pred_stress = stress_reg.predict(x_stress)
r2_stress = r2_score(y_happiness, y_pred_stress)

print(f"Stress vs Happiness: Happiness = {stress_reg.coef_[0]:.3f} × Stress + {stress_reg.intercept_:.3f}")
print(f"R² Score: {r2_stress:.3f}")

# Final summary
print("\n" + "="*60)
print("FUNCTION FITTING COMPLETE!")
print("="*60)
print("Key Insights:")
print(f"- Screen time shows a { 'negative' if slope < 0 else 'positive'} correlation with happiness")
print(f"- The best fitting model explains {best_overall['R² Score']*100:.1f}% of variance in happiness")
print("- Multiple regression provides better prediction using all available features")
print("- Check residuals to ensure model assumptions are met")

print("\nNext steps:")
print("1. Consider feature engineering")
print("2. Try regularization techniques (Ridge, Lasso)")
print("3. Explore interaction terms")
print("4. Validate with train-test split")

============================================================
9. ADDITIONAL ANALYSIS: Stress vs Happiness
============================================================
Stress vs Happiness: Happiness = -0.728 × Stress + 13.196
R² Score: 0.543

============================================================
FUNCTION FITTING COMPLETE!
============================================================
Key Insights:
- Screen time shows a negative correlation with happiness
- The best fitting model explains 64.5% of variance in happiness
- Multiple regression provides better prediction using all available features
- Check residuals to ensure model assumptions are met

Next steps:
1. Consider feature engineering
2. Try regularization techniques (Ridge, Lasso)
3. Explore interaction terms
4. Validate with train-test split