In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
In [2]:
plt.style.use('default')
sns.set_palette("husl")
print("Libraries imported successfully!")
Libraries imported successfully!
In [4]:
df = pd.read_csv('datasets/mental_health.csv')
print(f"Dataset loaded with {len(df)} rows and {len(df.columns)} columns")
Dataset loaded with 500 rows and 10 columns
In [5]:
# Display basic info
print("\nDataset Overview:")
print("=" * 50)
print(df.info())
print("\nFirst 5 rows:")
print(df.head())
Dataset Overview: ================================================== <class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User_ID 500 non-null object 1 Age 500 non-null int64 2 Gender 500 non-null object 3 Daily_Screen_Time(hrs) 500 non-null float64 4 Sleep_Quality(1-10) 500 non-null float64 5 Stress_Level(1-10) 500 non-null float64 6 Days_Without_Social_Media 500 non-null float64 7 Exercise_Frequency(week) 500 non-null float64 8 Social_Media_Platform 500 non-null object 9 Happiness_Index(1-10) 500 non-null float64 dtypes: float64(6), int64(1), object(3) memory usage: 39.2+ KB None First 5 rows: User_ID Age Gender Daily_Screen_Time(hrs) Sleep_Quality(1-10) \ 0 U001 44 Male 3.1 7.0 1 U002 30 Other 5.1 7.0 2 U003 23 Other 7.4 6.0 3 U004 36 Female 5.7 7.0 4 U005 34 Female 7.0 4.0 Stress_Level(1-10) Days_Without_Social_Media Exercise_Frequency(week) \ 0 6.0 2.0 5.0 1 8.0 5.0 3.0 2 7.0 1.0 3.0 3 8.0 1.0 1.0 4 7.0 5.0 1.0 Social_Media_Platform Happiness_Index(1-10) 0 Facebook 10.0 1 LinkedIn 10.0 2 YouTube 6.0 3 TikTok 8.0 4 X (Twitter) 8.0
In [6]:
# Data preprocessing - handle any missing values if they exist
print(f"\nMissing values: {df.isnull().sum().sum()}")
Missing values: 0
In [7]:
# 1. SIMPLE LINEAR REGRESSION - Screen Time vs Happiness
print("\n" + "="*60)
print("1. SIMPLE LINEAR REGRESSION: Screen Time vs Happiness")
print("="*60)
============================================================ 1. SIMPLE LINEAR REGRESSION: Screen Time vs Happiness ============================================================
In [9]:
# Prepare data
x_screen = df['Daily_Screen_Time(hrs)'].values.reshape(-1, 1)
y_happiness = df['Happiness_Index(1-10)'].values
# Fit linear regression
lin_reg = LinearRegression()
lin_reg.fit(x_screen, y_happiness)
y_pred_linear = lin_reg.predict(x_screen)
# Get coefficients
slope = lin_reg.coef_[0]
intercept = lin_reg.intercept_
r2_linear = r2_score(y_happiness, y_pred_linear)
mse_linear = mean_squared_error(y_happiness, y_pred_linear)
print(f"Linear Regression Equation: Happiness = {slope:.3f} Ă— ScreenTime + {intercept:.3f}")
print(f"R² Score: {r2_linear:.3f}")
print(f"Mean Squared Error: {mse_linear:.3f}")
Linear Regression Equation: Happiness = -0.620 × ScreenTime + 11.802 R² Score: 0.497 Mean Squared Error: 1.166
In [10]:
# 2. POLYNOMIAL REGRESSION
print("\n" + "="*60)
print("2. POLYNOMIAL REGRESSION")
print("="*60)
# Try different polynomial degrees
degrees = [2, 3, 4]
poly_results = {}
for degree in degrees:
poly = PolynomialFeatures(degree=degree)
x_poly = poly.fit_transform(x_screen)
poly_reg = LinearRegression()
poly_reg.fit(x_poly, y_happiness)
y_pred_poly = poly_reg.predict(x_poly)
r2_poly = r2_score(y_happiness, y_pred_poly)
mse_poly = mean_squared_error(y_happiness, y_pred_poly)
poly_results[degree] = {
'model': poly_reg,
'transformer': poly,
'r2': r2_poly,
'mse': mse_poly
}
print(f"Degree {degree}: R² = {r2_poly:.3f}, MSE = {mse_poly:.3f}")
============================================================ 2. POLYNOMIAL REGRESSION ============================================================ Degree 2: R² = 0.513, MSE = 1.128 Degree 3: R² = 0.516, MSE = 1.123 Degree 4: R² = 0.516, MSE = 1.122
In [11]:
# 3. MULTIPLE LINEAR REGRESSION
print("\n" + "="*60)
print("3. MULTIPLE LINEAR REGRESSION")
print("="*60)
# Select multiple features
features = ['Daily_Screen_Time(hrs)', 'Sleep_Quality(1-10)', 'Stress_Level(1-10)',
'Days_Without_Social_Media', 'Exercise_Frequency(week)', 'Age']
X_multi = df[features]
y_multi = df['Happiness_Index(1-10)']
# Fit multiple regression
multi_reg = LinearRegression()
multi_reg.fit(X_multi, y_multi)
y_pred_multi = multi_reg.predict(X_multi)
r2_multi = r2_score(y_multi, y_pred_multi)
mse_multi = mean_squared_error(y_multi, y_pred_multi)
print("Multiple Regression Coefficients:")
for feature, coef in zip(features, multi_reg.coef_):
print(f" {feature}: {coef:.3f}")
print(f"Intercept: {multi_reg.intercept_:.3f}")
print(f"R² Score: {r2_multi:.3f}")
print(f"Mean Squared Error: {mse_multi:.3f}")
============================================================ 3. MULTIPLE LINEAR REGRESSION ============================================================ Multiple Regression Coefficients: Daily_Screen_Time(hrs): -0.105 Sleep_Quality(1-10): 0.316 Stress_Level(1-10): -0.457 Days_Without_Social_Media: 0.035 Exercise_Frequency(week): 0.010 Age: 0.007 Intercept: 9.623 R² Score: 0.645 Mean Squared Error: 0.822
In [12]:
# 4. CUSTOM FUNCTION FITTING USING curve_fit
print("\n" + "="*60)
print("4. CUSTOM FUNCTION FITTING")
print("="*60)
# Define custom functions to fit
def linear_func(x, a, b):
return a * x + b
def quadratic_func(x, a, b, c):
return a * x**2 + b * x + c
def exponential_func(x, a, b, c):
return a * np.exp(-b * x) + c # Negative for decay
def power_func(x, a, b):
return a * x**b
def logistic_func(x, a, b, c):
return a / (1 + np.exp(-b * (x - c)))
# Fit different functions to screen time vs happiness
x_data = df['Daily_Screen_Time(hrs)'].values
y_data = df['Happiness_Index(1-10)'].values
# Remove any potential NaN values
mask = ~np.isnan(x_data) & ~np.isnan(y_data)
x_clean = x_data[mask]
y_clean = y_data[mask]
functions = {
'Linear': linear_func,
'Quadratic': quadratic_func,
'Exponential Decay': exponential_func,
'Power Law': power_func
}
fit_results = {}
for name, func in functions.items():
try:
if name == 'Linear':
popt, pcov = curve_fit(func, x_clean, y_clean)
y_pred = func(x_clean, *popt)
elif name == 'Quadratic':
popt, pcov = curve_fit(func, x_clean, y_clean, maxfev=5000)
y_pred = func(x_clean, *popt)
elif name == 'Exponential Decay':
# Provide initial guess for better convergence
p0 = [10, 0.1, 5]
popt, pcov = curve_fit(func, x_clean, y_clean, p0=p0, maxfev=5000)
y_pred = func(x_clean, *popt)
elif name == 'Power Law':
popt, pcov = curve_fit(func, x_clean, y_clean, maxfev=5000)
y_pred = func(x_clean, *popt)
r2 = r2_score(y_clean, y_pred)
mse = mean_squared_error(y_clean, y_pred)
fit_results[name] = {
'parameters': popt,
'r2': r2,
'mse': mse,
'predictions': y_pred
}
print(f"{name}: R² = {r2:.3f}, MSE = {mse:.3f}")
print(f" Parameters: {popt}")
except Exception as e:
print(f"Could not fit {name}: {e}")
============================================================ 4. CUSTOM FUNCTION FITTING ============================================================ Linear: R² = 0.497, MSE = 1.166 Parameters: [-0.61957933 11.80227369] Quadratic: R² = 0.513, MSE = 1.128 Parameters: [-0.04754136 -0.09092248 10.47546284] Exponential Decay: R² = 0.497, MSE = 1.166 Parameters: [ 1.79832433e+04 3.44586963e-05 -1.79714408e+04] Power Law: R² = 0.385, MSE = 1.427 Parameters: [13.25918678 -0.28086346]
In [13]:
# 5. VISUALIZATION OF ALL FITS
print("\n" + "="*60)
print("5. CREATING VISUALIZATIONS")
print("="*60)
# Create comprehensive visualization
fig = plt.figure(figsize=(20, 15))
# Plot 1: Linear regression
plt.subplot(3, 3, 1)
plt.scatter(x_clean, y_clean, alpha=0.5, label='Data', color='blue')
x_line = np.linspace(x_clean.min(), x_clean.max(), 100)
y_line_linear = linear_func(x_line, *fit_results['Linear']['parameters'])
plt.plot(x_line, y_line_linear, 'r-', linewidth=2, label='Linear fit')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Happiness Index')
plt.title(f'Linear Fit (R² = {fit_results["Linear"]["r2"]:.3f})')
plt.legend()
plt.grid(True, alpha=0.3)
============================================================ 5. CREATING VISUALIZATIONS ============================================================
In [14]:
# Plot 2: Quadratic fit
plt.subplot(3, 3, 2)
plt.scatter(x_clean, y_clean, alpha=0.5, label='Data', color='blue')
y_line_quad = quadratic_func(x_line, *fit_results['Quadratic']['parameters'])
plt.plot(x_line, y_line_quad, 'g-', linewidth=2, label='Quadratic fit')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Happiness Index')
plt.title(f'Quadratic Fit (R² = {fit_results["Quadratic"]["r2"]:.3f})')
plt.legend()
plt.grid(True, alpha=0.3)
In [15]:
# Plot 3: Exponential decay fit
plt.subplot(3, 3, 3)
plt.scatter(x_clean, y_clean, alpha=0.5, label='Data', color='blue')
y_line_exp = exponential_func(x_line, *fit_results['Exponential Decay']['parameters'])
plt.plot(x_line, y_line_exp, 'purple', linewidth=2, label='Exponential decay fit')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Happiness Index')
plt.title(f'Exponential Decay Fit (R² = {fit_results["Exponential Decay"]["r2"]:.3f})')
plt.legend()
plt.grid(True, alpha=0.3)
In [16]:
# Plot 4: Power law fit
plt.subplot(3, 3, 4)
plt.scatter(x_clean, y_clean, alpha=0.5, label='Data', color='blue')
y_line_power = power_func(x_line, *fit_results['Power Law']['parameters'])
plt.plot(x_line, y_line_power, 'orange', linewidth=2, label='Power law fit')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Happiness Index')
plt.title(f'Power Law Fit (R² = {fit_results["Power Law"]["r2"]:.3f})')
plt.legend()
plt.grid(True, alpha=0.3)
In [17]:
# Plot 5: Comparison of all fits
plt.subplot(3, 3, 5)
plt.scatter(x_clean, y_clean, alpha=0.3, label='Data', color='gray')
plt.plot(x_line, y_line_linear, 'r-', linewidth=2, label='Linear')
plt.plot(x_line, y_line_quad, 'g-', linewidth=2, label='Quadratic')
plt.plot(x_line, y_line_exp, 'purple', linewidth=2, label='Exponential')
plt.plot(x_line, y_line_power, 'orange', linewidth=2, label='Power Law')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Happiness Index')
plt.title('Comparison of All Fits')
plt.legend()
plt.grid(True, alpha=0.3)
In [18]:
# Plot 6: Residuals for linear fit
plt.subplot(3, 3, 6)
residuals_linear = y_clean - linear_func(x_clean, *fit_results['Linear']['parameters'])
plt.scatter(x_clean, residuals_linear, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Residuals')
plt.title('Residuals - Linear Fit')
plt.grid(True, alpha=0.3)
In [19]:
# Plot 7: Residuals for quadratic fit
plt.subplot(3, 3, 7)
residuals_quad = y_clean - quadratic_func(x_clean, *fit_results['Quadratic']['parameters'])
plt.scatter(x_clean, residuals_quad, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Daily Screen Time (hrs)')
plt.ylabel('Residuals')
plt.title('Residuals - Quadratic Fit')
plt.grid(True, alpha=0.3)
In [21]:
# Plot 8: Distribution of residuals comparison
plt.subplot(3, 3, 8)
plt.hist(residuals_linear, alpha=0.7, label='Linear', bins=20, density=True)
plt.hist(residuals_quad, alpha=0.7, label='Quadratic', bins=20, density=True)
plt.xlabel('Residual Value')
plt.ylabel('Density')
plt.title('Distribution of Residuals')
plt.legend()
plt.grid(True, alpha=0.3)
In [22]:
# Plot 9: Actual vs Predicted for best model
best_model_name = max(fit_results.items(), key=lambda x: x[1]['r2'])[0]
best_predictions = fit_results[best_model_name]['predictions']
plt.subplot(3, 3, 9)
plt.scatter(y_clean, best_predictions, alpha=0.5)
plt.plot([y_clean.min(), y_clean.max()], [y_clean.min(), y_clean.max()], 'r--', linewidth=2)
plt.xlabel('Actual Happiness')
plt.ylabel('Predicted Happiness')
plt.title(f'Actual vs Predicted ({best_model_name})')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
In [23]:
# 6. MULTI-VARIABLE FUNCTION FITTING
print("\n" + "="*60)
print("6. MULTI-VARIABLE FUNCTION FITTING")
print("="*60)
# Define a multi-variable function
def multi_var_func(X, a, b, c, d, e, f):
x1, x2, x3, x4, x5, x6 = X
return a*x1 + b*x2 + c*x3 + d*x4 + e*x5 + f*x6
# Prepare data for multi-variable fitting
x1 = df['Daily_Screen_Time(hrs)'].values
x2 = df['Sleep_Quality(1-10)'].values
x3 = df['Stress_Level(1-10)'].values
x4 = df['Days_Without_Social_Media'].values
x5 = df['Exercise_Frequency(week)'].values
x6 = df['Age'].values
y = df['Happiness_Index(1-10)'].values
# Remove NaN values
mask = ~(np.isnan(x1) | np.isnan(x2) | np.isnan(x3) | np.isnan(x4) | np.isnan(x5) | np.isnan(x6) | np.isnan(y))
x1_clean, x2_clean, x3_clean, x4_clean, x5_clean, x6_clean, y_clean = \
x1[mask], x2[mask], x3[mask], x4[mask], x5[mask], x6[mask], y[mask]
try:
# Fit multi-variable function
popt_multi, pcov_multi = curve_fit(multi_var_func,
(x1_clean, x2_clean, x3_clean, x4_clean, x5_clean, x6_clean),
y_clean, maxfev=5000)
a, b, c, d, e, f = popt_multi
y_pred_multi_func = multi_var_func((x1_clean, x2_clean, x3_clean, x4_clean, x5_clean, x6_clean), *popt_multi)
r2_multi_func = r2_score(y_clean, y_pred_multi_func)
print("Multi-variable Function:")
print(f"Happiness = {a:.3f}Ă—ScreenTime + {b:.3f}Ă—SleepQuality + {c:.3f}Ă—StressLevel + {d:.3f}Ă—DaysWithoutSM + {e:.3f}Ă—Exercise + {f:.3f}Ă—Age")
print(f"R² Score: {r2_multi_func:.3f}")
except Exception as e:
print(f"Multi-variable fitting failed: {e}")
============================================================ 6. MULTI-VARIABLE FUNCTION FITTING ============================================================ Multi-variable Function: Happiness = 0.358×ScreenTime + 1.014×SleepQuality + -0.256×StressLevel + 0.096×DaysWithoutSM + 0.117×Exercise + 0.032×Age R² Score: 0.373
In [24]:
# 7. MODEL COMPARISON AND SUMMARY
print("\n" + "="*60)
print("7. MODEL COMPARISON SUMMARY")
print("="*60)
# Create comparison table
comparison_data = {
'Model': ['Simple Linear', 'Multiple Linear'] + list(fit_results.keys()),
'R² Score': [r2_linear, r2_multi] + [fit_results[name]['r2'] for name in fit_results],
'MSE': [mse_linear, mse_multi] + [fit_results[name]['mse'] for name in fit_results]
}
comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('R² Score', ascending=False)
print(comparison_df.to_string(index=False))
# Find best model
best_overall = comparison_df.iloc[0]
print(f"\n🏆 BEST MODEL: {best_overall['Model']} (R² = {best_overall['R² Score']:.3f})")
============================================================
7. MODEL COMPARISON SUMMARY
============================================================
Model R² Score MSE
Multiple Linear 0.645409 0.822164
Quadratic 0.513426 1.128182
Simple Linear 0.497315 1.165537
Linear 0.497315 1.165537
Exponential Decay 0.497308 1.165554
Power Law 0.384568 1.426954
🏆 BEST MODEL: Multiple Linear (R² = 0.645)
In [25]:
# 8. PREDICTION ON NEW DATA
print("\n" + "="*60)
print("8. PREDICTION EXAMPLES")
print("="*60)
# Use the best single-variable model for prediction
if best_model_name in fit_results:
best_func = functions[best_model_name]
best_params = fit_results[best_model_name]['parameters']
# Predict for some example screen times
example_times = [2, 5, 8, 10]
print("Predicted Happiness for different screen times:")
for time in example_times:
if best_model_name == 'Linear':
pred = linear_func(time, *best_params)
elif best_model_name == 'Quadratic':
pred = quadratic_func(time, *best_params)
elif best_model_name == 'Exponential Decay':
pred = exponential_func(time, *best_params)
elif best_model_name == 'Power Law':
pred = power_func(time, *best_params)
print(f" {time} hours screen time → {pred:.1f} happiness")
============================================================ 8. PREDICTION EXAMPLES ============================================================ Predicted Happiness for different screen times: 2 hours screen time → 10.1 happiness 5 hours screen time → 8.8 happiness 8 hours screen time → 6.7 happiness 10 hours screen time → 4.8 happiness
In [26]:
# 9. ADDITIONAL ANALYSIS: STRESS vs HAPPINESS
print("\n" + "="*60)
print("9. ADDITIONAL ANALYSIS: Stress vs Happiness")
print("="*60)
# Fit stress vs happiness
x_stress = df['Stress_Level(1-10)'].values.reshape(-1, 1)
y_happiness = df['Happiness_Index(1-10)'].values
stress_reg = LinearRegression()
stress_reg.fit(x_stress, y_happiness)
y_pred_stress = stress_reg.predict(x_stress)
r2_stress = r2_score(y_happiness, y_pred_stress)
print(f"Stress vs Happiness: Happiness = {stress_reg.coef_[0]:.3f} Ă— Stress + {stress_reg.intercept_:.3f}")
print(f"R² Score: {r2_stress:.3f}")
# Final summary
print("\n" + "="*60)
print("FUNCTION FITTING COMPLETE!")
print("="*60)
print("Key Insights:")
print(f"- Screen time shows a { 'negative' if slope < 0 else 'positive'} correlation with happiness")
print(f"- The best fitting model explains {best_overall['R² Score']*100:.1f}% of variance in happiness")
print("- Multiple regression provides better prediction using all available features")
print("- Check residuals to ensure model assumptions are met")
print("\nNext steps:")
print("1. Consider feature engineering")
print("2. Try regularization techniques (Ridge, Lasso)")
print("3. Explore interaction terms")
print("4. Validate with train-test split")
============================================================ 9. ADDITIONAL ANALYSIS: Stress vs Happiness ============================================================ Stress vs Happiness: Happiness = -0.728 × Stress + 13.196 R² Score: 0.543 ============================================================ FUNCTION FITTING COMPLETE! ============================================================ Key Insights: - Screen time shows a negative correlation with happiness - The best fitting model explains 64.5% of variance in happiness - Multiple regression provides better prediction using all available features - Check residuals to ensure model assumptions are met Next steps: 1. Consider feature engineering 2. Try regularization techniques (Ridge, Lasso) 3. Explore interaction terms 4. Validate with train-test split
In [ ]: