import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

plt.style.use('default')
sns.set_palette("husl")

df = pd.read_csv('datasets/mental_health.csv')

print("Dataset Overview:")
print("=" * 50)
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nDescriptive Statistics:")
print(df.describe())
print("\nMissing values:")
print(df.isnull().sum())

Dataset Overview:
==================================================
Dataset shape: (500, 10)

First few rows:
  User_ID  Age  Gender  Daily_Screen_Time(hrs)  Sleep_Quality(1-10)  \
0    U001   44    Male                     3.1                  7.0   
1    U002   30   Other                     5.1                  7.0   
2    U003   23   Other                     7.4                  6.0   
3    U004   36  Female                     5.7                  7.0   
4    U005   34  Female                     7.0                  4.0   

   Stress_Level(1-10)  Days_Without_Social_Media  Exercise_Frequency(week)  \
0                 6.0                        2.0                       5.0   
1                 8.0                        5.0                       3.0   
2                 7.0                        1.0                       3.0   
3                 8.0                        1.0                       1.0   
4                 7.0                        5.0                       1.0   

  Social_Media_Platform  Happiness_Index(1-10)  
0              Facebook                   10.0  
1              LinkedIn                   10.0  
2               YouTube                    6.0  
3                TikTok                    8.0  
4           X (Twitter)                    8.0  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   User_ID                    500 non-null    object 
 1   Age                        500 non-null    int64  
 2   Gender                     500 non-null    object 
 3   Daily_Screen_Time(hrs)     500 non-null    float64
 4   Sleep_Quality(1-10)        500 non-null    float64
 5   Stress_Level(1-10)         500 non-null    float64
 6   Days_Without_Social_Media  500 non-null    float64
 7   Exercise_Frequency(week)   500 non-null    float64
 8   Social_Media_Platform      500 non-null    object 
 9   Happiness_Index(1-10)      500 non-null    float64
dtypes: float64(6), int64(1), object(3)
memory usage: 39.2+ KB
None

Descriptive Statistics:
              Age  Daily_Screen_Time(hrs)  Sleep_Quality(1-10)  \
count  500.000000              500.000000           500.000000   
mean    32.988000                5.530000             6.304000   
std      9.960637                1.734877             1.529792   
min     16.000000                1.000000             2.000000   
25%     24.000000                4.300000             5.000000   
50%     34.000000                5.600000             6.000000   
75%     41.000000                6.700000             7.000000   
max     49.000000               10.800000            10.000000   

       Stress_Level(1-10)  Days_Without_Social_Media  \
count          500.000000                 500.000000   
mean             6.618000                   3.134000   
std              1.542996                   1.858751   
min              2.000000                   0.000000   
25%              6.000000                   2.000000   
50%              7.000000                   3.000000   
75%              8.000000                   5.000000   
max             10.000000                   9.000000   

       Exercise_Frequency(week)  Happiness_Index(1-10)  
count                500.000000             500.000000  
mean                   2.448000               8.376000  
std                    1.428067               1.524228  
min                    0.000000               4.000000  
25%                    1.000000               7.000000  
50%                    2.000000               9.000000  
75%                    3.000000              10.000000  
max                    7.000000              10.000000  

Missing values:
User_ID                      0
Age                          0
Gender                       0
Daily_Screen_Time(hrs)       0
Sleep_Quality(1-10)          0
Stress_Level(1-10)           0
Days_Without_Social_Media    0
Exercise_Frequency(week)     0
Social_Media_Platform        0
Happiness_Index(1-10)        0
dtype: int64

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Demographic Distribution', fontsize=16, fontweight='bold')

Text(0.5, 0.98, 'Demographic Distribution')

axes[0,0].hist(df['Age'], bins=20, edgecolor='black', alpha=0.7, color='skyblue')
axes[0,0].set_title('Age Distribution')
axes[0,0].set_xlabel('Age')
axes[0,0].set_ylabel('Frequency')

Text(4.444444444444452, 0.5, 'Frequency')

gender_counts = df['Gender'].value_counts()
axes[0,1].pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%', startangle=90)
axes[0,1].set_title('Gender Distribution')

Text(0.5, 1.0, 'Gender Distribution')

platform_counts = df['Social_Media_Platform'].value_counts()
axes[1,0].bar(platform_counts.index, platform_counts.values, color='lightcoral')
axes[1,0].set_title('Social Media Platform Usage')
axes[1,0].set_xlabel('Platform')
axes[1,0].set_ylabel('Count')
plt.setp(axes[1,0].xaxis.get_majorticklabels(), rotation=45)

[None, None, None, None, None, None]

exercise_counts = df['Exercise_Frequency(week)'].value_counts().sort_index()
axes[1,1].bar(exercise_counts.index, exercise_counts.values, color='lightgreen')
axes[1,1].set_title('Exercise Frequency per Week')
axes[1,1].set_xlabel('Exercise Frequency (times/week)')
axes[1,1].set_ylabel('Count')

Text(638.5353535353535, 0.5, 'Count')

plt.tight_layout()
plt.show()

<Figure size 640x480 with 0 Axes>

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Mental Health Metrics Analysis', fontsize=16, fontweight='bold')

Text(0.5, 0.98, 'Mental Health Metrics Analysis')

axes[0,0].hist(df['Stress_Level(1-10)'], bins=10, edgecolor='black', alpha=0.7, color='red')
axes[0,0].set_title('Stress Level Distribution (1-10)')
axes[0,0].set_xlabel('Stress Level')
axes[0,0].set_ylabel('Frequency')

Text(4.444444444444452, 0.5, 'Frequency')

axes[0,1].hist(df['Sleep_Quality(1-10)'], bins=10, edgecolor='black', alpha=0.7, color='purple')
axes[0,1].set_title('Sleep Quality Distribution (1-10)')
axes[0,1].set_xlabel('Sleep Quality')
axes[0,1].set_ylabel('Frequency')

Text(496.7973856209152, 0.5, 'Frequency')

axes[0,2].hist(df['Happiness_Index(1-10)'], bins=10, edgecolor='black', alpha=0.7, color='orange')
axes[0,2].set_title('Happiness Index Distribution (1-10)')
axes[0,2].set_xlabel('Happiness Index')
axes[0,2].set_ylabel('Frequency')

Text(989.1503267973857, 0.5, 'Frequency')

axes[1,0].hist(df['Daily_Screen_Time(hrs)'], bins=20, edgecolor='black', alpha=0.7, color='blue')
axes[1,0].set_title('Daily Screen Time Distribution (hours)')
axes[1,0].set_xlabel('Daily Screen Time (hours)')
axes[1,0].set_ylabel('Frequency')

Text(4.444444444444452, 0.5, 'Frequency')

axes[1,1].hist(df['Days_Without_Social_Media'], bins=15, edgecolor='black', alpha=0.7, color='green')
axes[1,1].set_title('Days Without Social Media')
axes[1,1].set_xlabel('Days Without Social Media')
axes[1,1].set_ylabel('Frequency')

Text(496.7973856209152, 0.5, 'Frequency')

fig.delaxes(axes[1,2])

plt.tight_layout()
plt.show()

<Figure size 640x480 with 0 Axes>

numerical_cols = ['Age', 'Daily_Screen_Time(hrs)', 'Sleep_Quality(1-10)', 
                 'Stress_Level(1-10)', 'Days_Without_Social_Media', 
                 'Exercise_Frequency(week)', 'Happiness_Index(1-10)']
correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
           square=True, linewidths=0.5)
plt.title('Correlation Matrix of Numerical Variables', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# 4. Relationships between key variables
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Relationships Between Key Variables', fontsize=16, fontweight='bold')

Text(0.5, 0.98, 'Relationships Between Key Variables')

axes[0,0].scatter(df['Daily_Screen_Time(hrs)'], df['Happiness_Index(1-10)'], alpha=0.6)
axes[0,0].set_xlabel('Daily Screen Time (hours)')
axes[0,0].set_ylabel('Happiness Index')
axes[0,0].set_title('Screen Time vs Happiness')

Text(0.5, 1.0, 'Screen Time vs Happiness')

axes[0,1].scatter(df['Daily_Screen_Time(hrs)'], df['Stress_Level(1-10)'], alpha=0.6, color='red')
axes[0,1].set_xlabel('Daily Screen Time (hours)')
axes[0,1].set_ylabel('Stress Level')
axes[0,1].set_title('Screen Time vs Stress')

Text(0.5, 1.0, 'Screen Time vs Stress')

axes[0,2].scatter(df['Sleep_Quality(1-10)'], df['Happiness_Index(1-10)'], alpha=0.6, color='green')
axes[0,2].set_xlabel('Sleep Quality')
axes[0,2].set_ylabel('Happiness Index')
axes[0,2].set_title('Sleep Quality vs Happiness')

Text(0.5, 1.0, 'Sleep Quality vs Happiness')

axes[1,0].scatter(df['Exercise_Frequency(week)'], df['Happiness_Index(1-10)'], alpha=0.6, color='orange')
axes[1,0].set_xlabel('Exercise Frequency (times/week)')
axes[1,0].set_ylabel('Happiness Index')
axes[1,0].set_title('Exercise vs Happiness')

Text(0.5, 1.0, 'Exercise vs Happiness')

axes[1,1].scatter(df['Days_Without_Social_Media'], df['Happiness_Index(1-10)'], alpha=0.6, color='purple')
axes[1,1].set_xlabel('Days Without Social Media')
axes[1,1].set_ylabel('Happiness Index')
axes[1,1].set_title('Social Media Break vs Happiness')

Text(0.5, 1.0, 'Social Media Break vs Happiness')

axes[1,2].scatter(df['Stress_Level(1-10)'], df['Happiness_Index(1-10)'], alpha=0.6, color='brown')
axes[1,2].set_xlabel('Stress Level')
axes[1,2].set_ylabel('Happiness Index')
axes[1,2].set_title('Stress vs Happiness')

plt.tight_layout()
plt.show()

<Figure size 640x480 with 0 Axes>

plt.figure(figsize=(15, 10))

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

plt.subplot(2, 2, 1)
platform_happiness = df.groupby('Social_Media_Platform')['Happiness_Index(1-10)'].mean().sort_values(ascending=False)
platform_happiness.plot(kind='bar', color='lightblue')
plt.title('Average Happiness Index by Social Media Platform')
plt.xlabel('Social Media Platform')
plt.ylabel('Average Happiness Index')
plt.xticks(rotation=45)

(array([0, 1, 2, 3, 4, 5]),
 [Text(0, 0, 'X (Twitter)'),
  Text(1, 0, 'LinkedIn'),
  Text(2, 0, 'TikTok'),
  Text(3, 0, 'Facebook'),
  Text(4, 0, 'YouTube'),
  Text(5, 0, 'Instagram')])

plt.subplot(2, 2, 2)
platform_stress = df.groupby('Social_Media_Platform')['Stress_Level(1-10)'].mean().sort_values(ascending=False)
platform_stress.plot(kind='bar', color='lightcoral')
plt.title('Average Stress Level by Social Media Platform')
plt.xlabel('Social Media Platform')
plt.ylabel('Average Stress Level')
plt.xticks(rotation=45)

(array([0, 1, 2, 3, 4, 5]),
 [Text(0, 0, 'Instagram'),
  Text(1, 0, 'Facebook'),
  Text(2, 0, 'YouTube'),
  Text(3, 0, 'TikTok'),
  Text(4, 0, 'LinkedIn'),
  Text(5, 0, 'X (Twitter)')])

plt.subplot(2, 2, 3)
platform_screen = df.groupby('Social_Media_Platform')['Daily_Screen_Time(hrs)'].mean().sort_values(ascending=False)
platform_screen.plot(kind='bar', color='lightgreen')
plt.title('Average Screen Time by Social Media Platform')
plt.xlabel('Social Media Platform')
plt.ylabel('Average Screen Time (hours)')
plt.xticks(rotation=45)

(array([0, 1, 2, 3, 4, 5]),
 [Text(0, 0, 'Instagram'),
  Text(1, 0, 'Facebook'),
  Text(2, 0, 'YouTube'),
  Text(3, 0, 'TikTok'),
  Text(4, 0, 'X (Twitter)'),
  Text(5, 0, 'LinkedIn')])

plt.subplot(2, 2, 4)
platform_sleep = df.groupby('Social_Media_Platform')['Sleep_Quality(1-10)'].mean().sort_values(ascending=False)
platform_sleep.plot(kind='bar', color='lightyellow')
plt.title('Average Sleep Quality by Social Media Platform')
plt.xlabel('Social Media Platform')
plt.ylabel('Average Sleep Quality')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

df['Age_Group'] = pd.cut(df['Age'], bins=[15, 25, 35, 45, 55], labels=['15-25', '26-35', '36-45', '46-55'])

plt.figure(figsize=(15, 10))

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

plt.subplot(2, 2, 1)
age_happiness = df.groupby('Age_Group')['Happiness_Index(1-10)'].mean()
age_happiness.plot(kind='bar', color='skyblue')
plt.title('Average Happiness Index by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Happiness Index')

/tmp/ipykernel_2578/4115539301.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  age_happiness = df.groupby('Age_Group')['Happiness_Index(1-10)'].mean()

Text(0, 0.5, 'Average Happiness Index')

plt.subplot(2, 2, 2)
age_screen = df.groupby('Age_Group')['Daily_Screen_Time(hrs)'].mean()
age_screen.plot(kind='bar', color='lightcoral')
plt.title('Average Screen Time by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Screen Time (hours)')

/tmp/ipykernel_2578/4145469376.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  age_screen = df.groupby('Age_Group')['Daily_Screen_Time(hrs)'].mean()

Text(0, 0.5, 'Average Screen Time (hours)')

plt.subplot(2, 2, 3)
age_stress = df.groupby('Age_Group')['Stress_Level(1-10)'].mean()
age_stress.plot(kind='bar', color='lightgreen')
plt.title('Average Stress Level by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Stress Level')

/tmp/ipykernel_2578/1582472313.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  age_stress = df.groupby('Age_Group')['Stress_Level(1-10)'].mean()

Text(0, 0.5, 'Average Stress Level')

plt.subplot(2, 2, 4)
age_exercise = df.groupby('Age_Group')['Exercise_Frequency(week)'].mean()
age_exercise.plot(kind='bar', color='gold')
plt.title('Average Exercise Frequency by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Exercise Frequency')

plt.tight_layout()
plt.show()

/tmp/ipykernel_2578/2839182975.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  age_exercise = df.groupby('Age_Group')['Exercise_Frequency(week)'].mean()

print("\n" + "="*60)
print("KEY INSIGHTS SUMMARY")
print("="*60)

print(f"\nOverall Average Happiness: {df['Happiness_Index(1-10)'].mean():.2f}")
print(f"Overall Average Stress Level: {df['Stress_Level(1-10)'].mean():.2f}")
print(f"Overall Average Screen Time: {df['Daily_Screen_Time(hrs)'].mean():.2f} hours")
print(f"Overall Average Sleep Quality: {df['Sleep_Quality(1-10)'].mean():.2f}")
print(f"Average Days Without Social Media: {df['Days_Without_Social_Media'].mean():.2f} days")
print(f"Average Exercise Frequency: {df['Exercise_Frequency(week)'].mean():.2f} times/week")

============================================================
KEY INSIGHTS SUMMARY
============================================================

Overall Average Happiness: 8.38
Overall Average Stress Level: 6.62
Overall Average Screen Time: 5.53 hours
Overall Average Sleep Quality: 6.30
Average Days Without Social Media: 3.13 days
Average Exercise Frequency: 2.45 times/week

happiness_correlations = correlation_matrix['Happiness_Index(1-10)'].sort_values(ascending=False)
print(f"\nTop correlations with Happiness:")
for var, corr in happiness_correlations.items():
    if var != 'Happiness_Index(1-10)':
        print(f"  {var}: {corr:.3f}")

max_happy_platform = df.groupby('Social_Media_Platform')['Happiness_Index(1-10)'].mean().idxmax()
min_happy_platform = df.groupby('Social_Media_Platform')['Happiness_Index(1-10)'].mean().idxmin()
print(f"\nPlatform with highest average happiness: {max_happy_platform}")
print(f"Platform with lowest average happiness: {min_happy_platform}")

Top correlations with Happiness:
  Sleep_Quality(1-10): 0.679
  Days_Without_Social_Media: 0.064
  Exercise_Frequency(week): 0.041
  Age: 0.019
  Daily_Screen_Time(hrs): -0.705
  Stress_Level(1-10): -0.737

Platform with highest average happiness: X (Twitter)
Platform with lowest average happiness: Instagram

plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.boxplot(data=df, x='Social_Media_Platform', y='Happiness_Index(1-10)')
plt.title('Happiness Index Distribution by Platform')
plt.xticks(rotation=45)

plt.subplot(2, 2, 2)
sns.boxplot(data=df, x='Social_Media_Platform', y='Stress_Level(1-10)')
plt.title('Stress Level Distribution by Platform')
plt.xticks(rotation=45)

plt.subplot(2, 2, 3)
sns.boxplot(data=df, x='Social_Media_Platform', y='Daily_Screen_Time(hrs)')
plt.title('Screen Time Distribution by Platform')
plt.xticks(rotation=45)

plt.subplot(2, 2, 4)
sns.boxplot(data=df, x='Social_Media_Platform', y='Sleep_Quality(1-10)')
plt.title('Sleep Quality Distribution by Platform')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("\nVisualization completed! Check the charts above for insights.")

Visualization completed! Check the charts above for insights.