In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
In [2]:
plt.style.use('default')
sns.set_palette("husl")
In [3]:
df = pd.read_csv('datasets/mental_health.csv')
In [4]:
print("Dataset Overview:")
print("=" * 50)
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nDescriptive Statistics:")
print(df.describe())
print("\nMissing values:")
print(df.isnull().sum())
Dataset Overview:
==================================================
Dataset shape: (500, 10)
First few rows:
User_ID Age Gender Daily_Screen_Time(hrs) Sleep_Quality(1-10) \
0 U001 44 Male 3.1 7.0
1 U002 30 Other 5.1 7.0
2 U003 23 Other 7.4 6.0
3 U004 36 Female 5.7 7.0
4 U005 34 Female 7.0 4.0
Stress_Level(1-10) Days_Without_Social_Media Exercise_Frequency(week) \
0 6.0 2.0 5.0
1 8.0 5.0 3.0
2 7.0 1.0 3.0
3 8.0 1.0 1.0
4 7.0 5.0 1.0
Social_Media_Platform Happiness_Index(1-10)
0 Facebook 10.0
1 LinkedIn 10.0
2 YouTube 6.0
3 TikTok 8.0
4 X (Twitter) 8.0
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User_ID 500 non-null object
1 Age 500 non-null int64
2 Gender 500 non-null object
3 Daily_Screen_Time(hrs) 500 non-null float64
4 Sleep_Quality(1-10) 500 non-null float64
5 Stress_Level(1-10) 500 non-null float64
6 Days_Without_Social_Media 500 non-null float64
7 Exercise_Frequency(week) 500 non-null float64
8 Social_Media_Platform 500 non-null object
9 Happiness_Index(1-10) 500 non-null float64
dtypes: float64(6), int64(1), object(3)
memory usage: 39.2+ KB
None
Descriptive Statistics:
Age Daily_Screen_Time(hrs) Sleep_Quality(1-10) \
count 500.000000 500.000000 500.000000
mean 32.988000 5.530000 6.304000
std 9.960637 1.734877 1.529792
min 16.000000 1.000000 2.000000
25% 24.000000 4.300000 5.000000
50% 34.000000 5.600000 6.000000
75% 41.000000 6.700000 7.000000
max 49.000000 10.800000 10.000000
Stress_Level(1-10) Days_Without_Social_Media \
count 500.000000 500.000000
mean 6.618000 3.134000
std 1.542996 1.858751
min 2.000000 0.000000
25% 6.000000 2.000000
50% 7.000000 3.000000
75% 8.000000 5.000000
max 10.000000 9.000000
Exercise_Frequency(week) Happiness_Index(1-10)
count 500.000000 500.000000
mean 2.448000 8.376000
std 1.428067 1.524228
min 0.000000 4.000000
25% 1.000000 7.000000
50% 2.000000 9.000000
75% 3.000000 10.000000
max 7.000000 10.000000
Missing values:
User_ID 0
Age 0
Gender 0
Daily_Screen_Time(hrs) 0
Sleep_Quality(1-10) 0
Stress_Level(1-10) 0
Days_Without_Social_Media 0
Exercise_Frequency(week) 0
Social_Media_Platform 0
Happiness_Index(1-10) 0
dtype: int64
In [5]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Demographic Distribution', fontsize=16, fontweight='bold')
Out[5]:
Text(0.5, 0.98, 'Demographic Distribution')
In [6]:
axes[0,0].hist(df['Age'], bins=20, edgecolor='black', alpha=0.7, color='skyblue')
axes[0,0].set_title('Age Distribution')
axes[0,0].set_xlabel('Age')
axes[0,0].set_ylabel('Frequency')
Out[6]:
Text(4.444444444444452, 0.5, 'Frequency')
In [7]:
gender_counts = df['Gender'].value_counts()
axes[0,1].pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%', startangle=90)
axes[0,1].set_title('Gender Distribution')
Out[7]:
Text(0.5, 1.0, 'Gender Distribution')
In [8]:
platform_counts = df['Social_Media_Platform'].value_counts()
axes[1,0].bar(platform_counts.index, platform_counts.values, color='lightcoral')
axes[1,0].set_title('Social Media Platform Usage')
axes[1,0].set_xlabel('Platform')
axes[1,0].set_ylabel('Count')
plt.setp(axes[1,0].xaxis.get_majorticklabels(), rotation=45)
Out[8]:
[None, None, None, None, None, None]
In [9]:
exercise_counts = df['Exercise_Frequency(week)'].value_counts().sort_index()
axes[1,1].bar(exercise_counts.index, exercise_counts.values, color='lightgreen')
axes[1,1].set_title('Exercise Frequency per Week')
axes[1,1].set_xlabel('Exercise Frequency (times/week)')
axes[1,1].set_ylabel('Count')
Out[9]:
Text(638.5353535353535, 0.5, 'Count')
In [10]:
plt.tight_layout()
plt.show()
<Figure size 640x480 with 0 Axes>
In [11]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Mental Health Metrics Analysis', fontsize=16, fontweight='bold')
Out[11]:
Text(0.5, 0.98, 'Mental Health Metrics Analysis')
In [12]:
axes[0,0].hist(df['Stress_Level(1-10)'], bins=10, edgecolor='black', alpha=0.7, color='red')
axes[0,0].set_title('Stress Level Distribution (1-10)')
axes[0,0].set_xlabel('Stress Level')
axes[0,0].set_ylabel('Frequency')
Out[12]:
Text(4.444444444444452, 0.5, 'Frequency')
In [13]:
axes[0,1].hist(df['Sleep_Quality(1-10)'], bins=10, edgecolor='black', alpha=0.7, color='purple')
axes[0,1].set_title('Sleep Quality Distribution (1-10)')
axes[0,1].set_xlabel('Sleep Quality')
axes[0,1].set_ylabel('Frequency')
Out[13]:
Text(496.7973856209152, 0.5, 'Frequency')
In [14]:
axes[0,2].hist(df['Happiness_Index(1-10)'], bins=10, edgecolor='black', alpha=0.7, color='orange')
axes[0,2].set_title('Happiness Index Distribution (1-10)')
axes[0,2].set_xlabel('Happiness Index')
axes[0,2].set_ylabel('Frequency')
Out[14]:
Text(989.1503267973857, 0.5, 'Frequency')
In [15]:
axes[1,0].hist(df['Daily_Screen_Time(hrs)'], bins=20, edgecolor='black', alpha=0.7, color='blue')
axes[1,0].set_title('Daily Screen Time Distribution (hours)')
axes[1,0].set_xlabel('Daily Screen Time (hours)')
axes[1,0].set_ylabel('Frequency')
Out[15]:
Text(4.444444444444452, 0.5, 'Frequency')
In [16]:
axes[1,1].hist(df['Days_Without_Social_Media'], bins=15, edgecolor='black', alpha=0.7, color='green')
axes[1,1].set_title('Days Without Social Media')
axes[1,1].set_xlabel('Days Without Social Media')
axes[1,1].set_ylabel('Frequency')
Out[16]:
Text(496.7973856209152, 0.5, 'Frequency')
In [17]:
fig.delaxes(axes[1,2])
plt.tight_layout()
plt.show()
<Figure size 640x480 with 0 Axes>
In [18]:
numerical_cols = ['Age', 'Daily_Screen_Time(hrs)', 'Sleep_Quality(1-10)',
'Stress_Level(1-10)', 'Days_Without_Social_Media',
'Exercise_Frequency(week)', 'Happiness_Index(1-10)']
correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
square=True, linewidths=0.5)
plt.title('Correlation Matrix of Numerical Variables', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
# 4. Relationships between key variables
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Relationships Between Key Variables', fontsize=16, fontweight='bold')
Out[18]:
Text(0.5, 0.98, 'Relationships Between Key Variables')
In [19]:
axes[0,0].scatter(df['Daily_Screen_Time(hrs)'], df['Happiness_Index(1-10)'], alpha=0.6)
axes[0,0].set_xlabel('Daily Screen Time (hours)')
axes[0,0].set_ylabel('Happiness Index')
axes[0,0].set_title('Screen Time vs Happiness')
Out[19]:
Text(0.5, 1.0, 'Screen Time vs Happiness')
In [20]:
axes[0,1].scatter(df['Daily_Screen_Time(hrs)'], df['Stress_Level(1-10)'], alpha=0.6, color='red')
axes[0,1].set_xlabel('Daily Screen Time (hours)')
axes[0,1].set_ylabel('Stress Level')
axes[0,1].set_title('Screen Time vs Stress')
Out[20]:
Text(0.5, 1.0, 'Screen Time vs Stress')
In [21]:
axes[0,2].scatter(df['Sleep_Quality(1-10)'], df['Happiness_Index(1-10)'], alpha=0.6, color='green')
axes[0,2].set_xlabel('Sleep Quality')
axes[0,2].set_ylabel('Happiness Index')
axes[0,2].set_title('Sleep Quality vs Happiness')
Out[21]:
Text(0.5, 1.0, 'Sleep Quality vs Happiness')
In [22]:
axes[1,0].scatter(df['Exercise_Frequency(week)'], df['Happiness_Index(1-10)'], alpha=0.6, color='orange')
axes[1,0].set_xlabel('Exercise Frequency (times/week)')
axes[1,0].set_ylabel('Happiness Index')
axes[1,0].set_title('Exercise vs Happiness')
Out[22]:
Text(0.5, 1.0, 'Exercise vs Happiness')
In [23]:
axes[1,1].scatter(df['Days_Without_Social_Media'], df['Happiness_Index(1-10)'], alpha=0.6, color='purple')
axes[1,1].set_xlabel('Days Without Social Media')
axes[1,1].set_ylabel('Happiness Index')
axes[1,1].set_title('Social Media Break vs Happiness')
Out[23]:
Text(0.5, 1.0, 'Social Media Break vs Happiness')
In [24]:
axes[1,2].scatter(df['Stress_Level(1-10)'], df['Happiness_Index(1-10)'], alpha=0.6, color='brown')
axes[1,2].set_xlabel('Stress Level')
axes[1,2].set_ylabel('Happiness Index')
axes[1,2].set_title('Stress vs Happiness')
plt.tight_layout()
plt.show()
<Figure size 640x480 with 0 Axes>
In [25]:
plt.figure(figsize=(15, 10))
Out[25]:
<Figure size 1500x1000 with 0 Axes>
<Figure size 1500x1000 with 0 Axes>
In [26]:
plt.subplot(2, 2, 1)
platform_happiness = df.groupby('Social_Media_Platform')['Happiness_Index(1-10)'].mean().sort_values(ascending=False)
platform_happiness.plot(kind='bar', color='lightblue')
plt.title('Average Happiness Index by Social Media Platform')
plt.xlabel('Social Media Platform')
plt.ylabel('Average Happiness Index')
plt.xticks(rotation=45)
Out[26]:
(array([0, 1, 2, 3, 4, 5]), [Text(0, 0, 'X (Twitter)'), Text(1, 0, 'LinkedIn'), Text(2, 0, 'TikTok'), Text(3, 0, 'Facebook'), Text(4, 0, 'YouTube'), Text(5, 0, 'Instagram')])
In [27]:
plt.subplot(2, 2, 2)
platform_stress = df.groupby('Social_Media_Platform')['Stress_Level(1-10)'].mean().sort_values(ascending=False)
platform_stress.plot(kind='bar', color='lightcoral')
plt.title('Average Stress Level by Social Media Platform')
plt.xlabel('Social Media Platform')
plt.ylabel('Average Stress Level')
plt.xticks(rotation=45)
Out[27]:
(array([0, 1, 2, 3, 4, 5]), [Text(0, 0, 'Instagram'), Text(1, 0, 'Facebook'), Text(2, 0, 'YouTube'), Text(3, 0, 'TikTok'), Text(4, 0, 'LinkedIn'), Text(5, 0, 'X (Twitter)')])
In [28]:
plt.subplot(2, 2, 3)
platform_screen = df.groupby('Social_Media_Platform')['Daily_Screen_Time(hrs)'].mean().sort_values(ascending=False)
platform_screen.plot(kind='bar', color='lightgreen')
plt.title('Average Screen Time by Social Media Platform')
plt.xlabel('Social Media Platform')
plt.ylabel('Average Screen Time (hours)')
plt.xticks(rotation=45)
Out[28]:
(array([0, 1, 2, 3, 4, 5]), [Text(0, 0, 'Instagram'), Text(1, 0, 'Facebook'), Text(2, 0, 'YouTube'), Text(3, 0, 'TikTok'), Text(4, 0, 'X (Twitter)'), Text(5, 0, 'LinkedIn')])
In [29]:
plt.subplot(2, 2, 4)
platform_sleep = df.groupby('Social_Media_Platform')['Sleep_Quality(1-10)'].mean().sort_values(ascending=False)
platform_sleep.plot(kind='bar', color='lightyellow')
plt.title('Average Sleep Quality by Social Media Platform')
plt.xlabel('Social Media Platform')
plt.ylabel('Average Sleep Quality')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [30]:
df['Age_Group'] = pd.cut(df['Age'], bins=[15, 25, 35, 45, 55], labels=['15-25', '26-35', '36-45', '46-55'])
plt.figure(figsize=(15, 10))
Out[30]:
<Figure size 1500x1000 with 0 Axes>
<Figure size 1500x1000 with 0 Axes>
In [31]:
plt.subplot(2, 2, 1)
age_happiness = df.groupby('Age_Group')['Happiness_Index(1-10)'].mean()
age_happiness.plot(kind='bar', color='skyblue')
plt.title('Average Happiness Index by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Happiness Index')
/tmp/ipykernel_2578/4115539301.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
age_happiness = df.groupby('Age_Group')['Happiness_Index(1-10)'].mean()
Out[31]:
Text(0, 0.5, 'Average Happiness Index')
In [32]:
plt.subplot(2, 2, 2)
age_screen = df.groupby('Age_Group')['Daily_Screen_Time(hrs)'].mean()
age_screen.plot(kind='bar', color='lightcoral')
plt.title('Average Screen Time by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Screen Time (hours)')
/tmp/ipykernel_2578/4145469376.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
age_screen = df.groupby('Age_Group')['Daily_Screen_Time(hrs)'].mean()
Out[32]:
Text(0, 0.5, 'Average Screen Time (hours)')
In [33]:
plt.subplot(2, 2, 3)
age_stress = df.groupby('Age_Group')['Stress_Level(1-10)'].mean()
age_stress.plot(kind='bar', color='lightgreen')
plt.title('Average Stress Level by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Stress Level')
/tmp/ipykernel_2578/1582472313.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
age_stress = df.groupby('Age_Group')['Stress_Level(1-10)'].mean()
Out[33]:
Text(0, 0.5, 'Average Stress Level')
In [34]:
plt.subplot(2, 2, 4)
age_exercise = df.groupby('Age_Group')['Exercise_Frequency(week)'].mean()
age_exercise.plot(kind='bar', color='gold')
plt.title('Average Exercise Frequency by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Exercise Frequency')
plt.tight_layout()
plt.show()
/tmp/ipykernel_2578/2839182975.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
age_exercise = df.groupby('Age_Group')['Exercise_Frequency(week)'].mean()
In [35]:
print("\n" + "="*60)
print("KEY INSIGHTS SUMMARY")
print("="*60)
print(f"\nOverall Average Happiness: {df['Happiness_Index(1-10)'].mean():.2f}")
print(f"Overall Average Stress Level: {df['Stress_Level(1-10)'].mean():.2f}")
print(f"Overall Average Screen Time: {df['Daily_Screen_Time(hrs)'].mean():.2f} hours")
print(f"Overall Average Sleep Quality: {df['Sleep_Quality(1-10)'].mean():.2f}")
print(f"Average Days Without Social Media: {df['Days_Without_Social_Media'].mean():.2f} days")
print(f"Average Exercise Frequency: {df['Exercise_Frequency(week)'].mean():.2f} times/week")
============================================================ KEY INSIGHTS SUMMARY ============================================================ Overall Average Happiness: 8.38 Overall Average Stress Level: 6.62 Overall Average Screen Time: 5.53 hours Overall Average Sleep Quality: 6.30 Average Days Without Social Media: 3.13 days Average Exercise Frequency: 2.45 times/week
In [36]:
happiness_correlations = correlation_matrix['Happiness_Index(1-10)'].sort_values(ascending=False)
print(f"\nTop correlations with Happiness:")
for var, corr in happiness_correlations.items():
if var != 'Happiness_Index(1-10)':
print(f" {var}: {corr:.3f}")
max_happy_platform = df.groupby('Social_Media_Platform')['Happiness_Index(1-10)'].mean().idxmax()
min_happy_platform = df.groupby('Social_Media_Platform')['Happiness_Index(1-10)'].mean().idxmin()
print(f"\nPlatform with highest average happiness: {max_happy_platform}")
print(f"Platform with lowest average happiness: {min_happy_platform}")
Top correlations with Happiness: Sleep_Quality(1-10): 0.679 Days_Without_Social_Media: 0.064 Exercise_Frequency(week): 0.041 Age: 0.019 Daily_Screen_Time(hrs): -0.705 Stress_Level(1-10): -0.737 Platform with highest average happiness: X (Twitter) Platform with lowest average happiness: Instagram
In [37]:
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
sns.boxplot(data=df, x='Social_Media_Platform', y='Happiness_Index(1-10)')
plt.title('Happiness Index Distribution by Platform')
plt.xticks(rotation=45)
plt.subplot(2, 2, 2)
sns.boxplot(data=df, x='Social_Media_Platform', y='Stress_Level(1-10)')
plt.title('Stress Level Distribution by Platform')
plt.xticks(rotation=45)
plt.subplot(2, 2, 3)
sns.boxplot(data=df, x='Social_Media_Platform', y='Daily_Screen_Time(hrs)')
plt.title('Screen Time Distribution by Platform')
plt.xticks(rotation=45)
plt.subplot(2, 2, 4)
sns.boxplot(data=df, x='Social_Media_Platform', y='Sleep_Quality(1-10)')
plt.title('Sleep Quality Distribution by Platform')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
print("\nVisualization completed! Check the charts above for insights.")
Visualization completed! Check the charts above for insights.
In [ ]: